diff --git a/official/cv/cspdarknet53/train.py b/official/cv/cspdarknet53/train.py index edd391d14acfcc68fc0c6fc7ec5ce26cb9e0a4b4..69c8a4976ac43d8dea2fea1a0ddca9b0046e0c7d 100644 --- a/official/cv/cspdarknet53/train.py +++ b/official/cv/cspdarknet53/train.py @@ -47,8 +47,8 @@ class ProgressMonitor(Callback): """monitor loss and cost time.""" def __init__(self, args): super(ProgressMonitor, self).__init__() - self.me_epoch_start_time = 0 - self.me_epoch_start_step_num = 0 + self.me_epoch_start_time = time.time() + self.me_epoch_start_step_num = -1 self.args = args def epoch_end(self, run_context): diff --git a/official/cv/yolov4/train.py b/official/cv/yolov4/train.py index ba42ad92498a994e0de671e6d9b7ac66d02b638e..5111d98144082d66504e0498386bc1d60b3c2023 100644 --- a/official/cv/yolov4/train.py +++ b/official/cv/yolov4/train.py @@ -273,7 +273,7 @@ def run_train(): for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] - config.logger.info('iter[%d], shape%d', i, input_shape[0]) + config.logger.info('iter[%d], shape%d', i + 1, input_shape[0]) images = Tensor.from_numpy(images) batch_y_true_0 = Tensor.from_numpy(data['bbox1']) @@ -294,12 +294,12 @@ def run_train(): cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) - if i % config.log_interval == 0: + if (i + 1) % config.log_interval == 0: time_used = time.time() - t_end - epoch = int(i / config.steps_per_epoch) + epoch = int((i + 1) / config.steps_per_epoch) fps = config.per_batch_size * (i - old_progress) * config.group_size / time_used if config.rank == 0: - config.logger.info('epoch[{}], iter[{}], {}, pre step time: {:.2f} ms, fps: {:.2f}, lr:{}'.format( + config.logger.info('epoch[{}], iter[{}], {}, per step time: {:.2f} ms, fps: {:.2f}, lr:{}'.format( epoch, i, loss_meter, 1000 * time_used / (i - old_progress), fps, lr[i])) t_end = time.time() loss_meter.reset() diff --git a/research/cv/FaceAttribute/train.py b/research/cv/FaceAttribute/train.py index a844e10bd11d2b62140338e115e42e28bb7913d6..213449a3682388579c99321ebd3407fca0986f71 100644 --- a/research/cv/FaceAttribute/train.py +++ b/research/cv/FaceAttribute/train.py @@ -209,7 +209,7 @@ def run_train(): cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) - if i % config.steps_per_epoch == 0 and config.local_rank == 0: + if (i + 1) % config.steps_per_epoch == 0 and config.local_rank == 0: cb_params.cur_epoch_num += 1 if i == 0: @@ -217,25 +217,25 @@ def run_train(): config.logger.important_info( '{}, graph compile time={:.2f}s'.format(config.backbone, time_for_graph_compile)) - if i % config.log_interval == 0 and config.local_rank == 0: + if (i + 1) % config.log_interval == 0 and config.local_rank == 0: time_used = time.time() - t_end - epoch = int(i / config.steps_per_epoch) + epoch = int((i + 1) / config.steps_per_epoch) fps = config.per_batch_size * (i - old_progress) * config.world_size / time_used - config.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec'.format(epoch, i, loss_meter, fps)) + config.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec'.format(epoch, i + 1, loss_meter, fps)) t_end = time.time() loss_meter.reset() old_progress = i - if i % config.steps_per_epoch == 0 and config.local_rank == 0: + if (i + 1) % config.steps_per_epoch == 0 and config.local_rank == 0: epoch_time_used = time.time() - t_epoch - epoch = int(i / config.steps_per_epoch) + epoch = int((i + 1) / config.steps_per_epoch) fps = config.per_batch_size * config.world_size * config.steps_per_epoch / epoch_time_used - pre_step_time = epoch_time_used / config.steps_per_epoch + per_step_time = epoch_time_used / config.steps_per_epoch config.logger.info('=================================================') - config.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps)) - config.logger.info('epoch[{}], epoch time: {:5.3f} ms, pre step time: {:5.3f} ms'.format( - epoch, epoch_time_used * 1000, pre_step_time * 1000)) + config.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i + 1, fps)) + config.logger.info('epoch[{}], epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format( + epoch, epoch_time_used * 1000, per_step_time * 1000)) config.logger.info('=================================================') t_epoch = time.time() diff --git a/research/cv/FaceDetection/train.py b/research/cv/FaceDetection/train.py index aec77c4af25e7b1dc18252cc2de32d393817f751..1b8955fcb0408d51b0c3649518bc89f027ac3f42 100644 --- a/research/cv/FaceDetection/train.py +++ b/research/cv/FaceDetection/train.py @@ -169,12 +169,12 @@ def run_train(): else: scale_manager.update_loss_scale(False) config.logger.info('rank[{:d}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, ' - 'batch_labels:{}'.format(config.local_rank, i, loss0, overflow, scaling_sens, + 'batch_labels:{}'.format(config.local_rank, i + 1, loss0, overflow, scaling_sens, config.lr[i], batch_images.shape, batch_labels.shape)) else: loss0 = train_net(*input_list) config.logger.info('rank[{:d}], iter[{}], loss[{}], lr:{}, batch_images:{}, ' - 'batch_labels:{}'.format(config.local_rank, i, loss0, + 'batch_labels:{}'.format(config.local_rank, i + 1, loss0, config.lr[i], batch_images.shape, batch_labels.shape)) # save ckpt cb_params.cur_step_num = i + 1 # current step number @@ -187,24 +187,24 @@ def run_train(): time_for_graph_compile = time.time() - create_network_start config.logger.important_info('Yolov3, graph compile time={:.2f}s'.format(time_for_graph_compile)) - if i % config.steps_per_epoch == 0: + if (i + 1) % config.steps_per_epoch == 0: cb_params.cur_epoch_num += 1 - if i % config.log_interval == 0 and config.local_rank == 0: + if (i + 1) % config.log_interval == 0 and config.local_rank == 0: time_used = time.time() - t_end - epoch = int(i / config.steps_per_epoch) + epoch = int((i + 1) / config.steps_per_epoch) fps = config.batch_size * (i - old_progress) * config.world_size / time_used - config.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i, loss0, fps)) + config.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i + 1, loss0, fps)) t_end = time.time() old_progress = i - if i % config.steps_per_epoch == 0 and config.local_rank == 0: + if (i + 1) % config.steps_per_epoch == 0 and config.local_rank == 0: epoch_time_used = time.time() - t_epoch - epoch = int(i / config.steps_per_epoch) + epoch = int((i + 1) / config.steps_per_epoch) fps = config.batch_size * config.world_size * config.steps_per_epoch / epoch_time_used config.logger.info('=================================================') - config.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps)) - config.logger.info('epoch[{}], epoch time: {:5.3f} ms, pre step time: {:5.3f} ms'.format( + config.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i + 1, fps)) + config.logger.info('epoch[{}], epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format( epoch, epoch_time_used * 1000, epoch_time_used * 1000 / config.steps_per_epoch)) config.logger.info('=================================================') t_epoch = time.time() diff --git a/research/cv/FaceQualityAssessment/train.py b/research/cv/FaceQualityAssessment/train.py index 461ab75208ce01586a9da2944dc9916650abf5b8..2f998867efde5881c6a99a75d77b726754cae741 100644 --- a/research/cv/FaceQualityAssessment/train.py +++ b/research/cv/FaceQualityAssessment/train.py @@ -206,24 +206,24 @@ def run_train(): time_for_graph_compile = time.time() - create_network_start cfg.logger.important_info('{}, graph compile time={:.2f}s'.format(cfg.task, time_for_graph_compile)) - if i % cfg.log_interval == 0 and cfg.local_rank == 0: + if (i + 1) % cfg.log_interval == 0 and cfg.local_rank == 0: time_used = time.time() - t_end - epoch = int(i / cfg.steps_per_epoch) + epoch = int((i + 1) / cfg.steps_per_epoch) fps = cfg.per_batch_size * (i - old_progress) * cfg.world_size / time_used - cfg.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec'.format(epoch, i, loss_meter, fps)) + cfg.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec'.format(epoch, i + 1, loss_meter, fps)) t_end = time.time() loss_meter.reset() old_progress = i - if i % cfg.steps_per_epoch == 0 and cfg.local_rank == 0: + if (i + 1) % cfg.steps_per_epoch == 0 and cfg.local_rank == 0: epoch_time_used = time.time() - t_epoch - epoch = int(i / cfg.steps_per_epoch) + epoch = int((i + 1) / cfg.steps_per_epoch) fps = cfg.per_batch_size * cfg.world_size * cfg.steps_per_epoch / epoch_time_used - pre_step_time = epoch_time_used / cfg.steps_per_epoch + per_step_time = epoch_time_used / cfg.steps_per_epoch cfg.logger.info('=================================================') - cfg.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps)) - cfg.logger.info('epoch[{}], epoch time: {:5.3f} ms, pre step time: {:5.3f} ms'.format( - epoch, epoch_time_used * 1000, pre_step_time * 1000)) + cfg.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i + 1, fps)) + cfg.logger.info('epoch[{}], epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format( + epoch, epoch_time_used * 1000, per_step_time * 1000)) cfg.logger.info('=================================================') t_epoch = time.time() diff --git a/research/cv/FaceRecognition/README.md b/research/cv/FaceRecognition/README.md index afb88b9322657129a46e599a868fc8cdcc029cd9..1e4b20b184e4265ff1f415a0e60276dbc2905e7a 100644 --- a/research/cv/FaceRecognition/README.md +++ b/research/cv/FaceRecognition/README.md @@ -122,6 +122,19 @@ The entire code structure is as following: ### Train +- After installing MindSpore via the official website, you can start training and evaluation in as follows. If running on GPU, please add `--device_target=GPU` in the python command or use the "_gpu" shell script ("xxx_gpu.sh"). +- Prepare hccl_8p.json files, before run network. + - Genatating hccl_8p.json, Run the script of utils/hccl_tools/hccl_tools.py. + The following parameter "[0-8)" indicates that the hccl_8p.json file of cards 0 to 7 is generated. + - The name of json file generated by this command is hccl_8p_01234567_{host_ip}.json. For convenience of expression, use hccl_8p.json represents the json file. + + ``` + python hccl_tools.py --device_num "[0,8)" + ``` + +- Prepare the dataset and set "data_dir='/path_to_dataset/'" on xxx_config.yaml file, before run network. +- Prepare the trained base model(.ckpt file) if train with beta mode and set "pretrained='/path_to_checkpoint_path/model.ckpt'" on beta_config.yaml file, before run network. + - Stand alone mode(Ascend) - base model @@ -400,16 +413,17 @@ You will get the result as following in "./scripts/acc.log" if 'dis_dataset' ran | -------------------------- | ----------------------------------------------------------- | ------------------ | | Model Version | V1 | V1 | | Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | NV PCIE V100-32G | -| uploaded Date | 09/30/2020 (month/day/year) | 14/10/2021 (month/day/year) | -| MindSpore Version | 1.0.0 | 1.5.0 | +| uploaded Date | 14/10/2021 (month/day/year) | 14/10/2021 (month/day/year) | +| MindSpore Version | 1.5.0 | 1.5.0 | | Dataset | 4.7 million images | 4.7 million images | -| Training Parameters | epoch=100, batch_size=192, momentum=0.9 | epoch=18(base:9, beta:9), batch_size=192, momentum=0.9 | +| Training Parameters | epoch=18(base:9, beta:9), batch_size=192, momentum=0.9 | epoch=18(base:9, beta:9), batch_size=192, momentum=0.9 | | Optimizer | Momentum | Momentum | | Loss Function | Cross Entropy | Cross Entropy | | outputs | probability | probability | -| Speed | 1pc: 350-600 fps; 8pcs: 2500-4500 fps | base: 1pc: 290-310 fps, 8pcs: 2050-2150 fps; beta: 1pc: 400-430 fps, 8pcs: 2810-2860 fps | +| Speed | base: 1pc: 350-600 fps; 8pcs: 2500-4500 fps; | base: 1pc: 290-310 fps, 8pcs: 2050-2150 fps; | +| | beta: 1pc: 350-600 fps; 8pcs: 2500-4500 fps; | beta: 1pc: 400-430 fps, 8pcs: 2810-2860 fps; | | Total time | 1pc: NA hours; 8pcs: 10 hours | 1pc: NA hours; 8pcs: 5.6(base) + 4.2(beta) hours | -| Checkpoint for Fine tuning | 584M (.ckpt file) | 768M (.ckpt file, base), 582M (.ckpt file, beta) | +| Checkpoint for Fine tuning | 768M (.ckpt file, base), 582M (.ckpt file, beta) | 768M (.ckpt file, base), 582M (.ckpt file, beta) | ### Evaluation Performance @@ -417,13 +431,13 @@ You will get the result as following in "./scripts/acc.log" if 'dis_dataset' ran | ------------------- | --------------------------- | --------------------------- | | Model Version | V1 | V1 | | Resource | Ascend 910; OS Euler2.8 | NV SMX2 V100-32G | -| Uploaded Date | 09/30/2020 (month/day/year) | 29/07/2021 (month/day/year) | -| MindSpore Version | 1.0.0 | 1.3.0 | +| Uploaded Date | 14/10/2021 (month/day/year) | 29/07/2021 (month/day/year) | +| MindSpore Version | 1.5.0 | 1.3.0 | | Dataset | 1.1 million images | 1.1 million images | | batch_size | 512 | 512 | | outputs | ACC | ACC | | ACC | 0.9 | 0.9 | -| Model for inference | 584M (.ckpt file) | 582M (.ckpt file) | +| Model for inference | 582M (.ckpt file) | 582M (.ckpt file) | # [ModelZoo Homepage](#contents) diff --git a/research/cv/FaceRecognitionForTracking/train.py b/research/cv/FaceRecognitionForTracking/train.py index 14c9bb7172f13bd40edfe6ed69129f3d5d5c8583..3ebad74d846663d4403bebd1d84bb9bb02a33aa0 100644 --- a/research/cv/FaceRecognitionForTracking/train.py +++ b/research/cv/FaceRecognitionForTracking/train.py @@ -237,24 +237,25 @@ def run_train(): time_for_graph_compile = time.time() - create_network_start cfg.logger.important_info('{}, graph compile time={:.2f}s'.format(cfg.task, time_for_graph_compile)) - if i % cfg.log_interval == 0 and cfg.local_rank == 0: + if (i + 1) % cfg.log_interval == 0 and cfg.local_rank == 0: time_used = time.time() - t_end - epoch = int(i / cfg.steps_per_epoch) + epoch = int((i + 1) / cfg.steps_per_epoch) fps = cfg.per_batch_size * (i - old_progress) * cfg.world_size / time_used - cfg.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr={}'.format(epoch, i, loss_meter, fps, lr[i])) + cfg.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr={}'.format(epoch, i + 1, loss_meter, + fps, lr[i])) t_end = time.time() loss_meter.reset() old_progress = i - if i % cfg.steps_per_epoch == 0 and cfg.local_rank == 0: + if (i + 1) % cfg.steps_per_epoch == 0 and cfg.local_rank == 0: epoch_time_used = time.time() - t_epoch - epoch = int(i / cfg.steps_per_epoch) + epoch = int((i + 1) / cfg.steps_per_epoch) fps = cfg.per_batch_size * cfg.world_size * cfg.steps_per_epoch / epoch_time_used - pre_step_time = epoch_time_used / cfg.steps_per_epoch + per_step_time = epoch_time_used / cfg.steps_per_epoch cfg.logger.info('=================================================') - cfg.logger.info('epoch[{}], iter[{}], fps: {:.2f}'.format(epoch, i, fps)) - cfg.logger.info('epoch[{}], epoch time: {:5.3f} ms, pre step time: {:5.3f} ms'.format( - epoch, epoch_time_used * 1000, pre_step_time * 1000)) + cfg.logger.info('epoch[{}], iter[{}], fps: {:.2f}'.format(epoch, i + 1, fps)) + cfg.logger.info('epoch[{}], epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format( + epoch, epoch_time_used * 1000, per_step_time * 1000)) cfg.logger.info('=================================================') t_epoch = time.time()