Skip to content
Snippets Groups Projects
Commit 60400f0f authored by zhanghuiyao's avatar zhanghuiyao
Browse files

modify time log for some network

parent 1d8fe960
No related branches found
No related tags found
No related merge requests found
......@@ -47,8 +47,8 @@ class ProgressMonitor(Callback):
"""monitor loss and cost time."""
def __init__(self, args):
super(ProgressMonitor, self).__init__()
self.me_epoch_start_time = 0
self.me_epoch_start_step_num = 0
self.me_epoch_start_time = time.time()
self.me_epoch_start_step_num = -1
self.args = args
def epoch_end(self, run_context):
......
......@@ -273,7 +273,7 @@ def run_train():
for i, data in enumerate(data_loader):
images = data["image"]
input_shape = images.shape[2:4]
config.logger.info('iter[%d], shape%d', i, input_shape[0])
config.logger.info('iter[%d], shape%d', i + 1, input_shape[0])
images = Tensor.from_numpy(images)
batch_y_true_0 = Tensor.from_numpy(data['bbox1'])
......@@ -294,12 +294,12 @@ def run_train():
cb_params.batch_num = i + 2
ckpt_cb.step_end(run_context)
if i % config.log_interval == 0:
if (i + 1) % config.log_interval == 0:
time_used = time.time() - t_end
epoch = int(i / config.steps_per_epoch)
epoch = int((i + 1) / config.steps_per_epoch)
fps = config.per_batch_size * (i - old_progress) * config.group_size / time_used
if config.rank == 0:
config.logger.info('epoch[{}], iter[{}], {}, pre step time: {:.2f} ms, fps: {:.2f}, lr:{}'.format(
config.logger.info('epoch[{}], iter[{}], {}, per step time: {:.2f} ms, fps: {:.2f}, lr:{}'.format(
epoch, i, loss_meter, 1000 * time_used / (i - old_progress), fps, lr[i]))
t_end = time.time()
loss_meter.reset()
......
......@@ -209,7 +209,7 @@ def run_train():
cb_params.batch_num = i + 2
ckpt_cb.step_end(run_context)
if i % config.steps_per_epoch == 0 and config.local_rank == 0:
if (i + 1) % config.steps_per_epoch == 0 and config.local_rank == 0:
cb_params.cur_epoch_num += 1
if i == 0:
......@@ -217,25 +217,25 @@ def run_train():
config.logger.important_info(
'{}, graph compile time={:.2f}s'.format(config.backbone, time_for_graph_compile))
if i % config.log_interval == 0 and config.local_rank == 0:
if (i + 1) % config.log_interval == 0 and config.local_rank == 0:
time_used = time.time() - t_end
epoch = int(i / config.steps_per_epoch)
epoch = int((i + 1) / config.steps_per_epoch)
fps = config.per_batch_size * (i - old_progress) * config.world_size / time_used
config.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec'.format(epoch, i, loss_meter, fps))
config.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec'.format(epoch, i + 1, loss_meter, fps))
t_end = time.time()
loss_meter.reset()
old_progress = i
if i % config.steps_per_epoch == 0 and config.local_rank == 0:
if (i + 1) % config.steps_per_epoch == 0 and config.local_rank == 0:
epoch_time_used = time.time() - t_epoch
epoch = int(i / config.steps_per_epoch)
epoch = int((i + 1) / config.steps_per_epoch)
fps = config.per_batch_size * config.world_size * config.steps_per_epoch / epoch_time_used
pre_step_time = epoch_time_used / config.steps_per_epoch
per_step_time = epoch_time_used / config.steps_per_epoch
config.logger.info('=================================================')
config.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps))
config.logger.info('epoch[{}], epoch time: {:5.3f} ms, pre step time: {:5.3f} ms'.format(
epoch, epoch_time_used * 1000, pre_step_time * 1000))
config.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i + 1, fps))
config.logger.info('epoch[{}], epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format(
epoch, epoch_time_used * 1000, per_step_time * 1000))
config.logger.info('=================================================')
t_epoch = time.time()
......
......@@ -169,12 +169,12 @@ def run_train():
else:
scale_manager.update_loss_scale(False)
config.logger.info('rank[{:d}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, '
'batch_labels:{}'.format(config.local_rank, i, loss0, overflow, scaling_sens,
'batch_labels:{}'.format(config.local_rank, i + 1, loss0, overflow, scaling_sens,
config.lr[i], batch_images.shape, batch_labels.shape))
else:
loss0 = train_net(*input_list)
config.logger.info('rank[{:d}], iter[{}], loss[{}], lr:{}, batch_images:{}, '
'batch_labels:{}'.format(config.local_rank, i, loss0,
'batch_labels:{}'.format(config.local_rank, i + 1, loss0,
config.lr[i], batch_images.shape, batch_labels.shape))
# save ckpt
cb_params.cur_step_num = i + 1 # current step number
......@@ -187,24 +187,24 @@ def run_train():
time_for_graph_compile = time.time() - create_network_start
config.logger.important_info('Yolov3, graph compile time={:.2f}s'.format(time_for_graph_compile))
if i % config.steps_per_epoch == 0:
if (i + 1) % config.steps_per_epoch == 0:
cb_params.cur_epoch_num += 1
if i % config.log_interval == 0 and config.local_rank == 0:
if (i + 1) % config.log_interval == 0 and config.local_rank == 0:
time_used = time.time() - t_end
epoch = int(i / config.steps_per_epoch)
epoch = int((i + 1) / config.steps_per_epoch)
fps = config.batch_size * (i - old_progress) * config.world_size / time_used
config.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i, loss0, fps))
config.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i + 1, loss0, fps))
t_end = time.time()
old_progress = i
if i % config.steps_per_epoch == 0 and config.local_rank == 0:
if (i + 1) % config.steps_per_epoch == 0 and config.local_rank == 0:
epoch_time_used = time.time() - t_epoch
epoch = int(i / config.steps_per_epoch)
epoch = int((i + 1) / config.steps_per_epoch)
fps = config.batch_size * config.world_size * config.steps_per_epoch / epoch_time_used
config.logger.info('=================================================')
config.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps))
config.logger.info('epoch[{}], epoch time: {:5.3f} ms, pre step time: {:5.3f} ms'.format(
config.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i + 1, fps))
config.logger.info('epoch[{}], epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format(
epoch, epoch_time_used * 1000, epoch_time_used * 1000 / config.steps_per_epoch))
config.logger.info('=================================================')
t_epoch = time.time()
......
......@@ -206,24 +206,24 @@ def run_train():
time_for_graph_compile = time.time() - create_network_start
cfg.logger.important_info('{}, graph compile time={:.2f}s'.format(cfg.task, time_for_graph_compile))
if i % cfg.log_interval == 0 and cfg.local_rank == 0:
if (i + 1) % cfg.log_interval == 0 and cfg.local_rank == 0:
time_used = time.time() - t_end
epoch = int(i / cfg.steps_per_epoch)
epoch = int((i + 1) / cfg.steps_per_epoch)
fps = cfg.per_batch_size * (i - old_progress) * cfg.world_size / time_used
cfg.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec'.format(epoch, i, loss_meter, fps))
cfg.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec'.format(epoch, i + 1, loss_meter, fps))
t_end = time.time()
loss_meter.reset()
old_progress = i
if i % cfg.steps_per_epoch == 0 and cfg.local_rank == 0:
if (i + 1) % cfg.steps_per_epoch == 0 and cfg.local_rank == 0:
epoch_time_used = time.time() - t_epoch
epoch = int(i / cfg.steps_per_epoch)
epoch = int((i + 1) / cfg.steps_per_epoch)
fps = cfg.per_batch_size * cfg.world_size * cfg.steps_per_epoch / epoch_time_used
pre_step_time = epoch_time_used / cfg.steps_per_epoch
per_step_time = epoch_time_used / cfg.steps_per_epoch
cfg.logger.info('=================================================')
cfg.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps))
cfg.logger.info('epoch[{}], epoch time: {:5.3f} ms, pre step time: {:5.3f} ms'.format(
epoch, epoch_time_used * 1000, pre_step_time * 1000))
cfg.logger.info('epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i + 1, fps))
cfg.logger.info('epoch[{}], epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format(
epoch, epoch_time_used * 1000, per_step_time * 1000))
cfg.logger.info('=================================================')
t_epoch = time.time()
......
......@@ -122,6 +122,19 @@ The entire code structure is as following:
### Train
- After installing MindSpore via the official website, you can start training and evaluation in as follows. If running on GPU, please add `--device_target=GPU` in the python command or use the "_gpu" shell script ("xxx_gpu.sh").
- Prepare hccl_8p.json files, before run network.
- Genatating hccl_8p.json, Run the script of utils/hccl_tools/hccl_tools.py.
The following parameter "[0-8)" indicates that the hccl_8p.json file of cards 0 to 7 is generated.
- The name of json file generated by this command is hccl_8p_01234567_{host_ip}.json. For convenience of expression, use hccl_8p.json represents the json file.
```
python hccl_tools.py --device_num "[0,8)"
```
- Prepare the dataset and set "data_dir='/path_to_dataset/'" on xxx_config.yaml file, before run network.
- Prepare the trained base model(.ckpt file) if train with beta mode and set "pretrained='/path_to_checkpoint_path/model.ckpt'" on beta_config.yaml file, before run network.
- Stand alone mode(Ascend)
- base model
......@@ -400,16 +413,17 @@ You will get the result as following in "./scripts/acc.log" if 'dis_dataset' ran
| -------------------------- | ----------------------------------------------------------- | ------------------ |
| Model Version | V1 | V1 |
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | NV PCIE V100-32G |
| uploaded Date | 09/30/2020 (month/day/year) | 14/10/2021 (month/day/year) |
| MindSpore Version | 1.0.0 | 1.5.0 |
| uploaded Date | 14/10/2021 (month/day/year) | 14/10/2021 (month/day/year) |
| MindSpore Version | 1.5.0 | 1.5.0 |
| Dataset | 4.7 million images | 4.7 million images |
| Training Parameters | epoch=100, batch_size=192, momentum=0.9 | epoch=18(base:9, beta:9), batch_size=192, momentum=0.9 |
| Training Parameters | epoch=18(base:9, beta:9), batch_size=192, momentum=0.9 | epoch=18(base:9, beta:9), batch_size=192, momentum=0.9 |
| Optimizer | Momentum | Momentum |
| Loss Function | Cross Entropy | Cross Entropy |
| outputs | probability | probability |
| Speed | 1pc: 350-600 fps; 8pcs: 2500-4500 fps | base: 1pc: 290-310 fps, 8pcs: 2050-2150 fps; beta: 1pc: 400-430 fps, 8pcs: 2810-2860 fps |
| Speed | base: 1pc: 350-600 fps; 8pcs: 2500-4500 fps; | base: 1pc: 290-310 fps, 8pcs: 2050-2150 fps; |
| | beta: 1pc: 350-600 fps; 8pcs: 2500-4500 fps; | beta: 1pc: 400-430 fps, 8pcs: 2810-2860 fps; |
| Total time | 1pc: NA hours; 8pcs: 10 hours | 1pc: NA hours; 8pcs: 5.6(base) + 4.2(beta) hours |
| Checkpoint for Fine tuning | 584M (.ckpt file) | 768M (.ckpt file, base), 582M (.ckpt file, beta) |
| Checkpoint for Fine tuning | 768M (.ckpt file, base), 582M (.ckpt file, beta) | 768M (.ckpt file, base), 582M (.ckpt file, beta) |
### Evaluation Performance
......@@ -417,13 +431,13 @@ You will get the result as following in "./scripts/acc.log" if 'dis_dataset' ran
| ------------------- | --------------------------- | --------------------------- |
| Model Version | V1 | V1 |
| Resource | Ascend 910; OS Euler2.8 | NV SMX2 V100-32G |
| Uploaded Date | 09/30/2020 (month/day/year) | 29/07/2021 (month/day/year) |
| MindSpore Version | 1.0.0 | 1.3.0 |
| Uploaded Date | 14/10/2021 (month/day/year) | 29/07/2021 (month/day/year) |
| MindSpore Version | 1.5.0 | 1.3.0 |
| Dataset | 1.1 million images | 1.1 million images |
| batch_size | 512 | 512 |
| outputs | ACC | ACC |
| ACC | 0.9 | 0.9 |
| Model for inference | 584M (.ckpt file) | 582M (.ckpt file) |
| Model for inference | 582M (.ckpt file) | 582M (.ckpt file) |
# [ModelZoo Homepage](#contents)
......
......@@ -237,24 +237,25 @@ def run_train():
time_for_graph_compile = time.time() - create_network_start
cfg.logger.important_info('{}, graph compile time={:.2f}s'.format(cfg.task, time_for_graph_compile))
if i % cfg.log_interval == 0 and cfg.local_rank == 0:
if (i + 1) % cfg.log_interval == 0 and cfg.local_rank == 0:
time_used = time.time() - t_end
epoch = int(i / cfg.steps_per_epoch)
epoch = int((i + 1) / cfg.steps_per_epoch)
fps = cfg.per_batch_size * (i - old_progress) * cfg.world_size / time_used
cfg.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr={}'.format(epoch, i, loss_meter, fps, lr[i]))
cfg.logger.info('epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr={}'.format(epoch, i + 1, loss_meter,
fps, lr[i]))
t_end = time.time()
loss_meter.reset()
old_progress = i
if i % cfg.steps_per_epoch == 0 and cfg.local_rank == 0:
if (i + 1) % cfg.steps_per_epoch == 0 and cfg.local_rank == 0:
epoch_time_used = time.time() - t_epoch
epoch = int(i / cfg.steps_per_epoch)
epoch = int((i + 1) / cfg.steps_per_epoch)
fps = cfg.per_batch_size * cfg.world_size * cfg.steps_per_epoch / epoch_time_used
pre_step_time = epoch_time_used / cfg.steps_per_epoch
per_step_time = epoch_time_used / cfg.steps_per_epoch
cfg.logger.info('=================================================')
cfg.logger.info('epoch[{}], iter[{}], fps: {:.2f}'.format(epoch, i, fps))
cfg.logger.info('epoch[{}], epoch time: {:5.3f} ms, pre step time: {:5.3f} ms'.format(
epoch, epoch_time_used * 1000, pre_step_time * 1000))
cfg.logger.info('epoch[{}], iter[{}], fps: {:.2f}'.format(epoch, i + 1, fps))
cfg.logger.info('epoch[{}], epoch time: {:5.3f} ms, per step time: {:5.3f} ms'.format(
epoch, epoch_time_used * 1000, per_step_time * 1000))
cfg.logger.info('=================================================')
t_epoch = time.time()
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment