diff --git a/official/cv/resnet/README.md b/official/cv/resnet/README.md
index 35fca631087266987c878e5896e5cafb935d8ec5..574217a48284a75f2343edb900a9a507a9fe2050 100644
--- a/official/cv/resnet/README.md
+++ b/official/cv/resnet/README.md
@@ -269,6 +269,10 @@ Parameters for both training and evaluation can be set in config file.
"lr_init": 0.01, # initial learning rate
"lr_end": 0.00001, # final learning rate
"lr_max": 0.1, # maximum learning rate
+"save_graphs": False, # save graph results
+"save_graphs_path": "./graphs", # save graph results path
+"has_trained_epoch":0, # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0, # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
```
- Config for ResNet18 and ResNet50, ImageNet2012 dataset
@@ -291,6 +295,10 @@ Parameters for both training and evaluation can be set in config file.
"lr_init": 0, # initial learning rate
"lr_max": 0.8, # maximum learning rate
"lr_end": 0.0, # minimum learning rate
+"save_graphs": False, # save graph results
+"save_graphs_path": "./graphs", # save graph results path
+"has_trained_epoch":0, # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0, # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
```
- Config for ResNet34, ImageNet2012 dataset
@@ -333,6 +341,10 @@ Parameters for both training and evaluation can be set in config file.
"use_label_smooth": True, # label_smooth
"label_smooth_factor": 0.1, # label_smooth_factor
"lr": 0.1 # base learning rate
+"save_graphs": False, # save graph results
+"save_graphs_path": "./graphs", # save graph results path
+"has_trained_epoch":0, # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0, # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
```
- Config for ResNet152, ImageNet2012 dataset
@@ -354,6 +366,10 @@ Parameters for both training and evaluation can be set in config file.
"label_smooth_factor": 0.1, # label_smooth_factor
"lr": 0.1, # base learning rate
"lr_end": 0.0001, # end learning rate
+"save_graphs": False, # save graph results
+"save_graphs_path": "./graphs", # save graph results path
+"has_trained_epoch":0, # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0, # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
```
- Config for SE-ResNet50, ImageNet2012 dataset
@@ -377,6 +393,10 @@ Parameters for both training and evaluation can be set in config file.
"lr_init": 0.0, # initial learning rate
"lr_max": 0.3, # maximum learning rate
"lr_end": 0.0001, # end learning rate
+"save_graphs": False, # save graph results
+"save_graphs_path": "./graphs", # save graph results path
+"has_trained_epoch":0, # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0, # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
```
## [Training Process](#contents)
@@ -465,6 +485,20 @@ By default, a standalone cache server would be started to cache all eval images
Users can choose to shutdown the cache server after training or leave it alone for future usage.
+## [Resume Process](#contents)
+
+### Usage
+
+#### Running on Ascend
+
+```text
+# distributed training
+用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH]
+
+# standalone training
+用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH]
+```
+
### Result
- Training ResNet18 with CIFAR-10 dataset
diff --git a/official/cv/resnet/README_CN.md b/official/cv/resnet/README_CN.md
index 2f30d6eda6522177462e28fa5c1950196bf68b0b..b9bcfd78add576c4dc6deecc61334f812b068523 100644
--- a/official/cv/resnet/README_CN.md
+++ b/official/cv/resnet/README_CN.md
@@ -249,6 +249,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
"lr_init":0.01, # 初始学习率
"lr_end":0.0001, # 最终学习率
"lr_max":0.1, # 最大学习率
+"save_graphs":False, # 是否保存图编译结果
+"save_graphs_path":"./graphs", # 图编译结果保存路径
+"has_trained_epoch":0, # 加载已经训练好的模型的epoch大小;实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0, # 加载已经训练好的模型的step大小;实际训练周期大小等于step_size减去has_trained_step
```
- 配置ResNet18、ResNet50和ImageNet2012数据集。
@@ -271,6 +275,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
"lr_init":0, # 初始学习率
"lr_max":0.8, # 最大学习率
"lr_end":0.0, # 最小学习率
+"save_graphs":False, # 是否保存图编译结果
+"save_graphs_path":"./graphs", # 图编译结果保存路径
+"has_trained_epoch":0, # 加载已经训练好的模型的epoch大小;实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0, # 加载已经训练好的模型的step大小;实际训练周期大小等于step_size减去has_trained_step
```
- 配置ResNet34和ImageNet2012数据集。
@@ -293,6 +301,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
"lr_init":0, # 初始学习率
"lr_max":1.0, # 最大学习率
"lr_end":0.0, # 最小学习率
+"save_graphs":False, # 是否保存图编译结果
+"save_graphs_path":"./graphs", # 图编译结果保存路径
+"has_trained_epoch":0, # 加载已经训练好的模型的epoch大小;实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0, # 加载已经训练好的模型的step大小;实际训练周期大小等于step_size减去has_trained_step
```
- 配置ResNet101和ImageNet2012数据集。
@@ -313,6 +325,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
"use_label_smooth":True, # 标签平滑
"label_smooth_factor":0.1, # 标签平滑因子
"lr":0.1 # 基础学习率
+"save_graphs":False, # 是否保存图编译结果
+"save_graphs_path":"./graphs", # 图编译结果保存路径
+"has_trained_epoch":0, # 加载已经训练好的模型的epoch大小;实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0, # 加载已经训练好的模型的step大小;实际训练周期大小等于step_size减去has_trained_step
```
- 配置ResNet152和ImageNet2012数据集。
@@ -334,6 +350,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
"label_smooth_factor":0.1, # 标签平滑因子
"lr":0.1, # 基础学习率
"lr_end":0.0001, # 最终学习率
+"save_graphs":False, # 是否保存图编译结果
+"save_graphs_path":"./graphs", # 图编译结果保存路径
+"has_trained_epoch":0, # 加载已经训练好的模型的epoch大小;实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0, # 加载已经训练好的模型的step大小;实际训练周期大小等于step_size减去has_trained_step
```
- 配置SE-ResNet50和ImageNet2012数据集。
@@ -357,6 +377,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
"lr_init":0.0, # 初始学习率
"lr_max":0.3, # 最大学习率
"lr_end":0.0001, # 最终学习率
+"save_graphs":False, # 是否保存图编译结果
+"save_graphs_path":"./graphs", # 图编译结果保存路径
+"has_trained_epoch":0, # 加载已经训练好的模型的epoch大小;实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0, # 加载已经训练好的模型的step大小;实际训练周期大小等于step_size减去has_trained_step
```
## 训练过程
@@ -434,6 +458,20 @@ bash run_standalone_train_gpu.sh [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASE
在训练结束后,可以选择关闭缓存服务器或不关闭它以继续为未来的推理提供缓存服务。
+## 续训过程
+
+### 用法
+
+#### Ascend处理器环境运行
+
+```text
+# 分布式训练
+用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH]
+
+# 单机训练
+用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH]
+```
+
### 结果
- 使用CIFAR-10数据集训练ResNet18
diff --git a/official/cv/resnet/config/resnet101_imagenet2012_config.yaml b/official/cv/resnet/config/resnet101_imagenet2012_config.yaml
index d9144b0b42cbd163d27993117b1fc970fdf52aa4..4d9e5750e129ca264ece94a7d49ae3b8e1b33608 100644
--- a/official/cv/resnet/config/resnet101_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet101_imagenet2012_config.yaml
@@ -69,6 +69,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet101_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -88,3 +94,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet152_imagenet2012_config.yaml b/official/cv/resnet/config/resnet152_imagenet2012_config.yaml
index 61f7e56605d54fbc92ba1a6073d1ce5f7c23f915..21d230c78c29f0ab04b7dfe4d761d97bd66f9246 100644
--- a/official/cv/resnet/config/resnet152_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet152_imagenet2012_config.yaml
@@ -71,6 +71,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet152_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -93,3 +99,5 @@ checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
result_path: "result files path."
label_path: "image file path."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet18_cifar10_config.yaml b/official/cv/resnet/config/resnet18_cifar10_config.yaml
index e7c5050e3fe7baf04d819979f56a2001c7beb226..05ee8fc411325416d5806cfaa1489a7b61fd2d5c 100644
--- a/official/cv/resnet/config/resnet18_cifar10_config.yaml
+++ b/official/cv/resnet/config/resnet18_cifar10_config.yaml
@@ -65,6 +65,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet18_cifar10"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -84,3 +90,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet18_cifar10_config_gpu.yaml b/official/cv/resnet/config/resnet18_cifar10_config_gpu.yaml
index 0c1ebc025ddccd0fd5ec374e09c8fb9a4e5e154a..d58681c60c6b3afab506460632e135c6bdc04eb9 100644
--- a/official/cv/resnet/config/resnet18_cifar10_config_gpu.yaml
+++ b/official/cv/resnet/config/resnet18_cifar10_config_gpu.yaml
@@ -65,6 +65,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet18_cifar10"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -84,3 +90,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet18_imagenet2012_config.yaml b/official/cv/resnet/config/resnet18_imagenet2012_config.yaml
index e0fcbce8bc0c9c3188d62075cc565b0ad1759c4f..cbd2d213b614a016d505e7f9b782cbe4c11752e6 100644
--- a/official/cv/resnet/config/resnet18_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet18_imagenet2012_config.yaml
@@ -67,6 +67,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet18_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -86,3 +92,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet18_imagenet2012_config_gpu.yaml b/official/cv/resnet/config/resnet18_imagenet2012_config_gpu.yaml
index ee04dc202c191195a3bba3fa6dcf3428ee40818f..8b57105b8f6bc438d44754d219391afea401a8d0 100644
--- a/official/cv/resnet/config/resnet18_imagenet2012_config_gpu.yaml
+++ b/official/cv/resnet/config/resnet18_imagenet2012_config_gpu.yaml
@@ -67,6 +67,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet18_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -86,3 +92,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet34_imagenet2012_config.yaml b/official/cv/resnet/config/resnet34_imagenet2012_config.yaml
index 2371608aa55b1b15c5b05c6bfd81c43b58b84a44..1f0ed53304244eb50c835323a4ce3f8984724f82 100644
--- a/official/cv/resnet/config/resnet34_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet34_imagenet2012_config.yaml
@@ -67,6 +67,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet34_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -86,3 +92,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet50_cifar10_config.yaml b/official/cv/resnet/config/resnet50_cifar10_config.yaml
index 9e0f04001d6a1c73d69f6515cdae3c2925242b38..52e724cb2c88e6320ff4c82331dcc912fdbdb3df 100644
--- a/official/cv/resnet/config/resnet50_cifar10_config.yaml
+++ b/official/cv/resnet/config/resnet50_cifar10_config.yaml
@@ -68,6 +68,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet50_cifar10"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -87,3 +93,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet50_imagenet2012_Ascend_Thor_config.yaml b/official/cv/resnet/config/resnet50_imagenet2012_Ascend_Thor_config.yaml
index b0761155dc3802dabc1e96059ff24f4153b8fee0..4e85d6edc72e53e1254eb5cf136c4f1954dbf8a6 100644
--- a/official/cv/resnet/config/resnet50_imagenet2012_Ascend_Thor_config.yaml
+++ b/official/cv/resnet/config/resnet50_imagenet2012_Ascend_Thor_config.yaml
@@ -69,6 +69,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -88,3 +94,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet50_imagenet2012_Boost_config.yaml b/official/cv/resnet/config/resnet50_imagenet2012_Boost_config.yaml
index 02ff74ebd72ead15184443bd5c51d2d3eee7cf11..862cb8d4c9297706c2566610b8ec6b1565ce02ad 100644
--- a/official/cv/resnet/config/resnet50_imagenet2012_Boost_config.yaml
+++ b/official/cv/resnet/config/resnet50_imagenet2012_Boost_config.yaml
@@ -70,6 +70,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -89,3 +95,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet50_imagenet2012_GPU_Thor_config.yaml b/official/cv/resnet/config/resnet50_imagenet2012_GPU_Thor_config.yaml
index 37a704efe3b6256d6da21df4309dfdac42f79e03..b9b56a31f6143610094879da3941c6c4ad89925c 100644
--- a/official/cv/resnet/config/resnet50_imagenet2012_GPU_Thor_config.yaml
+++ b/official/cv/resnet/config/resnet50_imagenet2012_GPU_Thor_config.yaml
@@ -69,6 +69,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -88,3 +94,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet50_imagenet2012_config.yaml b/official/cv/resnet/config/resnet50_imagenet2012_config.yaml
index b9db5f7bab7b85aa334551dc78fa066e75549eb6..3096bc0ef8fca228edb69a61eb59268fff0edf91 100644
--- a/official/cv/resnet/config/resnet50_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet50_imagenet2012_config.yaml
@@ -70,6 +70,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -89,3 +95,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/resnet_benchmark_GPU.yaml b/official/cv/resnet/config/resnet_benchmark_GPU.yaml
index 7fc403790cc21b3d66b2604868e144d2c76f07fd..fcb70e9c8136056b7c63db8336bfb7c459a559ce 100644
--- a/official/cv/resnet/config/resnet_benchmark_GPU.yaml
+++ b/official/cv/resnet/config/resnet_benchmark_GPU.yaml
@@ -40,6 +40,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -59,3 +65,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/config/se-resnet50_imagenet2012_config.yaml b/official/cv/resnet/config/se-resnet50_imagenet2012_config.yaml
index 71f24869d08a6ea26200f753891807aa0f9cc85d..545840c55bac1f99c8edb7ae01194441ba0791b8 100644
--- a/official/cv/resnet/config/se-resnet50_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/se-resnet50_imagenet2012_config.yaml
@@ -71,6 +71,12 @@ file_format: "MINDIR"
ckpt_file: ""
network_dataset: "se-resnet50_imagenet2012"
+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
# postprocess resnet inference
result_path: ''
label_path: ''
@@ -90,3 +96,5 @@ batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
diff --git a/official/cv/resnet/train.py b/official/cv/resnet/train.py
index 28bb2e07214b3cbc7f71e75e5756ccd0d928f652..3af39f99732c0d7a156ba5f3f8501ed9b88a20de 100644
--- a/official/cv/resnet/train.py
+++ b/official/cv/resnet/train.py
@@ -13,8 +13,11 @@
# limitations under the License.
# ============================================================================
"""train resnet."""
+import datetime
+import glob
import os
import numpy as np
+
from mindspore import context
from mindspore import Tensor
from mindspore.nn.optim import Momentum, thor, LARS
@@ -31,6 +34,7 @@ from mindspore.parallel import set_algo_parameters
import mindspore.nn as nn
import mindspore.common.initializer as weight_init
import mindspore.log as logger
+
from src.lr_generator import get_lr, warmup_cosine_annealing_lr
from src.CrossEntropySmooth import CrossEntropySmooth
from src.eval_callback import EvalCallBack
@@ -43,6 +47,38 @@ from src.resnet import conv_variance_scaling_initializer
set_seed(1)
+
+class LossCallBack(LossMonitor):
+ """
+ Monitor the loss in training.
+ If the loss in NAN or INF terminating training.
+ """
+
+ def __init__(self, has_trained_epoch=0):
+ super(LossCallBack, self).__init__()
+ self.has_trained_epoch = has_trained_epoch
+
+ def step_end(self, run_context):
+ cb_params = run_context.original_args()
+ loss = cb_params.net_outputs
+
+ if isinstance(loss, (tuple, list)):
+ if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
+ loss = loss[0]
+
+ if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
+ loss = np.mean(loss.asnumpy())
+
+ cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
+
+ if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
+ raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
+ cb_params.cur_epoch_num, cur_step_in_epoch))
+ if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
+ print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num + int(self.has_trained_epoch),
+ cur_step_in_epoch, loss), flush=True)
+
+
if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"):
if config.net_name == "resnet18":
from src.resnet import resnet18 as resnet
@@ -76,6 +112,7 @@ def filter_checkpoint_parameter_by_list(origin_dict, param_filter):
del origin_dict[key]
break
+
def apply_eval(eval_param):
eval_model = eval_param["model"]
eval_ds = eval_param["dataset"]
@@ -83,23 +120,33 @@ def apply_eval(eval_param):
res = eval_model.eval(eval_ds)
return res[metrics_name]
+
def set_graph_kernel_context(run_platform, net_name):
if run_platform == "GPU" and net_name == "resnet101":
context.set_context(enable_graph_kernel=True)
context.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D")
+
def set_parameter():
"""set_parameter"""
target = config.device_target
if target == "CPU":
config.run_distribute = False
+ config.save_graphs = not config.pre_trained
+
# init context
if config.mode_name == 'GRAPH':
- context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
+ if target == "Ascend":
+ rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID')))
+ context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs,
+ save_graphs_path=rank_save_graphs_path)
+ else:
+ context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs)
set_graph_kernel_context(target, config.net_name)
else:
context.set_context(mode=context.PYNATIVE_MODE, device_target=target, save_graphs=False)
+
if config.parameter_server:
context.set_ps_context(enable_ps=True)
if config.run_distribute:
@@ -124,14 +171,44 @@ def set_parameter():
if config.net_name == "resnet50":
context.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config)
-def init_weight(net):
+
+def load_pre_trained_checkpoint():
+ """
+ Load checkpoint according to pre_trained path.
+ """
+ param_dict = None
+ if config.pre_trained:
+ if os.path.isdir(config.pre_trained):
+ ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path, "ckpt_0")
+ ckpt_pattern = os.path.join(ckpt_save_dir, "*.ckpt")
+ ckpt_files = glob.glob(ckpt_pattern)
+ if not ckpt_files:
+ logger.warning(f"There is no ckpt file in {ckpt_save_dir}, "
+ f"pre_trained is unsupported.")
+ else:
+ ckpt_files.sort(key=os.path.getmtime, reverse=True)
+ time_stamp = datetime.datetime.now()
+ print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')}"
+ f" pre trained ckpt model {ckpt_files[0]} loading",
+ flush=True)
+ param_dict = load_checkpoint(ckpt_files[0])
+ elif os.path.isfile(config.pre_trained):
+ param_dict = load_checkpoint(config.pre_trained)
+ else:
+ print(f"Invalid pre_trained {config.pre_trained} parameter.")
+ return param_dict
+
+
+def init_weight(net, param_dict):
"""init_weight"""
if config.pre_trained:
- param_dict = load_checkpoint(config.pre_trained)
- if config.filter_weight:
- filter_list = [x.name for x in net.end_point.get_parameters()]
- filter_checkpoint_parameter_by_list(param_dict, filter_list)
- load_param_into_net(net, param_dict)
+ if param_dict:
+ config.has_trained_epoch = int(param_dict["epoch_num"].data.asnumpy())
+ config.has_trained_step = int(param_dict["step_num"].data.asnumpy())
+ if config.filter_weight:
+ filter_list = [x.name for x in net.end_point.get_parameters()]
+ filter_checkpoint_parameter_by_list(param_dict, filter_list)
+ load_param_into_net(net, param_dict)
else:
for _, cell in net.cells_and_names():
if isinstance(cell, nn.Conv2d):
@@ -156,6 +233,7 @@ def init_weight(net):
weight = Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype)
cell.weight.set_data(weight)
+
def init_lr(step_size):
"""init lr"""
if config.optimizer == "Thor":
@@ -171,6 +249,7 @@ def init_lr(step_size):
config.pretrain_epoch_size * step_size)
return lr
+
def init_loss_scale():
if config.dataset == "imagenet2012":
if not config.use_label_smooth:
@@ -196,6 +275,7 @@ def init_group_params(net):
{'order_params': net.trainable_params()}]
return group_params
+
def run_eval(target, model, ckpt_save_dir, cb):
"""run_eval"""
if config.run_eval:
@@ -230,6 +310,7 @@ def train_net():
"""train net"""
target = config.device_target
set_parameter()
+ ckpt_param_dict = load_pre_trained_checkpoint()
dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1,
batch_size=config.batch_size, train_image_size=config.train_image_size,
eval_image_size=config.eval_image_size, target=target,
@@ -238,7 +319,8 @@ def train_net():
net = resnet(class_num=config.class_num)
if config.parameter_server:
net.set_param_ps()
- init_weight(net=net)
+
+ init_weight(net=net, param_dict=ckpt_param_dict)
lr = Tensor(init_lr(step_size=step_size))
# define opt
group_params = init_group_params(net)
@@ -275,12 +357,14 @@ def train_net():
# define callbacks
time_cb = TimeMonitor(data_size=step_size)
- loss_cb = LossMonitor()
+ loss_cb = LossMonitor(config.has_trained_epoch)
cb = [time_cb, loss_cb]
ckpt_save_dir = set_save_ckpt_dir()
if config.save_checkpoint:
+ ckpt_append_info = [{"epoch_num": config.has_trained_epoch, "step_num": config.has_trained_step}]
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
- keep_checkpoint_max=config.keep_checkpoint_max)
+ keep_checkpoint_max=config.keep_checkpoint_max,
+ append_info=ckpt_append_info)
ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
cb += [ckpt_cb]
run_eval(target, model, ckpt_save_dir, cb)
@@ -288,11 +372,13 @@ def train_net():
if config.net_name == "se-resnet50":
config.epoch_size = config.train_epoch_size
dataset_sink_mode = (not config.parameter_server) and target != "CPU"
+ config.pretrain_epoch_size = config.has_trained_epoch
model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb,
sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode)
if config.run_eval and config.enable_cache:
print("Remember to shut down the cache server via \"cache_admin --stop\"")
+
if __name__ == '__main__':
train_net()
diff --git a/official/nlp/pangu_alpha/src/utils.py b/official/nlp/pangu_alpha/src/utils.py
index 528dca2d23c4147919cef4a534295f19a27cf060..2fbff49d1ae5a16ec7c0abb0d247250ee4b98fd1 100644
--- a/official/nlp/pangu_alpha/src/utils.py
+++ b/official/nlp/pangu_alpha/src/utils.py
@@ -16,6 +16,7 @@
network config setting, gradient clip function and dynamic learning rate function
"""
import argparse
+import ast
import os
import time
import numpy as np
@@ -385,6 +386,44 @@ def add_training_params(opt):
help="Column name of datasets")
+def add_retrain_params(opt):
+ """
+ Add parameters about retrain.
+ """
+ opt.add_argument("--pre_trained",
+ type=str,
+ default=None,
+ help="Pretrained checkpoint path.")
+ opt.add_argument("--save_checkpoint_path",
+ type=str,
+ default=None,
+ help="Save checkpoint path.")
+ opt.add_argument("--keep_checkpoint_max",
+ type=int,
+ default=1,
+ help="Max checkpoint save number.")
+ opt.add_argument("--save_checkpoint_steps",
+ type=int,
+ default=2000,
+ help="Save checkpoint step number.")
+ opt.add_argument("--save_checkpoint",
+ type=ast.literal_eval,
+ default=False,
+ help="Whether save checkpoint in local disk.")
+ opt.add_argument("--ckpt_name_prefix",
+ type=str,
+ default="pangu",
+ help="Saving checkpoint name prefix.")
+ opt.add_argument("--has_trained_epoches",
+ type=int,
+ default=0,
+ help="Epoches has been trained before.")
+ opt.add_argument("--has_trained_steps",
+ type=int,
+ default=0,
+ help="Steps has been trained before.")
+
+
def get_args(inference=False):
"""train function for PanguAlpha"""
parser = argparse.ArgumentParser(description="PanguAlpha training")
@@ -469,6 +508,7 @@ def get_args(inference=False):
default=10,
help="The eval step in train and eval mode. Default 10.")
add_training_params(parser)
+ add_retrain_params(parser)
if inference:
add_inference_params(parser)
args_opt = parser.parse_args()
diff --git a/official/nlp/pangu_alpha/train.py b/official/nlp/pangu_alpha/train.py
index 0b72d10fda10ed9b0dbee6fbc6b246329280228e..9d1ef23acc91137c23491a7016d84a0f29cf3ba5 100644
--- a/official/nlp/pangu_alpha/train.py
+++ b/official/nlp/pangu_alpha/train.py
@@ -16,6 +16,8 @@
PanguAlpha train script
"""
+import datetime
+import glob
import os
import math
from mindspore import context
@@ -30,6 +32,8 @@ from mindspore.parallel import set_algo_parameters
from mindspore.parallel._cost_model_context import _set_multi_subgraphs
from mindspore.nn.wrap.cell_wrapper import PipelineCell, _VirtualDatasetCell
from mindspore.parallel.nn import TransformerOpParallelConfig, CrossEntropyLoss
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+
from src.adam import AdamWeightDecayOp
from src.dataset import create_dataset
from src.pangu_alpha import PanGUAlphaWithLoss, PanguAlphaModel
@@ -64,10 +68,30 @@ def set_weight_decay(params):
return group_params
-def run_train(args_opt):
+def add_checkpoint_callback_policy(args_param, callback, rank_id):
r"""
- The main training process.
+ Add checkpoint policy to callback.
"""
+ if args_param.save_checkpoint:
+ # checkpoint store epoch_num and step_num info
+ ckpt_append_info = [{"epoch_num": args_param.has_trained_epoches, "step_num": args_param.has_trained_steps}]
+ ckpt_config = CheckpointConfig(save_checkpoint_steps=args_param.save_checkpoint_steps,
+ keep_checkpoint_max=args_param.keep_checkpoint_max,
+ integrated_save=False,
+ append_info=ckpt_append_info
+ )
+
+ ckpoint_cb = ModelCheckpoint(prefix=args_param.ckpt_name_prefix + str(rank_id),
+ directory=args_param.save_checkpoint_path,
+ config=ckpt_config)
+
+ callback.append(ckpoint_cb)
+
+
+def run_train(args_opt):
+ r"""The main training process."""
+ os.environ['HCCL_CONNECT_TIMEOUT'] = "6000"
+
# Set execution mode
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
context.set_context(variable_memory_max_size="31GB")
@@ -81,7 +105,7 @@ def run_train(args_opt):
context.set_auto_parallel_context(
parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=False,
full_batch=bool(args_opt.full_batch), strategy_ckpt_load_file=args_opt.strategy_load_ckpt_path,
- enable_parallel_optimizer=bool(args_opt.optimizer_shard))
+ enable_parallel_optimizer=bool(args_opt.optimizer_shard), strategy_ckpt_save_file='strategy.ckpt')
set_algo_parameters(elementwise_op_strategy_follow=True)
_set_multi_subgraphs()
else:
@@ -125,10 +149,8 @@ def run_train(args_opt):
pangu_alpha_with_loss = _VirtualDatasetCell(pangu_alpha_with_loss_net)
print("=====args_opt is: ", args_opt, flush=True)
# Warm-up and cosine decay learning rate
- lr = LearningRate(learning_rate=args_opt.start_lr,
- end_learning_rate=args_opt.end_lr,
- warmup_steps=args_opt.warmup_step,
- decay_steps=200000)
+ lr = LearningRate(learning_rate=args_opt.start_lr, end_learning_rate=args_opt.end_lr,
+ warmup_steps=args_opt.warmup_step, decay_steps=200000)
params = pangu_alpha_with_loss.trainable_params()
group_params = set_weight_decay(params)
@@ -163,6 +185,12 @@ def run_train(args_opt):
callback.append(EvalCallBack(model, ds_eval, ppl_metric))
else:
model = Model(pangu_alpha_with_grads)
+
+ if args_opt.pre_trained:
+ load_checkpoint(args_opt, args_opt.sink_size, ds, model, device_num)
+
+ add_checkpoint_callback_policy(args_opt, callback, rank)
+
if args_opt.incremental_training:
from mindspore.train.serialization import load_distributed_checkpoint
strategy = model.infer_train_layout(train_dataset=ds, sink_size=args_opt.sink_size)
@@ -176,10 +204,57 @@ def run_train(args_opt):
model.train(actual_epoch_num, ds, callbacks=callback, sink_size=args_opt.sink_size, dataset_sink_mode=True)
-def run_train_pipeline(args_opt):
+def load_checkpoint(args_param, sink_size, dataset, model, device_num):
r"""
- The main training process in pipeline.
+ Load checkpoint process.
"""
+ from mindspore.train.serialization import load_distributed_checkpoint
+ strategy = model.infer_train_layout(train_dataset=dataset, sink_size=sink_size)
+ print("======start load_distributed checkpoint", flush=True)
+ # For 2.6B and 13B models, the number of ckpt files is 512.
+ ckpt_name = args_param.ckpt_name_prefix
+ if os.path.isdir(args_param.pre_trained):
+ ckpt_pattern = os.path.join(args_param.save_checkpoint_path,
+ f"{ckpt_name}*.ckpt")
+ ckpt_files = glob.glob(ckpt_pattern)
+ if not ckpt_files:
+ print(f"There is no ckpt file in {args_param.load_ckpt_path}, "
+ f"pre_trained is unsupported.")
+ else:
+ ckpt_files.sort(key=os.path.getmtime, reverse=True)
+ time_stamp = datetime.datetime.now()
+ print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')} pre trained ckpt model {ckpt_files} loading",
+ flush=True)
+ ckpt_file = os.path.basename(ckpt_files[0])
+ ckpt_file_length = ckpt_file.split("_")
+ if len(ckpt_file_length) == 3:
+ depulicate_num = ckpt_file.split("-")[0].split("_")[-1]
+ step_size = ckpt_file.split("-")[-1].split("_")[0]
+ sink_size = ckpt_file.split("-")[-1].split("_")[-1].split(".")[0]
+ ckpt_file_list = [os.path.join(args_param.save_checkpoint_path,
+ f"{ckpt_name}{ckpt_rank}_{depulicate_num}-{step_size}_{sink_size}.ckpt")
+ for ckpt_rank in range(device_num)]
+ # Load checkpoint files
+ load_distributed_checkpoint(model.train_network, ckpt_file_list, strategy)
+ elif len(ckpt_file_length) == 2:
+ step_size = ckpt_file.split("-")[-1].split("_")[0]
+ sink_size = ckpt_file.split("-")[-1].split("_")[-1].split(".")[0]
+ ckpt_file_list = [os.path.join(args_param.save_checkpoint_path,
+ f"{ckpt_name}{ckpt_rank}-{step_size}_{sink_size}.ckpt")
+ for ckpt_rank in range(device_num)]
+ # Load checkpoint files
+ load_distributed_checkpoint(model.train_network, ckpt_file_list, strategy)
+ else:
+ print(f"Please check {args_param.pre_trained} value.")
+ else:
+ print(f"Please check {args_param.pre_trained} value.")
+
+
+def run_train_pipeline(args_opt):
+ r"""The main training process in pipeline."""
+ # Set hccl connect time
+ os.environ['HCCL_CONNECT_TIMEOUT'] = "6000"
+
context.set_context(save_graphs=False, mode=context.GRAPH_MODE, device_target=args_opt.device_target)
context.set_context(variable_memory_max_size="31GB")
if args_opt.distribute == "true":