!89 【模型开发】同步r1.3分支resnet和pangu_alpha代码到master分支

Merge pull request !89 from Atlas_hrp/master

!89 【模型开发】同步r1.3分支resnet和pangu_alpha代码到master分支
Merge pull request !89 from Atlas_hrp/master
409abab6 · i-robot · Gitee · 5abf4f57 · e9db9dcb · 409abab6
Commit 409abab6 authored 3 years ago by i-robot Committed by Gitee 3 years ago
--- a/official/cv/resnet/README.md
+++ b/official/cv/resnet/README.md
@@ -269,6 +269,10 @@ Parameters for both training and evaluation can be set in config file.
 "lr_init": 0.01,                  # initial learning rate
 "lr_end": 0.00001,                # final learning rate
 "lr_max": 0.1,                    # maximum learning rate
+"save_graphs": False,             # save graph results
+"save_graphs_path": "./graphs",   # save graph results path
+"has_trained_epoch":0,            # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0,             # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
 ```

 - Config for ResNet18 and ResNet50, ImageNet2012 dataset
@@ -291,6 +295,10 @@ Parameters for both training and evaluation can be set in config file.
 "lr_init": 0,                     # initial learning rate
 "lr_max": 0.8,                    # maximum learning rate
 "lr_end": 0.0,                    # minimum learning rate
+"save_graphs": False,             # save graph results
+"save_graphs_path": "./graphs",   # save graph results path
+"has_trained_epoch":0,            # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0,             # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
 ```

 - Config for ResNet34, ImageNet2012 dataset
@@ -333,6 +341,10 @@ Parameters for both training and evaluation can be set in config file.
 "use_label_smooth": True,         # label_smooth
 "label_smooth_factor": 0.1,       # label_smooth_factor
 "lr": 0.1                         # base learning rate
+"save_graphs": False,             # save graph results
+"save_graphs_path": "./graphs",   # save graph results path
+"has_trained_epoch":0,            # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0,             # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
 ```

 - Config for ResNet152, ImageNet2012 dataset
@@ -354,6 +366,10 @@ Parameters for both training and evaluation can be set in config file.
 "label_smooth_factor": 0.1,       # label_smooth_factor
 "lr": 0.1,                        # base learning rate
 "lr_end": 0.0001,                 # end learning rate
+"save_graphs": False,             # save graph results
+"save_graphs_path": "./graphs",   # save graph results path
+"has_trained_epoch":0,            # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0,             # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
 ```

 - Config for SE-ResNet50, ImageNet2012 dataset
@@ -377,6 +393,10 @@ Parameters for both training and evaluation can be set in config file.
 "lr_init": 0.0,                   # initial learning rate
 "lr_max": 0.3,                    # maximum learning rate
 "lr_end": 0.0001,                 # end learning rate
+"save_graphs": False,             # save graph results
+"save_graphs_path": "./graphs",   # save graph results path
+"has_trained_epoch":0,            # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus has_trained_epoch
+"has_trained_step":0,             # step size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to step_size minus has_trained_step
 ```

 ## [Training Process](#contents)
@@ -465,6 +485,20 @@ By default, a standalone cache server would be started to cache all eval images

 Users can choose to shutdown the cache server after training or leave it alone for future usage.

+## [Resume Process](#contents)
+
+### Usage
+
+#### Running on Ascend
+
+```text
+# distributed training
+用法：bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH]
+
+# standalone training
+用法：bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH]
+```
+
 ### Result

 - Training ResNet18 with CIFAR-10 dataset

--- a/official/cv/resnet/README_CN.md
+++ b/official/cv/resnet/README_CN.md
@@ -249,6 +249,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]  [CONFIG_PATH]
 "lr_init":0.01,                  # 初始学习率
 "lr_end":0.0001,                  # 最终学习率
 "lr_max":0.1,                    # 最大学习率
+"save_graphs":False,             # 是否保存图编译结果
+"save_graphs_path":"./graphs",   # 图编译结果保存路径
+"has_trained_epoch":0,           # 加载已经训练好的模型的epoch大小；实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0,            # 加载已经训练好的模型的step大小；实际训练周期大小等于step_size减去has_trained_step
 ```

 - 配置ResNet18、ResNet50和ImageNet2012数据集。
@@ -271,6 +275,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]  [CONFIG_PATH]
 "lr_init":0,                     # 初始学习率
 "lr_max":0.8,                    # 最大学习率
 "lr_end":0.0,                    # 最小学习率
+"save_graphs":False,             # 是否保存图编译结果
+"save_graphs_path":"./graphs",   # 图编译结果保存路径
+"has_trained_epoch":0,           # 加载已经训练好的模型的epoch大小；实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0,            # 加载已经训练好的模型的step大小；实际训练周期大小等于step_size减去has_trained_step
 ```

 - 配置ResNet34和ImageNet2012数据集。
@@ -293,6 +301,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]  [CONFIG_PATH]
 "lr_init":0,                     # 初始学习率
 "lr_max":1.0,                    # 最大学习率
 "lr_end":0.0,                    # 最小学习率
+"save_graphs":False,             # 是否保存图编译结果
+"save_graphs_path":"./graphs",   # 图编译结果保存路径
+"has_trained_epoch":0,           # 加载已经训练好的模型的epoch大小；实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0,            # 加载已经训练好的模型的step大小；实际训练周期大小等于step_size减去has_trained_step
 ```

 - 配置ResNet101和ImageNet2012数据集。
@@ -313,6 +325,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]  [CONFIG_PATH]
 "use_label_smooth":True,         # 标签平滑
 "label_smooth_factor":0.1,       # 标签平滑因子
 "lr":0.1                         # 基础学习率
+"save_graphs":False,             # 是否保存图编译结果
+"save_graphs_path":"./graphs",   # 图编译结果保存路径
+"has_trained_epoch":0,           # 加载已经训练好的模型的epoch大小；实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0,            # 加载已经训练好的模型的step大小；实际训练周期大小等于step_size减去has_trained_step
 ```

 - 配置ResNet152和ImageNet2012数据集。
@@ -334,6 +350,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]  [CONFIG_PATH]
 "label_smooth_factor":0.1,       # 标签平滑因子
 "lr":0.1,                        # 基础学习率
 "lr_end":0.0001,                 # 最终学习率
+"save_graphs":False,             # 是否保存图编译结果
+"save_graphs_path":"./graphs",   # 图编译结果保存路径
+"has_trained_epoch":0,           # 加载已经训练好的模型的epoch大小；实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0,            # 加载已经训练好的模型的step大小；实际训练周期大小等于step_size减去has_trained_step
 ```

 - 配置SE-ResNet50和ImageNet2012数据集。
@@ -357,6 +377,10 @@ bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]  [CONFIG_PATH]
 "lr_init":0.0,                   # 初始学习率
 "lr_max":0.3,                    # 最大学习率
 "lr_end":0.0001,                 # 最终学习率
+"save_graphs":False,             # 是否保存图编译结果
+"save_graphs_path":"./graphs",   # 图编译结果保存路径
+"has_trained_epoch":0,           # 加载已经训练好的模型的epoch大小；实际训练周期大小等于epoch_size减去has_trained_epoch
+"has_trained_step":0,            # 加载已经训练好的模型的step大小；实际训练周期大小等于step_size减去has_trained_step
 ```

 ## 训练过程
@@ -434,6 +458,20 @@ bash run_standalone_train_gpu.sh [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASE

 在训练结束后，可以选择关闭缓存服务器或不关闭它以继续为未来的推理提供缓存服务。

+## 续训过程
+
+### 用法
+
+#### Ascend处理器环境运行
+
+```text
+# 分布式训练
+用法：bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH]
+
+# 单机训练
+用法：bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH]
+```
+
 ### 结果

 - 使用CIFAR-10数据集训练ResNet18

--- a/official/cv/resnet/config/resnet101_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet101_imagenet2012_config.yaml
@@ -69,6 +69,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet101_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -88,3 +94,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet152_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet152_imagenet2012_config.yaml
@@ -71,6 +71,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet152_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -93,3 +99,5 @@ checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
 result_path: "result files path."
 label_path: "image file path."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet18_cifar10_config.yaml
+++ b/official/cv/resnet/config/resnet18_cifar10_config.yaml
@@ -65,6 +65,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet18_cifar10"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -84,3 +90,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet18_cifar10_config_gpu.yaml
+++ b/official/cv/resnet/config/resnet18_cifar10_config_gpu.yaml
@@ -65,6 +65,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet18_cifar10"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -84,3 +90,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet18_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet18_imagenet2012_config.yaml
@@ -67,6 +67,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet18_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -86,3 +92,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet18_imagenet2012_config_gpu.yaml
+++ b/official/cv/resnet/config/resnet18_imagenet2012_config_gpu.yaml
@@ -67,6 +67,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet18_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -86,3 +92,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet34_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet34_imagenet2012_config.yaml
@@ -67,6 +67,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet34_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -86,3 +92,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet50_cifar10_config.yaml
+++ b/official/cv/resnet/config/resnet50_cifar10_config.yaml
@@ -68,6 +68,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet50_cifar10"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -87,3 +93,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet50_imagenet2012_Ascend_Thor_config.yaml
+++ b/official/cv/resnet/config/resnet50_imagenet2012_Ascend_Thor_config.yaml
@@ -69,6 +69,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet50_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -88,3 +94,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet50_imagenet2012_Boost_config.yaml
+++ b/official/cv/resnet/config/resnet50_imagenet2012_Boost_config.yaml
@@ -70,6 +70,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet50_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -89,3 +95,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet50_imagenet2012_GPU_Thor_config.yaml
+++ b/official/cv/resnet/config/resnet50_imagenet2012_GPU_Thor_config.yaml
@@ -69,6 +69,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet50_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -88,3 +94,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet50_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/resnet50_imagenet2012_config.yaml
@@ -70,6 +70,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet50_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -89,3 +95,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/resnet_benchmark_GPU.yaml
+++ b/official/cv/resnet/config/resnet_benchmark_GPU.yaml
@@ -40,6 +40,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "resnet50_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -59,3 +65,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/config/se-resnet50_imagenet2012_config.yaml
+++ b/official/cv/resnet/config/se-resnet50_imagenet2012_config.yaml
@@ -71,6 +71,12 @@ file_format: "MINDIR"
 ckpt_file: ""
 network_dataset: "se-resnet50_imagenet2012"

+# Retrain options
+save_graphs: False
+save_graphs_path: "./graphs"
+has_trained_epoch: 0
+has_trained_step: 0
+
 # postprocess resnet inference
 result_path: ''
 label_path: ''
@@ -90,3 +96,5 @@ batch_size: "Batch size for training and evaluation"
 epoch_size: "Total training epochs."
 checkpoint_path: "The location of the checkpoint file."
 checkpoint_file_path: "The location of the checkpoint file."
+save_graphs: "Whether save graphs during training, default: False."
+save_graphs_path: "Path to save graphs."
--- a/official/cv/resnet/train.py
+++ b/official/cv/resnet/train.py
@@ -13,8 +13,11 @@
 # limitations under the License.
 # ============================================================================
 """train resnet."""
+import datetime
+import glob
 import os
 import numpy as np
+
 from mindspore import context
 from mindspore import Tensor
 from mindspore.nn.optim import Momentum, thor, LARS
@@ -31,6 +34,7 @@ from mindspore.parallel import set_algo_parameters
 import mindspore.nn as nn
 import mindspore.common.initializer as weight_init
 import mindspore.log as logger
+
 from src.lr_generator import get_lr, warmup_cosine_annealing_lr
 from src.CrossEntropySmooth import CrossEntropySmooth
 from src.eval_callback import EvalCallBack
@@ -43,6 +47,38 @@ from src.resnet import conv_variance_scaling_initializer

 set_seed(1)

+
+class LossCallBack(LossMonitor):
+    """
+    Monitor the loss in training.
+    If the loss in NAN or INF terminating training.
+    """
+
+    def __init__(self, has_trained_epoch=0):
+        super(LossCallBack, self).__init__()
+        self.has_trained_epoch = has_trained_epoch
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        loss = cb_params.net_outputs
+
+        if isinstance(loss, (tuple, list)):
+            if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
+                loss = loss[0]
+
+        if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
+            loss = np.mean(loss.asnumpy())
+
+        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
+
+        if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
+            raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
+                cb_params.cur_epoch_num, cur_step_in_epoch))
+        if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
+            print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num + int(self.has_trained_epoch),
+                                                      cur_step_in_epoch, loss), flush=True)
+
+
 if config.net_name in ("resnet18", "resnet34", "resnet50", "resnet152"):
    if config.net_name == "resnet18":
        from src.resnet import resnet18 as resnet
@@ -76,6 +112,7 @@ def filter_checkpoint_parameter_by_list(origin_dict, param_filter):
                del origin_dict[key]
                break

+
 def apply_eval(eval_param):
    eval_model = eval_param["model"]
    eval_ds = eval_param["dataset"]
@@ -83,23 +120,33 @@ def apply_eval(eval_param):
    res = eval_model.eval(eval_ds)
    return res[metrics_name]

+
 def set_graph_kernel_context(run_platform, net_name):
    if run_platform == "GPU" and net_name == "resnet101":
        context.set_context(enable_graph_kernel=True)
        context.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D")

+
 def set_parameter():
    """set_parameter"""
    target = config.device_target
    if target == "CPU":
        config.run_distribute = False

+    config.save_graphs = not config.pre_trained
+
    # init context
    if config.mode_name == 'GRAPH':
-        context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
+        if target == "Ascend":
+            rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID')))
+            context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs,
+                                save_graphs_path=rank_save_graphs_path)
+        else:
+            context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs)
        set_graph_kernel_context(target, config.net_name)
    else:
        context.set_context(mode=context.PYNATIVE_MODE, device_target=target, save_graphs=False)
+
    if config.parameter_server:
        context.set_ps_context(enable_ps=True)
    if config.run_distribute:
@@ -124,14 +171,44 @@ def set_parameter():
            if config.net_name == "resnet50":
                context.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config)

-def init_weight(net):
+
+def load_pre_trained_checkpoint():
+    """
+    Load checkpoint according to pre_trained path.
+    """
+    param_dict = None
+    if config.pre_trained:
+        if os.path.isdir(config.pre_trained):
+            ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path, "ckpt_0")
+            ckpt_pattern = os.path.join(ckpt_save_dir, "*.ckpt")
+            ckpt_files = glob.glob(ckpt_pattern)
+            if not ckpt_files:
+                logger.warning(f"There is no ckpt file in {ckpt_save_dir}, "
+                               f"pre_trained is unsupported.")
+            else:
+                ckpt_files.sort(key=os.path.getmtime, reverse=True)
+                time_stamp = datetime.datetime.now()
+                print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')}"
+                      f" pre trained ckpt model {ckpt_files[0]} loading",
+                      flush=True)
+                param_dict = load_checkpoint(ckpt_files[0])
+        elif os.path.isfile(config.pre_trained):
+            param_dict = load_checkpoint(config.pre_trained)
+        else:
+            print(f"Invalid pre_trained {config.pre_trained} parameter.")
+    return param_dict
+
+
+def init_weight(net, param_dict):
    """init_weight"""
    if config.pre_trained:
-        param_dict = load_checkpoint(config.pre_trained)
-        if config.filter_weight:
-            filter_list = [x.name for x in net.end_point.get_parameters()]
-            filter_checkpoint_parameter_by_list(param_dict, filter_list)
-        load_param_into_net(net, param_dict)
+        if param_dict:
+            config.has_trained_epoch = int(param_dict["epoch_num"].data.asnumpy())
+            config.has_trained_step = int(param_dict["step_num"].data.asnumpy())
+            if config.filter_weight:
+                filter_list = [x.name for x in net.end_point.get_parameters()]
+                filter_checkpoint_parameter_by_list(param_dict, filter_list)
+            load_param_into_net(net, param_dict)
    else:
        for _, cell in net.cells_and_names():
            if isinstance(cell, nn.Conv2d):
@@ -156,6 +233,7 @@ def init_weight(net):
                    weight = Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=cell.weight.dtype)
                    cell.weight.set_data(weight)

+
 def init_lr(step_size):
    """init lr"""
    if config.optimizer == "Thor":
@@ -171,6 +249,7 @@ def init_lr(step_size):
                                            config.pretrain_epoch_size * step_size)
    return lr

+
 def init_loss_scale():
    if config.dataset == "imagenet2012":
        if not config.use_label_smooth:
@@ -196,6 +275,7 @@ def init_group_params(net):
                    {'order_params': net.trainable_params()}]
    return group_params

+
 def run_eval(target, model, ckpt_save_dir, cb):
    """run_eval"""
    if config.run_eval:
@@ -230,6 +310,7 @@ def train_net():
    """train net"""
    target = config.device_target
    set_parameter()
+    ckpt_param_dict = load_pre_trained_checkpoint()
    dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1,
                             batch_size=config.batch_size, train_image_size=config.train_image_size,
                             eval_image_size=config.eval_image_size, target=target,
@@ -238,7 +319,8 @@ def train_net():
    net = resnet(class_num=config.class_num)
    if config.parameter_server:
        net.set_param_ps()
-    init_weight(net=net)
+
+    init_weight(net=net, param_dict=ckpt_param_dict)
    lr = Tensor(init_lr(step_size=step_size))
    # define opt
    group_params = init_group_params(net)
@@ -275,12 +357,14 @@ def train_net():

    # define callbacks
    time_cb = TimeMonitor(data_size=step_size)
-    loss_cb = LossMonitor()
+    loss_cb = LossMonitor(config.has_trained_epoch)
    cb = [time_cb, loss_cb]
    ckpt_save_dir = set_save_ckpt_dir()
    if config.save_checkpoint:
+        ckpt_append_info = [{"epoch_num": config.has_trained_epoch, "step_num": config.has_trained_step}]
        config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
-                                     keep_checkpoint_max=config.keep_checkpoint_max)
+                                     keep_checkpoint_max=config.keep_checkpoint_max,
+                                     append_info=ckpt_append_info)
        ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
        cb += [ckpt_cb]
    run_eval(target, model, ckpt_save_dir, cb)
@@ -288,11 +372,13 @@ def train_net():
    if config.net_name == "se-resnet50":
        config.epoch_size = config.train_epoch_size
    dataset_sink_mode = (not config.parameter_server) and target != "CPU"
+    config.pretrain_epoch_size = config.has_trained_epoch
    model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb,
                sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode)

    if config.run_eval and config.enable_cache:
        print("Remember to shut down the cache server via \"cache_admin --stop\"")

+
 if __name__ == '__main__':
    train_net()
--- a/official/nlp/pangu_alpha/src/utils.py
+++ b/official/nlp/pangu_alpha/src/utils.py
@@ -16,6 +16,7 @@
 network config setting, gradient clip function and dynamic learning rate function
 """
 import argparse
+import ast
 import os
 import time
 import numpy as np
@@ -385,6 +386,44 @@ def add_training_params(opt):
                     help="Column name of datasets")


+def add_retrain_params(opt):
+    """
+    Add parameters about retrain.
+    """
+    opt.add_argument("--pre_trained",
+                     type=str,
+                     default=None,
+                     help="Pretrained checkpoint path.")
+    opt.add_argument("--save_checkpoint_path",
+                     type=str,
+                     default=None,
+                     help="Save checkpoint path.")
+    opt.add_argument("--keep_checkpoint_max",
+                     type=int,
+                     default=1,
+                     help="Max checkpoint save number.")
+    opt.add_argument("--save_checkpoint_steps",
+                     type=int,
+                     default=2000,
+                     help="Save checkpoint step number.")
+    opt.add_argument("--save_checkpoint",
+                     type=ast.literal_eval,
+                     default=False,
+                     help="Whether save checkpoint in local disk.")
+    opt.add_argument("--ckpt_name_prefix",
+                     type=str,
+                     default="pangu",
+                     help="Saving checkpoint name prefix.")
+    opt.add_argument("--has_trained_epoches",
+                     type=int,
+                     default=0,
+                     help="Epoches has been trained before.")
+    opt.add_argument("--has_trained_steps",
+                     type=int,
+                     default=0,
+                     help="Steps has been trained before.")
+
+
 def get_args(inference=False):
    """train function for PanguAlpha"""
    parser = argparse.ArgumentParser(description="PanguAlpha training")
@@ -469,6 +508,7 @@ def get_args(inference=False):
                        default=10,
                        help="The eval step in train and eval mode. Default 10.")
    add_training_params(parser)
+    add_retrain_params(parser)
    if inference:
        add_inference_params(parser)
    args_opt = parser.parse_args()

--- a/official/nlp/pangu_alpha/train.py
+++ b/official/nlp/pangu_alpha/train.py
@@ -16,6 +16,8 @@
 PanguAlpha train script
 """

+import datetime
+import glob
 import os
 import math
 from mindspore import context
@@ -30,6 +32,8 @@ from mindspore.parallel import set_algo_parameters
 from mindspore.parallel._cost_model_context import _set_multi_subgraphs
 from mindspore.nn.wrap.cell_wrapper import PipelineCell, _VirtualDatasetCell
 from mindspore.parallel.nn import TransformerOpParallelConfig, CrossEntropyLoss
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+
 from src.adam import AdamWeightDecayOp
 from src.dataset import create_dataset
 from src.pangu_alpha import PanGUAlphaWithLoss, PanguAlphaModel
@@ -64,10 +68,30 @@ def set_weight_decay(params):
    return group_params


-def run_train(args_opt):
+def add_checkpoint_callback_policy(args_param, callback, rank_id):
    r"""
-    The main training process.
+    Add checkpoint policy to callback.
    """
+    if args_param.save_checkpoint:
+        # checkpoint store epoch_num and step_num info
+        ckpt_append_info = [{"epoch_num": args_param.has_trained_epoches, "step_num": args_param.has_trained_steps}]
+        ckpt_config = CheckpointConfig(save_checkpoint_steps=args_param.save_checkpoint_steps,
+                                       keep_checkpoint_max=args_param.keep_checkpoint_max,
+                                       integrated_save=False,
+                                       append_info=ckpt_append_info
+                                       )
+
+        ckpoint_cb = ModelCheckpoint(prefix=args_param.ckpt_name_prefix + str(rank_id),
+                                     directory=args_param.save_checkpoint_path,
+                                     config=ckpt_config)
+
+        callback.append(ckpoint_cb)
+
+
+def run_train(args_opt):
+    r"""The main training process."""
+    os.environ['HCCL_CONNECT_TIMEOUT'] = "6000"
+
    # Set execution mode
    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
    context.set_context(variable_memory_max_size="31GB")
@@ -81,7 +105,7 @@ def run_train(args_opt):
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=False,
            full_batch=bool(args_opt.full_batch), strategy_ckpt_load_file=args_opt.strategy_load_ckpt_path,
-            enable_parallel_optimizer=bool(args_opt.optimizer_shard))
+            enable_parallel_optimizer=bool(args_opt.optimizer_shard), strategy_ckpt_save_file='strategy.ckpt')
        set_algo_parameters(elementwise_op_strategy_follow=True)
        _set_multi_subgraphs()
    else:
@@ -125,10 +149,8 @@ def run_train(args_opt):
    pangu_alpha_with_loss = _VirtualDatasetCell(pangu_alpha_with_loss_net)
    print("=====args_opt is: ", args_opt, flush=True)
    # Warm-up and cosine decay learning rate
-    lr = LearningRate(learning_rate=args_opt.start_lr,
-                      end_learning_rate=args_opt.end_lr,
-                      warmup_steps=args_opt.warmup_step,
-                      decay_steps=200000)
+    lr = LearningRate(learning_rate=args_opt.start_lr, end_learning_rate=args_opt.end_lr,
+                      warmup_steps=args_opt.warmup_step, decay_steps=200000)

    params = pangu_alpha_with_loss.trainable_params()
    group_params = set_weight_decay(params)
@@ -163,6 +185,12 @@ def run_train(args_opt):
        callback.append(EvalCallBack(model, ds_eval, ppl_metric))
    else:
        model = Model(pangu_alpha_with_grads)
+
+    if args_opt.pre_trained:
+        load_checkpoint(args_opt, args_opt.sink_size, ds, model, device_num)
+
+    add_checkpoint_callback_policy(args_opt, callback, rank)
+
    if args_opt.incremental_training:
        from mindspore.train.serialization import load_distributed_checkpoint
        strategy = model.infer_train_layout(train_dataset=ds, sink_size=args_opt.sink_size)
@@ -176,10 +204,57 @@ def run_train(args_opt):
    model.train(actual_epoch_num, ds, callbacks=callback, sink_size=args_opt.sink_size, dataset_sink_mode=True)


-def run_train_pipeline(args_opt):
+def load_checkpoint(args_param, sink_size, dataset, model, device_num):
    r"""
-    The main training process in pipeline.
+    Load checkpoint process.
    """
+    from mindspore.train.serialization import load_distributed_checkpoint
+    strategy = model.infer_train_layout(train_dataset=dataset, sink_size=sink_size)
+    print("======start load_distributed checkpoint", flush=True)
+    # For 2.6B and 13B models, the number of ckpt files is 512.
+    ckpt_name = args_param.ckpt_name_prefix
+    if os.path.isdir(args_param.pre_trained):
+        ckpt_pattern = os.path.join(args_param.save_checkpoint_path,
+                                    f"{ckpt_name}*.ckpt")
+        ckpt_files = glob.glob(ckpt_pattern)
+        if not ckpt_files:
+            print(f"There is no ckpt file in {args_param.load_ckpt_path}, "
+                  f"pre_trained is unsupported.")
+        else:
+            ckpt_files.sort(key=os.path.getmtime, reverse=True)
+            time_stamp = datetime.datetime.now()
+            print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')} pre trained ckpt model {ckpt_files} loading",
+                  flush=True)
+            ckpt_file = os.path.basename(ckpt_files[0])
+            ckpt_file_length = ckpt_file.split("_")
+            if len(ckpt_file_length) == 3:
+                depulicate_num = ckpt_file.split("-")[0].split("_")[-1]
+                step_size = ckpt_file.split("-")[-1].split("_")[0]
+                sink_size = ckpt_file.split("-")[-1].split("_")[-1].split(".")[0]
+                ckpt_file_list = [os.path.join(args_param.save_checkpoint_path,
+                                               f"{ckpt_name}{ckpt_rank}_{depulicate_num}-{step_size}_{sink_size}.ckpt")
+                                  for ckpt_rank in range(device_num)]
+                # Load checkpoint files
+                load_distributed_checkpoint(model.train_network, ckpt_file_list, strategy)
+            elif len(ckpt_file_length) == 2:
+                step_size = ckpt_file.split("-")[-1].split("_")[0]
+                sink_size = ckpt_file.split("-")[-1].split("_")[-1].split(".")[0]
+                ckpt_file_list = [os.path.join(args_param.save_checkpoint_path,
+                                               f"{ckpt_name}{ckpt_rank}-{step_size}_{sink_size}.ckpt")
+                                  for ckpt_rank in range(device_num)]
+                # Load checkpoint files
+                load_distributed_checkpoint(model.train_network, ckpt_file_list, strategy)
+            else:
+                print(f"Please check {args_param.pre_trained} value.")
+    else:
+        print(f"Please check {args_param.pre_trained} value.")
+
+
+def run_train_pipeline(args_opt):
+    r"""The main training process in pipeline."""
+    # Set hccl connect time
+    os.environ['HCCL_CONNECT_TIMEOUT'] = "6000"
+
    context.set_context(save_graphs=False, mode=context.GRAPH_MODE, device_target=args_opt.device_target)
    context.set_context(variable_memory_max_size="31GB")
    if args_opt.distribute == "true":