!1052 add dpn_test on GPU

Merge pull request !1052 from blingbling/dpn_pr

!1052 add dpn_test on GPU
Merge pull request !1052 from blingbling/dpn_pr
15b2586e · i-robot · Gitee · 6e4a6532 · bed1bf46 · 15b2586e
Unverified Commit 15b2586e authored 3 years ago by i-robot Committed by Gitee 3 years ago
--- a/official/cv/dpn/README.md
+++ b/official/cv/dpn/README.md
@@ -118,9 +118,12 @@ The structure of the files in this repository is shown below.
    ├─ scripts
       ├─ docker_start.sh
       ├─ eval.sh                // launch ascend standalone evaluation
+       ├─ eval_gpu.sh                // launch gpu standalone evaluation
       ├─ run_infer_310.sh       //  310 infer
       ├─ train_distributed.sh   // launch ascend distributed training
-       └─ train_standalone.sh    // launch ascend standalone training
+       ├─ train_distributed_gpu.sh   // launch gpu distributed training
+       ├─ train_standalone.sh    // launch ascend standalone training
+       └─ train_standalone_gpu.sh    // launch gpu standalone training
    ├─ src
       ├─ callbacks.py           // user-defined callbacks
       ├─ crossentropy.py        // user-defined loss functions
@@ -150,7 +153,7 @@ Parameters for both training and evaluation and export can be set in `default_co

 - Configurations for DPN92 with ImageNet-1K dataset

-```default_config.yaml
+```text
 # model config
 config.image_size = (224,224)               # inpute image size
 config.num_classes = 1000                   # dataset class number
@@ -163,7 +166,7 @@ config.rank = 0                             # local rank of distributed
 config.group_size =  1                      # group size of distributed

 # training config
-config.batch_size =  32                     # batch_size
+config.batch_size =  32                     # batch_size=32 on Ascend; batch_size=16 on GPU
 config.global_step =  0                     # start step of learning rate
 config.epoch_size = 180                     # epoch_size
 config.loss_scale_num =  1024               # loss scale
@@ -200,6 +203,15 @@ bash scripts/train_standalone.sh [device_id] [train_data_dir] [ckpt_path_to_save
 # example: bash scripts/train_standalone.sh 0 /home/DataSet/ImageNet_Original/train/ ./ckpt 0
 ```

+#### Running on GPU
+
+Run `scripts/train_standalone_gpu.sh` to train the model standalone. The usage of the script is:
+
+```shell
+bash scripts/train_standalone_gpu.sh [device_id] [train_data_dir] [ckpt_path_to_save] [eval_each_epoch] [pretrained_ckpt(optional)]
+# example: bash scripts/train_standalone_gpu.sh 0 /home/DataSet/ImageNet_Original/train/ ./ckpt 0
+```
+
 If eval_each_epoch is 1, it will evaluate after each epoch and save the parameters with the max accuracy. But in this case, the time of one epoch will be longer.

 If eval_each_epoch is 0, it will save parameters every some epochs instead of evaluating in the training process.
@@ -237,7 +249,7 @@ The model checkpoint will be saved into `[ckpt_path_to_save]`.

 Run `scripts/train_distributed.sh` to train the model distributed. The usage of the script is:

-```text
+```shell
 bash scripts/train_distributed.sh [rank_table] [train_data_dir] [ckpt_path_to_save]  [rank_size] [eval_each_epoch] [pretrained_ckpt(optional)]
 # example: bash scripts/train_distributed.sh ~/hccl_8p.json /home/DataSet/ImageNet_Original/train/ ./ckpt/ 8 0
 ```
@@ -252,6 +264,15 @@ Epoch time: 1350398.913 ms, per step time: 369.864 ms
 ...
 ```

+#### Running on GPU
+
+Run `scripts/train_distributed_gpu.sh` to train the model distributed. Note modify the batch size to 16 on GPU training. The usage of the script is:
+
+```shell
+bash scripts/train_distributed_gpu.sh [DATA_DIR] [SAVE_PATH] [RANK_SIZE] [EVAL_EACH_EPOCH] [PRETRAINED_CKPT_PATH]
+# example: bash scripts/train_distributed_gpu.sh /home/DataSet/ImageNet_Original/train/ ./ckpt/ 8 0
+```
+
 The model checkpoint will be saved into `[ckpt_path_to_save]`.

 ## [Evaluation Process](#contents)
@@ -265,6 +286,15 @@ bash scripts/eval.sh [device_id] [eval_data_dir] [checkpoint_path]
 # example bash scripts/eval.sh 0 /home/DataSet/ImageNet_Original/validation_preprocess/ /home/model/dpn/ckpt/dpn-100_40036.ckpt
 ```

+### [Running on GPU](#contents)
+
+Run `scripts/eval_gpu.sh` to evaluate the model on GPU. Note modify the batch size to 16 on GPU evaluating. The usage of the script is:
+
+```text
+bash scripts/eval_gpu.sh [device_id] [eval_data_dir] [checkpoint_path]
+# example bash scripts/eval_gpu.sh 0 /home/DataSet/ImageNet_Original/validation_preprocess/ /home/model/dpn/ckpt/dpn-100_40036.ckpt
+```
+
 The above shell script will run evaluation in the background. You can view the results through the file `eval_log.txt`. The result will be achieved as follows:

 ```text
@@ -390,31 +420,31 @@ All results are validated at image size of 224x224. The dataset preprocessing an

 #### DPN92 (Training)

-| Parameters        | Ascend                      |
-| ----------------- | --------------------------- |
-| Model Version     | DPN92 (Train)               |
-| Resource          | Ascend 910; OS Euler2.8                  |
-| Uploaded Date     | 12/20/2020 (month/day/year) |
-| MindSpore Version | 1.1.0                       |
-| Dataset           | ImageNet-1K                 |
-| epochs            | 180                         |
-| outputs           | probability                 |
-| train performance | Top1:78.91%; Top5:94.53%    |
+| Parameters        | Ascend                      |GPU                      |
+| ----------------- | --------------------------- | --------------------------- |
+| Model Version     | DPN92 (Train)               | DPN92 (Train)               |
+| Resource          | Ascend 910; OS Euler2.8                  | NV SMX3 V100-32G                  |
+| Uploaded Date     | 12/20/2020 (month/day/year) | 03/10/2022 (month/day/year) |
+| MindSpore Version | 1.1.0                       | 1.6.0                       |
+| Dataset           | ImageNet-1K                 | ImageNet2012                 |
+| epochs            | 180                         | 180                         |
+| outputs           | probability                 | probability                 |
+| train performance | Top1:78.91%; Top5:94.53%    | Top1:78.84%; Top5:94.41%;    |

 ### [Efficiency](#contents)

 #### DPN92

-| Parameters        | Ascend                            |
-| ----------------- | --------------------------------- |
-| Model Version     | DPN92                             |
-| Resource          | Ascend 910; OS Euler2.8                        |
-| Uploaded Date     | 12/20/2020 (month/day/year)       |
-| MindSpore Version | 1.1.0                             |
-| Dataset           | ImageNet-1K                       |
-| batch_size        | 32                                |
-| outputs           | probability                       |
-| speed             | 1pc:233 ms/step;8pc:240 ms/step   |
+| Parameters        | Ascend                            | GPU|
+| ----------------- | --------------------------------- | --------------------------------- |
+| Model Version     | DPN92                             | DPN92                             |
+| Resource          | Ascend 910; OS Euler2.8                        | NV SMX3 V100-32G                        |
+| Uploaded Date     | 12/20/2020 (month/day/year)       | 03/10/2022 (month/day/year) |
+| MindSpore Version | 1.1.0                             | 1.6.0                       |
+| Dataset           | ImageNet-1K                       | ImageNet2012                 |
+| batch_size        | 32                                | 16                                |
+| outputs           | probability                       | probability                       |
+| speed             | 1pc:233 ms/step;8pc:240 ms/step   | 8pc: 305 ms/step   |

 # [Description of Random Situation](#contents)


--- a/official/cv/dpn/default_config.yaml
+++ b/official/cv/dpn/default_config.yaml
@@ -15,7 +15,8 @@ enable_profiling: False
 # common options
 is_distributed: 0
 image_size: [224, 224]
-batch_size: 32
+batch_size: 32  
+# batch_size=16 on GPU
 num_parallel_workers: 4
 rank: 0
 group_size: 1
@@ -54,7 +55,7 @@ keep_checkpoint_max: 3
 # ======================================================================================
 # Eval options
 eval_data_dir: ""
-checkpoint_path: ""
+checkpoint_path: "./ckpt/"


 # export options

--- a/official/cv/dpn/modelarts/train_start.py
+++ b/official/cv/dpn/modelarts/train_start.py
@@ -135,7 +135,7 @@ def _parse_args():
                        help='warmup epochs')

    # export configs
-    parser.add_argument('--export_dir', type=str, default='export_dir',
+    parser.add_argument('--export_dir', type=str, default='',
                        help='dircetory to save exported model, frozen model if not None')
    parser.add_argument('--width', type=int, default=224,
                        help='export width')
@@ -332,7 +332,8 @@ def main():
    dpn_train(config_args, ma_config)
    print('DPN training success!')
    # export
-    dpn_export(config_args, ma_config)
+    if config_args.export_dir:
+        dpn_export(config_args, ma_config)

    # data sync
    mox.file.copy_parallel(config_args.output_path, config_args.train_url)

--- a/official/cv/dpn/scripts/eval_gpu.sh
+++ b/official/cv/dpn/scripts/eval_gpu.sh
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+export DEVICE_ID=$1
+DATA_DIR=$2
+PATH_CHECKPOINT=$3
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+python $BASEPATH/../eval.py  \
+    --device_target=GPU \
+    --checkpoint_path=$PATH_CHECKPOINT \
+    --eval_data_dir=$DATA_DIR > eval_log.txt 2>&1 &
--- a/official/cv/dpn/scripts/train_distributed_gpu.sh
+++ b/official/cv/dpn/scripts/train_distributed_gpu.sh
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#Usage: bash train_distributed_gpu.sh [DATA_DIR] [SAVE_PATH] [RANK_SIZE] [EVAL_EACH_EPOCH] [PRETRAINED_CKPT_PATH](optional)
+
+if [[ $# -lt 4 || $# -gt 5 ]]; then
+  echo "bash train_distributed_gpu.sh [DATA_DIR] [SAVE_PATH] [RANK_SIZE] [EVAL_EACH_EPOCH] [PRETRAINED_CKPT_PATH](optional)"
+  exit 1
+fi
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+DATA_DIR=$1
+export RANK_SIZE=$3
+SAVE_PATH=$2
+EVAL_EACH_EPOCH=$4
+PATH_CHECKPOINT=""
+if [ $# == 5 ]; then
+  PATH_CHECKPOINT=$5
+fi
+
+if [ -d "distribute_train_gpu" ]; then
+  rm -rf ./distribute_train_gpu
+fi
+
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+mkdir ./distribute_train_gpu
+cp -r $BASEPATH/../src ./distribute_train_gpu
+cp $BASEPATH/../*.yaml ./distribute_train_gpu
+cp $BASEPATH/../*.py ./distribute_train_gpu
+cd ./distribute_train_gpu || exit
+
+if [ $# == 4 ]; then
+  mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
+  nohup python train.py  \
+  --is_distributed=1 \
+  --device_target=GPU \
+  --ckpt_path=$SAVE_PATH \
+  --eval_each_epoch=$EVAL_EACH_EPOCH\
+  --train_data_dir=$DATA_DIR\
+  --eval_data_dir= > log.txt 2>&1 &
+fi
+
+if [ $# == 5 ]; then
+  mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
+  nohup python train.py  \
+  --is_distributed=1 \
+  --device_target=GPU \
+  --ckpt_path=$SAVE_PATH \
+  --eval_each_epoch=$EVAL_EACH_EPOCH\
+  --pretrained=$PATH_CHECKPOINT \
+  --train_data_dir=$DATA_DIR > log.txt 2>&1 &
+fi
+
+cd ../
--- a/official/cv/dpn/scripts/train_standalone.sh
+++ b/official/cv/dpn/scripts/train_standalone.sh
@@ -23,7 +23,7 @@ if [ $# == 5 ]
 then
  PATH_CHECKPOINT=$5
 fi
-
+cd ../
 if [ $# == 4 ]
 then
        python train.py  \

--- a/official/cv/dpn/scripts/train_standalone_gpu.sh
+++ b/official/cv/dpn/scripts/train_standalone_gpu.sh
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#Usage: bssh train_standalone.sh [DEVICE_ID] [DATA_DIR] [SAVE_CKPT_PATH] [EVAL_EACH_EPOCH] [PATH_CHECKPOINT]!
+export DEVICE_ID=$1
+DATA_DIR=$2
+SAVE_CKPT_PATH=$3
+EVAL_EACH_EPOCH=$4
+BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
+
+if [ $# == 5 ]
+then
+  PATH_CHECKPOINT=$5
+fi
+cd $BASE_PATH/../
+if [ $# == 4 ]
+then
+        python train.py  \
+        --is_distributed=0 \
+        --device_target=GPU \
+        --ckpt_path=$SAVE_CKPT_PATH\
+        --eval_each_epoch=$EVAL_EACH_EPOCH\
+        --train_data_dir=$DATA_DIR > train_log.txt 2>&1 &
+    echo "    python train.py  \
+        --is_distributed=0 \
+        --device_target=GPU \
+        --ckpt_path=$SAVE_CKPT_PATH\
+        --eval_each_epoch=$EVAL_EACH_EPOCH\
+        --train_data_dir=$DATA_DIR > train_log.txt 2>&1 &"
+fi
+if [ $# == 5 ]
+then
+        python train.py  \
+        --is_distributed=0 \
+        --device_target=GPU \
+        --ckpt_path=$SAVE_CKPT_PATH\
+        --pretrained=$PATH_CHECKPOINT \
+        --train_data_dir=$DATA_DIR\
+        --eval_each_epoch=$EVAL_EACH_EPOCH > train_log.txt 2>&1 &
+fi
\ No newline at end of file
--- a/official/cv/dpn/src/dpn.py
+++ b/official/cv/dpn/src/dpn.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 from collections import OrderedDict
+import mindspore as ms
 import mindspore.nn as nn
 import mindspore.ops.operations as F

@@ -54,9 +55,14 @@ class BottleBlock(nn.Cell):
        self.bn1 = nn.BatchNorm2d(in_chs, eps=1e-3, momentum=0.9)
        self.conv1 = nn.Conv2d(in_chs, num_1x1_a, 1, stride=1)
        self.bn2 = nn.BatchNorm2d(num_1x1_a, eps=1e-3, momentum=0.9)
-        self.conv2 = nn.CellList()
-        for _ in range(G):
-            self.conv2.append(nn.Conv2d(num_1x1_a // G, num_3x3_b // G, 3, key_stride, pad_mode='pad', padding=1))
+        if ms.context.get_context('device_target') == "Ascend":
+            self.split_conv_concat = True
+            self.conv2 = nn.CellList()
+            for _ in range(G):
+                self.conv2.append(nn.Conv2d(num_1x1_a // G, num_3x3_b // G, 3, key_stride, pad_mode='pad', padding=1))
+        else:
+            self.split_conv_concat = False
+            self.conv2 = nn.Conv2d(num_1x1_a, num_3x3_b, 3, key_stride, pad_mode='pad', padding=1, group=G)
        self.bn3 = nn.BatchNorm2d(num_3x3_b, eps=1e-3, momentum=0.9)
        self.conv3_r = nn.Conv2d(num_3x3_b, num_1x1_c, 1, stride=1)
        self.conv3_d = nn.Conv2d(num_3x3_b, inc, 1, stride=1)
@@ -71,11 +77,14 @@ class BottleBlock(nn.Cell):
        x = self.conv1(x)
        x = self.bn2(x)
        x = self.relu(x)
-        group_x = ()
-        input_x = self.split(x)
-        for i in range(self.G):
-            group_x = group_x + (self.conv2[i](input_x[i]),)
-        x = self.concat(group_x)
+        if self.split_conv_concat:
+            group_x = ()
+            input_x = self.split(x)
+            for i in range(self.G):
+                group_x = group_x + (self.conv2[i](input_x[i]),)
+            x = self.concat(group_x)
+        else:
+            x = self.conv2(x)
        x = self.bn3(x)
        x = self.relu(x)
        return (self.conv3_r(x), self.conv3_d(x))

--- a/official/cv/dpn/src/imagenet_dataset.py
+++ b/official/cv/dpn/src/imagenet_dataset.py
@@ -77,10 +77,7 @@ def classification_dataset(data_dir, image_size, per_batch_size, max_epoch, rank
        >>>                               input_mode="txt", root=images_dir)
    """
    cv2.setNumThreads(0)
-    if mode == 'eval':
-        drop_remainder = False
-    else:
-        drop_remainder = True
+    drop_remainder = True
    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]

    std = [255 * 0.229, 255 * 0.224, 255 * 0.225]

--- a/official/cv/dpn/train.py
+++ b/official/cv/dpn/train.py
@@ -20,10 +20,9 @@ from mindspore import Tensor
 from mindspore.nn import SGD
 from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
 from mindspore.train.model import Model
-from mindspore.context import ParallelMode
 from mindspore.train.callback import LossMonitor, ModelCheckpoint, CheckpointConfig, TimeMonitor
 from mindspore.train.loss_scale_manager import FixedLossScaleManager
-from mindspore.communication.management import init, get_group_size
+from mindspore.communication.management import init, get_group_size, get_rank
 from mindspore.common import set_seed
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from src.imagenet_dataset import classification_dataset
@@ -33,8 +32,6 @@ from src.crossentropy import CrossEntropy
 from src.callbacks import SaveCallback
 from src.model_utils.config import config
 from src.model_utils.moxing_adapter import moxing_wrapper
-from src.model_utils.device_adapter import get_device_id, get_rank_id, get_device_num
-

 set_seed(1)

@@ -42,22 +39,18 @@ set_seed(1)
 def modelarts_pre_process():
    pass

-
 @moxing_wrapper(pre_process=modelarts_pre_process)
 def dpn_train():
    # init context
-    device_id = get_device_id()
-    context.set_context(mode=context.GRAPH_MODE,
-                        device_target=config.device_target, save_graphs=False, device_id=device_id)
-    # init distributed
+    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False)
    if config.is_distributed:
        init()
-        config.rank = get_rank_id()
        config.group_size = get_group_size()
-        config.device_num = get_device_num()
-        context.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
-                                          gradients_mean=True)
-
+        context.set_auto_parallel_context(parallel_mode=context.ParallelMode.DATA_PARALLEL,
+                                          gradients_mean=True, device_num=config.group_size)
+        config.rank = get_rank()
+    else:
+        config.rank = 0
    # select for master rank save ckpt or all rank save, compatible for model parallel
    config.rank_save_ckpt_flag = 0
    if config.is_save_on_master:
@@ -65,36 +58,27 @@ def dpn_train():
            config.rank_save_ckpt_flag = 1
    else:
        config.rank_save_ckpt_flag = 1
-    # create dataset
    train_dataset = classification_dataset(config.train_data_dir,
                                           image_size=config.image_size,
-                                           per_batch_size=config.batch_size,
-                                           max_epoch=1,
+                                           per_batch_size=config.batch_size, max_epoch=1,
                                           num_parallel_workers=config.num_parallel_workers,
-                                           shuffle=True,
-                                           rank=config.rank,
+                                           shuffle=True, rank=config.rank,
                                           group_size=config.group_size)
    if config.eval_each_epoch:
        print("create eval_dataset")
        eval_dataset = classification_dataset(config.eval_data_dir,
                                              image_size=config.image_size,
-                                              per_batch_size=config.batch_size,
-                                              max_epoch=1,
+                                              per_batch_size=config.batch_size, max_epoch=1,
                                              num_parallel_workers=config.num_parallel_workers,
-                                              shuffle=False,
-                                              rank=config.rank,
+                                              shuffle=False, rank=config.rank,
                                              group_size=config.group_size,
                                              mode='eval')
    train_step_size = train_dataset.get_dataset_size()
-
-    # choose net
    net = dpns[config.backbone](num_classes=config.num_classes)

-    # load checkpoint
    if os.path.isfile(config.pretrained):
        print("load ckpt")
        load_param_into_net(net, load_checkpoint(config.pretrained))
-    # learing rate schedule
    if config.lr_schedule == 'drop':
        print("lr_schedule:drop")
        lr = Tensor(get_lr_drop(global_step=config.global_step,
@@ -110,7 +94,6 @@ def dpn_train():
                                  lr_init=config.lr_init,
                                  lr_max=config.lr_max,
                                  warmup_epochs=config.warmup_epochs))
-
    # optimizer
    config.weight_decay = literal_eval(config.weight_decay)
    opt = SGD(net.trainable_params(),
@@ -129,14 +112,20 @@ def dpn_train():
            config.label_smooth_factor = 0.0
        print("Use Label_smooth CrossEntropy")
        loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.num_classes)
-    # create model
-    model = Model(net, amp_level="O2",
-                  keep_batchnorm_fp32=False,
-                  loss_fn=loss,
-                  optimizer=opt,
-                  loss_scale_manager=loss_scale,
-                  metrics={'top_1_accuracy', 'top_5_accuracy'})
-
+    if config.device_target == "Ascend":
+        model = Model(net, amp_level="O2",
+                      keep_batchnorm_fp32=False,
+                      loss_fn=loss,
+                      optimizer=opt,
+                      loss_scale_manager=loss_scale,
+                      metrics={'top_1_accuracy', 'top_5_accuracy'})
+    elif config.device_target == "GPU":
+        context.set_context(enable_graph_kernel=True)
+        model = Model(net,
+                      loss_fn=loss,
+                      optimizer=opt,
+                      loss_scale_manager=loss_scale,
+                      metrics={'top_1_accuracy', 'top_5_accuracy'})
    # loss/time monitor & ckpt save callback
    loss_cb = LossMonitor()
    time_cb = TimeMonitor(data_size=train_step_size)
@@ -145,15 +134,13 @@ def dpn_train():
        if config.eval_each_epoch:
            save_cb = SaveCallback(model, eval_dataset, config.ckpt_path)
            cb += [save_cb]
-        else:
-            config_ck = CheckpointConfig(save_checkpoint_steps=train_step_size,
+        elif config.rank == 0:
+            config_ck = CheckpointConfig(save_checkpoint_steps=train_step_size * 5,
                                         keep_checkpoint_max=config.keep_checkpoint_max)
            ckpoint_cb = ModelCheckpoint(prefix="dpn", directory=config.ckpt_path, config=config_ck)
            cb.append(ckpoint_cb)
-    # train model
    model.train(config.epoch_size, train_dataset, callbacks=cb)

-
 if __name__ == '__main__':
    dpn_train()
    print('DPN training success!')