From 3941f8230c723c8dfd23dc7c7b48f898d80f948a Mon Sep 17 00:00:00 2001
From: Vladimir Nechaev <steadymight@gmail.com>
Date: Mon, 24 Jan 2022 17:52:21 +0530
Subject: [PATCH] Implemented training and evaluation of Retinanet-Resnet152 on
 a GPU

---
 research/cv/retinanet_resnet152/README.md     | 507 ++++++++++++++++++
 research/cv/retinanet_resnet152/README_CN.md  | 169 +++---
 .../retinanet_resnet152/default_config.yaml   |  12 +-
 research/cv/retinanet_resnet152/eval.py       |   3 +
 research/cv/retinanet_resnet152/export.py     |   2 +
 .../cv/retinanet_resnet152/postprocess.py     |   4 +
 .../cv/retinanet_resnet152/requirements.txt   |   1 +
 .../scripts/run_distribute_train.sh           |   3 +
 .../scripts/run_distribute_train_gpu.sh       |  77 +++
 .../retinanet_resnet152/scripts/run_eval.sh   |   2 +
 .../scripts/run_eval_gpu.sh                   |  80 +++
 .../scripts/run_single_train.sh               |  36 +-
 .../scripts/run_single_train_gpu.sh           |  76 +++
 .../cv/retinanet_resnet152/src/backbone.py    |   3 +-
 .../cv/retinanet_resnet152/src/bottleneck.py  |   1 +
 .../cv/retinanet_resnet152/src/box_utils.py   |   2 +-
 .../cv/retinanet_resnet152/src/dataset.py     |  24 +-
 .../src/model_utils/config.py                 |   2 +-
 research/cv/retinanet_resnet152/train.py      |  17 +-
 19 files changed, 919 insertions(+), 102 deletions(-)
 create mode 100644 research/cv/retinanet_resnet152/README.md
 create mode 100644 research/cv/retinanet_resnet152/requirements.txt
 create mode 100644 research/cv/retinanet_resnet152/scripts/run_distribute_train_gpu.sh
 create mode 100644 research/cv/retinanet_resnet152/scripts/run_eval_gpu.sh
 create mode 100644 research/cv/retinanet_resnet152/scripts/run_single_train_gpu.sh

diff --git a/research/cv/retinanet_resnet152/README.md b/research/cv/retinanet_resnet152/README.md
new file mode 100644
index 000000000..23e04a27d
--- /dev/null
+++ b/research/cv/retinanet_resnet152/README.md
@@ -0,0 +1,507 @@
+# 1. 内容
+
+<!-- TOC -->
+
+[查看中文](./README_CN.md)
+
+- [Retinanet Description](#retinanet-description)
+- [Model Architecture](#model-architecture)
+- [Dataset](#dataset)
+- [Environment Requirements](#environment-requirements)
+- [Script Description](#script-description)
+    - [Script and Sample Code](#ccript-and-sample-code)
+    - [Script Parameters](#script-parameters)
+    - [Training Process](#training-process)
+        - [Usage](#usage1)
+        - [Run](#run1)
+        - [Result](#result1)
+    - [Evaluation Process](#evaluation-process)
+        - [Usage](#usage2)
+        - [Run](#run2)
+        - [Result](#result2)
+    - [Model Export](#model-export)
+        - [Usage](#usage3)
+        - [Run](#run3)
+    - [Inference Process](#inference-process)
+        - [Usage](#usage4)
+        - [Run](#run4)
+        - [Result](#result4)
+    - [Model Description](#model-description)
+        - [Performance](#performance)
+            - [Training Performance](#training-performance)
+            - [Evaluation Performance](#evaluation-performance)
+- [Description of Random State](#description-of-random-situation)
+- [ModelZoo Homepage](#modelzoo-homepage)
+
+<!-- /TOC -->
+
+## [Retinanet Description](#content)
+
+RetinaNet was proposed in "Focal Loss for Dense Object Detection" 2018 paper by Facebook AI Research. The biggest contribution of this paper is to propose Focal Loss to solve the problem of category imbalance, thus creating RetinaNet (One Stage target detection algorithm), a target detection network whose accuracy exceeds the classic Two Stage Faster-RCNN.
+
+[Paper](https://arxiv.org/pdf/1708.02002.pdf)
+Lin T Y , Goyal P , Girshick R , et al. Focal Loss for Dense Object Detection[C]// 2017 IEEE International Conference on Computer Vision (ICCV). IEEE, 2017:2999-3007.
+
+## [Model Architecture](#content)
+
+The overall network architecture of Retinanet is [here](https://arxiv.org/pdf/1708.02002.pdf)
+
+## [Dataset](#content)
+
+Dataset used (refer to paper): [COCO2017](https://cocodataset.org/#download)
+
+- Dataset size: 19.3G, 123287 pcs 80 classes of colored images
+    - [train](http://images.cocodataset.org/zips/train2017.zip): 19.3G, 118287 images
+    - [val](http://images.cocodataset.org/zips/val2017.zip): 814.3M, 5000 images
+- Data format: RGB
+
+>Note:The data will be processed with src/dataset.py.
+
+## [Environment Requirements](#content)
+
+- Hardware (Ascend/GPU)
+    - Prepare hardware environment with Ascend or GPU.
+- Framework
+    - [MindSpore](https://www.mindspore.cn/install)
+- For more information about MindSpore, please check the resources below:
+    - [MindSpore 教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
+
+- Training in ModelArts (If you want to run on ModelArts, you can refer to the following [modelarts documents](https://support.huaweicloud.com/modelarts/))
+
+    ```python
+    # Train with 8 cards on ModelArts
+    # (1) choose a or b
+    #       a. in default_config.yaml set "enable_modelarts=True"
+    #          in default_config.yaml set "distribute=True"
+    #          in default_config.yaml set "coco_root='/cache/data'"
+    #          in default_config.yaml set "epoch_size=500"
+    #          (optional)in default_config.yaml set "checkpoint_path='s3://dir_to_your_pretrained/'"
+    #          in default_config.yaml set other parameters
+    #       b. Set on the web page  "enable_modelarts=True"
+    #          Set on the web page "distribute=True"
+    #          Set on the web page "coco_root=/cache/data"
+    #          Set on the web page "epoch_size=500"
+    #          (optional)Set on the web page "checkpoint_path='s3://dir_to_your_pretrained/'"
+    #          Set on the web page other parameters
+    # (2) Prepare model code
+    # (3) If you choose to fine-tune your model, upload your pretrained model to the S3 bucket
+    # (4) choose a or b (recommended choice a)
+    #       a. First, compress the dataset into a ".zip" file.
+    #          Second, upload your compressed dataset to the S3 bucket (you can also upload the uncompressed dataset, but that may be slow.)
+    #       b. Upload the original dataset to the S3 bucket.
+    #          (Dataset conversion occurs during the training process, which takes more time. It will be re-transformed every time you train.)
+    # (5) Set your code path on the web page to "/path/retinanet"
+    # (6) Set the startup file on the web page as "train.py"
+    # (7) Set "training dataset", "training output file path", "job log path", etc. on the webpage
+    # (8) Create a training job
+    #
+    # Train with 1 card on ModelArts
+    # (1) choose a or b
+    #       a. in default_config.yaml set "enable_modelarts=True"
+    #          in default_config.yaml set "coco_root='/cache/data'"
+    #          in default_config.yaml set "epoch_size=500"
+    #          (optional)in default_config.yaml set "checkpoint_path='s3://dir_to_your_pretrained/'"
+    #          in default_config.yaml set other parameters
+    #       b. Set on the web page "enable_modelarts=True"
+    #          Set on the web page "coco_root='/cache/data'"
+    #          Set on the web page "epoch_size=500"
+    #          (optional)Set on the web page "checkpoint_path='s3://dir_to_your_pretrained/'"
+    #          Set on the web page other parameters
+    # (2) Prepare model code
+    # (3) If you choose to fine-tune your model, upload your pretrained model to the S3 bucket
+    # (4) choose a or b (recommended choice a)
+    #       a. First, compress the dataset into a ".zip" file.
+    #          Second, upload your compressed dataset to the S3 bucket (you can also upload the uncompressed dataset, but that may be slow.)
+    #       b. Upload the original dataset to the S3 bucket.
+    #          (Data set conversion occurs during the training process, which takes more time. It will be re-transformed every time you train.)
+    # (5) Set your code path on the web page to "/path/retinanet"
+    # (6) Set the startup file on the web page as "train.py"
+    # (7) Set "training dataset", "training output file path", "job log path", etc. on the webpage
+    # (8) Create a training job
+    #
+    # Eval on ModelArts
+    # (1) choose a or b
+    #       a. in default_config.yaml set "enable_modelarts=True"
+    #          in default_config.yaml set "checkpoint_path='s3://dir_to_your_trained_model/'"
+    #          in default_config.yaml set "mindrecord_dir='./MindRecord_COCO'"
+    #          in default_config.yaml set "coco_root='/cache/data'"
+    #          in default_config.yaml set other parameters
+    #       b. Set on the web page "enable_modelarts=True"
+    #          Set on the web page "checkpoint_path='s3://dir_to_your_trained_model/'"
+    #          Set on the web page "mindrecord_dir='./MindRecord_COCO'"
+    #          Set on the web page "coco_root='/cache/data'"
+    #          Set on the web page other parameters
+    # (2) Prepare model code
+    # (3) Upload your pretrained model to the S3 bucket
+    # (4) choose a or b (recommended choice a)
+    #       a. First, compress the dataset into a ".zip" file.
+    #          Second, upload your compressed dataset to the S3 bucket (you can also upload the uncompressed dataset, but that may be slow.)
+    #       b. Upload the original dataset to the S3 bucket.
+    #          (Data set conversion occurs during the training process, which takes more time. It will be re-transformed every time you train.)
+    # (5) Set your code path on the web page to "/path/retinanet"
+    # (6) Set the startup file on the web page as "eval.py"
+    # (7) Set "training dataset", "training output file path", "job log path", etc. on the webpage
+    # (8) Create a training job
+    ```
+
+- Export in ModelArts (If you want to run on ModelArts, you can refer to the following [modelarts documents](https://support.huaweicloud.com/modelarts/))
+
+    ```python
+    # (1) choose a or b
+    #       a. in default_config.yaml set "enable_modelarts=True"
+    #          in default_config.yaml set "file_name='retinanet'"
+    #          in default_config.yaml set "file_format='MINDIR'"
+    #          in base_config.yaml set "checkpoint_path='/The path of checkpoint in S3/'"
+    #          in base_config.yaml set other parameters
+    #       b. Set on the web page "enable_modelarts=True"
+    #          Set on the web page "file_name='retinanet'"
+    #          Set on the web page "file_format='MINDIR'"
+    #          Set on the web page "checkpoint_path='/The path of checkpoint in S3/'"
+    #          Set on the web page other parameters
+    # (2) Upload your pretrained model to the S3 bucket
+    # (3) Set your code path on the web page to "/path/retinanet"
+    # (4) Set the startup file on the web page as "export.py"
+    # (5) Set "training dataset", "training output file path", "job log path", etc. on the webpage
+    # (6) Create a training job
+    ```
+
+## [Script Description](#content)
+
+### [Script and Sample Code](#content)
+
+```text
+.
+└─Retinanet_resnet152
+  ├─README.md
+  ├─README_CN.md
+  ├─ascend310_infer
+  ├─scripts
+    ├─run_single_train.sh                     # training on Ascend single card
+    ├─run_single_train_gpu.sh                 # training on GPU single card
+    ├─run_distribute_train.sh                 # training on Ascend multiple card
+    ├─run_distribute_train_gpu.sh             # training on GPU multiple card
+    ├─run_eval.sh                             # inference on Ascend
+    ├─run_eval_gpu.sh                         # inference on GPU
+    └─run_infer_310.sh
+  ├─src
+    ├─backbone.py
+    ├─bottleneck.py
+    ├─dataset.py
+    ├─retinahead.py
+    ├─init_params.py
+    ├─lr_schedule.py
+    ├─coco_eval
+    ├─box_utils.py
+    └─model_utils
+      ├─config.py
+      ├─device_adapter.py
+      ├─local_adapter.py
+      └─moxing_adapter.py
+  ├─default_config.yaml
+  ├─export.py
+  ├─postprocess.py
+  ├─requirements.txt
+  ├─train.py
+  └─eval.py
+```
+
+### [Script Parameters](#content)
+
+```text
+Main parameteres used in train.py and config.py:
+"img_shape": [640, 640],                                                                        # image size
+"num_retinanet_boxes": 76725,                                                                   # The total number of a priori boxes set
+"match_thershold": 0.5,
+"softnms_sigma": 0.5,
+"nms_thershold": 0.6,
+"min_score": 0.1,
+"max_boxes": 100,                                                                               # Maximum number of detection frames
+"global_step": 0,
+"lr_init": 1e-6,
+"lr_end_rate": 5e-3,                                                                            # The ratio of the final learning rate to the maximum learning rate
+"warmup_epochs1": 2,                                                                            # Number of cycles of the 1st stage warmup
+"warmup_epochs2": 5,                                                                            # Number of cycles of the 2nd stage warmup
+"warmup_epochs3": 23,                                                                           # Number of cycles of the 3d stage warmup
+"warmup_epochs4": 60,                                                                           # Number of cycles of the 4th stage warmup
+"warmup_epochs5": 160,                                                                          # Number of cycles of the 5th stage warmup
+"momentum": 0.9,                                                                                # momentum
+"weight_decay": 1.5e-4,
+"num_default": [9, 9, 9, 9, 9],                                                                 # The number of a priori boxes in a single grid
+"extras_out_channels": [256, 256, 256, 256, 256],                                               # Feature layer output channels
+"feature_size": [75, 38, 19, 10, 5],
+"aspect_ratios": [(0.5,1.0,2.0), (0.5,1.0,2.0), (0.5,1.0,2.0), (0.5,1.0,2.0), (0.5,1.0,2.0)],   # Priori box size change ratio
+"steps": ( 8, 16, 32, 64, 128),                                                                 # Priori box setting step size
+"anchor_size":(32, 64, 128, 256, 512),                                                          # A priori box size
+"prior_scaling": (0.1, 0.2),                                                                    # Used to adjust the ratio of regression and regression in loss
+"gamma": 2.0,                                                                                   # focal loss parameter
+"alpha": 0.75,                                                                                  # focal loss parameter
+"mindrecord_dir": "/opr/root/data/MindRecord_COCO",
+"coco_root": "/opr/root/data/",
+"train_data_type": "train2017",
+"val_data_type": "val2017",
+"instances_set": "annotations_trainval2017/annotations/instances_{}.json",
+"coco_classes": ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+                 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+                 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+                 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
+                 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
+                 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+                 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+                 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
+                 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+                 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
+                 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
+                 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+                 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+                 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+                 'teddy bear', 'hair drier', 'toothbrush'),
+"num_classes": 81,
+"voc_root": "",
+"voc_dir": "",
+"image_dir": "",
+"anno_path": "",
+"save_checkpoint": True,
+"save_checkpoint_epochs": 1,
+"keep_checkpoint_max":1,
+"save_checkpoint_path": "./model",                                                              # Path to save checkpoints
+"finish_epoch":0,                                                                               # number of epoch that have been run
+"checkpoint_path":"/opr/root/reretina/retinanet2/LOG0/model/retinanet-400_458.ckpt"             # checkpoint path for evaluation
+```
+
+### [Training process](#content)
+
+#### Usage
+
+You can use python or shell scripts for training. The usage of the shell script is as follows:
+
+- Ascend:
+
+```bash
+# data and the path to store the mindrecord file are set in default_config.yaml
+
+# before training convert dataset to MindRecord format:
+python train.py --only_create_dataset=True --run_platform="Ascend"
+
+# Eight-card parallel training example:
+# create RANK_TABLE_FILE
+bash run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED(optional) PRE_TRAINED_EPOCH_SIZE(optional)
+
+# Single card training example:
+bash run_distribute_train.sh DEVICE_ID EPOCH_SIZE LR DATASET PRE_TRAINED(optional) PRE_TRAINED_EPOCH_SIZE(optional)
+```
+
+> Note: RANK_TABLE_FILE related reference materials see in this [link](https://www.mindspore.cn/docs/programming_guide/zh-CN/master/distributed_training_ascend.html),
+> for details on how to get device_ip check this [link](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools).
+
+- GPU:
+
+```bash
+# data and the path to store the mindrecord file are set in default_config.yaml
+
+# convert dataset to MindRecord format:
+python train.py --only_create_dataset=True --run_platform="GPU"
+
+# Eight-card parallel training example:
+bash run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)
+
+# Single card training example:
+bash run_single_train_gpu.sh [DEVICE_ID] [EPOCH_SIZE] [LR] [DATASET] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)
+```
+
+#### Run
+
+- Ascend:
+
+```bash
+# Ascend 8-card parallel training example (run in the retinanet directory):
+bash scripts/run_distribute_train.sh 8 500 0.1 coco scripts/rank_table_8pcs.json /dataset/retinanet-322_458.ckpt 322
+
+# Ascend single card training example (run in the retinanet directory):
+bash scripts/run_single_train.sh 0 500 0.1 coco /dataset/retinanet-322_458.ckpt 322
+```
+
+- GPU:
+
+```bash
+# Ascend 8-card parallel training example (run in the retinanet directory):
+bash run_distribute_train_gpu.sh 8 500 0.1 coco /dataset/retinanet-322_458.ckpt 322
+
+# Ascend single card training example (run in the retinanet directory):
+bash run_single_train_gpu.sh 0 500 0.1 coco /dataset/retinanet-322_458.ckpt 322
+```
+
+#### Result
+
+Paths are set in `default_config.yaml`. Checkpoints will be save in `./model`. The training log will be recorded to LOG/train.log,an example of the training log is as follows:
+
+```text
+epoch: 117 step: 916, loss is 0.8391866
+lr:[0.025187]
+epoch time: 444315.944 ms, per step time: 485.061 ms
+epoch: 118 step: 916, loss is 1.070719
+lr:[0.025749]
+epoch time: 444288.450 ms, per step time: 485.031 ms
+epoch: 119 step: 916, loss is 1.1607553
+lr:[0.026312]
+epoch time: 444339.538 ms, per step time: 485.087 ms
+epoch: 120 step: 916, loss is 1.174742
+lr:[0.026874]
+epoch time: 444237.851 ms, per step time: 484.976 ms
+```
+
+### [Evaluation process](#content)
+
+#### Usage
+
+You can use python or shell scripts for training. The usage of the shell script is as follows:
+
+- Ascend
+
+```bash
+bash scripts/run_eval.sh [DATASET] [DEVICE_ID]
+```
+
+- GPU
+
+```bash
+bash run_eval_gpu.sh [DATASET] [DEVICE_ID] [CHECKPOINT_PATH]
+```
+
+#### Run
+
+- Ascend
+
+```bash
+bash scripts/run_eval.sh coco 0
+```
+
+- GPU
+
+```bash
+bash run_eval_gpu.sh coco 0 LOG/model/retinanet-500_610.ckpt
+```
+
+> checkpoint can be generated during training.
+
+#### Result
+
+The calculation results will be stored in the example path, which you can view in `eval.log`.
+
+```text
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.357
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.503
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.400
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.141
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.381
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.498
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.307
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.447
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.458
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.172
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.487
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.643
+
+========================================
+
+mAP: 0.3571988469737286
+```
+
+### [Model Export](#content)
+
+#### Usage
+
+Before exporting the model, modify the checkpoint_path configuration item in the `default_config.yaml` file. The value is the path of the checkpoint.
+
+```shell
+python export.py --file_name [RUN_PLATFORM] --file_format[EXPORT_FORMAT] --checkpoint_path [CHECKPOINT PATH]
+```
+
+`EXPORT_FORMAT` choose from ["AIR", "MINDIR"]
+
+#### Run
+
+```bash
+python export.py
+```
+
+### [Inference Process](#content)
+
+#### Usage
+
+Before inference, model needs to be exported to the Ascend 910 environment. Pictures with iscrowd set to true should be excluded. The image id after removal is saved in the ascend310_infer directory.
+You also need to modify the coco_root, val_data_type, and instances_set configuration items in the config.py file. The values are respectively taken as the directory of the coco data set, the directory name of the data set used for inference, and the annotation file used to calculate the accuracy after the inference is completed. The instances_set is spliced with val_data_type to ensure that the file is correct and exists.
+
+```shell
+# Ascend310 inference
+bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [ANN_FILE] [DEVICE_ID]
+```
+
+#### Run
+
+```shell
+bash run_infer_310.sh  [MINDIR_PATH] [DATASET_PATH] [DEVICE_ID]
+```
+
+#### Result
+
+The result of the inference is saved in acc.log in the current directory, and the result similar to the following:
+
+```text
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.356
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.499
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.396
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.145
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.380
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.506
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.308
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.446
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.457
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.179
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.483
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.647
+mAP: 0.35625723922139957
+```
+
+## [Model Description](#content)
+
+### [Performance](#content)
+
+#### Training Performance
+
+| Parameters                 | Ascend (8 pcs)                                                                         | GPU Tesla V100 (1 pcs)                                                                 | GPU Tesla V100 (8 pcs)                                                                 |
+| -------------------------- |----------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------|
+| Model                      | Retinanet-resnet-152                                                                   | Retinanet-resnet-152                                                                   | Retinanet-resnet-152                                                                   |
+| Environment                | Huawei cloud Modelarts                                                                 | Ubuntu 18.04.6, Tesla V100, CPU 2.7 GHz, 56 cores, RAM 504 GB                          | Ubuntu 18.04.6, Tesla V100 (8 pcs), CPU 2.7 GHz, 56 cores, RAM 504 GB                  |
+| Uploaded Date              | 10/03/2021                                                                             | 27/12/2021                                                                             | 27/12/2021                                                                             |
+| MindSpore version          | 1.0.1                                                                                  | 1.6.0                                                                                  | 1.6.0                                                                                  |
+| Dataset                    | 118287 images                                                                          | 118287 images                                                                          | 118287 images                                                                          |
+| Training Parameters        | batch_size=16                                                                          | batch_size=12, epochs=200, lr=0.1, steps_per_epoch=9772                                | batch_size=12*8, epochs=180, lr=0.1, steps_per_epoch=1221                                |
+| Other training parameters  | default_config.yaml                                                                    | default_config.yaml                                                                    | default_config.yaml                                                                    |
+| Optimizer                  | Momentum                                                                               | Momentum                                                                               | Momentum                                                                               |
+| Loss function              | Focal loss                                                                             | Focal loss                                                                             | Focal loss                                                                             |
+| Final loss                 | 0.69                                                                                   | 0.84                                                                                   | 0.84                                                                                   |
+| Speed                      |                                                                                        | 860 ms/step                                                                            | 1205 ms/step                                                                           |
+| Total training time (8p)   | 41h 32m 20s                                                                            | 440h                                                                                   | 72h                                                                                    |
+| Script                     | [Link](https://gitee.com/mindspore/models/tree/master/research/cv/retinanet_resnet152) | [Link](https://gitee.com/mindspore/models/tree/master/research/cv/retinanet_resnet152) | [Link](https://gitee.com/mindspore/models/tree/master/research/cv/retinanet_resnet152) |
+
+#### Evaluation Performance
+
+| Parameters        | Ascend                      | GPU                                                           |
+|-------------------| --------------------------- |---------------------------------------------------------------|
+| Model             | Retinanet-resnet-152        | Retinanet-resnet-152                                          |
+| Environment       | Huawei cloud Modelarts      | Ubuntu 18.04.6, Tesla V100, CPU 2.7 GHz, 56 cores, RAM 504 GB |
+| Uploaded Date     | 10/03/2021                  | 27/12/2021                                                    |
+| MindSpore version | 1.0.1                       | 1.6.0                                                         |
+| Dataset           | 5k images                   | 5k images                                                     |
+| Batch_size        | 1                           | 1                                                             |
+| Accuracy          | mAP[0.3571]                 |                                                               |
+| Total time        | 12 min and 03 seconds       | 12 min                                                        |
+
+# [Description of Random State](#content)
+
+Random seed is set in function `create_dataset` of `dataset.py` script. It's also set in the `train.py` script.
+
+# [ModelZoo Homepage](#content)
+
+Please check the official [homepage](https://gitee.com/mindspore/models).
diff --git a/research/cv/retinanet_resnet152/README_CN.md b/research/cv/retinanet_resnet152/README_CN.md
index 14fd64a37..1dda1c52d 100644
--- a/research/cv/retinanet_resnet152/README_CN.md
+++ b/research/cv/retinanet_resnet152/README_CN.md
@@ -2,6 +2,8 @@
 
 <!-- TOC -->
 
+[View English](./README.md)
+
 - [Retinanet 描述](#retinanet描述)
 - [模型架构](#模型架构)
 - [数据集](#数据集)
@@ -48,19 +50,14 @@ Retinanet的整体网络架构如下所示:
 
 ## [数据集](#content)
 
-数据集可参考文献.
-
-MSCOCO2017
+数据集可参考文献. [COCO2017](https://cocodataset.org/#download)
 
 - 数据集大小: 19.3G, 123287张80类彩色图像
-
     - 训练:19.3G, 118287张图片
-
     - 测试:1814.3M, 5000张图片
+- 数据格式:RGB图像
 
-- 数据格式:RGB图像.
-
-    - 注意:数据将在src/dataset.py 中被处理
+>注意:数据将在src/dataset.py 中被处理
 
 ## [环境要求](#content)
 
@@ -175,7 +172,7 @@ MSCOCO2017
 
 ### [脚本和示例代码](#content)
 
-```shell
+```text
 .
 └─Retinanet_resnet152
   ├─README.md
@@ -201,12 +198,11 @@ MSCOCO2017
   ├─default_config.yaml                       # 参数配置
   ├─train.py                                  # 网络训练脚本
   └─eval.py                                   # 网络推理脚本
-
 ```
 
 ### [脚本参数](#content)
 
-```python
+```text
 在train.py和default_config.yaml脚本中使用到的主要参数是:
 "img_shape": [640, 640],                                                                        # 图像尺寸
 "num_retinanet_boxes": 76725,                                                                   # 设置的先验框总数
@@ -275,52 +271,65 @@ MSCOCO2017
 
 - Ascend:
 
-```训练
-# 八卡并行训练示例:
+```bash
+# data和存储mindrecord文件的路径在default_config.yaml里设置
 
-创建 RANK_TABLE_FILE
+# 训练以前, 请运行:
+python train.py --only_create_dataset=True --run_platform="Ascend"
+
+# 八卡并行训练示例:
+# 创建 RANK_TABLE_FILE
 bash run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED(optional) PRE_TRAINED_EPOCH_SIZE(optional)
 
 # 单卡训练示例:
-
 bash run_distribute_train.sh DEVICE_ID EPOCH_SIZE LR DATASET PRE_TRAINED(optional) PRE_TRAINED_EPOCH_SIZE(optional)
-
 ```
 
-> 注意:
+> 注意: RANK_TABLE_FILE相关参考资料见[链接](https://www.mindspore.cn/docs/programming_guide/zh-CN/master/distributed_training_ascend.html),
+> 获取device_ip方法详见[链接](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools).
 
-  RANK_TABLE_FILE相关参考资料见[链接](https://www.mindspore.cn/docs/programming_guide/zh-CN/master/distributed_training_ascend.html), 获取device_ip方法详见[链接](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools).
+- GPU:
 
-#### 运行
+```bash
+# data和存储mindrecord文件的路径在default_config.yaml里设置
 
-``` 运行
-# 训练示例
+# 训练以前, 请运行:
+python train.py --only_create_dataset=True --run_platform="GPU"
 
-  python:
-    data和存储mindrecord文件的路径在config里设置
+# 八卡并行训练示例:
+bash run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)
 
-      # 单卡训练示例:
+# 单卡训练示例:
+bash run_single_train_gpu.sh [DEVICE_ID] [EPOCH_SIZE] [LR] [DATASET] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)
+```
 
-      python train.py
-  shell:
-      Ascend:
+#### 运行
+
+- Ascend
 
-      # 八卡并行训练示例(在retinanet目录下运行):
+```bash
+# 八卡并行训练示例(在retinanet目录下运行):
+bash scripts/run_distribute_train.sh 8 500 0.1 coco scripts/rank_table_8pcs.json /dataset/retinanet-322_458.ckpt 322
 
-      bash scripts/run_distribute_train.sh 8 500 0.1 coco RANK_TABLE_FILE(创建的RANK_TABLE_FILE的地址) PRE_TRAINED(预训练checkpoint地址) PRE_TRAINED_EPOCH_SIZE(预训练EPOCH大小)
-      例如:bash scripts/run_distribute_train.sh 8 500 0.1 coco scripts/rank_table_8pcs.json /dataset/retinanet-322_458.ckpt 322
+# 单卡训练示例(在retinanet目录下运行):
+bash scripts/run_single_train.sh 0 500 0.1 coco /dataset/retinanet-322_458.ckpt 322
+```
 
-      # 单卡训练示例(在retinanet目录下运行):
+- GPU
 
-      bash scripts/run_single_train.sh 0 500 0.1 coco /dataset/retinanet-322_458.ckpt 322
+```bash
+# 八卡并行训练示例(在retinanet目录下运行):
+bash run_distribute_train_gpu.sh 8 500 0.1 coco /dataset/retinanet-322_458.ckpt 322
 
+# 单卡训练示例(在retinanet目录下运行):
+bash run_single_train_gpu.sh 0 500 0.1 coco /dataset/retinanet-322_458.ckpt 322
 ```
 
 #### 结果
 
-训练结果将存储在示例路径中。checkpoint将存储在 `./model` 路径下,训练日志将被记录到 `./log.txt` 中,训练日志部分示例如下:
+路径在`default_config.yaml`里设置。checkpoint将存储在 `./model` 路径下,训练日志将被记录到 `./train.log` 中,训练日志部分示例如下:
 
-``` 训练日志
+```text
 epoch: 117 step: 916, loss is 0.8391866
 lr:[0.025187]
 epoch time: 444315.944 ms, per step time: 485.061 ms
@@ -337,33 +346,43 @@ epoch time: 444237.851 ms, per step time: 484.976 ms
 
 ### [评估过程](#content)
 
-#### 用 法
+#### 用法
 
 您可以使用python或shell脚本进行训练。shell脚本的用法如下:
 
-```eval
+- Ascend
+
+```bash
 bash scripts/run_eval.sh [DATASET] [DEVICE_ID]
 ```
 
-#### 运 行
+- GPU
+
+```bash
+bash run_eval_gpu.sh [DATASET] [DEVICE_ID] [CHECKPOINT_PATH]
+```
+
+#### 运行
+
+- Ascend
+
+```bash
+bash scripts/run_eval.sh coco 0
+```
 
-```eval运行
-# 验证示例
+- GPU
 
-  python:
-      Ascend: python eval.py
-  checkpoint 的路径在config里设置
-  shell:
-      Ascend: bash scripts/run_eval.sh coco 0
+```bash
+bash run_eval_gpu.sh coco 0 LOG/model/retinanet-500_610.ckpt
 ```
 
 > checkpoint 可以在训练过程中产生.
 
-#### 结 果
+#### 结果
 
 计算结果将存储在示例路径中,您可以在 `eval.log` 查看.
 
-``` mAP
+```text
  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.357
  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.503
  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.400
@@ -386,7 +405,7 @@ mAP: 0.3571988469737286
 
 #### 用途
 
-导出模型前要修改default_config.yaml文件中的checkpoint_path配置项,值为checkpoint的路径。
+导出模型前要修改`default_config.yaml`文件中的checkpoint_path配置项,值为checkpoint的路径。
 
 ```shell
 python export.py --file_name [RUN_PLATFORM] --file_format[EXPORT_FORMAT] --checkpoint_path [CHECKPOINT PATH]
@@ -396,13 +415,13 @@ python export.py --file_name [RUN_PLATFORM] --file_format[EXPORT_FORMAT] --check
 
 #### 运行方式
 
-```运行
+```python
 python export.py
 ```
 
 ### [推理过程](#content)
 
-#### 用 途
+#### 用途
 
 在推理之前需要在昇腾910环境上完成模型的导出。推理时要将iscrowd为true的图片排除掉。在ascend310_infer目录下保存了去排除后的图片id。
 还需要修改default_config.yaml文件中的coco_root、val_data_type、instances_set配置项,值分别取coco数据集的目录,推理所用数据集的目录名称,推理完成后计算精度用的annotation文件,instances_set是用val_data_type拼接起来的,要保证文件正确并且存在。
@@ -414,7 +433,7 @@ bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [ANN_FILE] [DEVICE_ID]
 
 #### 运行命令
 
-```运行
+```shell
 bash run_infer_310.sh  [MINDIR_PATH] [DATASET_PATH] [DEVICE_ID]
 ```
 
@@ -422,7 +441,7 @@ bash run_infer_310.sh  [MINDIR_PATH] [DATASET_PATH] [DEVICE_ID]
 
 推理的结果保存在当前目录下,在acc.log日志文件中可以找到类似以下的结果。
 
-```mAP
+```text
  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.356
  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.499
  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.396
@@ -444,34 +463,34 @@ mAP: 0.35625723922139957
 
 #### 训练性能
 
-| 参数                        | Ascend                                |
-| -------------------------- | ------------------------------------- |
-| 模型名称                    | Retinanet                             |
-| 运行环境                    | 华为云 Modelarts                      |
-| 上传时间                    | 10/03/2021                           |
-| MindSpore 版本             | 1.0.1                                 |
-| 数据集                      | 123287 张图片                          |
-| Batch_size                 | 16                                   |
-| 训练参数                    | default_config.yaml                   |
-| 优化器                      | Momentum                              |
-| 损失函数                    | Focal loss                            |
-| 最终损失                    | 0.69                               |
-| 精确度 (8p)                 | mAP[0.3571]            |
-| 训练总时间 (8p)             | 41h32m20s                        |
+| 参数                        | Ascend (8pcs)        | GPU Tesla V100 (1 pcs)  | GPU Tesla V100 (8 pcs) |
+| -------------------------- | --------------------- | ---- | ---- |
+| 模型名称                    | Retinanet-resnet-152  | Retinanet-resnet-152 | Retinanet-resnet-152 |
+| 运行环境                    | 华为云 Modelarts       | Ubuntu 18.04.6, Tesla V100, CPU 2.7 GHz, 56 cores, RAM 504 GB | Ubuntu 18.04.6, Tesla V100 (8 pcs), CPU 2.7 GHz, 56 cores, RAM 504 GB |
+| 上传时间                    | 10/03/2021            | 27/12/2021 | 27/12/2021 |
+| MindSpore 版本             | 1.0.1                  | 1.6.0 | 1.6.0 |
+| 数据集                      | 123287 张图片          | 123287 张图片 | 123287 张图片 |
+| 训练参数                    | 16                    | batch_size=12, epochs=180, lr=0.1, steps_per_epoch=1221 | batch_size=12*8, epochs=180, lr=0.1, steps_per_epoch=1221 |
+| 其他训练参数                 | default_config.yaml   | default_config.yaml | default_config.yaml |
+| 优化器                      | Momentum              | Momentum | Momentum |
+| 损失函数                    | Focal loss            | Focal loss | Focal loss |
+| 最终损失                    | 0.69                 | 0.84 | 0.84 |
+| 速度 (8p)                  |                      | 860 毫秒/步 | 1205 毫秒/步 |
+| 训练总时间 (8p)             | 41h32m20s             | 440小时 | 72小时 |
 | 脚本                       | [Retianet script](https://gitee.com/mindspore/models/tree/master/research/cv/retinanet_resnet152) |
 
 #### 推理性能
 
-| 参数                 | Ascend                      |
-| ------------------- | :-------------------------- |
-| 模型名称             | Retinanet                    |
-| 运行环境             | 华为云 Modelarts             |
-| 上传时间             | 10/03/2021                 |
-| MindSpore 版本      | 1.0.1                        |
-| 数据集              | 5k 张图片                   |
-| Batch_size          | 1                          |
-| 精确度              | mAP[0.3571]                  |
-| 总时间              | 12m3s       |
+| 参数                 | Ascend                      | GPU |
+| ------------------- | :-------------------------- | --- |
+| 模型名称             | Retinanet-resnet-152        | Retinanet-resnet-152 |
+| 运行环境             | 华为云 Modelarts             | Ubuntu 18.04.6, Tesla V100, CPU 2.7 GHz, 56 cores, RAM 504 GB |
+| 上传时间             | 10/03/2021                 | 27/12/2021 |
+| MindSpore 版本      | 1.0.1                        | 1.6.0 |
+| 数据集              | 5k 张图片                   | 5k 张图片 |
+| Batch_size          | 1                          | 1 |
+| 精确度              | mAP[0.3571]                |  |
+| 总时间              | 12m3s       | 12m |
 
 # [随机情况的描述](#内容)
 
diff --git a/research/cv/retinanet_resnet152/default_config.yaml b/research/cv/retinanet_resnet152/default_config.yaml
index ed323a565..fb6a952a6 100644
--- a/research/cv/retinanet_resnet152/default_config.yaml
+++ b/research/cv/retinanet_resnet152/default_config.yaml
@@ -49,8 +49,8 @@ gamma: 2.0
 alpha: 0.75
 
 # `mindrecord_dir` and `coco_root` are better to use absolute path.
-mindrecord_dir: "/cache/train/MindRecord_COCO"
-coco_root: "/cache/data"
+mindrecord_dir: "/data/Datasets/COCO2017/MindRecord_COCO"
+coco_root: "/data/Datasets/COCO2017"
 train_data_type: "train2017"
 val_data_type: "val2017"
 instances_set: "annotations/instances_{}.json"
@@ -78,7 +78,7 @@ voc_dir: ""
 image_dir: ""
 anno_path: ""
 save_checkpoint: True
-keep_checkpoint_max: 30
+keep_checkpoint_max: 50
 save_checkpoint_path: "./model"
 finish_epoch: 0
 checkpoint_path: "/cache/train/model/retinanet-10_916.ckpt"
@@ -91,7 +91,7 @@ device_num: 1
 lr: 0.1
 mode: "sink"
 dataset: "coco"
-epoch_size: 500
+epoch_size: 180
 batch_size: 16
 pre_trained: ''
 pre_trained_epoch_size: 0
@@ -126,7 +126,7 @@ pre_trained_epoch_size: 'Pretrained epoch size.'
 save_checkpoint_epochs: 'Save checkpoint epochs, default is 1.'
 loss_scale: 'Loss scale, default is 1024.'
 filter_weight: 'Filter weight parameters, default is False.'
-run_platform: 'Run platform, only support Ascend in training and export, only support Ascend and GPU in evaling.'
+run_platform: 'Run platform, only support Ascend and GPU.'
 file_format: 'file format'
 file_name: 'output file name.'
 result_path: 'result file path.'
@@ -139,5 +139,5 @@ train_url: 'Training output url for obs'
 data_path: 'Dataset path for local'
 
 ---
-run_platform: ['Ascend']
+run_platform: ['Ascend', 'GPU']
 file_format: ["AIR", "MINDIR"]
diff --git a/research/cv/retinanet_resnet152/eval.py b/research/cv/retinanet_resnet152/eval.py
index 5910c4e4b..ae27c0351 100644
--- a/research/cv/retinanet_resnet152/eval.py
+++ b/research/cv/retinanet_resnet152/eval.py
@@ -38,6 +38,7 @@ def retinanet_eval(dataset_path, ckpt_path):
     net = retinahead(backbone, config)
     net = retinanetInferWithDecoder(net, Tensor(default_boxes), config)
     print("Load Checkpoint!")
+    print("ckpt_path: ", ckpt_path)
     param_dict = load_checkpoint(ckpt_path)
     net.init_parameters_data()
     load_param_into_net(net, param_dict)
@@ -77,6 +78,7 @@ def modelarts_process():
         config.coco_root = os.path.join(config.coco_root, config.modelarts_dataset_unzip_name)
         print(os.listdir(os.path.join(config.data_path, config.modelarts_dataset_unzip_name)))
 
+
 @moxing_wrapper(pre_process=modelarts_process)
 def eval_retinanet_resnet152():
     """ eval_retinanet_resnet152 """
@@ -115,5 +117,6 @@ def eval_retinanet_resnet152():
     print("Start Eval!")
     retinanet_eval(mindrecord_file, config.checkpoint_path)
 
+
 if __name__ == '__main__':
     eval_retinanet_resnet152()
diff --git a/research/cv/retinanet_resnet152/export.py b/research/cv/retinanet_resnet152/export.py
index 032d62d07..8a3f66e86 100644
--- a/research/cv/retinanet_resnet152/export.py
+++ b/research/cv/retinanet_resnet152/export.py
@@ -28,6 +28,7 @@ from src.backbone import resnet152
 def modelarts_process():
     pass
 
+
 @moxing_wrapper(pre_process=modelarts_process)
 def export_retinanet_resnet152():
     """ export_retinanet_resnet152 """
@@ -44,5 +45,6 @@ def export_retinanet_resnet152():
     input_data = Tensor(np.zeros(shape), mstype.float32)
     export(net, input_data, file_name=config.file_name, file_format=config.file_format)
 
+
 if __name__ == '__main__':
     export_retinanet_resnet152()
diff --git a/research/cv/retinanet_resnet152/postprocess.py b/research/cv/retinanet_resnet152/postprocess.py
index f3994b894..1e24b3e82 100644
--- a/research/cv/retinanet_resnet152/postprocess.py
+++ b/research/cv/retinanet_resnet152/postprocess.py
@@ -30,10 +30,12 @@ def get_pred(result_path, img_id):
     scores = np.fromfile(scores_file, dtype=np.float32).reshape(76725, config.num_classes)
     return boxes, scores
 
+
 def get_img_size(file_name):
     img = Image.open(file_name)
     return img.size
 
+
 def get_img_id(img_id_file):
     f = open(img_id_file)
     lines = f.readlines()
@@ -44,6 +46,7 @@ def get_img_id(img_id_file):
 
     return ids
 
+
 def cal_acc(result_path, img_path, img_id_file):
     """Calculate acc"""
     ids = get_img_id(img_id_file)
@@ -66,5 +69,6 @@ def cal_acc(result_path, img_path, img_id_file):
     mAP = metrics(pred_data)
     print(f"mAP: {mAP}")
 
+
 if __name__ == '__main__':
     cal_acc(config.result_path, config.img_path, config.img_id_file)
diff --git a/research/cv/retinanet_resnet152/requirements.txt b/research/cv/retinanet_resnet152/requirements.txt
new file mode 100644
index 000000000..99f22cd12
--- /dev/null
+++ b/research/cv/retinanet_resnet152/requirements.txt
@@ -0,0 +1 @@
+pycocotools
diff --git a/research/cv/retinanet_resnet152/scripts/run_distribute_train.sh b/research/cv/retinanet_resnet152/scripts/run_distribute_train.sh
index 7329ad5e5..3629c3b02 100644
--- a/research/cv/retinanet_resnet152/scripts/run_distribute_train.sh
+++ b/research/cv/retinanet_resnet152/scripts/run_distribute_train.sh
@@ -56,9 +56,11 @@ do
     export RANK_ID=$i
     echo "start training for rank $i, device $DEVICE_ID"
     env > env.log
+
     if [ $# == 5 ]
     then
         python train.py  \
+        --run_platform="Ascend" \
         --distribute=True  \
         --lr=$LR \
         --dataset=$DATASET \
@@ -70,6 +72,7 @@ do
     if [ $# == 7 ]
     then
         python train.py  \
+        --run_platform="Ascend" \
         --distribute=True  \
         --lr=$LR \
         --dataset=$DATASET \
diff --git a/research/cv/retinanet_resnet152/scripts/run_distribute_train_gpu.sh b/research/cv/retinanet_resnet152/scripts/run_distribute_train_gpu.sh
new file mode 100644
index 000000000..c32b540ee
--- /dev/null
+++ b/research/cv/retinanet_resnet152/scripts/run_distribute_train_gpu.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the script as: "
+echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
+echo "for example: sh run_distribute_train_gpu.sh 8 500 0.1 coco /data/hccl.json /opt/retinanet-500_458.ckpt(optional) 200(optional)"
+echo "It is better to use absolute path."
+echo "================================================================================================================="
+
+if [ $# != 4 ] && [ $# != 6 ]
+then
+    echo "Usage: sh run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
+[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
+    exit 1
+fi
+
+echo "After running the script, the network runs in the background. The log will be generated in LOG/train_log.txt"
+
+export DEVICE_NUM=$1
+EPOCH_SIZE=$2
+LR=$3
+DATASET=$4
+PRE_TRAINED=$5
+PRE_TRAINED_EPOCH_SIZE=$6
+
+rm -rf LOG
+mkdir ./LOG
+cp ../*.py ./LOG
+cp ../*.yaml ./LOG
+cp -r ../src ./LOG
+cd ./LOG || exit
+
+echo "start training on GPU $DEVICE_NUM devices"
+env > env.log
+
+if [ $# == 4 ]
+then
+  mpirun -n $DEVICE_NUM --output-filename log_output --merge-stderr-to-stdout \
+  python train.py  \
+  --run_platform="GPU" \
+  --batch_size=12 \
+  --distribute=True  \
+  --lr=$LR \
+  --dataset=$DATASET \
+  --device_num=$DEVICE_NUM  \
+  --epoch_size=$EPOCH_SIZE > train_log.txt 2>&1 &
+fi
+
+if [ $# == 6 ]
+then
+  mpirun -n $DEVICE_NUM --output-filename log_output --merge-stderr-to-stdout \
+  python train.py  \
+  --run_platform="GPU" \
+  --batch_size=12 \
+  --distribute=True  \
+  --lr=$LR \
+  --dataset=$DATASET \
+  --device_num=$DEVICE_NUM  \
+  --pre_trained=$PRE_TRAINED \
+  --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
+  --epoch_size=$EPOCH_SIZE > train_log.txt 2>&1 &
+fi
+cd ..
\ No newline at end of file
diff --git a/research/cv/retinanet_resnet152/scripts/run_eval.sh b/research/cv/retinanet_resnet152/scripts/run_eval.sh
index 615828f91..2775ca224 100644
--- a/research/cv/retinanet_resnet152/scripts/run_eval.sh
+++ b/research/cv/retinanet_resnet152/scripts/run_eval.sh
@@ -44,7 +44,9 @@ cp -r ./src ./eval$2
 cd ./eval$2 || exit
 env > env.log
 echo "start inferring for device $DEVICE_ID"
+
 python eval.py \
+    --run_platform="Ascend" \
     --dataset=$DATASET \
     --device_id=$2 > log.txt 2>&1 &
 cd ..
\ No newline at end of file
diff --git a/research/cv/retinanet_resnet152/scripts/run_eval_gpu.sh b/research/cv/retinanet_resnet152/scripts/run_eval_gpu.sh
new file mode 100644
index 000000000..af0a2e1eb
--- /dev/null
+++ b/research/cv/retinanet_resnet152/scripts/run_eval_gpu.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 2 ] && [ $# != 3 ]
+then
+    echo "Usage: sh run_eval_gpu.sh [DATASET] [DEVICE_ID] [CHECKPOINT_PATH](optional)"
+exit 1
+fi
+
+DATASET=$1
+echo $DATASET
+
+export DEVICE_NUM=1
+export DEVICE_ID=$2
+export RANK_SIZE=$DEVICE_NUM
+export RANK_ID=0
+
+get_real_path() {
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+if [ $# == 3 ]
+then
+    CHECKPOINT_PATH=$(get_real_path $3)
+    if [ ! -f $CHECKPOINT_PATH ]
+    then
+        echo "error: CHECKPOINT_PATH=$CHECKPOINT_PATH is not a file"
+    exit 1
+    fi
+fi
+
+if [ -d "eval$2" ];
+then
+    rm -rf ./eval$2
+fi
+
+mkdir ./eval$2
+cp ../*.py ./eval$2
+cp ../*.yaml ./eval$2
+cp -r ../src ./eval$2
+cd ./eval$2 || exit
+
+env > env.log
+echo "start inferring for device $DEVICE_ID"
+
+if [ $# == 2 ]
+then
+  python eval.py \
+      --run_platform="GPU" \
+      --dataset=$DATASET \
+      --device_id=$2 > eval_log.txt 2>&1 &
+fi
+
+if [ $# == 3 ]
+then
+  python eval.py \
+      --run_platform="GPU" \
+      --dataset=$DATASET \
+      --checkpoint_path=$CHECKPOINT_PATH \
+      --device_id=$2 > eval_log.txt 2>&1 &
+fi
+
+cd ..
\ No newline at end of file
diff --git a/research/cv/retinanet_resnet152/scripts/run_single_train.sh b/research/cv/retinanet_resnet152/scripts/run_single_train.sh
index 27891cedc..745fe7f0d 100644
--- a/research/cv/retinanet_resnet152/scripts/run_single_train.sh
+++ b/research/cv/retinanet_resnet152/scripts/run_single_train.sh
@@ -38,6 +38,8 @@ export DEVICE_ID=$1
 EPOCH_SIZE=$2
 LR=$3
 DATASET=$4
+PRE_TRAINED=$5
+PRE_TRAINED_EPOCH_SIZE=$6
 
 rm -rf LOG$1
 mkdir ./LOG$1
@@ -45,13 +47,33 @@ cp ./*.py ./LOG$1
 cp ./*.yaml ./LOG$1
 cp -r ./src ./LOG$1
 cd ./LOG$1 || exit
+
 echo "start training for device $1"
 env > env.log
-python train.py  \
---distribute=False  \
---lr=$LR \
---dataset=$DATASET \
---device_num=1  \
---device_id=$DEVICE_ID  \
---epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
+
+if [ $# == 4 ]
+then
+      python train.py  \
+      --run_platform="Ascend" \
+      --distribute=False  \
+      --lr=$LR \
+      --dataset=$DATASET \
+      --device_num=1  \
+      --device_id=$DEVICE_ID  \
+      --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
+fi
+
+if [ $# == 6 ]
+then
+      python train_retinanet.py  \
+      --run_platform="Ascend" \
+      --distribute=False  \
+      --lr=$LR \
+      --dataset=$DATASET \
+      --device_num=1  \
+      --device_id=$DEVICE_ID  \
+      --pre_trained=$PRE_TRAINED \
+      --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
+      --epoch_size=$EPOCH_SIZE > train_log.txt 2>&1 &
+fi
 cd ../
diff --git a/research/cv/retinanet_resnet152/scripts/run_single_train_gpu.sh b/research/cv/retinanet_resnet152/scripts/run_single_train_gpu.sh
new file mode 100644
index 000000000..bb9651759
--- /dev/null
+++ b/research/cv/retinanet_resnet152/scripts/run_single_train_gpu.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the script as: "
+echo "sh run_single_train_gpu.sh DEVICE_ID EPOCH_SIZE LR DATASET PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
+echo "for example: sh run_single_train_gpu.sh 0 500 0.1 coco /opt/retinanet-500_458.ckpt(optional) 200(optional)"
+echo "It is better to use absolute path."
+echo "================================================================================================================="
+
+if [ $# != 4 ] && [ $# != 6 ]
+then
+    echo "Usage: sh run_single_train_gpu.sh [DEVICE_ID] [EPOCH_SIZE] [LR] [DATASET] \
+[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
+    exit 1
+fi
+
+echo "After running the script, the network runs in the background. The log will be generated in LOGx/train_log.txt"
+
+export DEVICE_ID=$1
+EPOCH_SIZE=$2
+LR=$3
+DATASET=$4
+PRE_TRAINED=$5
+PRE_TRAINED_EPOCH_SIZE=$6
+
+rm -rf LOG$1
+mkdir ./LOG$1
+cp ../*.py ./LOG$1
+cp ../*.yaml ./LOG$1
+cp -r ../src ./LOG$1
+cd ./LOG$1 || exit
+
+echo "start training for device $1"
+env > env.log
+
+if [ $# == 4 ]
+then
+      python train.py  \
+      --run_platform="GPU" \
+      --batch_size=12 \
+      --distribute=False  \
+      --lr=$LR \
+      --dataset=$DATASET \
+      --device_num=1  \
+      --device_id=$DEVICE_ID  \
+      --epoch_size=$EPOCH_SIZE > train_log.txt 2>&1 &
+fi
+
+if [ $# == 6 ]
+then
+      python train_retinanet.py  \
+      --run_platform="GPU" \
+      --batch_size=12 \
+      --distribute=False  \
+      --lr=$LR \
+      --dataset=$DATASET \
+      --device_num=1  \
+      --device_id=$DEVICE_ID  \
+      --pre_trained=$PRE_TRAINED \
+      --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
+      --epoch_size=$EPOCH_SIZE > train_log.txt 2>&1 &
+fi
+cd ../
diff --git a/research/cv/retinanet_resnet152/src/backbone.py b/research/cv/retinanet_resnet152/src/backbone.py
index 020a48378..1c30f362f 100644
--- a/research/cv/retinanet_resnet152/src/backbone.py
+++ b/research/cv/retinanet_resnet152/src/backbone.py
@@ -93,7 +93,7 @@ class ResidualBlock(nn.Cell):
             self.down_sample_layer = nn.Conv2dBnAct(in_channel, out_channel,
                                                     kernel_size=1, stride=stride,
                                                     pad_mode='same', padding=0, has_bn=True, activation='relu')
-        self.add = P.TensorAdd()
+        self.add = P.Add()
         self.relu = P.ReLU()
 
     def construct(self, x):
@@ -111,6 +111,7 @@ class ResidualBlock(nn.Cell):
 
         return out
 
+
 class resnet(nn.Cell):
     """
     ResNet architecture.
diff --git a/research/cv/retinanet_resnet152/src/bottleneck.py b/research/cv/retinanet_resnet152/src/bottleneck.py
index f4c46c96e..8dd365c1d 100644
--- a/research/cv/retinanet_resnet152/src/bottleneck.py
+++ b/research/cv/retinanet_resnet152/src/bottleneck.py
@@ -18,6 +18,7 @@
 import mindspore.nn as nn
 from mindspore.ops import operations as P
 
+
 class FPN(nn.Cell):
     """FPN"""
     def __init__(self, config, backbone, is_training=True):
diff --git a/research/cv/retinanet_resnet152/src/box_utils.py b/research/cv/retinanet_resnet152/src/box_utils.py
index 0d7780e77..30258dd7c 100644
--- a/research/cv/retinanet_resnet152/src/box_utils.py
+++ b/research/cv/retinanet_resnet152/src/box_utils.py
@@ -74,7 +74,7 @@ def retinanet_bboxes_encode(boxes):
     Labels anchors with ground truth inputs.
 
     Args:
-        boxex: ground truth with shape [N, 5], for each row, it stores [y, x, h, w, cls].
+        boxes: ground truth with shape [N, 5], for each row, it stores [y, x, h, w, cls].
 
     Returns:
         gt_loc: location ground truth with shape [num_anchors, 4].
diff --git a/research/cv/retinanet_resnet152/src/dataset.py b/research/cv/retinanet_resnet152/src/dataset.py
index eac1a6451..f1dad5f44 100644
--- a/research/cv/retinanet_resnet152/src/dataset.py
+++ b/research/cv/retinanet_resnet152/src/dataset.py
@@ -215,10 +215,13 @@ def create_voc_label(is_training):
             if not is_training:
                 o_width = abs(x_max - x_min)
                 o_height = abs(y_max - y_min)
-                ann = {'area': o_width * o_height, 'iscrowd': 0, 'image_id': \
-                    img_id, 'bbox': [x_min, y_min, o_width, o_height], \
-                       'category_id': cls_map[cls_name], 'id': bnd_id, \
-                       'ignore': 0, \
+                ann = {'area': o_width * o_height,
+                       'iscrowd': 0,
+                       'image_id': img_id,
+                       'bbox': [x_min, y_min, o_width, o_height],
+                       'category_id': cls_map[cls_name],
+                       'id': bnd_id,
+                       'ignore': 0,
                        'segmentation': []}
                 json_dict['annotations'].append(ann)
                 bnd_id = bnd_id + 1
@@ -390,8 +393,8 @@ def data_to_mindrecord_byte_image(dataset="coco", is_training=True, prefix="reti
 
 
 def create_retinanet_dataset(mindrecord_file, batch_size, repeat_num, device_num=1, rank=0,
-                             is_training=True, num_parallel_workers=64):
-    """Creatr retinanet dataset with MindDataset."""
+                             is_training=True, num_parallel_workers=8):
+    """Create retinanet dataset with MindDataset."""
     ds = de.MindDataset(mindrecord_file, columns_list=["img_id", "image", "annotation"], num_shards=device_num,
                         shard_id=rank, num_parallel_workers=num_parallel_workers, shuffle=is_training)
     decode = C.Decode()
@@ -427,6 +430,8 @@ def create_mindrecord(dataset="coco", prefix="retinanet.mindrecord", is_training
 
     mindrecord_dir = config.mindrecord_dir
     mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
+    if dataset == "voc":
+        config.coco_root = config.voc_root
     if not os.path.exists(mindrecord_file):
         if not os.path.isdir(mindrecord_dir):
             os.makedirs(mindrecord_dir)
@@ -438,12 +443,12 @@ def create_mindrecord(dataset="coco", prefix="retinanet.mindrecord", is_training
             else:
                 print("coco_root not exits.")
         elif dataset == "voc":
-            if os.path.isdir(config.voc_dir):
+            if os.path.isdir(config.voc_dir) and os.path.isdir(config.voc_root):
                 print("Create Mindrecord.")
                 voc_data_to_mindrecord(mindrecord_dir, is_training, prefix)
                 print("Create Mindrecord Done, at {}".format(mindrecord_dir))
             else:
-                print("voc_dir not exits.")
+                print("voc_root or voc_dir not exits.")
         else:
             if os.path.isdir(config.image_dir) and os.path.exists(config.anno_path):
                 print("Create Mindrecord.")
@@ -451,4 +456,7 @@ def create_mindrecord(dataset="coco", prefix="retinanet.mindrecord", is_training
                 print("Create Mindrecord Done, at {}".format(mindrecord_dir))
             else:
                 print("image_dir or anno_path not exits.")
+    else:
+        print("Mindrecord file exists.")
+
     return mindrecord_file
diff --git a/research/cv/retinanet_resnet152/src/model_utils/config.py b/research/cv/retinanet_resnet152/src/model_utils/config.py
index 7f1ff6e2b..ec9300cfa 100644
--- a/research/cv/retinanet_resnet152/src/model_utils/config.py
+++ b/research/cv/retinanet_resnet152/src/model_utils/config.py
@@ -119,9 +119,9 @@ def get_config():
                         help="Config file path")
     path_args, _ = parser.parse_known_args()
     default, helper, choices = parse_yaml(path_args.config_path)
-    pprint(default)
     args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
     final_config = merge(args, default)
+    pprint(final_config)
     return Config(final_config)
 
 config = get_config()
diff --git a/research/cv/retinanet_resnet152/train.py b/research/cv/retinanet_resnet152/train.py
index 5402c6a4f..d8b580090 100644
--- a/research/cv/retinanet_resnet152/train.py
+++ b/research/cv/retinanet_resnet152/train.py
@@ -19,7 +19,7 @@ import os
 import mindspore
 import mindspore.nn as nn
 from mindspore import context, Tensor
-from mindspore.communication.management import init, get_rank
+from mindspore.communication.management import init, get_rank, get_group_size
 from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor, Callback
 from mindspore.train import Model
 from mindspore.context import ParallelMode
@@ -65,11 +65,12 @@ def modelarts_process():
         config.coco_root = os.path.join(config.coco_root, config.modelarts_dataset_unzip_name)
         print(os.listdir(os.path.join(config.data_path, config.modelarts_dataset_unzip_name)))
 
+
 @moxing_wrapper(pre_process=modelarts_process)
 def train_retinanet_resnet152():
     """ train_retinanet_resnet152 """
+    context.set_context(mode=context.GRAPH_MODE, device_target=config.run_platform)
     if config.run_platform == "Ascend":
-        context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
         if config.distribute:
             if os.getenv("DEVICE_ID", "not_set").isdigit():
                 context.set_context(device_id=int(os.getenv("DEVICE_ID")))
@@ -83,6 +84,15 @@ def train_retinanet_resnet152():
             device_num = 1
             context.set_context(device_id=get_device_id())
 
+    elif config.run_platform == "GPU":
+        rank = config.device_id
+        device_num = config.device_num
+        if config.distribute:
+            init()
+            rank = get_rank()
+            device_num = get_group_size()
+            context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
+                                              device_num=device_num)
     else:
         raise ValueError("Unsupported platform.")
 
@@ -102,7 +112,7 @@ def train_retinanet_resnet152():
         backbone = resnet152(config.num_classes)
         retinanet = retinahead(backbone, config)
         net = retinanetWithLossCell(retinanet, config)
-        net.to_float(mindspore.float16)
+        net.to_float(mindspore.float32)
         init_net_param(net)
 
         if config.pre_trained:
@@ -137,5 +147,6 @@ def train_retinanet_resnet152():
             cb += [ckpt_cb]
             model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=True)
 
+
 if __name__ == '__main__':
     train_retinanet_resnet152()
-- 
GitLab