diff --git a/official/cv/yolov5/README.md b/official/cv/yolov5/README.md
index f9920a040ecaa59a630773572c4ff53f28e1f2fd..1f83f5d08844909a7c37da4bf76b77bb72ce0e12 100644
--- a/official/cv/yolov5/README.md
+++ b/official/cv/yolov5/README.md
@@ -150,8 +150,6 @@ optional arguments:
   --label_smooth_factor Smooth strength of original one-hot. Default: 0.1
   --log_interval        Logging interval steps. Default: 100
   --ckpt_path           Checkpoint save location. Default: outputs/
-  --ckpt_interval       Save checkpoint interval. Default: None
-  --is_save_on_master   Save ckpt on master or all rank, 1 for master, 0 for all ranks. Default: 1
   --is_distributed      Distribute train or not, 1 for yes, 0 for no. Default: 1
   --rank                Local rank of distributed. Default: 0
   --group_size          World size of device. Default: 1
diff --git a/official/cv/yolov5/default_config.yaml b/official/cv/yolov5/default_config.yaml
index dc4664f2adc87ab12a28c519f7e5f194d26075b6..5c43fad4b437fcb5392a8f4fffa7cd1db118d491 100644
--- a/official/cv/yolov5/default_config.yaml
+++ b/official/cv/yolov5/default_config.yaml
@@ -5,7 +5,6 @@ data_url: ""
 train_url: ""
 checkpoint_url: ""
 # Path for local
-output_dir: "/cache"
 data_path: "/cache/data"
 output_path: "/cache/train"
 load_path: "/cache/checkpoint_path"
@@ -21,6 +20,7 @@ yolov5_version: "yolov5s"
 pretrained_backbone: ""
 resume_yolov5: ""
 pretrained_checkpoint: ""
+output_dir: "./output"
 
 lr_scheduler: "cosine_annealing"
 lr: 0.013
@@ -37,24 +37,19 @@ label_smooth: 0
 label_smooth_factor: 0.1
 log_interval: 100
 ckpt_path: "outputs/"
-ckpt_interval: -1
-is_save_on_master: 1
 is_distributed: 0
 bind_cpu: True
 device_num: 8
 rank: 0
 group_size: 1
 need_profiler: 0
-training_shape: ""
 resize_rate: 10
-is_modelArts: 0
 
 # Eval options
 pretrained: ""
 log_path: "outputs/"
 ann_val_file: ""
 eval_nms_thresh: 0.6
-eval_shape: ""
 ignore_threshold: 0.7
 test_ignore_threshold: 0.001
 multi_label: True
@@ -63,7 +58,7 @@ multi_label_thresh: 0.1
 # Export options
 device_id: 0
 batch_size: 1
-testing_shape: 640
+testing_shape: [640, 640]
 ckpt_file: ""
 file_name: "yolov5"
 file_format: "MINDIR"
@@ -77,21 +72,6 @@ saturation: 1.5
 value: 0.4
 jitter: 0.3
 
-multi_scale: [[320, 320],
-              [352, 352],
-              [384, 384],
-              [416, 416],
-              [448, 448],
-              [480, 480],
-              [512, 512],
-              [544, 544],
-              [576, 576],
-              [608, 608],
-              [640, 640],
-              [672, 672],
-              [704, 704],
-              [736, 736],
-              [768, 768]]
 num_classes: 80
 max_box: 150
 
@@ -166,7 +146,6 @@ device_num: "Device numbers per server"
 rank: "Local rank of distributed."
 group_size: "World size of device."
 need_profiler: "Whether use profiler. 0 for no, 1 for yes."
-training_shape: "Fix training shape."
 resize_rate: "Resize rate for multi-scale training."
 ann_file: "path to annotation"
 each_multiscale: "Apply multi-scale for each scale"
diff --git a/official/cv/yolov5/eval.py b/official/cv/yolov5/eval.py
index ea785e6561f7c589ca71d26d3b05995a384c1c47..0f16e218a61bbc4acb8fad2e464ab5729dfac231 100644
--- a/official/cv/yolov5/eval.py
+++ b/official/cv/yolov5/eval.py
@@ -14,412 +14,90 @@
 # ============================================================================
 """YoloV5 eval."""
 import os
-import datetime
 import time
-import sys
-from collections import defaultdict
 
 import numpy as np
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
 
-from mindspore import Tensor
-from mindspore.context import ParallelMode
-from mindspore import context
-from mindspore.train.serialization import load_checkpoint, load_param_into_net
 import mindspore as ms
 
 from src.yolo import YOLOV5
 from src.logger import get_logger
+from src.util import DetectionEngine
 from src.yolo_dataset import create_yolo_dataset
 
 from model_utils.config import config
-from model_utils.moxing_adapter import moxing_wrapper
-from model_utils.device_adapter import get_device_id, get_device_num
 
-config.rank = 0
+# only useful for huawei cloud modelarts
+from model_utils.moxing_adapter import moxing_wrapper, modelarts_pre_process
 
-if config.is_modelArts:
-    config.data_root = os.path.join(config.data_dir, 'val2017')
-    config.ann_file = os.path.join(config.data_dir, 'annotations')
-    import moxing as mox
-
-    local_data_url = os.path.join(config.data_path, str(config.rank))
-    local_annFile = os.path.join(config.data_path, str(config.rank))
-    local_pretrained = os.path.join(config.data_path, str(config.rank))
-
-    temp_str = config.pretrained.split('/')[-1]
-    config.pretrained = config.pretrained[0:config.pretrained.rfind('/')]
 
-    mox.file.copy_parallel(config.data_root, local_data_url)
-    config.data_root = local_data_url
-
-    mox.file.copy_parallel(config.ann_file, local_annFile)
-    config.ann_file = os.path.join(local_data_url, 'instances_val2017.json')
-
-    mox.file.copy_parallel(config.pretrained, local_pretrained)
-    config.pretrained = os.path.join(local_data_url, temp_str)
-else:
+def eval_preprocess():
     config.data_root = os.path.join(config.data_dir, 'val2017')
     config.ann_file = os.path.join(config.data_dir, 'annotations/instances_val2017.json')
-
-
-class Redirct:
-    def __init__(self):
-        self.content = ""
-
-    def write(self, content):
-        self.content += content
-
-    def flush(self):
-        self.content = ""
-
-
-class DetectionEngine:
-    """Detection engine."""
-
-    def __init__(self, args_detection):
-        self.ignore_threshold = args_detection.test_ignore_threshold
-        self.labels = args_detection.labels
-        self.num_classes = len(self.labels)
-        self.results = {}
-        self.file_path = ''
-        self.save_prefix = args_detection.outputs_dir
-        self.ann_file = args_detection.ann_file
-        self._coco = COCO(self.ann_file)
-        self._img_ids = list(sorted(self._coco.imgs.keys()))
-        self.det_boxes = []
-        self.nms_thresh = args_detection.eval_nms_thresh
-        self.multi_label = args_detection.multi_label
-        self.multi_label_thresh = args_detection.multi_label_thresh
-        self.coco_catids = self._coco.getCatIds()
-        self.coco_catIds = args_detection.coco_ids
-
-    def do_nms_for_results(self):
-        """Get result boxes."""
-        for img_id in self.results:
-            for clsi in self.results[img_id]:
-                dets = self.results[img_id][clsi]
-                dets = np.array(dets)
-                keep_index = self._diou_nms(dets, thresh=self.nms_thresh)
-
-                keep_box = [{'image_id': int(img_id), 'category_id': int(clsi),
-                             'bbox': list(dets[i][:4].astype(float)),
-                             'score': dets[i][4].astype(float)} for i in keep_index]
-                self.det_boxes.extend(keep_box)
-
-    def _nms(self, predicts, threshold):
-        """Calculate NMS."""
-        # convert xywh -> xmin ymin xmax ymax
-        x1 = predicts[:, 0]
-        y1 = predicts[:, 1]
-        x2 = x1 + predicts[:, 2]
-        y2 = y1 + predicts[:, 3]
-        scores = predicts[:, 4]
-
-        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-        order = scores.argsort()[::-1]
-
-        reserved_boxes = []
-        while order.size > 0:
-            i = order[0]
-            reserved_boxes.append(i)
-            max_x1 = np.maximum(x1[i], x1[order[1:]])
-            max_y1 = np.maximum(y1[i], y1[order[1:]])
-            min_x2 = np.minimum(x2[i], x2[order[1:]])
-            min_y2 = np.minimum(y2[i], y2[order[1:]])
-
-            intersect_w = np.maximum(0.0, min_x2 - max_x1 + 1)
-            intersect_h = np.maximum(0.0, min_y2 - max_y1 + 1)
-            intersect_area = intersect_w * intersect_h
-            ovr = intersect_area / \
-                (areas[i] + areas[order[1:]] - intersect_area)
-
-            indexes = np.where(ovr <= threshold)[0]
-            order = order[indexes + 1]
-        return reserved_boxes
-
-    def _diou_nms(self, dets, thresh=0.5):
-        """
-        convert xywh -> xmin ymin xmax ymax
-        """
-        x1 = dets[:, 0]
-        y1 = dets[:, 1]
-        x2 = x1 + dets[:, 2]
-        y2 = y1 + dets[:, 3]
-        scores = dets[:, 4]
-        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-        order = scores.argsort()[::-1]
-        keep = []
-        while order.size > 0:
-            i = order[0]
-            keep.append(i)
-            xx1 = np.maximum(x1[i], x1[order[1:]])
-            yy1 = np.maximum(y1[i], y1[order[1:]])
-            xx2 = np.minimum(x2[i], x2[order[1:]])
-            yy2 = np.minimum(y2[i], y2[order[1:]])
-
-            w = np.maximum(0.0, xx2 - xx1 + 1)
-            h = np.maximum(0.0, yy2 - yy1 + 1)
-            inter = w * h
-            ovr = inter / (areas[i] + areas[order[1:]] - inter)
-            center_x1 = (x1[i] + x2[i]) / 2
-            center_x2 = (x1[order[1:]] + x2[order[1:]]) / 2
-            center_y1 = (y1[i] + y2[i]) / 2
-            center_y2 = (y1[order[1:]] + y2[order[1:]]) / 2
-            inter_diag = (center_x2 - center_x1) ** 2 + (center_y2 - center_y1) ** 2
-            out_max_x = np.maximum(x2[i], x2[order[1:]])
-            out_max_y = np.maximum(y2[i], y2[order[1:]])
-            out_min_x = np.minimum(x1[i], x1[order[1:]])
-            out_min_y = np.minimum(y1[i], y1[order[1:]])
-            outer_diag = (out_max_x - out_min_x) ** 2 + (out_max_y - out_min_y) ** 2
-            diou = ovr - inter_diag / outer_diag
-            diou = np.clip(diou, -1, 1)
-            inds = np.where(diou <= thresh)[0]
-            order = order[inds + 1]
-        return keep
-
-    def write_result(self):
-        """Save result to file."""
-        import json
-        t = datetime.datetime.now().strftime('_%Y_%m_%d_%H_%M_%S')
-        try:
-            self.file_path = self.save_prefix + '/predict' + t + '.json'
-            f = open(self.file_path, 'w')
-            json.dump(self.det_boxes, f)
-        except IOError as e:
-            raise RuntimeError("Unable to open json file to dump. What(): {}".format(str(e)))
+    device_id = int(os.getenv('DEVICE_ID', '0'))
+    ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=device_id)
+
+    # logger module is managed by config, it is used in other function. e.x. config.logger.info("xxx")
+    config.logger = get_logger(config.output_dir, device_id)
+
+
+def load_parameters(network, filename):
+    config.logger.info("yolov5 pretrained network model: %s", filename)
+    param_dict = ms.load_checkpoint(filename)
+    param_dict_new = {}
+    for key, values in param_dict.items():
+        if key.startswith('moments.'):
+            continue
+        elif key.startswith('yolo_network.'):
+            param_dict_new[key[13:]] = values
         else:
-            f.close()
-            return self.file_path
+            param_dict_new[key] = values
+    ms.load_param_into_net(network, param_dict_new)
+    config.logger.info('load_model %s success', filename)
 
-    def get_eval_result(self):
-        """Get eval result."""
-        coco_gt = COCO(self.ann_file)
-        coco_dt = coco_gt.loadRes(self.file_path)
-        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
-        coco_eval.evaluate()
-        coco_eval.accumulate()
-        rdct = Redirct()
-        stdout = sys.stdout
-        sys.stdout = rdct
-        coco_eval.summarize()
-        sys.stdout = stdout
-        return rdct.content
 
-    def detect(self, outputs, batch, image_shape, image_id):
-        """Detect boxes."""
-        outputs_num = len(outputs)
-        # output [|32, 52, 52, 3, 85| ]
-        for batch_id in range(batch):
-            for out_id in range(outputs_num):
-                # 32, 52, 52, 3, 85
-                out_item = outputs[out_id]
-                # 52, 52, 3, 85
-                out_item_single = out_item[batch_id, :]
-                # get number of items in one head, [B, gx, gy, anchors, 5+80]
-                dimensions = out_item_single.shape[:-1]
-                out_num = 1
-                for d in dimensions:
-                    out_num *= d
-                ori_w, ori_h = image_shape[batch_id]
-                img_id = int(image_id[batch_id])
-                x = out_item_single[..., 0] * ori_w
-                y = out_item_single[..., 1] * ori_h
-                w = out_item_single[..., 2] * ori_w
-                h = out_item_single[..., 3] * ori_h
-
-                conf = out_item_single[..., 4:5]
-                cls_emb = out_item_single[..., 5:]
-                cls_argmax = np.expand_dims(np.argmax(cls_emb, axis=-1), axis=-1)
-                x = x.reshape(-1)
-                y = y.reshape(-1)
-                w = w.reshape(-1)
-                h = h.reshape(-1)
-                x_top_left = x - w / 2.
-                y_top_left = y - h / 2.
-                cls_emb = cls_emb.reshape(-1, self.num_classes)
-                if self.multi_label:
-                    conf = conf.reshape(-1, 1)
-                    # create all False
-                    confidence = cls_emb * conf
-                    flag = cls_emb > self.multi_label_thresh
-                    flag = flag.nonzero()
-                    for index in range(len(flag[0])):
-                        i = flag[0][index]
-                        j = flag[1][index]
-                        confi = confidence[i][j]
-                        if confi < self.ignore_threshold:
-                            continue
-                        if img_id not in self.results:
-                            self.results[img_id] = defaultdict(list)
-                        x_lefti = max(0, x_top_left[i])
-                        y_lefti = max(0, y_top_left[i])
-                        wi = min(w[i], ori_w)
-                        hi = min(h[i], ori_h)
-                        clsi = j
-                        # transform catId to match coco
-                        coco_clsi = self.coco_catIds[clsi]
-                        self.results[img_id][coco_clsi].append([x_lefti, y_lefti, wi, hi, confi])
-                else:
-                    cls_argmax = np.expand_dims(np.argmax(cls_emb, axis=-1), axis=-1)
-                    conf = conf.reshape(-1)
-                    cls_argmax = cls_argmax.reshape(-1)
-
-                    # create all False
-                    flag = np.random.random(cls_emb.shape) > sys.maxsize
-                    for i in range(flag.shape[0]):
-                        c = cls_argmax[i]
-                        flag[i, c] = True
-                    confidence = cls_emb[flag] * conf
-
-                    for x_lefti, y_lefti, wi, hi, confi, clsi in zip(x_top_left, y_top_left,
-                                                                     w, h, confidence, cls_argmax):
-                        if confi < self.ignore_threshold:
-                            continue
-                        if img_id not in self.results:
-                            self.results[img_id] = defaultdict(list)
-                        x_lefti = max(0, x_lefti)
-                        y_lefti = max(0, y_lefti)
-                        wi = min(wi, ori_w)
-                        hi = min(hi, ori_h)
-                        # transform catId to match coco
-                        coco_clsi = self.coco_catids[clsi]
-                        self.results[img_id][coco_clsi].append([x_lefti, y_lefti, wi, hi, confi])
-
-
-def convert_testing_shape(args_testing_shape):
-    """Convert testing shape to list."""
-    testing_shape = [int(args_testing_shape), int(args_testing_shape)]
-    return testing_shape
-
-def modelarts_pre_process():
-    '''modelarts pre process function.'''
-    def unzip(zip_file, save_dir):
-        import zipfile
-        s_time = time.time()
-        if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
-            zip_isexist = zipfile.is_zipfile(zip_file)
-            if zip_isexist:
-                fz = zipfile.ZipFile(zip_file, 'r')
-                data_num = len(fz.namelist())
-                print("Extract Start...")
-                print("unzip file num: {}".format(data_num))
-                data_print = int(data_num / 100) if data_num > 100 else 1
-                i = 0
-                for file in fz.namelist():
-                    if i % data_print == 0:
-                        print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
-                    i += 1
-                    fz.extract(file, save_dir)
-                print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
-                                                     int(int(time.time() - s_time) % 60)))
-                print("Extract Done.")
-            else:
-                print("This is not zip.")
-        else:
-            print("Zip has been extracted.")
-
-    if config.need_modelarts_dataset_unzip:
-        zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
-        save_dir_1 = os.path.join(config.data_path)
-
-        sync_lock = "/tmp/unzip_sync.lock"
-
-        # Each server contains 8 devices as most.
-        if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
-            print("Zip file path: ", zip_file_1)
-            print("Unzip file save dir: ", save_dir_1)
-            unzip(zip_file_1, save_dir_1)
-            print("===Finish extract data synchronization===")
-            try:
-                os.mknod(sync_lock)
-            except IOError:
-                pass
-
-        while True:
-            if os.path.exists(sync_lock):
-                break
-            time.sleep(1)
-
-        print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
-
-    config.log_path = os.path.join(config.output_path, config.log_path)
-
-@moxing_wrapper(pre_process=modelarts_pre_process)
+@moxing_wrapper(pre_process=modelarts_pre_process, pre_args=[config])
 def run_eval():
+    eval_preprocess()
     start_time = time.time()
-    device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0
-    # device_id = 1
-    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id)
-
-    # logger
-    config.outputs_dir = os.path.join(config.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
-    rank_id = int(os.getenv('DEVICE_ID', '0'))
-    config.logger = get_logger(config.outputs_dir, rank_id)
-
-    context.reset_auto_parallel_context()
-    parallel_mode = ParallelMode.STAND_ALONE
-    context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1)
-
     config.logger.info('Creating Network....')
     dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3}
     network = YOLOV5(is_training=False, version=dict_version[config.yolov5_version])
 
-    config.logger.info(config.pretrained)
     if os.path.isfile(config.pretrained):
-        param_dict = load_checkpoint(config.pretrained)
-        param_dict_new = {}
-        for key, values in param_dict.items():
-            if key.startswith('moments.'):
-                continue
-            elif key.startswith('yolo_network.'):
-                param_dict_new[key[13:]] = values
-            else:
-                param_dict_new[key] = values
-        load_param_into_net(network, param_dict_new)
-        config.logger.info('load_model %s success', config.pretrained)
+        load_parameters(network, config.pretrained)
     else:
-        config.logger.info('%s not exists or not a pre-trained file', config.pretrained)
-        assert FileNotFoundError('{} not exists or not a pre-trained file'.format(config.pretrained))
-        exit(1)
-
-    data_root = config.data_root
-    ann_file = config.ann_file
-
-    if config.eval_shape:
-        config.test_img_shape = convert_testing_shape(config.eval_shape)
+        raise FileNotFoundError(f"{config.pretrained} is not a filename.")
 
-    ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=config.per_batch_size,
-                                        max_epoch=1, device_num=1, rank=rank_id, shuffle=False, config=config)
+    ds = create_yolo_dataset(config.data_root, config.ann_file, is_training=False, batch_size=config.per_batch_size,
+                             device_num=1, rank=0, shuffle=False, config=config)
 
     config.logger.info('testing shape : %s', config.test_img_shape)
-    config.logger.info('total %d images to eval', data_size)
+    config.logger.info('total %d images to eval', ds.get_dataset_size() * config.per_batch_size)
 
     network.set_train(False)
 
     # init detection engine
-    detection = DetectionEngine(config)
+    detection = DetectionEngine(config, config.test_ignore_threshold)
 
-    input_shape = Tensor(tuple(config.test_img_shape), ms.float32)
+    input_shape = ms.Tensor(tuple(config.test_img_shape), ms.float32)
     config.logger.info('Start inference....')
-    for image_index, data in enumerate(ds.create_dict_iterator(num_epochs=1)):
-        image = data["image"].asnumpy()
+    for index, data in enumerate(ds.create_dict_iterator(output_numpy=True, num_epochs=1)):
+        image = data["image"]
+        # adapt network shape of input data
         image = np.concatenate((image[..., ::2, ::2], image[..., 1::2, ::2],
                                 image[..., ::2, 1::2], image[..., 1::2, 1::2]), axis=1)
-        image = Tensor(image)
+        image = ms.Tensor(image)
         image_shape_ = data["image_shape"]
         image_id_ = data["img_id"]
-        prediction = network(image, input_shape)
-        output_big, output_me, output_small = prediction
+        output_big, output_me, output_small = network(image, input_shape)
         output_big = output_big.asnumpy()
         output_me = output_me.asnumpy()
         output_small = output_small.asnumpy()
-        image_id_ = image_id_.asnumpy()
-        image_shape_ = image_shape_.asnumpy()
         detection.detect([output_small, output_me, output_big], config.per_batch_size, image_shape_, image_id_)
-        if image_index % 1000 == 0:
-            config.logger.info('Processing... {:.2f}% '.format(image_index * config.per_batch_size / data_size * 100))
+
+        if index % 50 == 0:
+            config.logger.info('Processing... {:.2f}% '.format(index / ds.get_dataset_size() * 100))
 
     config.logger.info('Calculating mAP...')
     detection.do_nms_for_results()
@@ -432,5 +110,6 @@ def run_eval():
     config.logger.info(eval_log_string)
     config.logger.info('testing cost time %.2f h', cost_time / 3600.)
 
+
 if __name__ == "__main__":
     run_eval()
diff --git a/official/cv/yolov5/export.py b/official/cv/yolov5/export.py
index de04ff4ce3b7aa0a78ec66227f4c17d9617c0836..240453a9cfd1acf240c296cdae9a62e820ee48be 100644
--- a/official/cv/yolov5/export.py
+++ b/official/cv/yolov5/export.py
@@ -12,41 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-import numpy as np
-
-import mindspore
-from mindspore import context, Tensor
-from mindspore.train.serialization import export, load_checkpoint, load_param_into_net
+import mindspore as ms
 
 from src.yolo import YOLOV5s_Infer
 
 from model_utils.config import config
-from model_utils.moxing_adapter import moxing_wrapper
-
-def modelarts_pre_process():
-    '''modelarts pre process function.'''
-    config.file_name = os.path.join(config.output_path, config.file_name)
+from model_utils.moxing_adapter import moxing_wrapper, modelarts_export_preprocess
 
 
-@moxing_wrapper(pre_process=modelarts_pre_process)
+@moxing_wrapper(pre_process=modelarts_export_preprocess, pre_args=[config])
 def run_export():
-    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
+    ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target)
     if config.device_target == "Ascend":
-        context.set_context(device_id=config.device_id)
-    ts_shape = config.testing_shape // 2
+        ms.set_context(device_id=config.device_id)
 
     dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3}
     config.file_name = config.file_name + '_' + config.yolov5_version
 
-    network = YOLOV5s_Infer(config.testing_shape, version=dict_version[config.yolov5_version])
+    network = YOLOV5s_Infer(config.testing_shape[0], version=dict_version[config.yolov5_version])
     network.set_train(False)
 
-    param_dict = load_checkpoint(config.ckpt_file)
-    load_param_into_net(network, param_dict)
+    param_dict = ms.load_checkpoint(config.ckpt_file)
+    ms.load_param_into_net(network, param_dict)
 
-    input_data = Tensor(np.zeros([config.batch_size, 12, ts_shape, ts_shape]), mindspore.float32)
+    input_data = ms.numpy.zeros([config.batch_size, 12, *config.testing_shape], ms.float32)
 
-    export(network, input_data, file_name=config.file_name, file_format=config.file_format)
+    ms.export(network, input_data, file_name=config.file_name, file_format=config.file_format)
     print('==========success export===============')
 
 if __name__ == "__main__":
diff --git a/official/cv/yolov5/model_utils/moxing_adapter.py b/official/cv/yolov5/model_utils/moxing_adapter.py
index 25838a7da99a27a1bb744684c1f75f80f5704688..a2f802f598fac7fb32d13c4d4c556251153766be 100644
--- a/official/cv/yolov5/model_utils/moxing_adapter.py
+++ b/official/cv/yolov5/model_utils/moxing_adapter.py
@@ -17,7 +17,7 @@
 
 import os
 import functools
-from mindspore import context
+import mindspore as ms
 from .config import config
 
 _global_sync_count = 0
@@ -72,8 +72,67 @@ def sync_data(from_path, to_path):
 
     print("Finish sync data from {} to {}.".format(from_path, to_path))
 
-
-def moxing_wrapper(pre_process=None, post_process=None):
+def modelarts_pre_process(args):
+    '''modelarts pre process function.'''
+    def unzip(zip_file, save_dir):
+        import zipfile
+        s_time = time.time()
+        if not os.path.exists(os.path.join(save_dir, args.modelarts_dataset_unzip_name)):
+            zip_isexist = zipfile.is_zipfile(zip_file)
+            if zip_isexist:
+                fz = zipfile.ZipFile(zip_file, 'r')
+                data_num = len(fz.namelist())
+                print("Extract Start...")
+                print("unzip file num: {}".format(data_num))
+                data_print = int(data_num / 100) if data_num > 100 else 1
+                i = 0
+                for file in fz.namelist():
+                    if i % data_print == 0:
+                        print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
+                    i += 1
+                    fz.extract(file, save_dir)
+                print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
+                                                     int(int(time.time() - s_time) % 60)))
+                print("Extract Done.")
+            else:
+                print("This is not zip.")
+        else:
+            print("Zip has been extracted.")
+
+    if args.need_modelarts_dataset_unzip:
+        zip_file_1 = os.path.join(args.data_path, args.modelarts_dataset_unzip_name + ".zip")
+        save_dir_1 = os.path.join(args.data_path)
+
+        sync_lock = "/tmp/unzip_sync.lock"
+
+        # Each server contains 8 devices as most.
+        if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
+            print("Zip file path: ", zip_file_1)
+            print("Unzip file save dir: ", save_dir_1)
+            unzip(zip_file_1, save_dir_1)
+            print("===Finish extract data synchronization===")
+            try:
+                os.mknod(sync_lock)
+            except IOError:
+                pass
+
+        while True:
+            if os.path.exists(sync_lock):
+                break
+            time.sleep(1)
+
+        print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
+
+    args.output_dir = os.path.join(args.output_path, args.output_dir)
+    args.ckpt_path = os.path.join(args.output_path, args.ckpt_path)
+
+def modelarts_post_process():
+    sync_data(from_path='/cache/output', to_path='obs://hit-cyf/yolov5_npu/outputs/')
+
+def modelarts_export_preprocess(args):
+    args.file_name = os.path.join(args.output_path, args.file_name)
+
+def moxing_wrapper(pre_process=None, post_process=None, **kwargs):
     """
     Moxing wrapper to download dataset and upload outputs.
     """
@@ -92,14 +151,17 @@ def moxing_wrapper(pre_process=None, post_process=None):
                     sync_data(config.train_url, config.output_path)
                     print("Workspace downloaded: ", os.listdir(config.output_path))
 
-                context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
+                ms.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
                 config.device_num = get_device_num()
                 config.device_id = get_device_id()
                 if not os.path.exists(config.output_path):
                     os.makedirs(config.output_path)
 
                 if pre_process:
-                    pre_process()
+                    if "pre_args" in kwargs.keys():
+                        pre_process(*kwargs["pre_args"])
+                    else:
+                        pre_process()
 
             # Run the main function
             run_func(*args, **kwargs)
@@ -107,7 +169,10 @@ def moxing_wrapper(pre_process=None, post_process=None):
             # Upload data to train_url
             if config.enable_modelarts:
                 if post_process:
-                    post_process()
+                    if "post_args" in kwargs.keys():
+                        post_process(*kwargs["post_args"])
+                    else:
+                        post_process()
 
                 if config.train_url:
                     print("Start to copy output directory")
diff --git a/official/cv/yolov5/postprocess.py b/official/cv/yolov5/postprocess.py
index 9cd53c2ed0d9757731b0b514f20ea52398c81a6a..57d310befc003c114e8559f1bab4ab5e5e68dca8 100644
--- a/official/cv/yolov5/postprocess.py
+++ b/official/cv/yolov5/postprocess.py
@@ -14,16 +14,13 @@
 # ============================================================================
 """YoloV5 310 infer."""
 import os
-import sys
 import argparse
-import datetime
 import time
 import ast
-from collections import defaultdict
 import numpy as np
 from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
 from src.logger import get_logger
+from src.util import DetectionEngine
 
 parser = argparse.ArgumentParser('yolov5 postprocess')
 
@@ -46,240 +43,6 @@ parser.add_argument('--multi_label_thresh', type=float, default=0.1, help='thres
 args, _ = parser.parse_known_args()
 
 
-class Redirct:
-    def __init__(self):
-        self.content = ""
-
-    def write(self, content):
-        self.content += content
-
-    def flush(self):
-        self.content = ""
-
-
-class DetectionEngine:
-    """Detection engine."""
-
-    def __init__(self, args_detection):
-        self.ignore_threshold = args_detection.ignore_threshold
-        self.labels = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
-                       'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
-                       'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
-                       'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-                       'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
-                       'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
-                       'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
-                       'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
-                       'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
-                       'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
-        self.num_classes = len(self.labels)
-        self.results = {}
-        self.file_path = ''
-        self.save_prefix = args_detection.outputs_dir
-        self.ann_file = args_detection.ann_file
-        self._coco = COCO(self.ann_file)
-        self._img_ids = list(sorted(self._coco.imgs.keys()))
-        self.det_boxes = []
-        self.nms_thresh = args_detection.nms_thresh
-        self.multi_label = args_detection.multi_label
-        self.multi_label_thresh = args_detection.multi_label_thresh
-        self.coco_catIds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27,
-                            28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53,
-                            54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80,
-                            81, 82, 84, 85, 86, 87, 88, 89, 90]
-
-    def do_nms_for_results(self):
-        """Get result boxes."""
-        for image_id in self.results:
-            for clsi in self.results[image_id]:
-                dets = self.results[image_id][clsi]
-                dets = np.array(dets)
-                keep_index = self._diou_nms(dets, thresh=self.nms_thresh)
-
-                keep_box = [{'image_id': int(image_id), 'category_id': int(clsi),
-                             'bbox': list(dets[i][:4].astype(float)),
-                             'score': dets[i][4].astype(float)} for i in keep_index]
-                self.det_boxes.extend(keep_box)
-
-    def _nms(self, predicts, threshold):
-        """Calculate NMS."""
-        # convert xywh -> xmin ymin xmax ymax
-        x1 = predicts[:, 0]
-        y1 = predicts[:, 1]
-        x2 = x1 + predicts[:, 2]
-        y2 = y1 + predicts[:, 3]
-        scores = predicts[:, 4]
-
-        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-        order = scores.argsort()[::-1]
-
-        reserved_boxes = []
-        while order.size > 0:
-            i = order[0]
-            reserved_boxes.append(i)
-            max_x1 = np.maximum(x1[i], x1[order[1:]])
-            max_y1 = np.maximum(y1[i], y1[order[1:]])
-            min_x2 = np.minimum(x2[i], x2[order[1:]])
-            min_y2 = np.minimum(y2[i], y2[order[1:]])
-
-            intersect_w = np.maximum(0.0, min_x2 - max_x1 + 1)
-            intersect_h = np.maximum(0.0, min_y2 - max_y1 + 1)
-            intersect_area = intersect_w * intersect_h
-            ovr = intersect_area / (areas[i] + areas[order[1:]] - intersect_area)
-
-            indexes = np.where(ovr <= threshold)[0]
-            order = order[indexes + 1]
-        return reserved_boxes
-
-    def _diou_nms(self, dets, thresh=0.5):
-        """
-        convert xywh -> xmin ymin xmax ymax
-        """
-        x1 = dets[:, 0]
-        y1 = dets[:, 1]
-        x2 = x1 + dets[:, 2]
-        y2 = y1 + dets[:, 3]
-        scores = dets[:, 4]
-        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-        order = scores.argsort()[::-1]
-        keep = []
-        while order.size > 0:
-            i = order[0]
-            keep.append(i)
-            xx1 = np.maximum(x1[i], x1[order[1:]])
-            yy1 = np.maximum(y1[i], y1[order[1:]])
-            xx2 = np.minimum(x2[i], x2[order[1:]])
-            yy2 = np.minimum(y2[i], y2[order[1:]])
-
-            w = np.maximum(0.0, xx2 - xx1 + 1)
-            h = np.maximum(0.0, yy2 - yy1 + 1)
-            inter = w * h
-            ovr = inter / (areas[i] + areas[order[1:]] - inter)
-            center_x1 = (x1[i] + x2[i]) / 2
-            center_x2 = (x1[order[1:]] + x2[order[1:]]) / 2
-            center_y1 = (y1[i] + y2[i]) / 2
-            center_y2 = (y1[order[1:]] + y2[order[1:]]) / 2
-            inter_diag = (center_x2 - center_x1) ** 2 + (center_y2 - center_y1) ** 2
-            out_max_x = np.maximum(x2[i], x2[order[1:]])
-            out_max_y = np.maximum(y2[i], y2[order[1:]])
-            out_min_x = np.minimum(x1[i], x1[order[1:]])
-            out_min_y = np.minimum(y1[i], y1[order[1:]])
-            outer_diag = (out_max_x - out_min_x) ** 2 + (out_max_y - out_min_y) ** 2
-            diou = ovr - inter_diag / outer_diag
-            diou = np.clip(diou, -1, 1)
-            inds = np.where(diou <= thresh)[0]
-            order = order[inds + 1]
-        return keep
-
-    def write_result(self):
-        """Save result to file."""
-        import json
-        t = datetime.datetime.now().strftime('_%Y_%m_%d_%H_%M_%S')
-        try:
-            self.file_path = self.save_prefix + '/predict' + t + '.json'
-            f = open(self.file_path, 'w')
-            json.dump(self.det_boxes, f)
-        except IOError as e:
-            raise RuntimeError("Unable to open json file to dump. What(): {}".format(str(e)))
-        else:
-            f.close()
-            return self.file_path
-
-    def get_eval_result(self):
-        """Get eval result."""
-        coco_gt = COCO(self.ann_file)
-        coco_dt = coco_gt.loadRes(self.file_path)
-        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
-        coco_eval.evaluate()
-        coco_eval.accumulate()
-        rdct = Redirct()
-        stdout = sys.stdout
-        sys.stdout = rdct
-        coco_eval.summarize()
-        sys.stdout = stdout
-        return rdct.content
-
-    def detect(self, outputs, batch, img_shape, image_id):
-        """Detect boxes."""
-        outputs_num = len(outputs)
-        # output [|32, 52, 52, 3, 85| ]
-        for batch_id in range(batch):
-            for out_id in range(outputs_num):
-                # 32, 52, 52, 3, 85
-                out_item = outputs[out_id]
-                # 52, 52, 3, 85
-                out_item_single = out_item[batch_id, :]
-                # get number of items in one head, [B, gx, gy, anchors, 5+80]
-                dimensions = out_item_single.shape[:-1]
-                out_num = 1
-                for d in dimensions:
-                    out_num *= d
-                ori_w, ori_h = img_shape[batch_id]
-                img_id = int(image_id[batch_id])
-                x = out_item_single[..., 0] * ori_w
-                y = out_item_single[..., 1] * ori_h
-                w = out_item_single[..., 2] * ori_w
-                h = out_item_single[..., 3] * ori_h
-
-                conf = out_item_single[..., 4:5]
-                cls_emb = out_item_single[..., 5:]
-                cls_argmax = np.expand_dims(np.argmax(cls_emb, axis=-1), axis=-1)
-                x = x.reshape(-1)
-                y = y.reshape(-1)
-                w = w.reshape(-1)
-                h = h.reshape(-1)
-                x_top_left = x - w / 2.
-                y_top_left = y - h / 2.
-                cls_emb = cls_emb.reshape(-1, self.num_classes)
-                if self.multi_label:
-                    conf = conf.reshape(-1, 1)
-                    # create all False
-                    confidence = cls_emb * conf
-                    flag = cls_emb > self.multi_label_thresh
-                    flag = flag.nonzero()
-                    for index in range(len(flag[0])):
-                        i = flag[0][index]
-                        j = flag[1][index]
-                        confi = confidence[i][j]
-                        if confi < self.ignore_threshold:
-                            continue
-                        if img_id not in self.results:
-                            self.results[img_id] = defaultdict(list)
-                        x_lefti = max(0, x_top_left[i])
-                        y_lefti = max(0, y_top_left[i])
-                        wi = min(w[i], ori_w)
-                        hi = min(h[i], ori_h)
-                        clsi = j
-                        # transform catId to match coco
-                        coco_clsi = self.coco_catIds[clsi]
-                        self.results[img_id][coco_clsi].append([x_lefti, y_lefti, wi, hi, confi])
-                else:
-                    cls_argmax = np.expand_dims(np.argmax(cls_emb, axis=-1), axis=-1)
-                    conf = conf.reshape(-1)
-                    cls_argmax = cls_argmax.reshape(-1)
-
-                    # create all False
-                    flag = np.random.random(cls_emb.shape) > sys.maxsize
-                    for i in range(flag.shape[0]):
-                        c = cls_argmax[i]
-                        flag[i, c] = True
-                    confidence = cls_emb[flag] * conf
-
-                    for x_lefti, y_lefti, wi, hi, confi, clsi in zip(x_top_left, y_top_left, w, h, confidence,
-                                                                     cls_argmax):
-                        if confi < self.ignore_threshold:
-                            continue
-                        if img_id not in self.results:
-                            self.results[img_id] = defaultdict(list)
-                        x_lefti = max(0, x_lefti)
-                        y_lefti = max(0, y_lefti)
-                        wi = min(wi, ori_w)
-                        hi = min(hi, ori_h)
-                        # transform catId to match coco
-                        coco_clsi = self.coco_catids[clsi]
-                        self.results[img_id][coco_clsi].append([x_lefti, y_lefti, wi, hi, confi])
-
-
 if __name__ == "__main__":
     start_time = time.time()
 
@@ -288,7 +51,7 @@ if __name__ == "__main__":
     args.logger = get_logger(args.outputs_dir, 0)
 
     # init detection engine
-    detection = DetectionEngine(args)
+    detection = DetectionEngine(args, args.ignore_threshold)
 
     coco = COCO(args.ann_file)
     result_path = args.result_files
diff --git a/official/cv/yolov5/scripts/run_distribute_train.sh b/official/cv/yolov5/scripts/run_distribute_train.sh
index 3ef2f9059c19d7abfd77c0816313b5daca5e1ef5..e40c9d9ffe7a23ef655bcaa8077681c2904c8e54 100644
--- a/official/cv/yolov5/scripts/run_distribute_train.sh
+++ b/official/cv/yolov5/scripts/run_distribute_train.sh
@@ -78,7 +78,6 @@ do
         --max_epoch=300 \
         --warmup_epochs=20 \
         --per_batch_size=16 \
-        --training_shape=640 \
         --lr_scheduler=cosine_annealing  > log.txt 2>&1 &
     cd ..
 done
diff --git a/official/cv/yolov5/scripts/run_distribute_train_gpu.sh b/official/cv/yolov5/scripts/run_distribute_train_gpu.sh
index d16a10af01142bca91984fb4d8811fa60c0a7cda..b34adeb327b88d609badd1b804e00665bc6536b3 100644
--- a/official/cv/yolov5/scripts/run_distribute_train_gpu.sh
+++ b/official/cv/yolov5/scripts/run_distribute_train_gpu.sh
@@ -61,6 +61,5 @@ nohup python train.py \
       --T_max=300 \
       --max_epoch=300 \
       --warmup_epochs=20 \
-      --training_shape=640 \
       --lr_scheduler=cosine_annealing > log.txt 2>&1 &
 cd ..
diff --git a/official/cv/yolov5/scripts/run_eval.sh b/official/cv/yolov5/scripts/run_eval.sh
index c88f7a2138e4fb02f9c739e855b405b94e18b37c..fbe3e65dad4014fbbffb3773929ea3959fe53585 100644
--- a/official/cv/yolov5/scripts/run_eval.sh
+++ b/official/cv/yolov5/scripts/run_eval.sh
@@ -64,6 +64,5 @@ echo "start inferring for device $DEVICE_ID"
 python eval.py \
     --data_dir=$DATASET_PATH \
     --yolov5_version='yolov5s' \
-    --pretrained=$CHECKPOINT_PATH \
-    --eval_shape=640 > log.txt 2>&1 &
+    --pretrained=$CHECKPOINT_PATH > log.txt 2>&1 &
 cd ..
diff --git a/official/cv/yolov5/scripts/run_standalone_train.sh b/official/cv/yolov5/scripts/run_standalone_train.sh
index 39a7dc0521e603e0c15057d4beb9b14871f6106b..2f62484654f3b361333e2d04f03deaffa9a60d28 100644
--- a/official/cv/yolov5/scripts/run_standalone_train.sh
+++ b/official/cv/yolov5/scripts/run_standalone_train.sh
@@ -65,6 +65,5 @@ python train.py \
     --T_max=320 \
     --max_epoch=320 \
     --warmup_epochs=4 \
-    --training_shape=640 \
     --lr_scheduler=cosine_annealing > log.txt 2>&1 &
 cd ..
\ No newline at end of file
diff --git a/official/cv/yolov5/src/backbone.py b/official/cv/yolov5/src/backbone.py
index 49339576d982aeb201be50e9bdd68db4f7e3ed7a..7df0aa8f849e6b60c227f064a54d9b8a053462d2 100644
--- a/official/cv/yolov5/src/backbone.py
+++ b/official/cv/yolov5/src/backbone.py
@@ -14,7 +14,7 @@
 # ============================================================================
 """DarkNet model."""
 import mindspore.nn as nn
-from mindspore.ops import operations as P
+import mindspore.ops as ops
 
 
 class Bottleneck(nn.Cell):
@@ -46,7 +46,7 @@ class BottleneckCSP(nn.Cell):
         self.conv3 = Conv(2 * c_, c2, 1)  # act=FReLU(c2)
         self.m = nn.SequentialCell(
             [Bottleneck(c_, c_, shortcut, e=1.0) for _ in range(n)])
-        self.concat = P.Concat(axis=1)
+        self.concat = ops.Concat(axis=1)
 
     def construct(self, x):
         c1 = self.conv1(x)
@@ -69,7 +69,7 @@ class SPP(nn.Cell):
         self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, pad_mode='same')
         self.maxpool2 = nn.MaxPool2d(kernel_size=9, stride=1, pad_mode='same')
         self.maxpool3 = nn.MaxPool2d(kernel_size=13, stride=1, pad_mode='same')
-        self.concat = P.Concat(axis=1)
+        self.concat = ops.Concat(axis=1)
 
     def construct(self, x):
         c1 = self.conv1(x)
@@ -95,7 +95,7 @@ class Focus(nn.Cell):
 class SiLU(nn.Cell):
     def __init__(self):
         super(SiLU, self).__init__()
-        self.sigmoid = P.Sigmoid()
+        self.sigmoid = ops.Sigmoid()
 
     def construct(self, x):
         return x * self.sigmoid(x)
@@ -134,7 +134,7 @@ class Conv(nn.Cell):
             has_bias=False)
         self.bn = nn.BatchNorm2d(c2, momentum=momentum, eps=eps)
         self.act = SiLU() if act is True else (
-            act if isinstance(act, nn.Cell) else P.Identity())
+            act if isinstance(act, nn.Cell) else ops.Identity())
 
     def construct(self, x):
         return self.act(self.bn(self.conv(x)))
diff --git a/official/cv/yolov5/src/initializer.py b/official/cv/yolov5/src/initializer.py
index 2f49dbfd413a5acb4e8a497d530f53dfab39642a..3dac5d640960947ef0e00d2c262b1bb9620147e6 100644
--- a/official/cv/yolov5/src/initializer.py
+++ b/official/cv/yolov5/src/initializer.py
@@ -16,9 +16,7 @@
 import math
 from functools import reduce
 import numpy as np
-from mindspore.common import initializer as init
-from mindspore.common.initializer import Initializer as MeInitializer
-from mindspore.train.serialization import load_checkpoint, load_param_into_net
+import mindspore as ms
 import mindspore.nn as nn
 
 
@@ -141,7 +139,7 @@ def _calculate_fan_in_and_fan_out(arr):
     return fan_in, fan_out
 
 
-class KaimingUniform(MeInitializer):
+class KaimingUniform(ms.common.initializer.Initializer):
     """Kaiming uniform initializer."""
 
     def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu'):
@@ -159,17 +157,21 @@ def default_recurisive_init(custom_cell):
     """Initialize parameter."""
     for _, cell in custom_cell.cells_and_names():
         if isinstance(cell, nn.Conv2d):
-            cell.weight.set_data(init.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.shape, cell.weight.dtype))
+            cell.weight.set_data(ms.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)),
+                                                                   cell.weight.shape, cell.weight.dtype))
             if cell.bias is not None:
                 fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight)
                 bound = 1 / math.sqrt(fan_in)
-                cell.bias.set_data(init.initializer(init.Uniform(bound), cell.bias.shape, cell.bias.dtype))
+                cell.bias.set_data(ms.common.initializer.initializer(ms.common.initializer.Uniform(bound),
+                                                                     cell.bias.shape, cell.bias.dtype))
         elif isinstance(cell, nn.Dense):
-            cell.weight.set_data(init.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.shape, cell.weight.dtype))
+            cell.weight.set_data(ms.common.initializer.initializer(KaimingUniform(a=math.sqrt(5)),
+                                                                   cell.weight.shape, cell.weight.dtype))
             if cell.bias is not None:
                 fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight)
                 bound = 1 / math.sqrt(fan_in)
-                cell.bias.set_data(init.initializer(init.Uniform(bound), cell.bias.shape, cell.bias.dtype))
+                cell.bias.set_data(ms.common.initializer.initializer(ms.common.initializer.Uniform(bound),
+                                                                     cell.bias.shape, cell.bias.dtype))
         elif isinstance(cell, (nn.BatchNorm2d, nn.BatchNorm1d)):
             pass
 
@@ -177,7 +179,7 @@ def default_recurisive_init(custom_cell):
 def load_yolov5_params(args, network):
     """Load yolov5 backbone parameter from checkpoint."""
     if args.resume_yolov5:
-        param_dict = load_checkpoint(args.resume_yolov5)
+        param_dict = ms.callback.load_checkpoint(args.resume_yolov5)
         param_dict_new = {}
         for key, values in param_dict.items():
             if key.startswith('moments.'):
@@ -190,11 +192,11 @@ def load_yolov5_params(args, network):
                 args.logger.info('in resume {}'.format(key))
 
         args.logger.info('resume finished')
-        load_param_into_net(network, param_dict_new)
+        ms.callback.load_param_into_net(network, param_dict_new)
         args.logger.info('load_model {} success'.format(args.resume_yolov5))
 
     if args.pretrained_backbone:
-        param_dict = load_checkpoint(args.pretrained_backbone)
+        param_dict = ms.callback.load_checkpoint(args.pretrained_backbone)
         param_dict_new = {}
         for key, values in param_dict.items():
             if key.startswith('moments.'):
@@ -207,5 +209,5 @@ def load_yolov5_params(args, network):
                 args.logger.info('in resume {}'.format(key))
 
         args.logger.info('pretrained finished')
-        load_param_into_net(network, param_dict_new)
+        ms.callback.load_param_into_net(network, param_dict_new)
         args.logger.info('load_model {} success'.format(args.pretrained_backbone))
diff --git a/official/cv/yolov5/src/loss.py b/official/cv/yolov5/src/loss.py
index 6cc099279030b47ff751b5211cc5f1ea9a4f1beb..952f2b8cebb03a08297da041cc88e32381982ace 100644
--- a/official/cv/yolov5/src/loss.py
+++ b/official/cv/yolov5/src/loss.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 """YOLOV5 loss."""
-from mindspore.ops import operations as P
+import mindspore.ops as ops
 import mindspore.nn as nn
 
 
@@ -22,8 +22,8 @@ class ConfidenceLoss(nn.Cell):
 
     def __init__(self):
         super(ConfidenceLoss, self).__init__()
-        self.cross_entropy = P.SigmoidCrossEntropyWithLogits()
-        self.reduce_sum = P.ReduceSum()
+        self.cross_entropy = ops.SigmoidCrossEntropyWithLogits()
+        self.reduce_sum = ops.ReduceSum()
 
     def construct(self, object_mask, predict_confidence, ignore_mask):
         confidence_loss = self.cross_entropy(predict_confidence, object_mask)
@@ -37,8 +37,8 @@ class ClassLoss(nn.Cell):
 
     def __init__(self):
         super(ClassLoss, self).__init__()
-        self.cross_entropy = P.SigmoidCrossEntropyWithLogits()
-        self.reduce_sum = P.ReduceSum()
+        self.cross_entropy = ops.SigmoidCrossEntropyWithLogits()
+        self.reduce_sum = ops.ReduceSum()
 
     def construct(self, object_mask, predict_class, class_probs):
         class_loss = object_mask * self.cross_entropy(predict_class, class_probs)
diff --git a/official/cv/yolov5/src/lr_scheduler.py b/official/cv/yolov5/src/lr_scheduler.py
index 672fd8d5a6c51d699f9ef3fa28210cbdfabe4681..87a63a9ac328a15cdea19e2625748f26638fb3af 100644
--- a/official/cv/yolov5/src/lr_scheduler.py
+++ b/official/cv/yolov5/src/lr_scheduler.py
@@ -144,19 +144,19 @@ def warmup_cosine_annealing_lr_sample(lr, steps_per_epoch, warmup_epochs, max_ep
     return np.array(lr_each_step).astype(np.float32)
 
 
-def get_lr(args):
+def get_lr(args, steps_per_epoch):
     """generate learning rate."""
     if args.lr_scheduler == 'exponential':
-        lr = warmup_step_lr(args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.max_epoch,
+        lr = warmup_step_lr(args.lr, args.lr_epochs, steps_per_epoch, args.warmup_epochs, args.max_epoch,
                             gamma=args.lr_gamma)
     elif args.lr_scheduler == 'cosine_annealing':
-        lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs,
+        lr = warmup_cosine_annealing_lr(args.lr, steps_per_epoch, args.warmup_epochs,
                                         args.max_epoch, args.T_max, args.eta_min)
     elif args.lr_scheduler == 'cosine_annealing_V2':
-        lr = warmup_cosine_annealing_lr_V2(args.lr, args.steps_per_epoch, args.warmup_epochs,
+        lr = warmup_cosine_annealing_lr_V2(args.lr, steps_per_epoch, args.warmup_epochs,
                                            args.max_epoch, args.T_max, args.eta_min)
     elif args.lr_scheduler == 'cosine_annealing_sample':
-        lr = warmup_cosine_annealing_lr_sample(args.lr, args.steps_per_epoch, args.warmup_epochs,
+        lr = warmup_cosine_annealing_lr_sample(args.lr, steps_per_epoch, args.warmup_epochs,
                                                args.max_epoch, args.T_max, args.eta_min)
     else:
         raise NotImplementedError(args.lr_scheduler)
diff --git a/official/cv/yolov5/src/transforms.py b/official/cv/yolov5/src/transforms.py
index e3bbbb89e01941f2a69436119fce991130241eed..928dbab2c74a5fcbedee958ab43ad1b0b618badb 100644
--- a/official/cv/yolov5/src/transforms.py
+++ b/official/cv/yolov5/src/transforms.py
@@ -20,7 +20,7 @@ import copy
 import numpy as np
 from PIL import Image
 import cv2
-import mindspore.dataset.vision.py_transforms as PV
+import mindspore.dataset as ds
 
 
 def _rand(a=0., b=1.):
@@ -524,7 +524,7 @@ class MultiScaleTrans:
 
     def __call__(self, img, anno, input_size, mosaic_flag):
         if mosaic_flag[0] == 0:
-            img = PV.Decode()(img)
+            img = ds.vision.py_transforms.Decode()(img)
         img, anno = preprocess_fn(img, anno, self.config, input_size, self.device_num)
         return img, anno, np.array(img.shape[0:2])
 
diff --git a/official/cv/yolov5/src/util.py b/official/cv/yolov5/src/util.py
index 109057bd36ee5d7b41cfdb72fe9a6b5f02547ec4..8acc1011dff5e28e9f6ae97521dedd999ba48684 100644
--- a/official/cv/yolov5/src/util.py
+++ b/official/cv/yolov5/src/util.py
@@ -13,7 +13,14 @@
 # limitations under the License.
 # ============================================================================
 """Util class or function."""
-import mindspore.common.dtype as mstype
+import sys
+from collections import defaultdict
+import datetime
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+import mindspore as ms
 
 from .yolo import YoloLossBlock
 
@@ -130,4 +137,215 @@ def keep_loss_fp32(network):
     """Keep loss of network with float32"""
     for _, cell in network.cells_and_names():
         if isinstance(cell, (YoloLossBlock,)):
-            cell.to_float(mstype.float32)
+            cell.to_float(ms.float32)
+
+
+class Redirct:
+    def __init__(self):
+        self.content = ""
+
+    def write(self, content):
+        self.content += content
+
+    def flush(self):
+        self.content = ""
+
+
+def cpu_affinity(rank_id, device_num):
+    """Bind CPU cores according to rank_id and device_num."""
+    import psutil
+    cores = psutil.cpu_count()
+    if cores < device_num:
+        return
+    process = psutil.Process()
+    used_cpu_num = cores // device_num
+    rank_id = rank_id % device_num
+    used_cpu_list = [i for i in range(rank_id * used_cpu_num, (rank_id + 1) * used_cpu_num)]
+    process.cpu_affinity(used_cpu_list)
+    print(f"==== {rank_id}/{device_num} ==== bind cpu: {used_cpu_list}")
+
+
+class DetectionEngine:
+    """Detection engine."""
+
+    def __init__(self, args_detection, threshold):
+        self.ignore_threshold = threshold
+        self.labels = args_detection.labels
+        self.num_classes = len(self.labels)
+        self.results = {}
+        self.file_path = ''
+        self.save_prefix = args_detection.output_dir
+        self.ann_file = args_detection.ann_file
+        self._coco = COCO(self.ann_file)
+        self._img_ids = list(sorted(self._coco.imgs.keys()))
+        self.det_boxes = []
+        self.nms_thresh = args_detection.eval_nms_thresh
+        self.multi_label = args_detection.multi_label
+        self.multi_label_thresh = args_detection.multi_label_thresh
+        self.coco_catids = self._coco.getCatIds()
+        self.coco_catIds = args_detection.coco_ids
+
+    def do_nms_for_results(self):
+        """Get result boxes."""
+        for img_id in self.results:
+            for clsi in self.results[img_id]:
+                dets = self.results[img_id][clsi]
+                dets = np.array(dets)
+                keep_index = self._diou_nms(dets, thresh=self.nms_thresh)
+
+                keep_box = [{'image_id': int(img_id), 'category_id': int(clsi),
+                             'bbox': list(dets[i][:4].astype(float)),
+                             'score': dets[i][4].astype(float)} for i in keep_index]
+                self.det_boxes.extend(keep_box)
+
+    def _nms(self, predicts, threshold):
+        """Calculate NMS."""
+        # convert xywh -> xmin ymin xmax ymax
+        x1 = predicts[:, 0]
+        y1 = predicts[:, 1]
+        x2 = x1 + predicts[:, 2]
+        y2 = y1 + predicts[:, 3]
+        scores = predicts[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        reserved_boxes = []
+        while order.size > 0:
+            i = order[0]
+            reserved_boxes.append(i)
+            max_x1 = np.maximum(x1[i], x1[order[1:]])
+            max_y1 = np.maximum(y1[i], y1[order[1:]])
+            min_x2 = np.minimum(x2[i], x2[order[1:]])
+            min_y2 = np.minimum(y2[i], y2[order[1:]])
+
+            intersect_w = np.maximum(0.0, min_x2 - max_x1 + 1)
+            intersect_h = np.maximum(0.0, min_y2 - max_y1 + 1)
+            intersect_area = intersect_w * intersect_h
+            ovr = intersect_area / \
+                (areas[i] + areas[order[1:]] - intersect_area)
+
+            indexes = np.where(ovr <= threshold)[0]
+            order = order[indexes + 1]
+        return reserved_boxes
+
+    def _diou_nms(self, dets, thresh=0.5):
+        """
+        convert xywh -> xmin ymin xmax ymax
+        """
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = x1 + dets[:, 2]
+        y2 = y1 + dets[:, 3]
+        scores = dets[:, 4]
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+            center_x1 = (x1[i] + x2[i]) / 2
+            center_x2 = (x1[order[1:]] + x2[order[1:]]) / 2
+            center_y1 = (y1[i] + y2[i]) / 2
+            center_y2 = (y1[order[1:]] + y2[order[1:]]) / 2
+            inter_diag = (center_x2 - center_x1) ** 2 + (center_y2 - center_y1) ** 2
+            out_max_x = np.maximum(x2[i], x2[order[1:]])
+            out_max_y = np.maximum(y2[i], y2[order[1:]])
+            out_min_x = np.minimum(x1[i], x1[order[1:]])
+            out_min_y = np.minimum(y1[i], y1[order[1:]])
+            outer_diag = (out_max_x - out_min_x) ** 2 + (out_max_y - out_min_y) ** 2
+            diou = ovr - inter_diag / outer_diag
+            diou = np.clip(diou, -1, 1)
+            inds = np.where(diou <= thresh)[0]
+            order = order[inds + 1]
+        return keep
+
+    def write_result(self):
+        """Save result to file."""
+        import json
+        t = datetime.datetime.now().strftime('_%Y_%m_%d_%H_%M_%S')
+        try:
+            self.file_path = self.save_prefix + '/predict' + t + '.json'
+            f = open(self.file_path, 'w')
+            json.dump(self.det_boxes, f)
+        except IOError as e:
+            raise RuntimeError("Unable to open json file to dump. What(): {}".format(str(e)))
+        else:
+            f.close()
+            return self.file_path
+
+    def get_eval_result(self):
+        """Get eval result."""
+        coco_gt = COCO(self.ann_file)
+        coco_dt = coco_gt.loadRes(self.file_path)
+        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        rdct = Redirct()
+        stdout = sys.stdout
+        sys.stdout = rdct
+        coco_eval.summarize()
+        sys.stdout = stdout
+        return rdct.content
+
+    def detect(self, outputs, batch, image_shape, image_id):
+        """Detect boxes."""
+        # output [|32, 52, 52, 3, 85| ]
+        for batch_id in range(batch):
+            for out_item in outputs:
+                # 52, 52, 3, 85
+                out_item_single = out_item[batch_id, :]
+                ori_w, ori_h = image_shape[batch_id]
+                img_id = int(image_id[batch_id])
+                if img_id not in self.results:
+                    self.results[img_id] = defaultdict(list)
+                x = ori_w * out_item_single[..., 0].reshape(-1)
+                y = ori_h * out_item_single[..., 1].reshape(-1)
+                w = ori_w * out_item_single[..., 2].reshape(-1)
+                h = ori_h * out_item_single[..., 3].reshape(-1)
+                conf = out_item_single[..., 4:5]
+                cls_emb = out_item_single[..., 5:]
+                cls_argmax = np.expand_dims(np.argmax(cls_emb, axis=-1), axis=-1)
+                x_top_left = x - w / 2.
+                y_top_left = y - h / 2.
+                cls_emb = cls_emb.reshape(-1, self.num_classes)
+                if self.multi_label:
+                    confidence = conf.reshape(-1, 1) * cls_emb
+                    # create all False
+                    flag = cls_emb > self.multi_label_thresh
+                    flag = flag.nonzero()
+                    for i, j in zip(*flag):
+                        confi = confidence[i][j]
+                        if confi < self.ignore_threshold:
+                            continue
+                        x_lefti, y_lefti = max(0, x_top_left[i]), max(0, y_top_left[i])
+                        wi, hi = min(w[i], ori_w), min(h[i], ori_h)
+                        # transform catId to match coco
+                        coco_clsi = self.coco_catIds[j]
+                        self.results[img_id][coco_clsi].append([x_lefti, y_lefti, wi, hi, confi])
+                else:
+                    cls_argmax = cls_argmax.reshape(-1)
+                    # create all False
+                    flag = np.random.random(cls_emb.shape) > sys.maxsize
+                    for i in range(flag.shape[0]):
+                        c = cls_argmax[i]
+                        flag[i, c] = True
+                    confidence = conf.reshape(-1) * cls_emb[flag]
+                    for x_lefti, y_lefti, wi, hi, confi, clsi in zip(x_top_left, y_top_left,
+                                                                     w, h, confidence, cls_argmax):
+                        if confi < self.ignore_threshold:
+                            continue
+                        x_lefti, y_lefti = max(0, x_lefti), max(0, y_lefti)
+                        wi, hi = min(wi, ori_w), min(hi, ori_h)
+                        # transform catId to match coco
+                        coco_clsi = self.coco_catids[clsi]
+                        self.results[img_id][coco_clsi].append([x_lefti, y_lefti, wi, hi, confi])
diff --git a/official/cv/yolov5/src/yolo.py b/official/cv/yolov5/src/yolo.py
index 4071e16464118cbbadc26f304d5b786e449f24d6..cda99991f944df3c67a1b1b4d46e5caf6dbc11e9 100644
--- a/official/cv/yolov5/src/yolo.py
+++ b/official/cv/yolov5/src/yolo.py
@@ -15,14 +15,7 @@
 """YOLOv5 based on DarkNet."""
 import mindspore as ms
 import mindspore.nn as nn
-from mindspore.common.tensor import Tensor
-from mindspore import context
-from mindspore.context import ParallelMode
-from mindspore.parallel._auto_parallel_context import auto_parallel_context
-from mindspore.communication.management import get_group_size
-from mindspore.ops import operations as P
-from mindspore.ops import functional as F
-from mindspore.ops import composite as C
+import mindspore.ops as ops
 
 from src.backbone import YOLOv5Backbone, Conv, BottleneckCSP
 from src.loss import ConfidenceLoss, ClassLoss
@@ -47,7 +40,7 @@ class YOLO(nn.Cell):
         self.back_block2 = YoloBlock(shape[4], self.config.out_channel)
         self.back_block3 = YoloBlock(shape[5], self.config.out_channel)
 
-        self.concat = P.Concat(axis=1)
+        self.concat = ops.Concat(axis=1)
 
     def construct(self, x):
         """
@@ -56,17 +49,17 @@ class YOLO(nn.Cell):
         feature_map2 is (batch_size, backbone_shape[3], h/16, w/16)
         feature_map3 is (batch_size, backbone_shape[4], h/32, w/32)
         """
-        img_height = P.Shape()(x)[2] * 2
-        img_width = P.Shape()(x)[3] * 2
+        img_height = ops.Shape()(x)[2] * 2
+        img_width = ops.Shape()(x)[3] * 2
 
         feature_map1, feature_map2, feature_map3 = self.backbone(x)
 
         c1 = self.conv1(feature_map3)
-        ups1 = P.ResizeNearestNeighbor((img_height // 16, img_width // 16))(c1)
+        ups1 = ops.ResizeNearestNeighbor((img_height // 16, img_width // 16))(c1)
         c2 = self.concat((ups1, feature_map2))
         c3 = self.CSP5(c2)
         c4 = self.conv2(c3)
-        ups2 = P.ResizeNearestNeighbor((img_height // 8, img_width // 8))(c4)
+        ups2 = ops.ResizeNearestNeighbor((img_height // 8, img_width // 8))(c4)
         c5 = self.concat((ups2, feature_map1))
         # out
         c6 = self.CSP6(c5)
@@ -144,35 +137,35 @@ class DetectionBlock(nn.Cell):
             self.offset_x_y = 0.025
         else:
             raise KeyError("Invalid scale value for DetectionBlock")
-        self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], ms.float32)
+        self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32)
         self.num_anchors_per_scale = 3
         self.num_attrib = 4+1+self.config.num_classes
         self.lambda_coord = 1
 
         self.sigmoid = nn.Sigmoid()
-        self.reshape = P.Reshape()
-        self.tile = P.Tile()
-        self.concat = P.Concat(axis=-1)
-        self.pow = P.Pow()
+        self.reshape = ops.Reshape()
+        self.tile = ops.Tile()
+        self.concat = ops.Concat(axis=-1)
+        self.pow = ops.Pow()
         self.conf_training = is_training
 
     def construct(self, x, input_shape):
         """construct method"""
-        num_batch = P.Shape()(x)[0]
-        grid_size = P.Shape()(x)[2:4]
+        num_batch = ops.Shape()(x)[0]
+        grid_size = ops.Shape()(x)[2:4]
 
         # Reshape and transpose the feature to [n, grid_size[0], grid_size[1], 3, num_attrib]
-        prediction = P.Reshape()(x, (num_batch,
-                                     self.num_anchors_per_scale,
-                                     self.num_attrib,
-                                     grid_size[0],
-                                     grid_size[1]))
-        prediction = P.Transpose()(prediction, (0, 3, 4, 1, 2))
+        prediction = ops.Reshape()(x, (num_batch,
+                                       self.num_anchors_per_scale,
+                                       self.num_attrib,
+                                       grid_size[0],
+                                       grid_size[1]))
+        prediction = ops.Transpose()(prediction, (0, 3, 4, 1, 2))
 
         range_x = range(grid_size[1])
         range_y = range(grid_size[0])
-        grid_x = P.Cast()(F.tuple_to_array(range_x), ms.float32)
-        grid_y = P.Cast()(F.tuple_to_array(range_y), ms.float32)
+        grid_x = ops.Cast()(ops.tuple_to_array(range_x), ms.float32)
+        grid_y = ops.Cast()(ops.tuple_to_array(range_y), ms.float32)
         # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid
         # [batch, gridx, gridy, 1, 1]
         grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1))
@@ -188,9 +181,9 @@ class DetectionBlock(nn.Cell):
         # gridsize1 is x
         # gridsize0 is y
         box_xy = (self.scale_x_y * self.sigmoid(box_xy) - self.offset_x_y + grid) / \
-                 P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), ms.float32)
+                 ops.Cast()(ops.tuple_to_array((grid_size[1], grid_size[0])), ms.float32)
         # box_wh is w->h
-        box_wh = P.Exp()(box_wh) * self.anchors / input_shape
+        box_wh = ops.Exp()(box_wh) * self.anchors / input_shape
 
         box_confidence = self.sigmoid(box_confidence)
         box_probs = self.sigmoid(box_probs)
@@ -204,8 +197,8 @@ class Iou(nn.Cell):
     """Calculate the iou of boxes"""
     def __init__(self):
         super(Iou, self).__init__()
-        self.min = P.Minimum()
-        self.max = P.Maximum()
+        self.min = ops.Minimum()
+        self.max = ops.Maximum()
 
     def construct(self, box1, box2):
         """
@@ -215,22 +208,24 @@ class Iou(nn.Cell):
         """
         box1_xy = box1[:, :, :, :, :, :2]
         box1_wh = box1[:, :, :, :, :, 2:4]
-        box1_mins = box1_xy - box1_wh / F.scalar_to_array(2.0) # topLeft
-        box1_maxs = box1_xy + box1_wh / F.scalar_to_array(2.0) # rightDown
+        box1_mins = box1_xy - box1_wh / ops.scalar_to_array(2.0) # topLeft
+        box1_maxs = box1_xy + box1_wh / ops.scalar_to_array(2.0) # rightDown
 
         box2_xy = box2[:, :, :, :, :, :2]
         box2_wh = box2[:, :, :, :, :, 2:4]
-        box2_mins = box2_xy - box2_wh / F.scalar_to_array(2.0)
-        box2_maxs = box2_xy + box2_wh / F.scalar_to_array(2.0)
+        box2_mins = box2_xy - box2_wh / ops.scalar_to_array(2.0)
+        box2_maxs = box2_xy + box2_wh / ops.scalar_to_array(2.0)
 
         intersect_mins = self.max(box1_mins, box2_mins)
         intersect_maxs = self.min(box1_maxs, box2_maxs)
-        intersect_wh = self.max(intersect_maxs - intersect_mins, F.scalar_to_array(0.0))
-        # P.squeeze: for effiecient slice
-        intersect_area = P.Squeeze(-1)(intersect_wh[:, :, :, :, :, 0:1]) * \
-                         P.Squeeze(-1)(intersect_wh[:, :, :, :, :, 1:2])
-        box1_area = P.Squeeze(-1)(box1_wh[:, :, :, :, :, 0:1]) * P.Squeeze(-1)(box1_wh[:, :, :, :, :, 1:2])
-        box2_area = P.Squeeze(-1)(box2_wh[:, :, :, :, :, 0:1]) * P.Squeeze(-1)(box2_wh[:, :, :, :, :, 1:2])
+        intersect_wh = self.max(intersect_maxs - intersect_mins, ops.scalar_to_array(0.0))
+        # ops.squeeze: for effiecient slice
+        intersect_area = ops.Squeeze(-1)(intersect_wh[:, :, :, :, :, 0:1]) * \
+                         ops.Squeeze(-1)(intersect_wh[:, :, :, :, :, 1:2])
+        box1_area = ops.Squeeze(-1)(box1_wh[:, :, :, :, :, 0:1]) * \
+                    ops.Squeeze(-1)(box1_wh[:, :, :, :, :, 1:2])
+        box2_area = ops.Squeeze(-1)(box2_wh[:, :, :, :, :, 0:1]) * \
+                    ops.Squeeze(-1)(box2_wh[:, :, :, :, :, 1:2])
         iou = intersect_area / (box1_area + box2_area - intersect_area)
         # iou : [batch, gx, gy, anchors, maxboxes]
         return iou
@@ -252,15 +247,15 @@ class YoloLossBlock(nn.Cell):
             idx = (6, 7, 8)
         else:
             raise KeyError("Invalid scale value for DetectionBlock")
-        self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], ms.float32)
-        self.ignore_threshold = Tensor(self.config.ignore_threshold, ms.float32)
-        self.concat = P.Concat(axis=-1)
+        self.anchors = ms.Tensor([self.config.anchor_scales[i] for i in idx], ms.float32)
+        self.ignore_threshold = ms.Tensor(self.config.ignore_threshold, ms.float32)
+        self.concat = ops.Concat(axis=-1)
         self.iou = Iou()
-        self.reduce_max = P.ReduceMax(keep_dims=False)
+        self.reduce_max = ops.ReduceMax(keep_dims=False)
         self.confidence_loss = ConfidenceLoss()
         self.class_loss = ClassLoss()
 
-        self.reduce_sum = P.ReduceSum()
+        self.reduce_sum = ops.ReduceSum()
         self.g_iou = GIou()
 
     def construct(self, prediction, pred_xy, pred_wh, y_true, gt_box, input_shape):
@@ -275,24 +270,23 @@ class YoloLossBlock(nn.Cell):
         class_probs = y_true[:, :, :, :, 5:]
         true_boxes = y_true[:, :, :, :, :4]
 
-        grid_shape = P.Shape()(prediction)[1:3]
-        grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32)
+        grid_shape = ops.Shape()(prediction)[1:3]
+        grid_shape = ops.Cast()(ops.tuple_to_array(grid_shape[::-1]), ms.float32)
 
         pred_boxes = self.concat((pred_xy, pred_wh))
         true_wh = y_true[:, :, :, :, 2:4]
-        true_wh = P.Select()(P.Equal()(true_wh, 0.0),
-                             P.Fill()(P.DType()(true_wh),
-                                      P.Shape()(true_wh), 1.0),
-                             true_wh)
-        true_wh = P.Log()(true_wh / self.anchors * input_shape)
+        true_wh = ops.Select()(ops.Equal()(true_wh, 0.0),
+                               ops.Fill()(ops.DType()(true_wh), ops.Shape()(true_wh), 1.0),
+                               true_wh)
+        true_wh = ops.Log()(true_wh / self.anchors * input_shape)
         # 2-w*h for large picture, use small scale, since small obj need more precise
         box_loss_scale = 2 - y_true[:, :, :, :, 2:3] * y_true[:, :, :, :, 3:4]
 
-        gt_shape = P.Shape()(gt_box)
-        gt_box = P.Reshape()(gt_box, (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2]))
+        gt_shape = ops.Shape()(gt_box)
+        gt_box = ops.Reshape()(gt_box, (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2]))
 
         # add one more dimension for broadcast
-        iou = self.iou(P.ExpandDims()(pred_boxes, -2), gt_box)
+        iou = self.iou(ops.ExpandDims()(pred_boxes, -2), gt_box)
         # gt_box is x,y,h,w after normalize
         # [batch, grid[0], grid[1], num_anchor, num_gt]
         best_iou = self.reduce_max(iou, -1)
@@ -300,26 +294,26 @@ class YoloLossBlock(nn.Cell):
 
         # ignore_mask IOU too small
         ignore_mask = best_iou < self.ignore_threshold
-        ignore_mask = P.Cast()(ignore_mask, ms.float32)
-        ignore_mask = P.ExpandDims()(ignore_mask, -1)
+        ignore_mask = ops.Cast()(ignore_mask, ms.float32)
+        ignore_mask = ops.ExpandDims()(ignore_mask, -1)
         # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume.
         # so we turn off its gradient
-        ignore_mask = F.stop_gradient(ignore_mask)
+        ignore_mask = ops.stop_gradient(ignore_mask)
 
         confidence_loss = self.confidence_loss(object_mask, prediction[:, :, :, :, 4:5], ignore_mask)
         class_loss = self.class_loss(object_mask, prediction[:, :, :, :, 5:], class_probs)
 
-        object_mask_me = P.Reshape()(object_mask, (-1, 1))  # [8, 72, 72, 3, 1]
-        box_loss_scale_me = P.Reshape()(box_loss_scale, (-1, 1))
+        object_mask_me = ops.Reshape()(object_mask, (-1, 1))  # [8, 72, 72, 3, 1]
+        box_loss_scale_me = ops.Reshape()(box_loss_scale, (-1, 1))
         pred_boxes_me = xywh2x1y1x2y2(pred_boxes)
-        pred_boxes_me = P.Reshape()(pred_boxes_me, (-1, 4))
+        pred_boxes_me = ops.Reshape()(pred_boxes_me, (-1, 4))
         true_boxes_me = xywh2x1y1x2y2(true_boxes)
-        true_boxes_me = P.Reshape()(true_boxes_me, (-1, 4))
+        true_boxes_me = ops.Reshape()(true_boxes_me, (-1, 4))
         c_iou = self.g_iou(pred_boxes_me, true_boxes_me)
         c_iou_loss = object_mask_me * box_loss_scale_me * (1 - c_iou)
         c_iou_loss_me = self.reduce_sum(c_iou_loss, ())
         loss = c_iou_loss_me * 4 + confidence_loss + class_loss
-        batch_size = P.Shape()(prediction)[0]
+        batch_size = ops.Shape()(prediction)[0]
         return loss / batch_size
 
 
@@ -382,11 +376,11 @@ class YoloWithLossCell(nn.Cell):
         self.loss_big = YoloLossBlock('l', self.config)
         self.loss_me = YoloLossBlock('m', self.config)
         self.loss_small = YoloLossBlock('s', self.config)
-        self.tenser_to_array = P.TupleToArray()
+        self.tenser_to_array = ops.TupleToArray()
 
     def construct(self, x, y_true_0, y_true_1, y_true_2, gt_0, gt_1, gt_2, input_shape):
-        input_shape = F.shape(x)[2:4]
-        input_shape = F.cast(self.tenser_to_array(input_shape) * 2, ms.float32)
+        input_shape = ops.shape(x)[2:4]
+        input_shape = ops.cast(self.tenser_to_array(input_shape) * 2, ms.float32)
 
         yolo_out = self.yolo_network(x, input_shape)
         loss_l = self.loss_big(*yolo_out[0], y_true_0, gt_0, input_shape)
@@ -395,50 +389,17 @@ class YoloWithLossCell(nn.Cell):
         return loss_l + loss_m + loss_s * 0.2
 
 
-class TrainingWrapper(nn.Cell):
-    """Training wrapper."""
-    def __init__(self, network, optimizer, sens=1.0):
-        super(TrainingWrapper, self).__init__(auto_prefix=False)
-        self.network = network
-        self.network.set_grad()
-        self.weights = optimizer.parameters
-        self.optimizer = optimizer
-        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
-        self.sens = sens
-        self.reducer_flag = False
-        self.grad_reducer = None
-        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
-        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
-            self.reducer_flag = True
-        if self.reducer_flag:
-            mean = context.get_auto_parallel_context("gradients_mean")
-            if auto_parallel_context().get_device_num_is_set():
-                degree = context.get_auto_parallel_context("device_num")
-            else:
-                degree = get_group_size()
-            self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree)
-
-    def construct(self, *args):
-        weights = self.weights
-        loss = self.network(*args)
-        sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
-        grads = self.grad(self.network, weights)(*args, sens)
-        if self.reducer_flag:
-            grads = self.grad_reducer(grads)
-        return F.depend(loss, self.optimizer(grads))
-
-
 class GIou(nn.Cell):
     """Calculating giou"""
     def __init__(self):
         super(GIou, self).__init__()
-        self.cast = P.Cast()
-        self.reshape = P.Reshape()
-        self.min = P.Minimum()
-        self.max = P.Maximum()
-        self.concat = P.Concat(axis=1)
-        self.mean = P.ReduceMean()
-        self.div = P.RealDiv()
+        self.cast = ops.Cast()
+        self.reshape = ops.Reshape()
+        self.min = ops.Minimum()
+        self.max = ops.Maximum()
+        self.concat = ops.Concat(axis=1)
+        self.mean = ops.ReduceMean()
+        self.div = ops.RealDiv()
         self.eps = 0.000001
 
     def construct(self, box_p, box_gt):
@@ -462,7 +423,7 @@ class GIou(nn.Cell):
         res_mid0 = c_area - union
         res_mid1 = self.div(self.cast(res_mid0, ms.float32), self.cast(c_area, ms.float32))
         giou = iou - res_mid1
-        giou = C.clip_by_value(giou, -1.0, 1.0)
+        giou = ops.clip_by_value(giou, -1.0, 1.0)
         return giou
 
 
@@ -471,6 +432,6 @@ def xywh2x1y1x2y2(box_xywh):
     boxes_y1 = box_xywh[..., 1:2] - box_xywh[..., 3:4] / 2
     boxes_x2 = box_xywh[..., 0:1] + box_xywh[..., 2:3] / 2
     boxes_y2 = box_xywh[..., 1:2] + box_xywh[..., 3:4] / 2
-    boxes_x1y1x2y2 = P.Concat(-1)((boxes_x1, boxes_y1, boxes_x2, boxes_y2))
+    boxes_x1y1x2y2 = ops.Concat(-1)((boxes_x1, boxes_y1, boxes_x2, boxes_y2))
 
     return boxes_x1y1x2y2
diff --git a/official/cv/yolov5/src/yolo_dataset.py b/official/cv/yolov5/src/yolo_dataset.py
index 75234400b0e72aeea015ed0ad0fafbb960a09ee1..f4c602c5123df740f27ed7fd12b81dbb2b309286 100644
--- a/official/cv/yolov5/src/yolo_dataset.py
+++ b/official/cv/yolov5/src/yolo_dataset.py
@@ -20,8 +20,7 @@ import numpy as np
 import cv2
 from PIL import Image
 from pycocotools.coco import COCO
-import mindspore.dataset as de
-import mindspore.dataset.vision.c_transforms as CV
+import mindspore.dataset as ds
 from src.distributed_sampler import DistributedSampler
 from src.transforms import reshape_fn, MultiScaleTrans, PreprocessTrueBox
 
@@ -225,11 +224,11 @@ class COCOYoloDataset:
         return [x_min, y_min, x_min+w, y_min+h]
 
 
-def create_yolo_dataset(image_dir, anno_path, batch_size, max_epoch, device_num, rank,
+def create_yolo_dataset(image_dir, anno_path, batch_size, device_num, rank,
                         config=None, is_training=True, shuffle=True):
     """Create dataset for YOLOV5."""
     cv2.setNumThreads(0)
-    de.config.set_enable_shared_mem(True)
+    ds.config.set_enable_shared_mem(True)
     if is_training:
         filter_crowd = True
         remove_empty_anno = True
@@ -241,7 +240,7 @@ def create_yolo_dataset(image_dir, anno_path, batch_size, max_epoch, device_num,
                                    remove_images_without_annotations=remove_empty_anno, is_training=is_training)
     distributed_sampler = DistributedSampler(len(yolo_dataset), device_num, rank, shuffle=shuffle)
     yolo_dataset.size = len(distributed_sampler)
-    hwc_to_chw = CV.HWC2CHW()
+    hwc_to_chw = ds.vision.c_transforms.HWC2CHW()
 
     config.dataset_size = len(yolo_dataset)
     cores = multiprocessing.cpu_count()
@@ -258,34 +257,34 @@ def create_yolo_dataset(image_dir, anno_path, batch_size, max_epoch, device_num,
         map2_out_column_names = ["annotation", "bbox1", "bbox2", "bbox3",
                                  "gt_box1", "gt_box2", "gt_box3"]
 
-        ds = de.GeneratorDataset(yolo_dataset, column_names=dataset_column_names, sampler=distributed_sampler,
-                                 python_multiprocessing=True, num_parallel_workers=min(4, num_parallel_workers))
-        ds = ds.map(operations=multi_scale_trans, input_columns=dataset_column_names,
-                    output_columns=map1_out_column_names, column_order=map1_out_column_names,
-                    num_parallel_workers=min(12, num_parallel_workers), python_multiprocessing=True)
-        ds = ds.map(operations=PreprocessTrueBox(config), input_columns=map2_in_column_names,
-                    output_columns=map2_out_column_names, column_order=output_column_names,
-                    num_parallel_workers=min(4, num_parallel_workers), python_multiprocessing=False)
+        dataset = ds.GeneratorDataset(yolo_dataset, column_names=dataset_column_names, sampler=distributed_sampler,
+                                      python_multiprocessing=True, num_parallel_workers=min(4, num_parallel_workers))
+        dataset = dataset.map(operations=multi_scale_trans, input_columns=dataset_column_names,
+                              output_columns=map1_out_column_names, column_order=map1_out_column_names,
+                              num_parallel_workers=min(12, num_parallel_workers), python_multiprocessing=True)
+        dataset = dataset.map(operations=PreprocessTrueBox(config), input_columns=map2_in_column_names,
+                              output_columns=map2_out_column_names, column_order=output_column_names,
+                              num_parallel_workers=min(4, num_parallel_workers), python_multiprocessing=False)
         mean = [m * 255 for m in [0.485, 0.456, 0.406]]
         std = [s * 255 for s in [0.229, 0.224, 0.225]]
-        ds = ds.map([CV.Normalize(mean, std),
-                     hwc_to_chw], num_parallel_workers=min(4, num_parallel_workers))
+        dataset = dataset.map([ds.vision.c_transforms.Normalize(mean, std), hwc_to_chw],
+                              num_parallel_workers=min(4, num_parallel_workers))
 
         def concatenate(images):
             images = np.concatenate((images[..., ::2, ::2], images[..., 1::2, ::2],
                                      images[..., ::2, 1::2], images[..., 1::2, 1::2]), axis=0)
             return images
-        ds = ds.map(operations=concatenate, input_columns="image", num_parallel_workers=min(4, num_parallel_workers))
-        ds = ds.batch(batch_size, num_parallel_workers=min(4, num_parallel_workers), drop_remainder=True)
+        dataset = dataset.map(operations=concatenate, input_columns="image",
+                              num_parallel_workers=min(4, num_parallel_workers))
+        dataset = dataset.batch(batch_size, num_parallel_workers=min(4, num_parallel_workers), drop_remainder=True)
     else:
-        ds = de.GeneratorDataset(yolo_dataset, column_names=["image", "img_id"],
-                                 sampler=distributed_sampler)
+        dataset = ds.GeneratorDataset(yolo_dataset, column_names=["image", "img_id"],
+                                      sampler=distributed_sampler)
         compose_map_func = (lambda image, img_id: reshape_fn(image, img_id, config))
-        ds = ds.map(operations=compose_map_func, input_columns=["image", "img_id"],
-                    output_columns=["image", "image_shape", "img_id"],
-                    column_order=["image", "image_shape", "img_id"],
-                    num_parallel_workers=8)
-        ds = ds.map(operations=hwc_to_chw, input_columns=["image"], num_parallel_workers=8)
-        ds = ds.batch(batch_size, drop_remainder=True)
-    ds = ds.repeat(max_epoch)
-    return ds, len(yolo_dataset)
+        dataset = dataset.map(operations=compose_map_func, input_columns=["image", "img_id"],
+                              output_columns=["image", "image_shape", "img_id"],
+                              column_order=["image", "image_shape", "img_id"],
+                              num_parallel_workers=8)
+        dataset = dataset.map(operations=hwc_to_chw, input_columns=["image"], num_parallel_workers=8)
+        dataset = dataset.batch(batch_size, drop_remainder=True)
+    return dataset
diff --git a/official/cv/yolov5/src/yolov5_backbone.py b/official/cv/yolov5/src/yolov5_backbone.py
index 9f3f6fa5618fff42bc5c39db3f9e00763b11baa2..f0875ab0426ac49afde8be199551e2207b691cd4 100644
--- a/official/cv/yolov5/src/yolov5_backbone.py
+++ b/official/cv/yolov5/src/yolov5_backbone.py
@@ -14,7 +14,7 @@
 # ============================================================================
 """DarkNet model."""
 import mindspore.nn as nn
-from mindspore.ops import operations as P
+import mindspore.ops as ops
 
 
 class Concat(nn.Cell):
@@ -22,7 +22,7 @@ class Concat(nn.Cell):
     def __init__(self, dimension=1):
         super(Concat, self).__init__()
         self.d = dimension
-        self.concat = P.Concat(self.d)
+        self.concat = ops.Concat(self.d)
 
     def forward(self, x):
         return self.concat
@@ -53,7 +53,7 @@ class BottleneckCSP(nn.Cell):
         self.bn = nn.BatchNorm2d(2 * c_, momentum=0.9, eps=1e-5)  # applied to cat(cv2, cv3)
         self.act = nn.LeakyReLU(0.1)
         self.m = nn.SequentialCell([Bottleneck(c_, c_, shortcut, e=1.0) for _ in range(n)])
-        self.concat = P.Concat(1)
+        self.concat = ops.Concat(1)
 
     def construct(self, x):
         y1 = self.cv3(self.m(self.cv1(x)))
@@ -71,7 +71,7 @@ class C3(nn.Cell):
         self.cv2 = Conv(c1, c_, 1, 1)
         self.cv3 = Conv(2 * c_, c2, 1)  # act=FReLU(c2)
         self.m = nn.SequentialCell([Bottleneck(c_, c_, shortcut, e=1.0) for _ in range(n)])
-        self.concat = P.Concat(1)
+        self.concat = ops.Concat(1)
 
     def construct(self, x):
         y1 = self.m(self.cv1(x))
@@ -91,7 +91,7 @@ class SPP(nn.Cell):
         self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, pad_mode='same')
         self.maxpool2 = nn.MaxPool2d(kernel_size=9, stride=1, pad_mode='same')
         self.maxpool3 = nn.MaxPool2d(kernel_size=13, stride=1, pad_mode='same')
-        self.concat = P.Concat(1)
+        self.concat = ops.Concat(1)
 
     def construct(self, x):
         x = self.cv1(x)
@@ -107,11 +107,11 @@ class Focus(nn.Cell):
     def __init__(self, c1, c2, k=1, s=1, p=None, act=True):
         super(Focus, self).__init__()
         self.conv = Conv(c1 * 4, c2, k, s, p, act)
-        self.concat = P.Concat(1)
+        self.concat = ops.Concat(1)
 
     def construct(self, x):
-        w = P.Shape()(x)[2]
-        h = P.Shape()(x)[3]
+        w = ops.Shape()(x)[2]
+        h = ops.Shape()(x)[3]
         concat4 = self.concat((x[..., 0:w:2, 0:h:2], x[..., 1:w:2, 0:h:2], x[..., 0:w:2, 1:h:2], x[..., 1:w:2, 1:h:2]))
         return self.conv(concat4)
 
@@ -129,7 +129,7 @@ class Focusv2(nn.Cell):
 class SiLU(nn.Cell):
     def __init__(self):
         super(SiLU, self).__init__()
-        self.sigmoid = P.Sigmoid()
+        self.sigmoid = ops.Sigmoid()
 
     def construct(self, x):
         return x * self.sigmoid(x)
@@ -160,7 +160,7 @@ class Conv(nn.Cell):
             self.pad_mode = 'pad'
         self.conv = nn.Conv2d(c1, c2, k, s, padding=self.padding, pad_mode=self.pad_mode, has_bias=False)
         self.bn = nn.BatchNorm2d(c2, momentum=momentum, eps=eps)
-        self.act = SiLU() if act is True else (act if isinstance(act, nn.Cell) else P.Identity())
+        self.act = SiLU() if act is True else (act if isinstance(act, nn.Cell) else ops.operations.Identity())
 
     def construct(self, x):
         return self.act(self.bn(self.conv(x)))
@@ -172,8 +172,8 @@ class YOLOv5Backbone(nn.Cell):
         super(YOLOv5Backbone, self).__init__()
 
         # self.outchannel = 1024
-        # self.concat = P.Concat(axis=1)
-        # self.add = P.TensorAdd()
+        # self.concat = ops.Concat(axis=1)
+        # self.add = ops.TensorAdd()
 
         self.focusv2 = Focusv2(3, 32, k=3, s=1)
         self.conv1 = Conv(32, 64, k=3, s=2)
diff --git a/official/cv/yolov5/train.py b/official/cv/yolov5/train.py
index 1331afaa294f80ac80d5e8313a947200763f8711..e95abf1db6f2d262a735aa60aa7b2e14173eb071 100644
--- a/official/cv/yolov5/train.py
+++ b/official/cv/yolov5/train.py
@@ -15,248 +15,116 @@
 """YoloV5 train."""
 import os
 import time
-import datetime
 import mindspore as ms
-from mindspore.context import ParallelMode
-from mindspore.nn import Momentum
-from mindspore import Tensor
-from mindspore import context
-from mindspore.communication.management import init, get_rank, get_group_size
-from mindspore.train.callback import ModelCheckpoint, RunContext
-from mindspore.train.callback import _InternalCallbackParam, CheckpointConfig
+import mindspore.nn as nn
+import mindspore.communication as comm
 
-from src.yolo import YOLOV5, YoloWithLossCell, TrainingWrapper
+from src.yolo import YOLOV5, YoloWithLossCell
 from src.logger import get_logger
-from src.util import AverageMeter, get_param_groups
+from src.util import AverageMeter, get_param_groups, cpu_affinity
 from src.lr_scheduler import get_lr
 from src.yolo_dataset import create_yolo_dataset
 from src.initializer import default_recurisive_init, load_yolov5_params
 
 from model_utils.config import config
-from model_utils.moxing_adapter import moxing_wrapper
-from model_utils.device_adapter import get_device_id, get_device_num
+from model_utils.device_adapter import get_device_id
+
+# only useful for huawei cloud modelarts.
+from model_utils.moxing_adapter import moxing_wrapper, modelarts_pre_process, modelarts_post_process
+
 
 ms.set_seed(1)
 
-def cpu_affinity(rank_id, device_num):
-    """Bind CPU cores according to rank_id and device_num."""
-    import psutil
-    cores = psutil.cpu_count()
-    if cores < device_num:
-        return
-    process = psutil.Process()
-    used_cpu_num = cores // device_num
-    rank_id = rank_id % device_num
-    used_cpu_list = [i for i in range(rank_id * used_cpu_num, (rank_id + 1) * used_cpu_num)]
-    process.cpu_affinity(used_cpu_list)
-    print(f"==== {rank_id}/{device_num} ==== bind cpu: {used_cpu_list}")
 
-def set_default():
+def init_distribute():
+    comm.init()
+    config.rank = comm.get_rank()
+    config.group_size = comm.get_group_size()
+    ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True,
+                                 device_num=config.group_size)
+
+
+def train_preprocess():
     if config.lr_scheduler == 'cosine_annealing' and config.max_epoch > config.T_max:
         config.T_max = config.max_epoch
 
     config.lr_epochs = list(map(int, config.lr_epochs.split(',')))
-
-    if config.is_modelArts:
-        config.data_root = os.path.join(config.data_dir, 'train2017')
-        config.annFile = os.path.join(config.data_dir, 'annotations')
-        outputs_dir = os.path.join(config.outputs_dir, config.ckpt_path)
-    else:
-        config.data_root = os.path.join(config.data_dir, 'train2017')
-        config.annFile = os.path.join(config.data_dir, 'annotations/instances_train2017.json')
-        outputs_dir = config.ckpt_path
-
+    config.data_root = os.path.join(config.data_dir, 'train2017')
+    config.annFile = os.path.join(config.data_dir, 'annotations/instances_train2017.json')
     device_id = get_device_id()
-    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target,
-                        save_graphs=False, device_id=device_id)
-    # init distributed
+    ms.set_context(mode=ms.GRAPH_MODE, device_target=config.device_target, device_id=device_id)
+
     if config.is_distributed:
-        init()
-        config.rank = get_rank()
-        config.group_size = get_group_size()
-        context.reset_auto_parallel_context()
-        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
-                                          device_num=config.group_size)
+        # init distributed
+        init_distribute()
 
+    # for promoting performance in GPU device
     if config.device_target == "GPU" and config.bind_cpu:
         cpu_affinity(config.rank, min(config.group_size, config.device_num))
 
-    config.rank_save_ckpt_flag = 0
-    if config.is_save_on_master:
-        if config.rank == 0:
-            config.rank_save_ckpt_flag = 1
-    else:
-        config.rank_save_ckpt_flag = 1
-
-    # logger
-    config.outputs_dir = os.path.join(outputs_dir, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
-    config.logger = get_logger(config.outputs_dir, config.rank)
+    # logger module is managed by config, it is used in other function. e.x. config.logger.info("xxx")
+    config.logger = get_logger(config.output_dir, config.rank)
     config.logger.save_args(config)
 
-def convert_training_shape(args_training_shape):
-    training_shape = [int(args_training_shape), int(args_training_shape)]
-    return training_shape
 
-def modelarts_pre_process():
-    '''modelarts pre process function.'''
-    def unzip(zip_file, save_dir):
-        import zipfile
-        s_time = time.time()
-        if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
-            zip_isexist = zipfile.is_zipfile(zip_file)
-            if zip_isexist:
-                fz = zipfile.ZipFile(zip_file, 'r')
-                data_num = len(fz.namelist())
-                print("Extract Start...")
-                print("unzip file num: {}".format(data_num))
-                data_print = int(data_num / 100) if data_num > 100 else 1
-                i = 0
-                for file in fz.namelist():
-                    if i % data_print == 0:
-                        print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
-                    i += 1
-                    fz.extract(file, save_dir)
-                print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
-                                                     int(int(time.time() - s_time) % 60)))
-                print("Extract Done.")
-            else:
-                print("This is not zip.")
-        else:
-            print("Zip has been extracted.")
-
-    if config.need_modelarts_dataset_unzip:
-        zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
-        save_dir_1 = os.path.join(config.data_path)
-
-        sync_lock = "/tmp/unzip_sync.lock"
-
-        # Each server contains 8 devices as most.
-        if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
-            print("Zip file path: ", zip_file_1)
-            print("Unzip file save dir: ", save_dir_1)
-            unzip(zip_file_1, save_dir_1)
-            print("===Finish extract data synchronization===")
-            try:
-                os.mknod(sync_lock)
-            except IOError:
-                pass
-
-        while True:
-            if os.path.exists(sync_lock):
-                break
-            time.sleep(1)
-
-        print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
-
-    config.ckpt_path = os.path.join(config.output_path, config.ckpt_path)
-
-@moxing_wrapper(pre_process=modelarts_pre_process)
+@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process, pre_args=[config])
 def run_train():
-    set_default()
-    loss_meter = AverageMeter('loss')
-
-    if config.is_modelArts:
-        import moxing as mox
-        local_data_url = os.path.join(config.data_path, str(config.rank))
-        local_annFile = os.path.join(config.data_path, str(config.rank))
-        mox.file.copy_parallel(config.data_root, local_data_url)
-        config.data_root = local_data_url
-
-        mox.file.copy_parallel(config.annFile, local_annFile)
-        config.annFile = os.path.join(local_data_url, 'instances_train2017.json')
+    train_preprocess()
 
+    loss_meter = AverageMeter('loss')
     dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3}
     network = YOLOV5(is_training=True, version=dict_version[config.yolov5_version])
     # default is kaiming-normal
     default_recurisive_init(network)
     load_yolov5_params(config, network)
-
     network = YoloWithLossCell(network)
 
-    config.label_smooth = config.label_smooth
-    config.label_smooth_factor = config.label_smooth_factor
-
-    if config.training_shape:
-        config.multi_scale = [convert_training_shape(config.training_shape)]
-    if config.resize_rate:
-        config.resize_rate = config.resize_rate
-
-    ds, data_size = create_yolo_dataset(image_dir=config.data_root, anno_path=config.annFile, is_training=True,
-                                        batch_size=config.per_batch_size, max_epoch=config.max_epoch,
-                                        device_num=config.group_size, rank=config.rank, config=config)
-
+    ds = create_yolo_dataset(image_dir=config.data_root, anno_path=config.annFile, is_training=True,
+                             batch_size=config.per_batch_size, device_num=config.group_size,
+                             rank=config.rank, config=config)
     config.logger.info('Finish loading dataset')
 
-    config.steps_per_epoch = int(data_size / config.per_batch_size / config.group_size)
-
-    if config.ckpt_interval <= 0:
-        config.ckpt_interval = config.steps_per_epoch
-
-    lr = get_lr(config)
-
-    opt = Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=Tensor(lr),
-                   weight_decay=config.weight_decay, loss_scale=config.loss_scale)
-
-    network = TrainingWrapper(network, opt, config.loss_scale // 2)
+    steps_per_epoch = ds.get_dataset_size()
+    lr = get_lr(config, steps_per_epoch)
+    opt = nn.Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=ms.Tensor(lr),
+                      weight_decay=config.weight_decay, loss_scale=config.loss_scale)
+    network = nn.TrainOneStepCell(network, opt, config.loss_scale // 2)
     network.set_train()
 
-    if config.rank_save_ckpt_flag:
-        # checkpoint save
-        ckpt_max_num = config.max_epoch * config.steps_per_epoch // config.ckpt_interval
-        ckpt_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval, keep_checkpoint_max=1)
-        save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/')
-        ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(config.rank))
-        cb_params = _InternalCallbackParam()
-        cb_params.train_network = network
-        cb_params.epoch_num = ckpt_max_num
-        cb_params.cur_epoch_num = 1
-        run_context = RunContext(cb_params)
-        ckpt_cb.begin(run_context)
-
-    old_progress = -1
+    data_loader = ds.create_dict_iterator()
+    first_step = True
     t_end = time.time()
-    data_loader = ds.create_dict_iterator(output_numpy=True, num_epochs=1)
-
-    for i, data in enumerate(data_loader):
-        images = data["image"]
-        input_shape = images.shape[2:4]
-        images = Tensor.from_numpy(images)
-        batch_y_true_0 = Tensor.from_numpy(data['bbox1'])
-        batch_y_true_1 = Tensor.from_numpy(data['bbox2'])
-        batch_y_true_2 = Tensor.from_numpy(data['bbox3'])
-        batch_gt_box0 = Tensor.from_numpy(data['gt_box1'])
-        batch_gt_box1 = Tensor.from_numpy(data['gt_box2'])
-        batch_gt_box2 = Tensor.from_numpy(data['gt_box3'])
-        input_shape = Tensor(tuple(input_shape[::-1]), ms.float32)
-        loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1,
-                       batch_gt_box2, input_shape)
-        loss_meter.update(loss.asnumpy())
 
-        if config.rank_save_ckpt_flag:
-            # ckpt progress
-            cb_params.cur_step_num = i + 1  # current step number
-            cb_params.batch_num = i + 2
-            ckpt_cb.step_end(run_context)
-
-        if i % config.log_interval == 0:
-            time_used = time.time() - t_end
-            epoch = int(i / config.steps_per_epoch)
-            fps = config.per_batch_size * (i - old_progress) * config.group_size / time_used
-            per_step_time = time_used / config.log_interval * 1000
-            if config.rank == 0:
-                config.logger.info(
-                    'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{},'
-                    ' per step time:{}ms'.format(epoch, i, loss_meter, fps, lr[i], per_step_time))
-            t_end = time.time()
-            loss_meter.reset()
-            old_progress = i
-
-        if (i + 1) % config.steps_per_epoch == 0 and config.rank_save_ckpt_flag:
-            cb_params.cur_epoch_num += 1
+    for epoch_idx in range(config.max_epoch):
+        for step_idx, data in enumerate(data_loader):
+            images = data["image"]
+            input_shape = images.shape[2:4]
+            input_shape = ms.Tensor(tuple(input_shape[::-1]), ms.float32)
+            loss = network(images, data['bbox1'], data['bbox2'], data['bbox3'], data['gt_box1'], data['gt_box2'],
+                           data['gt_box2'], input_shape)
+            loss_meter.update(loss.asnumpy())
+
+            # it is used for loss, performance output per config.log_interval steps.
+            if (epoch_idx * steps_per_epoch + step_idx) % config.log_interval == 0:
+                time_used = time.time() - t_end
+                if first_step:
+                    fps = config.per_batch_size * config.group_size / time_used
+                    per_step_time = time_used * 1000
+                    first_step = False
+                else:
+                    fps = config.per_batch_size * config.log_interval * config.group_size / time_used
+                    per_step_time = time_used / config.log_interval * 1000
+                config.logger.info('epoch[{}], iter[{}], {}, fps:{:.2f} imgs/sec, '
+                                   'lr:{}, per step time: {}ms'.format(epoch_idx + 1, step_idx + 1,
+                                                                       loss_meter, fps, lr[step_idx], per_step_time))
+                t_end = time.time()
+                loss_meter.reset()
+        if config.rank == 0:
+            ckpt_name = os.path.join(config.output_dir, "yolov5_{}_{}.ckpt".format(epoch_idx + 1, steps_per_epoch))
+            ms.save_checkpoint(network, ckpt_name)
 
-    if config.is_modelArts:
-        mox.file.copy_parallel(src_url='/cache/outputs/', dst_url='obs://hit-cyf/yolov5_npu/outputs/')
     config.logger.info('==========end training===============')
 
+
 if __name__ == "__main__":
     run_train()