diff --git a/official/cv/yolov5/README.md b/official/cv/yolov5/README.md index fb8d8b39ce7d89d153f065e62cfdfa192d0e2814..21e5517e464f1ec0591e2d4fe6af7258195aab60 100644 --- a/official/cv/yolov5/README.md +++ b/official/cv/yolov5/README.md @@ -41,7 +41,7 @@ Note that you can run the scripts with **COCO2017 **or any other datasets with t After installing MindSpore via the official website, you can start training and evaluation as follows: ```bash -#run training example(1p) by python command +#run training example(1p) on Ascend by python command python train.py \ --data_dir=xxx/dataset \ --is_distributed=0 \ @@ -56,17 +56,28 @@ python train.py \ ```bash # For Ascend device, distributed training example(8p) by shell script bash run_distribute_train.sh xxx/dateset/ xxx/cspdarknet.ckpt rank_table_8pcs.json + +# For GPU device, distributed training example(8p) by shell script +bash run_distribute_train_gpu.sh xxx/dateset [RANK_SIZE] ``` ```bash -# run evaluation by python command +# run evaluation on Ascend by python command python eval.py \ 聽聽聽 --data_dir=xxx/dataset \ -聽聽聽 --testing_shape=640 > log.txt 2>&1 & +聽聽聽 --eval_shape=640 > log.txt 2>&1 & + +# run evaluation on GPU by python command +python eval.py \ + --device_target="GPU" \ + --data_dir=xxx/dataset \ + --yolov5_version='yolov5s' \ + --pretrained="***/*.ckpt" \ + --eval_shape=640 > log.txt 2>&1 & ``` ```bash -# run evaluation by shell script +# run evaluation on Ascend by shell script bash run_eval.sh xxx/dataset xxx/yolov5.ckpt ``` @@ -78,14 +89,20 @@ bash run_eval.sh xxx/dataset xxx/yolov5.ckpt 鈹溾攢鈹€ model_zoo 鈹溾攢鈹€ README.md // descriptions about all the models 鈹溾攢鈹€ yolov5 - 鈹溾攢鈹€ README.md // descriptions about yolov5 + 鈹溾攢鈹€ README.md // descriptions about yolov5 鈹溾攢鈹€ scripts 鈹� 鈹溾攢鈹€run_distribute_train.sh // launch distributed training(8p) in ascend - 鈹� 鈹溾攢鈹€run_eval.sh // shell script for evaluation + 鈹� 鈹溾攢鈹€run_distribute_train_gpu.sh // launch distributed training(8p) in GPU + 鈹� 鈹溾攢鈹€run_standalone_train.sh // launch 1p training in ascend + 鈹� 鈹溾攢鈹€run_eval.sh // shell script for evaluation 鈹� 鈹溾攢鈹€rank_table_8pcs.json // the example of rank table settings for 8p training + 鈹溾攢鈹€model_utils + 鈹� 鈹溾攢鈹€config.py // getting config parameters + 鈹� 鈹溾攢鈹€device_adapter.py // getting device info + 鈹� 鈹溾攢鈹€local_adapter.py // getting device info + 鈹� 鈹溾攢鈹€moxing_adapter.py // Decorator 鈹溾攢鈹€ src - 鈹� 鈹溾攢鈹€config.py // parameter configuration - 鈹� 鈹溾攢鈹€backbone.py // backbone of network + 鈹� 鈹溾攢鈹€backbone.py // backbone of network 鈹� 鈹溾攢鈹€distributed_sampler.py // iterator of dataset 鈹� 鈹溾攢鈹€initializer.py // initializer of parameters 鈹� 鈹溾攢鈹€logger.py // log function @@ -95,9 +112,10 @@ bash run_eval.sh xxx/dataset xxx/yolov5.ckpt 鈹� 鈹溾攢鈹€util.py // util function 鈹� 鈹溾攢鈹€yolo.py // yolov5 network 鈹� 鈹溾攢鈹€yolo_dataset.py // create dataset for YOLOV5 - 鈹溾攢鈹€ train.py // training script - 鈹溾攢鈹€ eval.py // evaluation script - 鈹溾攢鈹€ export.py // export script + 鈹溾攢鈹€ default_config.yaml // parameter configuration + 鈹溾攢鈹€ train.py // training script + 鈹溾攢鈹€ eval.py // evaluation script + 鈹溾攢鈹€ export.py // export script ``` ## [Script Parameters](#contents) @@ -108,7 +126,7 @@ Major parameters in train.py are: optional arguments: --device_target device where the code will be implemented: "Ascend", default is "Ascend" - --data_dir Train dataset directory. + --data_dir Train dataset directory. --per_batch_size Batch size for Training. Default: 8. --pretrained_backbone The ckpt file of CSPDarknet53. Default: "". --resume_yolov5 The ckpt file of YOLOv5, which used to fine tune.Default: "" @@ -157,6 +175,8 @@ python train.py \ --lr_scheduler=cosine_annealing > log.txt 2>&1 & ``` +You should fine tune the params when run training 1p on GPU + The python command above will run in the background, you can view the results through the file `log.txt`. After training, you'll get some checkpoint files under the **outputs** folder by default. The loss value will be achieved as follows: @@ -224,7 +244,7 @@ Before running the command below, please check the checkpoint path used for eval python eval.py \ 聽聽聽 --data_dir=xxx/dataset \ 聽聽聽 --pretrained=xxx/yolov5.ckpt \ -聽聽聽 --testing_shape=640 > log.txt 2>&1 & +聽聽聽 --eval_shape=640 > log.txt 2>&1 & OR # run evaluation by shell script bash run_eval.sh xxx/dataset xxx/yolov5.ckpt @@ -258,34 +278,34 @@ Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.677 YOLOv5 on 118K images(The annotation and data format must be the same as coco2017) -| Parameters | YOLOv5 | -| -------------------------- | ------------------------------------------------------------ | -| Resource | Ascend 910 锛汣PU 2.60GHz锛�192cores; Memory, 755G | -| uploaded Date | 7/12/2021 (month/day/year) | -| MindSpore Version | 1.2.0 | -| Dataset | 118K images | -| Training Parameters | epoch=300, batch_size=8, lr=0.02,momentum=0.9,warmup_epoch=20 | -| Optimizer | Momentum | -| Loss Function | Sigmoid Cross Entropy with logits, Giou Loss | -| outputs | boxes and label | -| Loss | 111.970097 | -| Speed | 8p about 450 FPS | -| Total time | 8p 21h28min | -| Checkpoint for Fine tuning | 53.62M (.ckpt file) | -| Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/ | +| Parameters | YOLOv5s | YOLOv5s | +| -------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| Resource | Ascend 910 锛汣PU 2.60GHz锛�192cores; Memory, 755G | GPU NV SMX2 V100-32G | +| uploaded Date | 7/12/2021 (month/day/year) | 9/15/2021 (month/day/year) | +| MindSpore Version | 1.2.0 | 1.3.0 | +| Dataset | 118K images | 118K images | +| Training Parameters | epoch=300, batch_size=8, lr=0.02,momentum=0.9,warmup_epoch=20| epoch=300, batch_size=32, lr=0.025, warmup_epoch=20, 8p | +| Optimizer | Momentum | Momentum | +| Loss Function | Sigmoid Cross Entropy with logits, Giou Loss | Sigmoid Cross Entropy with logits, Giou Loss | +| outputs | boxes and label | boxes and label | +| Loss | 111.970097 | 85 | +| Speed | 8p about 450 FPS | 8p about 290 FPS | +| Total time | 8p 21h28min | 8p 35h | +| Checkpoint for Fine tuning | 53.62M (.ckpt file) | 58.87M (.ckpt file) | +| Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/ | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/ | ### Inference Performance -| Parameters | YOLOv5 | -| ------------------- | --------------------------- | -| Resource | Ascend 910 锛汣PU 2.60GHz锛�192cores; Memory, 755G | -| Uploaded Date | 7/12/2021 (month/day/year) | -| MindSpore Version | 1.2.0 | -| Dataset | 20K images | -| batch_size | 1 | -| outputs | box position and sorces, and probability | -| Accuracy | mAP >= 36.7%(shape=640) | -| Model for inference | 56.67M (.ckpt file) | +| Parameters | YOLOv5s | YOLOv5s | +| ------------------- | -----------------------------------------------| ---------------------------------------------| +| Resource | Ascend 910 锛汣PU 2.60GHz锛�192cores; Memory, 755G | GPU NV SMX2 V100-32G | +| Uploaded Date | 7/12/2021 (month/day/year) | 9/15/2021 (month/day/year) | +| MindSpore Version | 1.2.0 | 1.3.0 | +| Dataset | 20K images | 20K images | +| batch_size | 1 | 1 | +| outputs | box position and sorces, and probability | box position and sorces, and probability | +| Accuracy | mAP >= 36.7%(shape=640) | mAP >= 36.7%(shape=640) | +| Model for inference | 56.67M (.ckpt file) | 58.87M (.ckpt file) | ### Transfer Learning diff --git a/official/cv/yolov5/default_config.yaml b/official/cv/yolov5/default_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..abbfc9d218e9b470471f144a9b5fe37838f88561 --- /dev/null +++ b/official/cv/yolov5/default_config.yaml @@ -0,0 +1,185 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +output_dir: "/cache" +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +device_target: "Ascend" +need_modelarts_dataset_unzip: True +modelarts_dataset_unzip_name: "coco" + +# ============================================================================== +# Train options +data_dir: "/data/coco" +per_batch_size: 32 +yolov5_version: "yolov5s" +pretrained_backbone: "" +resume_yolov5: "" +pretrained_checkpoint: "" + +lr_scheduler: "cosine_annealing" +lr: 0.013 +lr_epochs: "220,250" +lr_gamma: 0.1 +eta_min: 0.0 +T_max: 300 +max_epoch: 320 +warmup_epochs: 20 +weight_decay: 0.0005 +momentum: 0.9 +loss_scale: 1024 +label_smooth: 0 +label_smooth_factor: 0.1 +log_interval: 100 +ckpt_path: "outputs/" +ckpt_interval: -1 +is_save_on_master: 1 +is_distributed: 0 +rank: 0 +group_size: 1 +need_profiler: 0 +training_shape: "" +resize_rate: 10 +is_modelArts: 0 + +# Eval options +pretrained: "" +log_path: "outputs/" +ann_val_file: "" +eval_nms_thresh: 0.6 +eval_shape: "" +ignore_threshold: 0.7 +test_ignore_threshold: 0.001 +multi_label: True +multi_label_thresh: 0.1 + +# Export options +device_id: 0 +batch_size: 1 +testing_shape: 640 +ckpt_file: "" +file_name: "yolov5" +file_format: "MINDIR" +dataset_path: "" +ann_file: "" + + +# Other default config +hue: 0.015 +saturation: 1.5 +value: 0.4 +jitter: 0.3 + +multi_scale: [[320, 320], + [352, 352], + [384, 384], + [416, 416], + [448, 448], + [480, 480], + [512, 512], + [544, 544], + [576, 576], + [608, 608], + [640, 640], + [672, 672], + [704, 704], + [736, 736], + [768, 768]] +num_classes: 80 +max_box: 150 + +# h->w +anchor_scales: [[12, 16], + [19, 36], + [40, 28], + [36, 75], + [76, 55], + [72, 146], + [142, 110], + [192, 243], + [459, 401]] + +out_channel: 255 # 3 * (num_classes + 5) + +input_shape: [[3, 32, 64, 128, 256, 512, 1], + [3, 48, 96, 192, 384, 768, 2], + [3, 64, 128, 256, 512, 1024, 3], + [3, 80, 160, 320, 640, 1280, 4]] + +# test_param +test_img_shape: [640, 640] + +labels: [ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', + 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', + 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', + 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', + 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', + 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', + 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', + 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' ] + +coco_ids: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, + 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 84, 85, 86, 87, 88, 89, 90 ] + +result_files: './result_Files' + +--- + +# Help description for each configuration +# Train options +data_dir: "Train dataset directory." +per_batch_size: "Batch size for Training." +pretrained_backbone: "The ckpt file of CspDarkNet53." +resume_yolov5: "The ckpt file of YOLOv5, which used to fine tune." +pretrained_checkpoint: "The ckpt file of YOLOv5CspDarkNet53." +lr_scheduler: "Learning rate scheduler, options: exponential, cosine_annealing." +lr: "Learning rate." +lr_epochs: "Epoch of changing of lr changing, split with ','." +lr_gamma: "Decrease lr by a factor of exponential lr_scheduler." +eta_min: "Eta_min in cosine_annealing scheduler." +T_max: "T-max in cosine_annealing scheduler." +max_epoch: "Max epoch num to train the model." +warmup_epochs: "Warmup epochs." +weight_decay: "Weight decay factor." +momentum: "Momentum." +loss_scale: "Static loss scale." +label_smooth: "Whether to use label smooth in CE." +label_smooth_factor: "Smooth strength of original one-hot." +log_interval: "Logging interval steps." +ckpt_path: "Checkpoint save location." +ckpt_interval: "Save checkpoint interval." +is_save_on_master: "Save ckpt on master or all rank, 1 for master, 0 for all ranks." +is_distributed: "Distribute train or not, 1 for yes, 0 for no." +rank: "Local rank of distributed." +group_size: "World size of device." +need_profiler: "Whether use profiler. 0 for no, 1 for yes." +training_shape: "Fix training shape." +resize_rate: "Resize rate for multi-scale training." +ann_file: "path to annotation" +each_multiscale: "Apply multi-scale for each scale" +labels: "the label of train data" +multi_label: "use multi label to nms" +multi_label_thresh: "multi label thresh" + +# Eval options +pretrained: "model_path, local pretrained model to load" +log_path: "checkpoint save location" +ann_val_file: "path to annotation" + +# Export options +device_id: "Device id for export" +batch_size: "batch size for export" +testing_shape: "shape for test" +ckpt_file: "Checkpoint file path for export" +file_name: "output file name for export" +file_format: "file format for export" +result_files: 'path to 310 infer result floder' diff --git a/official/cv/yolov5/eval.py b/official/cv/yolov5/eval.py index 8be991e01ac24a2ec9f77ba9a3f4b80c13fb6733..76b1d9dbe0275a2d854cd796cac8e938d3e774d4 100644 --- a/official/cv/yolov5/eval.py +++ b/official/cv/yolov5/eval.py @@ -14,11 +14,9 @@ # ============================================================================ """YoloV5 eval.""" import os -import argparse import datetime import time import sys -import ast from collections import defaultdict import numpy as np @@ -34,64 +32,36 @@ import mindspore as ms from src.yolo import YOLOV5 from src.logger import get_logger from src.yolo_dataset import create_yolo_dataset -from src.config import ConfigYOLOV5 - -parser = argparse.ArgumentParser('mindspore coco testing') - -# device related -parser.add_argument('--device_target', type=str, default='Ascend', - help='device where the code will be implemented. (Default: Ascend)') - -# dataset related -parser.add_argument('--data_dir', type=str, default='/data/coco', help='train data dir') -parser.add_argument('--per_batch_size', default=1, type=int, help='batch size for per gpu') - -# network related -parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load') -parser.add_argument('--yolov5_version', default='yolov5s', type=str, - help='The version of YOLOv5, options: yolov5s, yolov5m, yolov5l, yolov5x') - -# logging related -parser.add_argument('--log_path', type=str, default='outputs/', help='checkpoint save location') - -# detect_related -parser.add_argument('--nms_thresh', type=float, default=0.6, help='threshold for NMS') -parser.add_argument('--ann_file', type=str, default='', help='path to annotation') -parser.add_argument('--testing_shape', type=str, default='', help='shape for test ') -parser.add_argument('--ignore_threshold', type=float, default=0.001, help='threshold to throw low quality boxes') -parser.add_argument('--multi_label', type=ast.literal_eval, default=True, help='whether to use multi label') -parser.add_argument('--multi_label_thresh', type=float, default=0.1, help='threshhold to throw low quality boxes') -parser.add_argument('--is_modelArts', type=int, default=0, - help='Trainning in modelArts or not, 1 for yes, 0 for no. Default: 0') - -args, _ = parser.parse_known_args() -args.rank = 0 - -if args.is_modelArts: - args.data_root = os.path.join(args.data_dir, 'val2017') - args.ann_file = os.path.join(args.data_dir, 'annotations') + +from model_utils.config import config +from model_utils.moxing_adapter import moxing_wrapper +from model_utils.device_adapter import get_device_id, get_device_num + +config.rank = 0 + +if config.is_modelArts: + config.data_root = os.path.join(config.data_dir, 'val2017') + config.ann_file = os.path.join(config.data_dir, 'annotations') import moxing as mox - local_data_url = os.path.join('/cache/data', str(args.rank)) - local_annFile = os.path.join('/cache/data', str(args.rank)) - local_pretrained = os.path.join('/cache/data', str(args.rank)) + local_data_url = os.path.join(config.data_path, str(config.rank)) + local_annFile = os.path.join(config.data_path, str(config.rank)) + local_pretrained = os.path.join(config.data_path, str(config.rank)) - temp_str = args.pretrained.split('/')[-1] - args.pretrained = args.pretrained[0:args.pretrained.rfind('/')] + temp_str = config.pretrained.split('/')[-1] + config.pretrained = config.pretrained[0:config.pretrained.rfind('/')] - mox.file.copy_parallel(args.data_root, local_data_url) - args.data_root = local_data_url + mox.file.copy_parallel(config.data_root, local_data_url) + config.data_root = local_data_url - mox.file.copy_parallel(args.ann_file, local_annFile) - args.ann_file = os.path.join(local_data_url, 'instances_val2017.json') + mox.file.copy_parallel(config.ann_file, local_annFile) + config.ann_file = os.path.join(local_data_url, 'instances_val2017.json') - mox.file.copy_parallel(args.pretrained, local_pretrained) - args.pretrained = os.path.join(local_data_url, temp_str) + mox.file.copy_parallel(config.pretrained, local_pretrained) + config.pretrained = os.path.join(local_data_url, temp_str) else: - args.data_root = os.path.join(args.data_dir, 'val2017') - args.ann_file = os.path.join( - args.data_dir, - 'annotations/instances_val2017.json') + config.data_root = os.path.join(config.data_dir, 'val2017') + config.ann_file = os.path.join(config.data_dir, 'annotations/instances_val2017.json') class Redirct: @@ -109,17 +79,8 @@ class DetectionEngine: """Detection engine.""" def __init__(self, args_detection): - self.ignore_threshold = args_detection.ignore_threshold - self.labels = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', - 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', - 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', - 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', - 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', - 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', - 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', - 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', - 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', - 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'] + self.ignore_threshold = args_detection.test_ignore_threshold + self.labels = args_detection.labels self.num_classes = len(self.labels) self.results = {} self.file_path = '' @@ -128,14 +89,11 @@ class DetectionEngine: self._coco = COCO(self.ann_file) self._img_ids = list(sorted(self._coco.imgs.keys())) self.det_boxes = [] - self.nms_thresh = args_detection.nms_thresh + self.nms_thresh = args_detection.eval_nms_thresh self.multi_label = args_detection.multi_label self.multi_label_thresh = args_detection.multi_label_thresh self.coco_catids = self._coco.getCatIds() - self.coco_catIds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, - 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, - 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, - 81, 82, 84, 85, 86, 87, 88, 89, 90] + self.coco_catIds = args_detection.coco_ids def do_nms_for_results(self): """Get result boxes.""" @@ -335,29 +293,82 @@ def convert_testing_shape(args_testing_shape): testing_shape = [int(args_testing_shape), int(args_testing_shape)] return testing_shape +def modelarts_pre_process(): + '''modelarts pre process function.''' + def unzip(zip_file, save_dir): + import zipfile + s_time = time.time() + if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)): + zip_isexist = zipfile.is_zipfile(zip_file) + if zip_isexist: + fz = zipfile.ZipFile(zip_file, 'r') + data_num = len(fz.namelist()) + print("Extract Start...") + print("unzip file num: {}".format(data_num)) + data_print = int(data_num / 100) if data_num > 100 else 1 + i = 0 + for file in fz.namelist(): + if i % data_print == 0: + print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True) + i += 1 + fz.extract(file, save_dir) + print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60), + int(int(time.time() - s_time) % 60))) + print("Extract Done.") + else: + print("This is not zip.") + else: + print("Zip has been extracted.") -if __name__ == "__main__": + if config.need_modelarts_dataset_unzip: + zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip") + save_dir_1 = os.path.join(config.data_path) + + sync_lock = "/tmp/unzip_sync.lock" + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("Zip file path: ", zip_file_1) + print("Unzip file save dir: ", save_dir_1) + unzip(zip_file_1, save_dir_1) + print("===Finish extract data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1)) + + config.log_path = os.path.join(config.output_path, config.log_path) + +@moxing_wrapper(pre_process=modelarts_pre_process) +def run_eval(): start_time = time.time() device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 # device_id = 1 - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=device_id) + context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=device_id) # logger - args.outputs_dir = os.path.join(args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) + config.outputs_dir = os.path.join(config.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) rank_id = int(os.getenv('DEVICE_ID', '0')) - args.logger = get_logger(args.outputs_dir, rank_id) + config.logger = get_logger(config.outputs_dir, rank_id) context.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) - args.logger.info('Creating Network....') + config.logger.info('Creating Network....') dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3} - network = YOLOV5(is_training=False, version=dict_version[args.yolov5_version]) + network = YOLOV5(is_training=False, version=dict_version[config.yolov5_version]) - args.logger.info(args.pretrained) - if os.path.isfile(args.pretrained): - param_dict = load_checkpoint(args.pretrained) + config.logger.info(config.pretrained) + if os.path.isfile(config.pretrained): + param_dict = load_checkpoint(config.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): @@ -367,32 +378,31 @@ if __name__ == "__main__": else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) - args.logger.info('load_model {} success'.format(args.pretrained)) + config.logger.info('load_model %s success', config.pretrained) else: - args.logger.info('{} not exists or not a pre-trained file'.format(args.pretrained)) - assert FileNotFoundError('{} not exists or not a pre-trained file'.format(args.pretrained)) + config.logger.info('%s not exists or not a pre-trained file', config.pretrained) + assert FileNotFoundError('{} not exists or not a pre-trained file'.format(config.pretrained)) exit(1) - data_root = args.data_root - ann_file = args.ann_file + data_root = config.data_root + ann_file = config.ann_file - config = ConfigYOLOV5() - if args.testing_shape: - config.test_img_shape = convert_testing_shape(args.testing_shape) + if config.testing_shape: + config.test_img_shape = convert_testing_shape(config.eval_shape) - ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=args.per_batch_size, + ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=config.per_batch_size, max_epoch=1, device_num=1, rank=rank_id, shuffle=False, config=config) - args.logger.info('testing shape : {}'.format(config.test_img_shape)) - args.logger.info('total {} images to eval'.format(data_size)) + config.logger.info('testing shape : %d', config.test_img_shape) + config.logger.info('total %d images to eval', data_size) network.set_train(False) # init detection engine - detection = DetectionEngine(args) + detection = DetectionEngine(config) input_shape = Tensor(tuple(config.test_img_shape), ms.float32) - args.logger.info('Start inference....') + config.logger.info('Start inference....') for image_index, data in enumerate(ds.create_dict_iterator(num_epochs=1)): image = data["image"].asnumpy() image = np.concatenate((image[..., ::2, ::2], image[..., 1::2, ::2], @@ -407,16 +417,20 @@ if __name__ == "__main__": output_small = output_small.asnumpy() image_id_ = image_id_.asnumpy() image_shape_ = image_shape_.asnumpy() - detection.detect([output_small, output_me, output_big], args.per_batch_size, image_shape_, image_id_) + detection.detect([output_small, output_me, output_big], config.per_batch_size, image_shape_, image_id_) if image_index % 1000 == 0: - args.logger.info('Processing... {:.2f}% '.format(image_index * args.per_batch_size / data_size * 100)) + config.logger.info('Processing... {:.2f}% '.format(image_index * config.per_batch_size / data_size * 100)) - args.logger.info('Calculating mAP...') + config.logger.info('Calculating mAP...') detection.do_nms_for_results() result_file_path = detection.write_result() - args.logger.info('result file path: {}'.format(result_file_path)) + config.logger.info('result file path: %s', result_file_path) eval_result = detection.get_eval_result() cost_time = time.time() - start_time - args.logger.info('\n=============coco eval reulst=========\n' + eval_result) - args.logger.info('testing cost time {:.2f}h'.format(cost_time / 3600.)) + eval_log_string = '\n=============coco eval result=========\n' + eval_result + config.logger.info(eval_log_string) + config.logger.info('testing cost time %.2f h', cost_time / 3600.) + +if __name__ == "__main__": + run_eval() diff --git a/official/cv/yolov5/export.py b/official/cv/yolov5/export.py index a228cef97e5d6301a20840dd245006652ba6932d..de04ff4ce3b7aa0a78ec66227f4c17d9617c0836 100644 --- a/official/cv/yolov5/export.py +++ b/official/cv/yolov5/export.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -import argparse import numpy as np import mindspore @@ -21,37 +20,34 @@ from mindspore.train.serialization import export, load_checkpoint, load_param_in from src.yolo import YOLOV5s_Infer -parser = argparse.ArgumentParser(description='yolov5 export') -parser.add_argument("--device_id", type=int, default=0, help="Device id") -parser.add_argument("--batch_size", type=int, default=1, help="batch size") -parser.add_argument('--yolov5_version', default='yolov5s', type=str, - help='The version of YOLOv5, options: yolov5s, yolov5m, yolov5l, yolov5x') +from model_utils.config import config +from model_utils.moxing_adapter import moxing_wrapper -parser.add_argument("--testing_shape", type=int, default=640, help="test shape") -parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.") -parser.add_argument("--file_name", type=str, default="yolov5", help="output file name.") -parser.add_argument('--file_format', type=str, choices=["AIR", "ONNX", "MINDIR"], default='MINDIR', help='file format') -parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], - default="Ascend", help="device target") -args = parser.parse_args() +def modelarts_pre_process(): + '''modelarts pre process function.''' + config.file_name = os.path.join(config.output_path, config.file_name) -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) -if args.device_target == "Ascend": - context.set_context(device_id=args.device_id) -if __name__ == "__main__": - ts_shape = args.testing_shape // 2 +@moxing_wrapper(pre_process=modelarts_pre_process) +def run_export(): + context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + if config.device_target == "Ascend": + context.set_context(device_id=config.device_id) + ts_shape = config.testing_shape // 2 dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3} - args.file_name = args.file_name + '_' + args.yolov5_version + config.file_name = config.file_name + '_' + config.yolov5_version - network = YOLOV5s_Infer(args.testing_shape, version=dict_version[args.yolov5_version]) + network = YOLOV5s_Infer(config.testing_shape, version=dict_version[config.yolov5_version]) network.set_train(False) - param_dict = load_checkpoint(args.ckpt_file) + param_dict = load_checkpoint(config.ckpt_file) load_param_into_net(network, param_dict) - input_data = Tensor(np.zeros([args.batch_size, 12, ts_shape, ts_shape]), mindspore.float32) + input_data = Tensor(np.zeros([config.batch_size, 12, ts_shape, ts_shape]), mindspore.float32) - export(network, input_data, file_name=args.file_name, file_format=args.file_format) + export(network, input_data, file_name=config.file_name, file_format=config.file_format) print('==========success export===============') + +if __name__ == "__main__": + run_export() diff --git a/official/cv/yolov5/model_utils/__init__.py b/official/cv/yolov5/model_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/official/cv/yolov5/model_utils/config.py b/official/cv/yolov5/model_utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..ad0d7497a8e5996a42c54a0cda599f84576fdc8e --- /dev/null +++ b/official/cv/yolov5/model_utils/config.py @@ -0,0 +1,126 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Parse arguments""" + +import os +import ast +import argparse +from pprint import pformat +import yaml + +class Config: + """ + Configuration namespace. Convert dictionary to members. + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + if isinstance(v, (list, tuple)): + setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Config(v) if isinstance(v, dict) else v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + +def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): + """ + Parse command line arguments to the configuration according to the default yaml. + + Args: + parser: Parent parser. + cfg: Base configuration. + helper: Helper description. + cfg_path: Path to the default yaml config. + """ + parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", + parents=[parser]) + helper = {} if helper is None else helper + choices = {} if choices is None else choices + for item in cfg: + if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): + help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) + choice = choices[item] if item in choices else None + if isinstance(cfg[item], bool): + parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, + help=help_description) + else: + parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, + help=help_description) + args = parser.parse_args() + return args + + +def parse_yaml(yaml_path): + """ + Parse the yaml config file. + + Args: + yaml_path: Path to the yaml config. + """ + with open(yaml_path, 'r') as fin: + try: + cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) + cfgs = [x for x in cfgs] + if len(cfgs) == 1: + cfg_helper = {} + cfg = cfgs[0] + cfg_choices = {} + elif len(cfgs) == 2: + cfg, cfg_helper = cfgs + cfg_choices = {} + elif len(cfgs) == 3: + cfg, cfg_helper, cfg_choices = cfgs + else: + raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") + print(cfg_helper) + except: + raise ValueError("Failed to parse yaml") + return cfg, cfg_helper, cfg_choices + + +def merge(args, cfg): + """ + Merge the base config from yaml file and command line arguments. + + Args: + args: Command line arguments. + cfg: Base configuration. + """ + args_var = vars(args) + for item in args_var: + cfg[item] = args_var[item] + return cfg + + +def get_config(): + """ + Get Config according to the yaml file and cli arguments. + """ + parser = argparse.ArgumentParser(description="default name", add_help=False) + current_dir = os.path.dirname(os.path.abspath(__file__)) + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), + help="Config file path") + path_args, _ = parser.parse_known_args() + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) + return Config(final_config) + +config = get_config() diff --git a/official/cv/yolov5/model_utils/device_adapter.py b/official/cv/yolov5/model_utils/device_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..7c5d7f837ddaa8f53cf8dc5573cac0e36881e7b1 --- /dev/null +++ b/official/cv/yolov5/model_utils/device_adapter.py @@ -0,0 +1,27 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Device adapter for ModelArts""" + +from .config import config + +if config.enable_modelarts: + from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + "get_device_id", "get_device_num", "get_rank_id", "get_job_id" +] diff --git a/official/cv/yolov5/model_utils/local_adapter.py b/official/cv/yolov5/model_utils/local_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..769fa6dc78e59eb66dbc8e6773accdc1d08b649e --- /dev/null +++ b/official/cv/yolov5/model_utils/local_adapter.py @@ -0,0 +1,36 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Local adapter""" + +import os + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return "Local Job" diff --git a/official/cv/yolov5/model_utils/moxing_adapter.py b/official/cv/yolov5/model_utils/moxing_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..25838a7da99a27a1bb744684c1f75f80f5704688 --- /dev/null +++ b/official/cv/yolov5/model_utils/moxing_adapter.py @@ -0,0 +1,116 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from .config import config + +_global_sync_count = 0 + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local path + Upload data from local directory to remote obs in contrast. + """ + import moxing as mox + import time + global _global_sync_count + sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) + _global_sync_count += 1 + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("from path: ", from_path) + print("to path: ", to_path) + mox.file.copy_parallel(from_path, to_path) + print("===finish data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + print("===save flag===") + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Finish sync data from {} to {}.".format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs. + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print("Dataset downloaded: ", os.listdir(config.data_path)) + if config.checkpoint_url: + sync_data(config.checkpoint_url, config.load_path) + print("Preload downloaded: ", os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print("Workspace downloaded: ", os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + # Run the main function + run_func(*args, **kwargs) + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper diff --git a/official/cv/yolov5/scripts/run_distribute_train.sh b/official/cv/yolov5/scripts/run_distribute_train.sh index 053c8b73b258c4ca3d90b3c8e643399d04807210..e89e4a427008bc3e88d4d84b1b3b4e31284c13cc 100644 --- a/official/cv/yolov5/scripts/run_distribute_train.sh +++ b/official/cv/yolov5/scripts/run_distribute_train.sh @@ -56,7 +56,9 @@ do rm -rf ./train_parallel$i mkdir ./train_parallel$i cp ../*.py ./train_parallel$i + cp ../*.yaml ./train_parallel$i cp -r ../src ./train_parallel$i + cp -r ../model_utils ./train_parallel$i cd ./train_parallel$i || exit echo "start training for rank $RANK_ID, device $DEVICE_ID" env > env.log diff --git a/official/cv/yolov5/scripts/run_distribute_train_gpu.sh b/official/cv/yolov5/scripts/run_distribute_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..d16a10af01142bca91984fb4d8811fa60c0a7cda --- /dev/null +++ b/official/cv/yolov5/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [RANK_SIZE]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +DATASET_PATH=$(get_real_path $1) +echo $DATASET_PATH + +if [ ! -d $DATASET_PATH ] +then + echo "error: DATASET_PATH=$DATASET_PATH is not a directory" +exit 1 +fi + +export RANK_SIZE=$2 + +if [ -d "distribute_train" ]; then + rm -rf ./distribute_train +fi + +mkdir ./distribute_train +cp ../*.py ./distribute_train +cp ../*.yaml ./distribute_train +cp -r ../src ./distribute_train +cp -r ../model_utils ./distribute_train +cd ./distribute_train || exit + +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ +nohup python train.py \ + --device_target=GPU \ + --per_batch_size=32 \ + --data_dir=$DATASET_PATH \ + --is_distributed=1 \ + --yolov5_version='yolov5s' \ + --lr=0.025 \ + --T_max=300 \ + --max_epoch=300 \ + --warmup_epochs=20 \ + --training_shape=640 \ + --lr_scheduler=cosine_annealing > log.txt 2>&1 & +cd .. diff --git a/official/cv/yolov5/scripts/run_eval.sh b/official/cv/yolov5/scripts/run_eval.sh index 600690de2bb2109a4d6290818fadb74148d05b97..c88f7a2138e4fb02f9c739e855b405b94e18b37c 100644 --- a/official/cv/yolov5/scripts/run_eval.sh +++ b/official/cv/yolov5/scripts/run_eval.sh @@ -55,7 +55,9 @@ then fi mkdir ./eval cp ../*.py ./eval +cp ../*.yaml ./eval cp -r ../src ./eval +cp -r ../model_utils ./eval cd ./eval || exit env > env.log echo "start inferring for device $DEVICE_ID" @@ -63,5 +65,5 @@ python eval.py \ --data_dir=$DATASET_PATH \ --yolov5_version='yolov5s' \ --pretrained=$CHECKPOINT_PATH \ - --testing_shape=640 > log.txt 2>&1 & + --eval_shape=640 > log.txt 2>&1 & cd .. diff --git a/official/cv/yolov5/scripts/run_standalone_train.sh b/official/cv/yolov5/scripts/run_standalone_train.sh index 0d7d8bf7c76b8f52b8381e37e16c774c6c3b410e..39a7dc0521e603e0c15057d4beb9b14871f6106b 100644 --- a/official/cv/yolov5/scripts/run_standalone_train.sh +++ b/official/cv/yolov5/scripts/run_standalone_train.sh @@ -50,7 +50,9 @@ then fi mkdir ./train cp ../*.py ./train +cp ../*.yaml ./train cp -r ../src ./train +cp -r ../model_utils ./train cd ./train || exit echo "start training for device $DEVICE_ID" env > env.log diff --git a/official/cv/yolov5/src/config.py b/official/cv/yolov5/src/config.py deleted file mode 100644 index 3414afa90cd84fc808341065b0e680ac896983cd..0000000000000000000000000000000000000000 --- a/official/cv/yolov5/src/config.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Config parameters for yolov5 models.""" - - -class ConfigYOLOV5: - """ - Config parameters for the yolov5. - - Examples: - ConfigYOLOV5() - """ - # train_param - # data augmentation related - hue = 0.015 - saturation = 1.5 - value = 0.4 - jitter = 0.3 - - resize_rate = 10 - multi_scale = [ - [320, 320], - [352, 352], - [384, 384], - [416, 416], - [448, 448], - [480, 480], - [512, 512], - [544, 544], - [576, 576], - [608, 608], - [640, 640], - [672, 672], - [704, 704], - [736, 736], - [768, 768]] - num_classes = 80 - max_box = 150 - - # confidence under ignore_threshold means no object when training - ignore_threshold = 0.7 - - # h->w - anchor_scales = [(12, 16), - (19, 36), - (40, 28), - (36, 75), - (76, 55), - (72, 146), - (142, 110), - (192, 243), - (459, 401)] - out_channel = 3 * (num_classes + 5) - - input_shape = [[3, 32, 64, 128, 256, 512, 1], - [3, 48, 96, 192, 384, 768, 2], - [3, 64, 128, 256, 512, 1024, 3], - [3, 80, 160, 320, 640, 1280, 4]] - - # test_param - test_img_shape = [640, 640] diff --git a/official/cv/yolov5/src/yolo.py b/official/cv/yolov5/src/yolo.py index 245566fe5f98253b372650b9c7e517e1345cf7d5..4071e16464118cbbadc26f304d5b786e449f24d6 100644 --- a/official/cv/yolov5/src/yolo.py +++ b/official/cv/yolov5/src/yolo.py @@ -25,15 +25,15 @@ from mindspore.ops import functional as F from mindspore.ops import composite as C from src.backbone import YOLOv5Backbone, Conv, BottleneckCSP -from src.config import ConfigYOLOV5 from src.loss import ConfidenceLoss, ClassLoss +from model_utils.config import config as default_config class YOLO(nn.Cell): def __init__(self, backbone, shape): super(YOLO, self).__init__() self.backbone = backbone - self.config = ConfigYOLOV5() + self.config = default_config self.conv1 = Conv(shape[5], shape[4], k=1, s=1) self.CSP5 = BottleneckCSP(shape[5], shape[4], n=1*shape[6], shortcut=False) @@ -117,7 +117,7 @@ class DetectionBlock(nn.Cell): Args: scale: Character. - config: ConfigYOLOV5, Configuration instance. + config: config, Configuration instance. is_training: Bool, Whether train or not, default True. Returns: @@ -127,7 +127,7 @@ class DetectionBlock(nn.Cell): DetectionBlock(scale='l',stride=32) """ - def __init__(self, scale, config=ConfigYOLOV5(), is_training=True): + def __init__(self, scale, config=default_config, is_training=True): super(DetectionBlock, self).__init__() self.config = config if scale == 's': @@ -240,7 +240,7 @@ class YoloLossBlock(nn.Cell): """ Loss block cell of YOLOV5 network. """ - def __init__(self, scale, config=ConfigYOLOV5()): + def __init__(self, scale, config=default_config): super(YoloLossBlock, self).__init__() self.config = config if scale == 's': @@ -339,7 +339,7 @@ class YOLOV5(nn.Cell): def __init__(self, is_training, version=0): super(YOLOV5, self).__init__() - self.config = ConfigYOLOV5() + self.config = default_config # YOLOv5 network self.shape = self.config.input_shape[version] @@ -378,7 +378,7 @@ class YoloWithLossCell(nn.Cell): def __init__(self, network): super(YoloWithLossCell, self).__init__() self.yolo_network = network - self.config = ConfigYOLOV5() + self.config = default_config self.loss_big = YoloLossBlock('l', self.config) self.loss_me = YoloLossBlock('m', self.config) self.loss_small = YoloLossBlock('s', self.config) diff --git a/official/cv/yolov5/train.py b/official/cv/yolov5/train.py index 532c56cd40bace811d78c73d65f45c145670820b..e3ef1b8a4b982a33710ce3742b3cc45ca5a4785f 100644 --- a/official/cv/yolov5/train.py +++ b/official/cv/yolov5/train.py @@ -15,7 +15,6 @@ """YoloV5 train.""" import os import time -import argparse import datetime import mindspore as ms from mindspore.context import ParallelMode @@ -32,230 +31,222 @@ from src.util import AverageMeter, get_param_groups from src.lr_scheduler import get_lr from src.yolo_dataset import create_yolo_dataset from src.initializer import default_recurisive_init, load_yolov5_params -from src.config import ConfigYOLOV5 -ms.set_seed(1) +from model_utils.config import config +from model_utils.moxing_adapter import moxing_wrapper +from model_utils.device_adapter import get_device_id, get_device_num -parser = argparse.ArgumentParser('mindspore coco training') - -# device related -parser.add_argument('--device_target', type=str, default='Ascend', help='device where the code will be implemented.') - -# dataset related -parser.add_argument('--data_dir', default='/data/coco', type=str, help='Train dataset directory.') -parser.add_argument('--per_batch_size', default=32, type=int, help='Batch size for Training. Default: 8') - -# network related -parser.add_argument('--yolov5_version', default='yolov5s', type=str, - help='The version of YOLOv5, options: yolov5s, yolov5m, yolov5l, yolov5x') -parser.add_argument('--pretrained_backbone', default='', type=str, help='The pretrained file of yolov5. Default: "".') -parser.add_argument('--resume_yolov5', default='', type=str, - help='The ckpt file of YOLOv5, which used to fine tune. Default: ""') - -# optimizer and lr related -parser.add_argument('--lr_scheduler', default='cosine_annealing', type=str, - help='Learning rate scheduler, options: exponential, cosine_annealing. Default: exponential') -parser.add_argument('--lr', default=0.013, type=float, help='Learning rate. Default: 0.01') -parser.add_argument('--lr_epochs', type=str, default='220,250', - help='Epoch of changing of lr changing, split with ",". Default: 220,250') -parser.add_argument('--lr_gamma', type=float, default=0.1, - help='Decrease lr by a factor of exponential lr_scheduler. Default: 0.1') -parser.add_argument('--eta_min', type=float, default=0., help='Eta_min in cosine_annealing scheduler. Default: 0') -parser.add_argument('--T_max', type=int, default=300, help='T-max in cosine_annealing scheduler. Default: 320') -parser.add_argument('--max_epoch', type=int, default=300, help='Max epoch num to train the model. Default: 320') -parser.add_argument('--warmup_epochs', default=20, type=float, help='Warmup epochs. Default: 0') -parser.add_argument('--weight_decay', type=float, default=0.0005, help='Weight decay factor. Default: 0.0005') -parser.add_argument('--momentum', type=float, default=0.9, help='Momentum. Default: 0.9') - -# loss related -parser.add_argument('--loss_scale', type=int, default=1024, help='Static loss scale. Default: 1024') -parser.add_argument('--label_smooth', type=int, default=0, help='Whether to use label smooth in CE. Default:0') -parser.add_argument('--label_smooth_factor', type=float, default=0.1, - help='Smooth strength of original one-hot. Default: 0.1') - -# logging related -parser.add_argument('--log_interval', type=int, default=100, help='Logging interval steps. Default: 100') -parser.add_argument('--ckpt_path', type=str, default='outputs/', help='Checkpoint save location. Default: outputs/') -parser.add_argument('--ckpt_interval', type=int, default=None, help='Save checkpoint interval. Default: None') - -parser.add_argument('--is_save_on_master', type=int, default=1, - help='Save ckpt on master or all rank, 1 for master, 0 for all ranks. Default: 1') - -# distributed related -parser.add_argument('--is_distributed', type=int, default=0, - help='Distribute train or not, 1 for yes, 0 for no. Default: 1') -parser.add_argument('--rank', type=int, default=0, help='Local rank of distributed. Default: 0') -parser.add_argument('--group_size', type=int, default=1, help='World size of device. Default: 1') - -# roma obs -parser.add_argument('--train_url', type=str, default="", help='train url') -# profiler init -parser.add_argument('--need_profiler', type=int, default=0, - help='Whether use profiler. 0 for no, 1 for yes. Default: 0') - -# reset default config -parser.add_argument('--training_shape', type=str, default="", help='Fix training shape. Default: ""') -parser.add_argument('--resize_rate', type=int, default=10, help='Resize rate for multi-scale training. Default: None') -parser.add_argument('--is_modelArts', type=int, default=0, - help='Trainning in modelArts or not, 1 for yes, 0 for no. Default: 0') - -args, _ = parser.parse_known_args() - -if args.lr_scheduler == 'cosine_annealing' and args.max_epoch > args.T_max: - args.T_max = args.max_epoch - -args.lr_epochs = list(map(int, args.lr_epochs.split(','))) - -if args.is_modelArts: - args.data_root = os.path.join(args.data_dir, 'train2017') - - args.annFile = os.path.join(args.data_dir, 'annotations') - outputs_dir = os.path.join('/cache', args.ckpt_path) -else: - args.data_root = os.path.join(args.data_dir, 'train2017') - args.annFile = os.path.join( - args.data_dir, 'annotations/instances_train2017.json') - outputs_dir = args.ckpt_path - -deviced = int(os.getenv('DEVICE_ID', '0')) -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, - save_graphs=False, device_id=deviced) -# init distributed -if args.is_distributed: - if args.device_target == "Ascend": - init() +ms.set_seed(1) - else: - init("nccl") - args.rank = get_rank() - args.group_size = get_group_size() +def set_default(): + if config.lr_scheduler == 'cosine_annealing' and config.max_epoch > config.T_max: + config.T_max = config.max_epoch -args.rank_save_ckpt_flag = 0 -if args.is_save_on_master: - if args.rank == 0: - args.rank_save_ckpt_flag = 1 -else: - args.rank_save_ckpt_flag = 1 + config.lr_epochs = list(map(int, config.lr_epochs.split(','))) -# logger -args.outputs_dir = os.path.join(outputs_dir, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) -args.logger = get_logger(args.outputs_dir, args.rank) -args.logger.save_args(args) + if config.is_modelArts: + config.data_root = os.path.join(config.data_dir, 'train2017') + config.annFile = os.path.join(config.data_dir, 'annotations') + outputs_dir = os.path.join(config.outputs_dir, config.ckpt_path) + else: + config.data_root = os.path.join(config.data_dir, 'train2017') + config.annFile = os.path.join(config.data_dir, 'annotations/instances_train2017.json') + outputs_dir = config.ckpt_path + + device_id = get_device_id() + context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=config.device_target, + save_graphs=False, device_id=device_id) + # init distributed + if config.is_distributed: + if config.device_target == "Ascend": + init() + else: + init("nccl") + config.rank = get_rank() + config.group_size = get_group_size() + + config.rank_save_ckpt_flag = 0 + if config.is_save_on_master: + if config.rank == 0: + config.rank_save_ckpt_flag = 1 + else: + config.rank_save_ckpt_flag = 1 + # logger + config.outputs_dir = os.path.join(outputs_dir, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) + config.logger = get_logger(config.outputs_dir, config.rank) + config.logger.save_args(config) def convert_training_shape(args_training_shape): training_shape = [int(args_training_shape), int(args_training_shape)] return training_shape - -loss_meter = AverageMeter('loss') - -if args.is_modelArts: - import moxing as mox - local_data_url = os.path.join('/cache/data', str(args.rank)) - local_annFile = os.path.join('/cache/data', str(args.rank)) - mox.file.copy_parallel(args.data_root, local_data_url) - args.data_root = local_data_url - - mox.file.copy_parallel(args.annFile, local_annFile) - args.annFile = os.path.join(local_data_url, 'instances_train2017.json') - -context.reset_auto_parallel_context() -parallel_mode = ParallelMode.STAND_ALONE -degree = 1 -if args.is_distributed: - parallel_mode = ParallelMode.DATA_PARALLEL - degree = get_group_size() -context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) - -dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3} -network = YOLOV5(is_training=True, version=dict_version[args.yolov5_version]) -# default is kaiming-normal -default_recurisive_init(network) -load_yolov5_params(args, network) - -network = YoloWithLossCell(network) -config = ConfigYOLOV5() - -config.label_smooth = args.label_smooth -config.label_smooth_factor = args.label_smooth_factor - -if args.training_shape: - config.multi_scale = [convert_training_shape(args.training_shape)] -if args.resize_rate: - config.resize_rate = args.resize_rate - -ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True, - batch_size=args.per_batch_size, max_epoch=args.max_epoch, - device_num=args.group_size, rank=args.rank, config=config) - -args.logger.info('Finish loading dataset') - -args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) - -if not args.ckpt_interval: - args.ckpt_interval = args.steps_per_epoch - -lr = get_lr(args) - -opt = Momentum(params=get_param_groups(network), momentum=args.momentum, learning_rate=Tensor(lr), - weight_decay=args.weight_decay, loss_scale=args.loss_scale) - -network = TrainingWrapper(network, opt, args.loss_scale // 2) -network.set_train() - -if args.rank_save_ckpt_flag: - # checkpoint save - ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval - ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=1) - save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') - ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) - cb_params = _InternalCallbackParam() - cb_params.train_network = network - cb_params.epoch_num = ckpt_max_num - cb_params.cur_epoch_num = 1 - run_context = RunContext(cb_params) - ckpt_cb.begin(run_context) - -old_progress = -1 -t_end = time.time() -data_loader = ds.create_dict_iterator(output_numpy=True, num_epochs=1) - -for i, data in enumerate(data_loader): - images = data["image"] - input_shape = images.shape[2:4] - images = Tensor.from_numpy(images) - batch_y_true_0 = Tensor.from_numpy(data['bbox1']) - batch_y_true_1 = Tensor.from_numpy(data['bbox2']) - batch_y_true_2 = Tensor.from_numpy(data['bbox3']) - batch_gt_box0 = Tensor.from_numpy(data['gt_box1']) - batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) - batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) - input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) - loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, - batch_gt_box2, input_shape) - loss_meter.update(loss.asnumpy()) - - if args.rank_save_ckpt_flag: - # ckpt progress - cb_params.cur_step_num = i + 1 # current step number - cb_params.batch_num = i + 2 - ckpt_cb.step_end(run_context) - - if i % args.log_interval == 0: - time_used = time.time() - t_end - epoch = int(i / args.steps_per_epoch) - fps = args.per_batch_size * (i - old_progress) * args.group_size / time_used - if args.rank == 0: - args.logger.info('epoch[{}], iter[{}], {}, fps:{:.2f} imgs/sec, ' - 'lr:{}'.format(epoch, i, loss_meter, fps, lr[i])) - t_end = time.time() - loss_meter.reset() - old_progress = i - - if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag: - cb_params.cur_epoch_num += 1 - -if args.is_modelArts: - mox.file.copy_parallel(src_url='/cache/outputs/', dst_url='obs://hit-cyf/yolov5_npu/outputs/') -args.logger.info('==========end training===============') +def modelarts_pre_process(): + '''modelarts pre process function.''' + def unzip(zip_file, save_dir): + import zipfile + s_time = time.time() + if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)): + zip_isexist = zipfile.is_zipfile(zip_file) + if zip_isexist: + fz = zipfile.ZipFile(zip_file, 'r') + data_num = len(fz.namelist()) + print("Extract Start...") + print("unzip file num: {}".format(data_num)) + data_print = int(data_num / 100) if data_num > 100 else 1 + i = 0 + for file in fz.namelist(): + if i % data_print == 0: + print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True) + i += 1 + fz.extract(file, save_dir) + print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60), + int(int(time.time() - s_time) % 60))) + print("Extract Done.") + else: + print("This is not zip.") + else: + print("Zip has been extracted.") + + if config.need_modelarts_dataset_unzip: + zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip") + save_dir_1 = os.path.join(config.data_path) + + sync_lock = "/tmp/unzip_sync.lock" + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("Zip file path: ", zip_file_1) + print("Unzip file save dir: ", save_dir_1) + unzip(zip_file_1, save_dir_1) + print("===Finish extract data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1)) + + config.ckpt_path = os.path.join(config.output_path, config.ckpt_path) + +@moxing_wrapper(pre_process=modelarts_pre_process) +def run_train(): + set_default() + loss_meter = AverageMeter('loss') + + if config.is_modelArts: + import moxing as mox + local_data_url = os.path.join(config.data_path, str(config.rank)) + local_annFile = os.path.join(config.data_path, str(config.rank)) + mox.file.copy_parallel(config.data_root, local_data_url) + config.data_root = local_data_url + + mox.file.copy_parallel(config.annFile, local_annFile) + config.annFile = os.path.join(local_data_url, 'instances_train2017.json') + + context.reset_auto_parallel_context() + parallel_mode = ParallelMode.STAND_ALONE + degree = 1 + if config.is_distributed: + parallel_mode = ParallelMode.DATA_PARALLEL + degree = get_group_size() + context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + + dict_version = {'yolov5s': 0, 'yolov5m': 1, 'yolov5l': 2, 'yolov5x': 3} + network = YOLOV5(is_training=True, version=dict_version[config.yolov5_version]) + # default is kaiming-normal + default_recurisive_init(network) + load_yolov5_params(config, network) + + network = YoloWithLossCell(network) + + config.label_smooth = config.label_smooth + config.label_smooth_factor = config.label_smooth_factor + + if config.training_shape: + config.multi_scale = [convert_training_shape(config.training_shape)] + if config.resize_rate: + config.resize_rate = config.resize_rate + + ds, data_size = create_yolo_dataset(image_dir=config.data_root, anno_path=config.annFile, is_training=True, + batch_size=config.per_batch_size, max_epoch=config.max_epoch, + device_num=config.group_size, rank=config.rank, config=config) + + config.logger.info('Finish loading dataset') + + config.steps_per_epoch = int(data_size / config.per_batch_size / config.group_size) + + if config.ckpt_interval <= 0: + config.ckpt_interval = config.steps_per_epoch + + lr = get_lr(config) + + opt = Momentum(params=get_param_groups(network), momentum=config.momentum, learning_rate=Tensor(lr), + weight_decay=config.weight_decay, loss_scale=config.loss_scale) + + network = TrainingWrapper(network, opt, config.loss_scale // 2) + network.set_train() + + if config.rank_save_ckpt_flag: + # checkpoint save + ckpt_max_num = config.max_epoch * config.steps_per_epoch // config.ckpt_interval + ckpt_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval, keep_checkpoint_max=1) + save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/') + ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(config.rank)) + cb_params = _InternalCallbackParam() + cb_params.train_network = network + cb_params.epoch_num = ckpt_max_num + cb_params.cur_epoch_num = 1 + run_context = RunContext(cb_params) + ckpt_cb.begin(run_context) + + old_progress = -1 + t_end = time.time() + data_loader = ds.create_dict_iterator(output_numpy=True, num_epochs=1) + + for i, data in enumerate(data_loader): + images = data["image"] + input_shape = images.shape[2:4] + images = Tensor.from_numpy(images) + batch_y_true_0 = Tensor.from_numpy(data['bbox1']) + batch_y_true_1 = Tensor.from_numpy(data['bbox2']) + batch_y_true_2 = Tensor.from_numpy(data['bbox3']) + batch_gt_box0 = Tensor.from_numpy(data['gt_box1']) + batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) + batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) + input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) + loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, + batch_gt_box2, input_shape) + loss_meter.update(loss.asnumpy()) + + if config.rank_save_ckpt_flag: + # ckpt progress + cb_params.cur_step_num = i + 1 # current step number + cb_params.batch_num = i + 2 + ckpt_cb.step_end(run_context) + + if i % config.log_interval == 0: + time_used = time.time() - t_end + epoch = int(i / config.steps_per_epoch) + fps = config.per_batch_size * (i - old_progress) * config.group_size / time_used + if config.rank == 0: + config.logger.info('epoch[{}], iter[{}], {}, fps:{:.2f} imgs/sec, ' + 'lr:{}'.format(epoch, i, loss_meter, fps, lr[i])) + t_end = time.time() + loss_meter.reset() + old_progress = i + + if (i + 1) % config.steps_per_epoch == 0 and config.rank_save_ckpt_flag: + cb_params.cur_epoch_num += 1 + + if config.is_modelArts: + mox.file.copy_parallel(src_url='/cache/outputs/', dst_url='obs://hit-cyf/yolov5_npu/outputs/') + config.logger.info('==========end training===============') + +if __name__ == "__main__": + run_train()