diff --git a/official/cv/retinaface_resnet50/README.md b/official/cv/retinaface_resnet50/README.md index e37ddade4f2194d9254f2192794f67b7f2817833..e9d98bab920ff9229101132361146421d18827c0 100644 --- a/official/cv/retinaface_resnet50/README.md +++ b/official/cv/retinaface_resnet50/README.md @@ -81,7 +81,7 @@ After installing MindSpore via the official website and download the dataset, yo # run evaluation example export CUDA_VISIBLE_DEVICES=0 - python eval.py > eval.log 2>&1 & + python eval.py > eval.log 2>&1 & OR bash run_standalone_gpu_eval.sh 0 ``` diff --git a/official/cv/retinaface_resnet50/scripts/run_distribute_gpu_train.sh b/official/cv/retinaface_resnet50/scripts/run_distribute_gpu_train.sh index d34dada3e79f6d55284bbfb3f25ae880c1c03908..c8427bb47b0f11c742ef7d0cf8664e37bf846e22 100644 --- a/official/cv/retinaface_resnet50/scripts/run_distribute_gpu_train.sh +++ b/official/cv/retinaface_resnet50/scripts/run_distribute_gpu_train.sh @@ -24,4 +24,4 @@ RANK_SIZE=$1 export CUDA_VISIBLE_DEVICES="$2" mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ - python train.py > train.log 2>&1 & + python train.py --is_distributed > train.log 2>&1 & diff --git a/official/cv/retinaface_resnet50/src/dataset.py b/official/cv/retinaface_resnet50/src/dataset.py index ec80e2b3da9363cd32678610343d09bf3e1fe814..18a3126b411d393387a93e1fc05eb5f78f8ed92e 100644 --- a/official/cv/retinaface_resnet50/src/dataset.py +++ b/official/cv/retinaface_resnet50/src/dataset.py @@ -109,12 +109,18 @@ def read_dataset(img_path, annotation): return img, target -def create_dataset(data_dir, cfg, batch_size=32, repeat_num=1, shuffle=True, multiprocessing=True, num_worker=4): +def create_dataset(data_dir, cfg, batch_size=32, repeat_num=1, shuffle=True, multiprocessing=True, num_worker=4, + is_distribute=False): dataset = WiderFace(data_dir) - init("nccl") - rank_id = get_rank() - device_num = get_group_size() + if is_distribute: + init("nccl") + rank_id = get_rank() + device_num = get_group_size() + else: + rank_id = 0 + device_num = 1 + if device_num == 1: de_dataset = de.GeneratorDataset(dataset, ["image", "annotation"], shuffle=shuffle, diff --git a/official/cv/retinaface_resnet50/train.py b/official/cv/retinaface_resnet50/train.py index 6552effa707d11f33457ff6ecc2cfb496d8db4c3..c3c04e1c4a4cbdce37f436eec8b655ffaa2d4690 100644 --- a/official/cv/retinaface_resnet50/train.py +++ b/official/cv/retinaface_resnet50/train.py @@ -15,6 +15,7 @@ """Train Retinaface_resnet50.""" from __future__ import print_function import math +import argparse import mindspore from mindspore import context @@ -30,19 +31,17 @@ from src.loss import MultiBoxLoss from src.dataset import create_dataset from src.lr_schedule import adjust_learning_rate -def train(cfg): +def train(cfg, args): context.set_context(mode=context.GRAPH_MODE, device_target='GPU', save_graphs=False) if context.get_context("device_target") == "GPU": # Enable graph kernel context.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") - if cfg['ngpu'] > 1: + if args.is_distributed: init("nccl") context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) cfg['ckpt_path'] = cfg['ckpt_path'] + "ckpt_" + str(get_rank()) + "/" - else: - raise ValueError('cfg_num_gpu <= 1') batch_size = cfg['batch_size'] max_epoch = cfg['epoch'] @@ -56,7 +55,8 @@ def train(cfg): negative_ratio = 7 stepvalues = (cfg['decay1'], cfg['decay2']) - ds_train = create_dataset(training_dataset, cfg, batch_size, multiprocessing=True, num_worker=cfg['num_workers']) + ds_train = create_dataset(training_dataset, cfg, batch_size, multiprocessing=True, num_worker=cfg['num_workers'], + is_distribute=args.is_distributed) print('dataset size is : \n', ds_train.get_dataset_size()) steps_per_epoch = math.ceil(ds_train.get_dataset_size()) @@ -110,9 +110,12 @@ def train(cfg): if __name__ == '__main__': + parser = argparse.ArgumentParser('MindSpore RetinaFace training') + parser.add_argument('--is_distributed', action='store_true', help='distributed training') + arg, _ = parser.parse_known_args() config = cfg_res50 mindspore.common.seed.set_seed(config['seed']) print('train config:\n', config) - train(cfg=config) + train(cfg=config, args=arg)