From 70bb1e9d89c7af3a8d45a6d992f2ecd2d6519aa3 Mon Sep 17 00:00:00 2001 From: maijianqiang <maijianqiang1@huawei.com> Date: Thu, 16 Sep 2021 16:49:00 +0800 Subject: [PATCH] add newtwork run demo --- official/cv/FCN8s/README.md | 2 +- official/cv/MCNN/README.md | 12 ++--- official/cv/MCNN/requirements.txt | 3 ++ official/cv/alexnet/README.md | 2 +- official/cv/alexnet/README_CN.md | 2 +- official/cv/cnn_direction_model/README.md | 2 +- official/cv/cnnctc/README.md | 4 +- official/cv/cnnctc/README_CN.md | 4 +- official/cv/ctpn/README.md | 5 +- official/cv/ctpn/scripts/run_infer_310.sh | 2 +- official/cv/deeplabv3/README.md | 12 +++-- official/cv/deeplabv3/README_CN.md | 9 ++-- official/cv/deeplabv3plus/README_CN.md | 9 ++-- official/cv/densenet/README.md | 13 +++--- official/cv/densenet/README_CN.md | 6 +-- official/cv/dpn/README.md | 2 +- official/cv/east/README.md | 4 +- official/cv/googlenet/README.md | 4 +- official/cv/googlenet/README_CN.md | 4 +- official/cv/inceptionv3/README.md | 2 +- official/cv/inceptionv3/README_CN.md | 4 +- official/cv/inceptionv4/README.md | 4 +- official/cv/maskrcnn_mobilenetv1/README.md | 6 +-- official/cv/mobilenetv1/README.md | 4 +- official/cv/mobilenetv2/README.md | 6 +-- official/cv/mobilenetv2/README_CN.md | 6 +-- official/cv/mobilenetv2_quant/README_CN.md | 16 +++---- official/cv/mobilenetv2_quant/Readme.md | 14 +++--- official/cv/openpose/README.md | 30 ++++++------ official/cv/retinanet/README_CN.md | 2 +- official/cv/shufflenetv1/README_CN.md | 19 ++++---- official/cv/simclr/README.md | 12 +++-- official/cv/simple_pose/README.md | 54 ++++++++++++---------- official/cv/squeezenet/README.md | 48 +++++++++++-------- 34 files changed, 179 insertions(+), 149 deletions(-) create mode 100644 official/cv/MCNN/requirements.txt diff --git a/official/cv/FCN8s/README.md b/official/cv/FCN8s/README.md index 7abe6d9cd..741d6ef26 100644 --- a/official/cv/FCN8s/README.md +++ b/official/cv/FCN8s/README.md @@ -225,7 +225,7 @@ ckpt_file: /home/FCN8s/ckpt/FCN8s_1-133_300.ckpt #Ascend鍏崱骞惰璁粌 bash scripts/run_train.sh [DEVICE_NUM] rank_table.json - # example: bash scripts/run_train.sh 8 /home/hccl_8p_01234567_10.155.170.71.json + # example: bash scripts/run_train.sh 8 ~/hccl_8p.json ``` 鍒嗗竷寮忚缁冮渶瑕佹彁鍓嶅垱寤篔SON鏍煎紡鐨凥CCL閰嶇疆鏂囦欢,璇烽伒寰猍閾炬帴璇存槑](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) diff --git a/official/cv/MCNN/README.md b/official/cv/MCNN/README.md index f7e7d94c0..de835793a 100644 --- a/official/cv/MCNN/README.md +++ b/official/cv/MCNN/README.md @@ -71,9 +71,9 @@ After installing MindSpore via the official website, you can start training and ```bash # enter script dir, train MCNN example -sh run_standalone_train_ascend.sh 0 ./formatted_trainval/shanghaitech_part_A_patches_9/train ./formatted_trainval/shanghaitech_part_A_patches_9/train_den ./formatted_trainval/shanghaitech_part_A_patches_9/val ./formatted_trainval/shanghaitech_part_A_patches_9/val_den ./ckpt +bash run_standalone_train_ascend.sh 0 ./formatted_trainval/shanghaitech_part_A_patches_9/train ./formatted_trainval/shanghaitech_part_A_patches_9/train_den ./formatted_trainval/shanghaitech_part_A_patches_9/val ./formatted_trainval/shanghaitech_part_A_patches_9/val_den ./ckpt # enter script dir, evaluate MCNN example -sh run_standalone_eval_ascend.sh 0 ./original/shanghaitech/part_A_final/test_data/images ./original/shanghaitech/part_A_final/test_data/ground_truth_csv ./train/ckpt/best.ckpt +bash run_standalone_eval_ascend.sh 0 ./original/shanghaitech/part_A_final/test_data/images ./original/shanghaitech/part_A_final/test_data/ground_truth_csv ./train/ckpt/best.ckpt ``` # [Script Description](#contents) @@ -126,14 +126,14 @@ Major parameters in train.py and config.py as follows: ```bash # enter script dir, and run the distribute script - sh run_distribute_train.sh ./hccl_table.json ./formatted_trainval/shanghaitech_part_A_patches_9/train ./formatted_trainval/shanghaitech_part_A_patches_9/train_den ./formatted_trainval/shanghaitech_part_A_patches_9/val ./formatted_trainval/shanghaitech_part_A_patches_9/val_den ./ckpt + bash run_distribute_train.sh ~/hccl_8p.json ./formatted_trainval/shanghaitech_part_A_patches_9/train ./formatted_trainval/shanghaitech_part_A_patches_9/train_den ./formatted_trainval/shanghaitech_part_A_patches_9/val ./formatted_trainval/shanghaitech_part_A_patches_9/val_den ./ckpt # enter script dir, and run the standalone script - sh run_standalone_train_ascend.sh 0 ./formatted_trainval/shanghaitech_part_A_patches_9/train ./formatted_trainval/shanghaitech_part_A_patches_9/train_den ./formatted_trainval/shanghaitech_part_A_patches_9/val ./formatted_trainval/shanghaitech_part_A_patches_9/val_den ./ckpt + bash run_standalone_train_ascend.sh 0 ./formatted_trainval/shanghaitech_part_A_patches_9/train ./formatted_trainval/shanghaitech_part_A_patches_9/train_den ./formatted_trainval/shanghaitech_part_A_patches_9/val ./formatted_trainval/shanghaitech_part_A_patches_9/val_den ./ckpt ``` After training, the loss value will be achieved as follows: - ```text + ```log # grep "loss is " log epoch: 1 step: 305, loss is 0.00041025918 epoch: 2 step: 305, loss is 3.7117527e-05 @@ -161,7 +161,7 @@ Before running the command below, please check the checkpoint path used for eval You can view the results through the file "eval_log". The accuracy of the test dataset will be as follows: - ```text + ```log # grep "MAE: " eval_log MAE: 105.87984801910736 MSE: 161.6687899899305 ``` diff --git a/official/cv/MCNN/requirements.txt b/official/cv/MCNN/requirements.txt new file mode 100644 index 000000000..4d2786989 --- /dev/null +++ b/official/cv/MCNN/requirements.txt @@ -0,0 +1,3 @@ +numpy +pandas +opencv \ No newline at end of file diff --git a/official/cv/alexnet/README.md b/official/cv/alexnet/README.md index 1ab7d4b7f..6e22c95c4 100644 --- a/official/cv/alexnet/README.md +++ b/official/cv/alexnet/README.md @@ -46,7 +46,7 @@ Dataset used: [CIFAR-10](<http://www.cs.toronto.edu/~kriz/cifar.html>) - Note锛欴ata will be processed in dataset.py - Download the dataset, the directory structure is as follows: -```bash +```cifar10 鈹溾攢cifar-10-batches-bin 鈹� 鈹斺攢cifar-10-verify-bin diff --git a/official/cv/alexnet/README_CN.md b/official/cv/alexnet/README_CN.md index 3ab37b04f..5b9ad7a73 100644 --- a/official/cv/alexnet/README_CN.md +++ b/official/cv/alexnet/README_CN.md @@ -48,7 +48,7 @@ AlexNet鐢�5涓嵎绉眰鍜�3涓叏杩炴帴灞傜粍鎴愩€傚涓嵎绉牳鐢ㄤ簬鎻愬彇 - 娉ㄦ剰锛氭暟鎹湪dataset.py涓鐞嗐€� - 涓嬭浇鏁版嵁闆嗐€傜洰褰曠粨鏋勫涓嬶細 -```bash +```cifar10 鈹溾攢cifar-10-batches-bin 鈹� 鈹斺攢cifar-10-verify-bin diff --git a/official/cv/cnn_direction_model/README.md b/official/cv/cnn_direction_model/README.md index 338527e7b..aa2241558 100644 --- a/official/cv/cnn_direction_model/README.md +++ b/official/cv/cnn_direction_model/README.md @@ -179,7 +179,7 @@ bash scripts/run_distribute_train_ascend.sh [rank_table] [train_dataset_path] [P For example, you can run the shell command below to launch the training procedure. ```shell -bash run_distribute_train_ascend.sh /home/hccl_8p_01234567_10.155.170.71.json /home/DataSet/FSNS/train/ +bash run_distribute_train_ascend.sh ~/hccl_8p.json /home/DataSet/FSNS/train/ ``` - running on ModelArts diff --git a/official/cv/cnnctc/README.md b/official/cv/cnnctc/README.md index 74d354f95..80671a1a1 100644 --- a/official/cv/cnnctc/README.md +++ b/official/cv/cnnctc/README.md @@ -147,7 +147,7 @@ bash scripts/run_standalone_train_gpu.sh $PRETRAINED_CKPT(options) ```bash bash scripts/run_distribute_train_ascend.sh $RANK_TABLE_FILE $PRETRAINED_CKPT(options) -# example: bash scripts/run_distribute_train_ascend.sh /home/hccl_8p_01234567_10.155.170.71.json +# example: bash scripts/run_distribute_train_ascend.sh ~/hccl_8p.json ``` - Distributed GPU Training: @@ -254,7 +254,7 @@ Results and checkpoints are written to `./train` folder. Log can be found in `./ ```bash bash scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_CKPT(options)] -# example: bash scripts/run_distribute_train_ascend.sh /home/hccl_8p_01234567_10.155.170.71.json +# example: bash scripts/run_distribute_train_ascend.sh ~/hccl_8p.json ``` For distributed training, a hccl configuration file with JSON format needs to be created in advance. diff --git a/official/cv/cnnctc/README_CN.md b/official/cv/cnnctc/README_CN.md index 9f081d577..a40fd1c09 100644 --- a/official/cv/cnnctc/README_CN.md +++ b/official/cv/cnnctc/README_CN.md @@ -144,7 +144,7 @@ bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [PRETRAINED_CKPT(options ```shell bash scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_CKPT(options)] -# example: bash scripts/run_distribute_train_ascend.sh /home/hccl_8p_01234567_10.155.170.71.json +# example: bash scripts/run_distribute_train_ascend.sh ~/hccl_8p.json ``` - 璇勪及锛� @@ -239,7 +239,7 @@ bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [PRETRAINED_CKPT(options ```shell bash scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_CKPT(options)] -# example: bash scripts/run_distribute_train_ascend.sh /home/hccl_8p_01234567_10.155.170.71.json +# example: bash scripts/run_distribute_train_ascend.sh ~/hccl_8p.json ``` 缁撴灉鍜屾鏌ョ偣鍒嗗埆鍐欏叆璁惧`i`鐨刞./train_parallel_{i}`鏂囦欢澶广€� diff --git a/official/cv/ctpn/README.md b/official/cv/ctpn/README.md index 13f5d2a06..26b30d481 100644 --- a/official/cv/ctpn/README.md +++ b/official/cv/ctpn/README.md @@ -171,8 +171,7 @@ Modify the parameters according to the actual path ```bash # distribute training bash scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [TASK_TYPE] [PRETRAINED_PATH] -# example: bash scripts/run_distribute_train_ascend.sh /home/hccl_8p_01234567_10.155.170.71.json Pretraining(or Finetune) \ -# /home/DataSet/ctpn_dataset/backbone/0-150_5004.ckpt +# example: bash scripts/run_distribute_train_ascend.sh ~/hccl_8p.json Pretraining(or Finetune) /home/DataSet/ctpn_dataset/backbone/0-150_5004.ckpt # standalone training bash scrpits/run_standalone_train_ascend.sh [TASK_TYPE] [PRETRAINED_PATH] [DEVICE_ID] @@ -226,7 +225,7 @@ ICDAR2013, SCUT-FORU to improve precision and recall, and when doing Finetune, w Ascend: # distribute training example(8p) bash run_distribute_train_ascend.sh [RANK_TABLE_FILE] [TASK_TYPE] [PRETRAINED_PATH] - # example: bash scripts/run_distribute_train_ascend.sh /home/hccl_8p_01234567_10.155.170.71.json Pretraining(or Finetune) /home/DataSet/ctpn_dataset/backbone/0-150_5004.ckpt + # example: bash scripts/run_distribute_train_ascend.sh ~/hccl_8p.json Pretraining(or Finetune) /home/DataSet/ctpn_dataset/backbone/0-150_5004.ckpt # standalone training bash run_standalone_train_ascend.sh [TASK_TYPE] [PRETRAINED_PATH] diff --git a/official/cv/ctpn/scripts/run_infer_310.sh b/official/cv/ctpn/scripts/run_infer_310.sh index f7117631b..4f8fcbfe5 100644 --- a/official/cv/ctpn/scripts/run_infer_310.sh +++ b/official/cv/ctpn/scripts/run_infer_310.sh @@ -104,7 +104,7 @@ function cal_acc() fi mkdir output mkdir output_img - python ../postprocess.py --dataset_path=$data_path --result_path=result_Files --label_path=$label_path &> acc.log + python ../postprocess.py --export_dataset_path=$data_path --result_path=result_Files --label_path=$label_path &> acc.log if [ $? -ne 0 ]; then echo "calculate accuracy failed" exit 1 diff --git a/official/cv/deeplabv3/README.md b/official/cv/deeplabv3/README.md index 5a3edb0e7..0d0688422 100644 --- a/official/cv/deeplabv3/README.md +++ b/official/cv/deeplabv3/README.md @@ -113,7 +113,7 @@ After installing MindSpore via the official website, you can start training and - Prepare backbone -Download resnet101 for here(https://download.mindspore.cn/model_zoo/r1.2/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt). +Download resnet101 for [here](https://download.mindspore.cn/model_zoo/r1.2/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt). - Running on Ascend @@ -140,7 +140,7 @@ Enter the shell script to modify the data_file and ckpt_pre_trained parameters data_file=/home/DataSet/VOC2012/vocaug_mindrecords/vocaug.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/predtrained/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt -bash run_distribute_train_s16_r1.sh +bash run_distribute_train_s16_r1.sh ~/hccl_8p.json ``` 2. Train s8 with vocaug dataset, finetuning from model in previous step, training script is: @@ -151,7 +151,7 @@ Enter the shell script to modify the data_file and ckpt_pre_trained parameters data_file=/home/DataSet/VOC2012/vocaug_mindrecords/vocaug.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/predtrained/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt -bash run_distribute_train_s8_r1.sh +bash run_distribute_train_s8_r1.sh ~/hccl_8p.json ``` 3. Train s8 with voctrain dataset, finetuning from model in previous step, training script is: @@ -164,17 +164,19 @@ data_file=/home/DataSet/VOC2012/voctrain_mindrecords/votrain.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/ckpt/deeplabv3-800_330.ckpt -bash run_distribute_train_s8_r2.sh +bash run_distribute_train_s8_r2.sh ~/hccl_8p.json ``` - For evaluation, evaluating steps are as follows: 1. Enter the shell script to modify the data_file and ckpt_pre_trained parameters -```default_config.yaml +```shell +modify the parameter according local path # example: data_root=/home/DataSet/VOC2012 data_lst=/home/DataSet/VOC2012/voc_val_lst.txt +ckpt_path=/home/model/deeplabv3/ckpt/deeplabv3-800_330.ckpt ``` 2. Eval s16 with voc val dataset, eval script is: diff --git a/official/cv/deeplabv3/README_CN.md b/official/cv/deeplabv3/README_CN.md index da2f68ade..51ef16a69 100644 --- a/official/cv/deeplabv3/README_CN.md +++ b/official/cv/deeplabv3/README_CN.md @@ -144,7 +144,7 @@ bash run_standalone_train.sh data_file=/home/DataSet/VOC2012/vocaug_mindrecords/vocaug.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/predtrained/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt -bash run_distribute_train_s16_r1.sh +bash run_distribute_train_s16_r1.sh ~/hccl_8p.json ``` 2. 浣跨敤VOCaug鏁版嵁闆嗚缁僺8锛屽井璋冧笂涓€姝ョ殑妯″瀷銆傝剼鏈涓嬶細 @@ -155,7 +155,7 @@ bash run_distribute_train_s16_r1.sh data_file=/home/DataSet/VOC2012/vocaug_mindrecords/vocaug.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/predtrained/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt -bash run_distribute_train_s8_r1.sh +bash run_distribute_train_s8_r1.sh ~/hccl_8p.json ``` 3. 浣跨敤VOCtrain鏁版嵁闆嗚缁僺8锛屽井璋冧笂涓€姝ョ殑妯″瀷銆傝剼鏈涓嬶細 @@ -167,17 +167,18 @@ bash run_distribute_train_s8_r1.sh data_file=/home/DataSet/VOC2012/voctrain_mindrecords/votrain.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/ckpt/deeplabv3-800_330.ckpt -bash run_distribute_train_s8_r2.sh +bash run_distribute_train_s8_r2.sh ~/hccl_8p.json ``` - 璇勪及姝ラ濡備笅锛� 1. 杩涘叆瀵瑰簲鐨剆hell鑴氭湰淇敼鍙傛暟 -```default_config.yaml +```shell # example: data_root=/home/DataSet/VOC2012 data_lst=/home/DataSet/VOC2012/voc_val_lst.txt +ckpt_path=/home/model/deeplabv3/ckpt/deeplabv3-800_330.ckpt ``` 2. 浣跨敤voc val鏁版嵁闆嗚瘎浼皊16銆傝瘎浼拌剼鏈涓嬶細 diff --git a/official/cv/deeplabv3plus/README_CN.md b/official/cv/deeplabv3plus/README_CN.md index 1ed6221bd..755dc7282 100644 --- a/official/cv/deeplabv3plus/README_CN.md +++ b/official/cv/deeplabv3plus/README_CN.md @@ -134,7 +134,7 @@ bash run_alone_train.sh data_file=/home/DataSet/VOC2012/vocaug_mindrecords/vocaug.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/predtrained/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt -bash run_distribute_train_s16_r1.sh +bash run_distribute_train_s16_r1.sh ~/hccl_8p.json ``` 2.浣跨敤VOCaug鏁版嵁闆嗚缁僺8锛屽井璋冧笂涓€姝ョ殑妯″瀷銆傝剼鏈涓嬶細 @@ -145,7 +145,7 @@ bash run_distribute_train_s16_r1.sh data_file=/home/DataSet/VOC2012/vocaug_mindrecords/vocaug.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/predtrained/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt -bash run_distribute_train_s8_r1.sh +bash run_distribute_train_s8_r1.sh ~/hccl_8p.json ``` 3.浣跨敤VOCtrain鏁版嵁闆嗚缁僺8锛屽井璋冧笂涓€姝ョ殑妯″瀷銆傝剼鏈涓嬶細 @@ -157,17 +157,18 @@ bash run_distribute_train_s8_r1.sh data_file=/home/DataSet/VOC2012/voctrain_mindrecords/votrain.mindrecord0 ckpt_pre_trained=/home/model/deeplabv3/ckpt/deeplabv3-800_330.ckpt -run_distribute_train_s8_r2.sh +bash run_distribute_train_s8_r2.sh ~/hccl_8p.json ``` 璇勪及姝ラ濡備笅锛� 1. 杩涘叆瀵瑰簲鐨剆hell鑴氭湰淇敼鍙傛暟 -```default_cofig.yaml +```shell # example: data_root=/home/DataSet/VOC2012 data_lst=/home/DataSet/VOC2012/voc_val_lst.txt +ckpt_path=/home/model/deeplabv3/ckpt/deeplabv3-800_330.ckpt ``` 2.浣跨敤voc val鏁版嵁闆嗚瘎浼皊16銆傝瘎浼拌剼鏈涓嬶細 diff --git a/official/cv/densenet/README.md b/official/cv/densenet/README.md index cdea0a65b..4e5b359b4 100644 --- a/official/cv/densenet/README.md +++ b/official/cv/densenet/README.md @@ -102,13 +102,13 @@ After installing MindSpore via the official website, you can start training and # run distributed training example bash scripts/run_distribute_train.sh [DEVICE_NUM] [RANK_TABLE_FILE] [NET_NAME] [DATASET_NAME] [TRAIN_DATA_DIR] - # example bash scripts/run_distribute_train.sh 8 /root/hccl_8p_01234567_10.155.170.71.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/ + # example bash scripts/run_distribute_train.sh 8 ~/hccl_8p.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/ # run evaluation example python eval.py --net [NET_NAME] --dataset [DATASET_NAME] --eval_data_dir /PATH/TO/DATASET --ckpt_files /PATH/TO/CHECKPOINT > eval.log 2>&1 & OR bash scripts/run_distribute_eval.sh [DEVICE_NUM] [RANDK_TABLE_FILE] [NET_NAME] [DATASET_NAME] [EVAL_DATA_DIR][CKPT_PATH] - # example: bash script/run_distribute_eval.sh 8 /root/hccl_8p_01234567_10.155.170.71.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/validation_preprocess/ /home/model/densenet/ckpt/0-120_500.ckpt + # example: bash script/run_distribute_eval.sh 8 ~/hccl_8p.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/validation_preprocess/ /home/model/densenet/ckpt/0-120_500.ckpt ``` For distributed training, a hccl configuration file with JSON format needs to be created in advance. @@ -293,7 +293,7 @@ You can modify the training behaviour through the various flags in the `densenet 2020-08-22 17:02:19,921:INFO:local passed 2020-08-22 17:05:43,112:INFO:epoch[2], iter[15011], loss:3.096, mean_fps:6304.53 imgs/sec 2020-08-22 17:05:43,113:INFO:local passed - + ... ``` - running on GPU @@ -326,7 +326,7 @@ You can modify the training behaviour through the various flags in the `densenet ```bash bash scripts/run_distribute_train.sh [DEVICE_NUM] [RANK_TABLE_FILE] [NET_NAME] [DATASET_NAME] [TRAIN_DATA_DIR] - # example bash scripts/run_distribute_train.sh 8 /root/hccl_8p_01234567_10.155.170.71.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/ + # example bash scripts/run_distribute_train.sh 8 ~/hccl_8p.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/ ``` @@ -340,7 +340,8 @@ You can modify the training behaviour through the various flags in the `densenet 2020-08-22 17:09:05,686:INFO:epoch[3], iter[20015], loss:3.113, mean_fps:6304.37 imgs/sec 2020-08-22 17:12:28,925:INFO:epoch[4], iter[25019], loss:3.29, mean_fps:6303.07 imgs/sec 2020-08-22 17:15:52,167:INFO:epoch[5], iter[30023], loss:2.865, mean_fps:6302.98 imgs/sec - + ... + ... ``` - running on GPU @@ -367,7 +368,7 @@ You can modify the training behaviour through the various flags in the `densenet python eval.py --net [NET_NAME] --dataset [DATASET_NAME] --eval_data_dir /PATH/TO/DATASET --ckpt_files /PATH/TO/CHECKPOINT > eval.log 2>&1 & OR bash scripts/run_distribute_eval.sh [DEVICE_NUM] [RANDK_TABLE_FILE] [NET_NAME] [DATASET_NAME] [EVAL_DATA_DIR][CKPT_PATH] - # example: bash script/run_distribute_eval.sh 8 /root/hccl_8p_01234567_10.155.170.71.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/validation_preprocess/ /home/model/densenet/ckpt/0-120_500.ckpt + # example: bash script/run_distribute_eval.sh 8 ~/hccl_8p.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/validation_preprocess/ /home/model/densenet/ckpt/0-120_500.ckpt ``` diff --git a/official/cv/densenet/README_CN.md b/official/cv/densenet/README_CN.md index 31191bd80..3301ea23f 100644 --- a/official/cv/densenet/README_CN.md +++ b/official/cv/densenet/README_CN.md @@ -107,13 +107,13 @@ DenseNet-100浣跨敤鐨勬暟鎹泦锛� Cifar-10 # 鍒嗗竷寮忚缁冪ず渚� bash scripts/run_distribute_train.sh [DEVICE_NUM] [RANK_TABLE_FILE] [NET_NAME] [DATASET_NAME] [TRAIN_DATA_DIR] - # example bash scripts/run_distribute_train.sh 8 /root/hccl_8p_01234567_10.155.170.71.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/ + # example bash scripts/run_distribute_train.sh 8 ~/hccl_8p.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/ # 鍗曞崱璇勪及绀轰緥 python eval.py --net [NET_NAME] --dataset [DATASET_NAME] --eval_data_dir /PATH/TO/DATASET --ckpt_files /PATH/TO/CHECKPOINT > eval.log 2>&1 & bash scripts/run_distribute_eval.sh [DEVICE_NUM] [RANDK_TABLE_FILE] [NET_NAME] [DATASET_NAME] [EVAL_DATA_DIR][CKPT_PATH] - # example: bash script/run_distribute_eval.sh 8 /root/hccl_8p_01234567_10.155.170.71.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/validation_preprocess/ /home/model/densenet/ckpt/0-120_500.ckpt + # example: bash script/run_distribute_eval.sh 8 ~/hccl_8p.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/validation_preprocess/ /home/model/densenet/ckpt/0-120_500.ckpt ``` 鍒嗗竷寮忚缁冮渶瑕佹彁鍓嶅垱寤篔SON鏍煎紡鐨凥CCL閰嶇疆鏂囦欢銆� @@ -310,7 +310,7 @@ python train.py --net=[NET_NAME] --dataset=[DATASET_NAME] --train_data_dir=[DATA ```shell bash scripts/run_distribute_train.sh [DEVICE_NUM] [RANK_TABLE_FILE] [NET_NAME] [DATASET_NAME] [TRAIN_DATA_DIR] -# example bash scripts/run_distribute_train.sh 8 /root/hccl_8p_01234567_10.155.170.71.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/ +# example bash scripts/run_distribute_train.sh 8 ~/hccl_8p.json densenet121 imagenet /home/DataSet/ImageNet_Original/train/ ``` 涓婅堪shell鑴氭湰灏嗗湪鍚庡彴杩涜鍒嗗竷寮忚缁冦€傚彲浠ラ€氳繃鏂囦欢`train[X]/output/202x-xx-xx_time_xx_xx_xx/`鏌ョ湅缁撴灉鏃ュ織鍜屾ā鍨嬫鏌ョ偣銆傚湪ImageNet鏁版嵁闆嗕笂璁粌DenseNet-121鐨勬崯澶卞€肩殑瀹炵幇濡備笅锛� diff --git a/official/cv/dpn/README.md b/official/cv/dpn/README.md index a1099021c..71c98709a 100644 --- a/official/cv/dpn/README.md +++ b/official/cv/dpn/README.md @@ -228,7 +228,7 @@ Run `scripts/train_distributed.sh` to train the model distributed. The usage of ```text bash scripts/train_distributed.sh [rank_table] [train_data_dir] [ckpt_path_to_save] [rank_size] [eval_each_epoch] [pretrained_ckpt(optional)] -# example: bash scripts/train_distributed.sh /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/ImageNet_Original/train/ ./ckpt/ 8 0 +# example: bash scripts/train_distributed.sh ~/hccl_8p.json /home/DataSet/ImageNet_Original/train/ ./ckpt/ 8 0 ``` The above shell script will run distribute training in the background. You can view the results through the file `train_parallel[X]/log.txt` as follows: diff --git a/official/cv/east/README.md b/official/cv/east/README.md index 2354fbe1d..7d7c2afb6 100644 --- a/official/cv/east/README.md +++ b/official/cv/east/README.md @@ -81,7 +81,7 @@ Dataset used [ICDAR 2015](https://rrc.cvc.uab.es/?ch=4&com=downloads) ```bash # distribute training example(8p) bash run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE] -# example: bash run_distribute_train.sh /home/DataSet/ICDAR2015/ic15/ home/model/east/pretrained/0-150_5004.ckpt /root/hccl_8p_01234567_10.155.170.71.json +# example: bash run_distribute_train.sh /home/DataSet/ICDAR2015/ic15/ home/model/east/pretrained/0-150_5004.ckpt ~/hccl_8p.json # standalone training bash run_standalone_train_ascend.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [DEVICE_ID] @@ -106,7 +106,7 @@ bash run_eval_ascend.sh [DATASET_PATH] [CKPT_PATH] [DEVICE_ID] Ascend: # distribute training example(8p) bash run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE] - # example: bash run_distribute_train.sh /home/DataSet/ICDAR2015/ic15/ home/model/east/pretrained/0-150_5004.ckpt /root/hccl_8p_01234567_10.155.170.71.json + # example: bash run_distribute_train.sh /home/DataSet/ICDAR2015/ic15/ home/model/east/pretrained/0-150_5004.ckpt ~/hccl_8p.json # standalone training bash run_standalone_train_ascend.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [DEVICE_ID] diff --git a/official/cv/googlenet/README.md b/official/cv/googlenet/README.md index dd5f145a3..ca8ce7db2 100644 --- a/official/cv/googlenet/README.md +++ b/official/cv/googlenet/README.md @@ -105,7 +105,7 @@ After installing MindSpore via the official website, you can start training and # run distributed training example bash scripts/run_train.sh [RANK_TABLE_FILE] [DATASET_NAME] - # example: bash scripts/run_train.sh /root/hccl_8p_01234567_10.155.170.71.json cifar10 + # example: bash scripts/run_train.sh ~/hccl_8p.json cifar10 # run evaluation example python eval.py > eval.log 2>&1 & @@ -401,7 +401,7 @@ For more configuration details, please refer the script `config.py`. - running on Ascend ```bash - bash scripts/run_train.sh /root/hccl_8p_01234567_10.155.170.71.json cifar10 + bash scripts/run_train.sh ~/hccl_8p.json cifar10 ``` The above shell script will run distribute training in the background. You can view the results through the file `train_parallel[X]/log`. The loss value will be achieved as follows: diff --git a/official/cv/googlenet/README_CN.md b/official/cv/googlenet/README_CN.md index 5a295ffcf..5cd3c07be 100644 --- a/official/cv/googlenet/README_CN.md +++ b/official/cv/googlenet/README_CN.md @@ -107,7 +107,7 @@ GoogleNet鐢卞涓猧nception妯″潡涓茶仈璧锋潵锛屽彲浠ユ洿鍔犳繁鍏ャ€� 闄嶇淮鐨� # 杩愯鍒嗗竷寮忚缁冪ず渚� bash scripts/run_train.sh [RANK_TABLE_FILE] [DATASET_NAME] - # example: bash scripts/run_train.sh /root/hccl_8p_01234567_10.155.170.71.json cifar10 + # example: bash scripts/run_train.sh ~/hccl_8p.json cifar10 # 杩愯璇勪及绀轰緥 python eval.py > eval.log 2>&1 & @@ -371,7 +371,7 @@ GoogleNet鐢卞涓猧nception妯″潡涓茶仈璧锋潵锛屽彲浠ユ洿鍔犳繁鍏ャ€� 闄嶇淮鐨� - Ascend澶勭悊鍣ㄧ幆澧冭繍琛� ```bash - bash scripts/run_train.sh /root/hccl_8p_01234567_10.155.170.71.json cifar10 + bash scripts/run_train.sh ~/hccl_8p.json cifar10 ``` 涓婅堪shell鑴氭湰灏嗗湪鍚庡彴杩愯鍒嗗竷璁粌銆傛偍鍙互閫氳繃train_parallel[X]/log鏂囦欢鏌ョ湅缁撴灉銆傞噰鐢ㄤ互涓嬫柟寮忚揪鍒版崯澶卞€硷細 diff --git a/official/cv/inceptionv3/README.md b/official/cv/inceptionv3/README.md index 6f0e1a2bc..7629b561a 100644 --- a/official/cv/inceptionv3/README.md +++ b/official/cv/inceptionv3/README.md @@ -284,7 +284,7 @@ Take training cifar10 as an example, the ds_type parameter is set to cifar10 ```shell # distribute training(8p) bash run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH] [CKPT_PATH] -# example: bash run_distribute_train.sh /root/hccl_8p_012345467_10.155.170.71.json /home/DataSet/cifar10/ ./ckpt/ +# example: bash run_distribute_train.sh ~/hccl_8p.json /home/DataSet/cifar10/ ./ckpt/ # standalone training bash scripts/run_standalone_train.sh [DEVICE_ID] [DATA_PATH] [CKPT_PATH] diff --git a/official/cv/inceptionv3/README_CN.md b/official/cv/inceptionv3/README_CN.md index d8423127d..3e92fcfa4 100644 --- a/official/cv/inceptionv3/README_CN.md +++ b/official/cv/inceptionv3/README_CN.md @@ -290,7 +290,7 @@ ds_type:cifar10 ```shell # 鍒嗗竷寮忚缁冪ず渚�(8鍗�) bash run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH] [CKPT_PATH] -# example: bash run_distribute_train.sh /root/hccl_8p_012345467_10.155.170.71.json /home/DataSet/cifar10/ ./ckpt/ +# example: bash run_distribute_train.sh ~/hccl_8p.json /home/DataSet/cifar10/ ./ckpt/ # 鍗曟満璁粌 bash scripts/run_standalone_train.sh [DEVICE_ID] [DATA_PATH] [CKPT_PATH] @@ -312,7 +312,7 @@ bash scripts/run_standalone_train.sh [DEVICE_ID] [DATA_PATH] [CKPT_PATH] Ascend: # 鍒嗗竷寮忚缁冪ず渚�(8鍗�) bash run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH] [CKPT_PATH] - # example: bash run_distribute_train.sh /root/hccl_8p_012345467_10.155.170.71.json /home/DataSet/cifar10/ ./ckpt/ + # example: bash run_distribute_train.sh ~/hccl_8p.json /home/DataSet/cifar10/ ./ckpt/ # 鍗曟満璁粌 bash scripts/run_standalone_train.sh [DEVICE_ID] [DATA_PATH] [CKPT_PATH] diff --git a/official/cv/inceptionv4/README.md b/official/cv/inceptionv4/README.md index f49028f97..1748aedb9 100644 --- a/official/cv/inceptionv4/README.md +++ b/official/cv/inceptionv4/README.md @@ -255,7 +255,7 @@ Take training cifar10 as an example, the ds_type parameter is set to cifar10 ```bash # distribute training example(8p) bash scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [DATA_DIR] -# example: bash scripts/run_distribute_train_ascend.sh /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/cifar10/ +# example: bash scripts/run_distribute_train_ascend.sh ~/hccl_8p.json /home/DataSet/cifar10/ # standalone training bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [DATA_DIR] @@ -289,7 +289,7 @@ bash scripts/run_standalone_train_cpu.sh DATA_PATH Ascend: # distribute training example(8p) bash scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [DATA_DIR] - # example: bash scripts/run_distribute_train_ascend.sh /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/cifar10/ + # example: bash scripts/run_distribute_train_ascend.sh ~/hccl_8p.json /home/DataSet/cifar10/ # standalone training bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [DATA_DIR] diff --git a/official/cv/maskrcnn_mobilenetv1/README.md b/official/cv/maskrcnn_mobilenetv1/README.md index 51813f4e7..97b5ecaa8 100644 --- a/official/cv/maskrcnn_mobilenetv1/README.md +++ b/official/cv/maskrcnn_mobilenetv1/README.md @@ -107,7 +107,7 @@ pip install mmcv=0.2.14 # distributed training bash run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH] [PRETRAINED_CKPT(optional)] - # example: bash run_distribute_train.sh /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/cocodataset/ + # example: bash run_distribute_train.sh ~/hccl_8p.json /home/DataSet/cocodataset/ # standalone training bash run_standalone_train.sh [DATA_PATH] [PRETRAINED_CKPT(optional)] @@ -351,7 +351,7 @@ On Ascend: # distributed training Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH] [PRETRAINED_CKPT(optional)] -# example: bash run_distribute_train.sh /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/cocodataset/ +# example: bash run_distribute_train.sh ~/hccl_8p.json /home/DataSet/cocodataset/ # standalone training Usage: bash run_standalone_train.sh [DATA_PATH] [PRETRAINED_CKPT(optional)] @@ -532,7 +532,7 @@ bash run_standalone_train_cpu.sh [PRETRAINED_MODEL](optional) ```bash bash run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH] [PRETRAINED_MODEL(optional)] -# example: bash run_distribute_train.sh /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/cocodataset/ +# example: bash run_distribute_train.sh ~/hccl_8p.json /home/DataSet/cocodataset/ ``` > hccl.json which is specified by RANK_TABLE_FILE is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). diff --git a/official/cv/mobilenetv1/README.md b/official/cv/mobilenetv1/README.md index 59a962199..068e4e50e 100644 --- a/official/cv/mobilenetv1/README.md +++ b/official/cv/mobilenetv1/README.md @@ -264,8 +264,8 @@ Please follow the instructions in the link [hccn_tools](https://gitee.com/mindsp shell: Ascend: bash run_distribute_train.sh [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) - # example: bash run_distribute_train.sh cifar10 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/cifar10/cifar-10-batches-bin/ - # example: bash run_distribute_train.sh imagenet2012 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/ImageNet_Original/ + # example: bash run_distribute_train.sh cifar10 ~/hccl_8p.json /home/DataSet/cifar10/cifar-10-batches-bin/ + # example: bash run_distribute_train.sh imagenet2012 ~/hccl_8p.json /home/DataSet/ImageNet_Original/ CPU: bash run_train_CPU.sh [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) GPU(single device): bash run_standalone_train_gpu.sh [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) diff --git a/official/cv/mobilenetv2/README.md b/official/cv/mobilenetv2/README.md index 4deadc116..4f21014a3 100644 --- a/official/cv/mobilenetv2/README.md +++ b/official/cv/mobilenetv2/README.md @@ -275,7 +275,7 @@ You can start training using python or shell scripts. The usage of shell scripts shell: Ascend: bash run_train.sh Ascend [CONFIG_PATH] [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] - # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/ImageNet_Original/ + # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 ~/hccl_8p.json /home/DataSet/ImageNet_Original/ GPU: bash run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] CPU: bash run_train.sh CPU [TRAIN_DATASET_PATH] @@ -288,7 +288,7 @@ You can start training using python or shell scripts. The usage of shell scripts shell: Ascend: bash run_train.sh Ascend [CONFIG_PATH] [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] [FILTER_HEAD] - # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/ImageNet_Original/ /home/model/mobilenetv2/predtrain/mobilenet-200_625.ckpt none True + # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 ~/hccl_8p.json /home/DataSet/ImageNet_Original/ /home/model/mobilenetv2/predtrain/mobilenet-200_625.ckpt none True GPU: bash run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] none True CPU: bash run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] none True @@ -301,7 +301,7 @@ You can start training using python or shell scripts. The usage of shell scripts shell: Ascend: bash run_train.sh Ascend [CONFIG_PATH] [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] - # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/ImageNet_Original/ /home/model/mobilenetv2/backbone/mobilenet-200_625.ckpt backbone + # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 ~/hccl_8p.json /home/DataSet/ImageNet_Original/ /home/model/mobilenetv2/backbone/mobilenet-200_625.ckpt backbone GPU: bash run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] backbone CPU: bash run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] backbone diff --git a/official/cv/mobilenetv2/README_CN.md b/official/cv/mobilenetv2/README_CN.md index e9c4b1055..9085a97fd 100644 --- a/official/cv/mobilenetv2/README_CN.md +++ b/official/cv/mobilenetv2/README_CN.md @@ -276,7 +276,7 @@ MobileNetV2鎬讳綋缃戠粶鏋舵瀯濡備笅锛� shell: Ascend: bash run_train.sh Ascend [CONFIG_PATH] [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] - # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/ImageNet_Original/ + # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 ~/hccl_8p.json /home/DataSet/ImageNet_Original/ GPU: bash run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] CPU: bash run_train.sh CPU [TRAIN_DATASET_PATH] @@ -289,7 +289,7 @@ MobileNetV2鎬讳綋缃戠粶鏋舵瀯濡備笅锛� shell: Ascend: bash run_train.sh Ascend [CONFIG_PATH] [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] [FILTER_HEAD] - # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/ImageNet_Original/ /home/model/mobilenetv2/predtrain/mobilenet-200_625.ckpt none True + # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 ~/hccl_8p.json /home/DataSet/ImageNet_Original/ /home/model/mobilenetv2/predtrain/mobilenet-200_625.ckpt none True GPU: bash run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] none True CPU: bash run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] none True @@ -302,7 +302,7 @@ MobileNetV2鎬讳綋缃戠粶鏋舵瀯濡備笅锛� shell: Ascend: bash run_train.sh Ascend [CONFIG_PATH] [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] - # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/ImageNet_Original/ /home/model/mobilenetv2/backbone/mobilenet-200_625.ckpt backbone + # example: bash run_train.sh Ascend default_config.yaml 8 0,1,2,3,4,5,6,7 ~/hccl_8p.json /home/DataSet/ImageNet_Original/ /home/model/mobilenetv2/backbone/mobilenet-200_625.ckpt backbone GPU: bash run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] backbone CPU: bash run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] backbone ``` diff --git a/official/cv/mobilenetv2_quant/README_CN.md b/official/cv/mobilenetv2_quant/README_CN.md index 45173675e..cbfa9eac4 100644 --- a/official/cv/mobilenetv2_quant/README_CN.md +++ b/official/cv/mobilenetv2_quant/README_CN.md @@ -164,17 +164,17 @@ MobileNetV2鎬讳綋缃戠粶鏋舵瀯濡備笅锛� Ascend: python train.py --device_target Ascend --dataset_path ~/imagenet/train/ GPU: python train.py --device_target GPU --dataset_path ~/imagenet/train/ shell锛� - Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt + Ascend: bash run_train.sh Ascend ~/hccl_8p.json ~/imagenet/train/ GPU: bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt - # 璁粌绀轰緥-閲忓寲姝ラ暱鍙涔犵殑閲忓寲鎰熺煡璁粌 + # 璁粌绀轰緥-閲忓寲姝ラ暱鍙涔犵殑閲忓寲鎰熺煡璁粌 python锛� Ascend: python train.py --device_target Ascend --dataset_path ~/imagenet/train/ \ --pre_trained ~/mobilenet.ckpt --optim_option "LEARNED_SCALE" GPU: python train.py --device_target GPU --dataset_path ~/imagenet/train/ \ --pre_trained ~/mobilenet.ckpt --optim_option "LEARNED_SCALE" shell锛� - Ascend: bash run_lsq_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt + Ascend: bash run_lsq_train.sh Ascend ~/hccl_8p.json ~/imagenet/train/ ~/mobilenet.ckpt GPU: bash run_lsq_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt ``` @@ -183,7 +183,7 @@ MobileNetV2鎬讳綋缃戠粶鏋舵瀯濡備笅锛� 璁粌缁撴灉淇濆瓨鍦ㄧず渚嬭矾寰勪腑銆俙Ascend`澶勭悊鍣ㄨ缁冪殑妫€鏌ョ偣榛樿淇濆瓨鍦╜./train/device$i/checkpoint`锛岃缁冩棩蹇楅噸瀹氬悜鍒癭./train/device$i/train.log`銆俙GPU`澶勭悊鍣ㄨ缁冪殑妫€鏌ョ偣榛樿淇濆瓨鍦╜./train/checkpointckpt_$i`涓紝璁粌鏃ュ織閲嶅畾鍚戝埌`./train/train.log`涓€� `train.log`鍐呭濡備笅锛� -```text +```log epoch:[ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] epoch time:140522.500, per step time:224.836, avg loss:5.258 epoch:[ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] @@ -234,13 +234,13 @@ shell: 鎺ㄧ悊缁撴灉淇濆瓨鍦ㄧず渚嬭矾寰勶紝鍙互鍦╜./val/infer.log`涓壘鍒板涓嬬粨鏋滐細 -```text +```log result:{'acc':0.71976314102564111} ``` ## 妯″瀷瀵煎嚭 -```shell +```python python export.py --checkpoint_path [CKPT_PATH] --file_format [EXPORT_FORMAT] --device_target [PLATFORM] --optim_option [OptimizeOption] ``` @@ -253,7 +253,7 @@ python export.py --checkpoint_path [CKPT_PATH] --file_format [EXPORT_FORMAT] --d 鍦ㄦ帹鐞嗕箣鍓嶉渶瑕佸湪鏄囪吘910鐜涓婂畬鎴怉IR妯″瀷鐨勫鍑恒€� 骞朵娇鐢╡xport_bin_file.py瀵煎嚭ImageNet鏁版嵁闆嗙殑bin鏂囦欢鍜屽搴旂殑label鏂囦欢锛� -```shell +```log python export_bin_file.py --dataset_dir [EVAL_DATASET_PATH] --save_dir [SAVE_PATH] ``` @@ -266,7 +266,7 @@ bash run_infer_310.sh [AIR_PATH] [DATA_PATH] [LABEL_PATH] [DEVICE_ID] 鎮ㄥ彲浠ラ€氳繃acc.log鏂囦欢鏌ョ湅缁撴灉銆俀AT閲忓寲鎺ㄧ悊鍑嗙‘鎬у涓嬶細 -```bash +```log 'Accuracy':0.7221 ``` diff --git a/official/cv/mobilenetv2_quant/Readme.md b/official/cv/mobilenetv2_quant/Readme.md index b4fe57023..82719ea7f 100644 --- a/official/cv/mobilenetv2_quant/Readme.md +++ b/official/cv/mobilenetv2_quant/Readme.md @@ -160,7 +160,7 @@ For Learned Step Size Quantization: Ascend: python train.py --device_target Ascend --dataset_path ~/imagenet/train/ GPU: python train.py --device_target GPU --dataset_path ~/imagenet/train/ shell锛� - Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt + Ascend: bash run_train.sh Ascend ~/hccl_8p.json ~/imagenet/train/ GPU: bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt # training example for Learned Step Size Quantization @@ -170,7 +170,7 @@ For Learned Step Size Quantization: GPU: python train.py --device_target GPU --dataset_path ~/imagenet/train/ \ --pre_trained ~/mobilenet.ckpt --optim_option "LEARNED_SCALE" shell锛� - Ascend: bash run_lsq_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt + Ascend: bash run_lsq_train.sh Ascend ~/hccl_8p.json ~/imagenet/train/ ~/mobilenet.ckpt GPU: bash run_lsq_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt ``` @@ -179,7 +179,7 @@ For Learned Step Size Quantization: Training result will be stored in the example path. Checkpoints trained by `Ascend` will be stored at `./train/device$i/checkpoint` by default, and training log will be redirected to `./train/device$i/train.log`. Checkpoints trained by `GPU` will be stored in `./train/checkpointckpt_$i` by default, and training log will be redirected to `./train/train.log`. `train.log` is as follows: -``` bash +``` log epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] @@ -232,13 +232,13 @@ shell: Inference result will be stored in the example path, you can find result like the following in `./val/infer.log`. -``` bash +``` log result: {'acc': 0.71976314102564111} ``` ## [Model Export](#contents) -```shell +```python python export.py --checkpoint_path [CKPT_PATH] --file_format [EXPORT_FORMAT] --device_target [PLATFORM] --optim_option [OptimizeOption] ``` @@ -250,7 +250,7 @@ python export.py --checkpoint_path [CKPT_PATH] --file_format [EXPORT_FORMAT] --d You should export AIR model at Ascend 910 before running the command below. You can use export_bin_file.py to export ImageNet bin and label for 310 inference. -```shell +```python python export_bin_file.py --dataset_dir [EVAL_DATASET_PATH] --save_dir [SAVE_PATH] ``` @@ -263,7 +263,7 @@ bash run_infer_310.sh [AIR_PATH] [DATA_PATH] [LABEL_PATH] [DEVICE_ID] You can view the results through the file "acc.log". The accuracy of the test dataset will be as follows: -```bash +```log 'Accuracy':0.7221 ``` diff --git a/official/cv/openpose/README.md b/official/cv/openpose/README.md index 2bca04112..5671daea5 100644 --- a/official/cv/openpose/README.md +++ b/official/cv/openpose/README.md @@ -49,7 +49,7 @@ In the currently provided training script, the coco2017 data set is used as an e Run python gen_ignore_mask.py ````python - python gen_ignore_mask.py --train_ann ../dataset/annotations/person_keypoints_train2017.json --val_ann ../dataset/annotations/person_keypoints_val2017.json --train_dir ../dataset/train2017 --val_dir ../dataset/val2017 + python gen_ignore_mask.py --train_ann /home/DataSet/coco/annotations/person_keypoints_train2017.json --val_ann /home/DataSet/coco/annotations/person_keypoints_val2017.json --train_dir /home/DataSet/coco/train2017 --val_dir /home/DataSet/coco/val2017 ```` - The dataset folder is generated in the root directory and contains the following files: @@ -59,8 +59,8 @@ In the currently provided training script, the coco2017 data set is used as an e 鈹溾攢鈹€ annotations 鈹溾攢person_keypoints_train2017.json 鈹斺攢person_keypoints_val2017.json - 鈹溾攢ignore_mask_train2017 - 鈹溾攢ignore_mask_val2017 + 鈹溾攢ignore_mask_train + 鈹溾攢ignore_mask_val 鈹溾攢train2017 鈹斺攢val2017 ``` @@ -90,15 +90,17 @@ After installing MindSpore via the official website, you can start training and ```python # run training example - python train.py --imgpath_train ./train2017 --jsonpath_train ./person_keypoints_train2017.json --maskpath_train ./ignore_mask_train2017 --vgg_path ./vgg19-0-97_5004.ckpt > train.log 2>&1 & + python train.py --imgpath_train /home/DataSet/coco/train2017 --jsonpath_train /home/DataSet/coco/annotations/person_keypoints_train2017.json --maskpath_train /home/DataSet/coco/ignore_mask_train --vgg_path /home/model/openpose/vgg19-0-97_5004.ckpt > train.log 2>&1 & # run distributed training example bash run_distribute_train.sh [RANK_TABLE_FILE] [IMGPATH_TRAIN] [JSONPATH_TRAIN] [MASKPATH_TRAIN] [VGG_PATH] + # example: bash run_distribute_train.sh ~/hccl_8p.json /home/DataSet/coco/train2017 /home/DataSet/coco/annotations/person_keypoints_train2017.json /home/DataSet/coco/ignore_mask_train /home/model/openpose/vgg19-0-97_5004.ckpt # run evaluation example - python eval.py --model_path path_to_eval_model.ckpt --imgpath_val ./dataset/val2017 --ann ./dataset/annotations/person_keypoints_val2017.json > eval.log 2>&1 & + python eval.py --model_path /home/model/openpose/ckpt/0-8_663.ckpt --imgpath_val /home/DataSet/coco/val2017 --ann /home/DataSet/coco/annotations/person_keypoints_val2017.json > eval.log 2>&1 & OR bash scripts/run_eval_ascend.sh [MODEL_PATH] [IMGPATH_VAL] [ANN] + # example: bash scripts/run_eval_ascend.sh /home/model/openpose/ckpt/0-8_663.ckpt /home/DataSet/coco/val2017 /home/DataSet/coco/annotations/person_keypoints_val2017.json ``` [RANK_TABLE_FILE] is the path of the multi-card information configuration table in the environment. The configuration table can be automatically generated by the tool [hccl_tool](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). @@ -165,14 +167,14 @@ For more configuration details, please refer the script `default_config.yaml`. - running on Ascend ```python - python train.py --imgpath_train ./train2017 --jsonpath_train ./person_keypoints_train2017.json --maskpath_train ./ignore_mask_train2017 --vgg_path ./vgg19-0-97_5004.ckpt > train.log 2>&1 & + python train.py --imgpath_train /home/DataSet/coco/train2017 --jsonpath_train /home/DataSet/coco/annotations/person_keypoints_train2017.json --maskpath_train /home/DataSet/coco/ignore_mask_train --vgg_path /home/model/openpose/vgg19-0-97_5004.ckpt > train.log 2>&1 & ``` The python command above will run in the background, you can view the results through the file `train.log`. After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows: - ```python + ```log # grep "epoch " train.log epoch[0], iter[23], mean loss is 0.292112287 epoch[0], iter[123], mean loss is 0.060355084 @@ -183,17 +185,17 @@ For more configuration details, please refer the script `default_config.yaml`. The model checkpoint will be saved in the directory of default_config.yaml: 'save_model_path'. - running on ModelArts -- If you want to train the model on modelarts, you can refer to the [official guidance document] of modelarts (https://support.huaweicloud.com/modelarts/) +- If you want to train the model on modelarts, you can refer to the official guidance document of [modelarts] (https://support.huaweicloud.com/modelarts/) -```python +```ModelArts # Example of using distributed training dpn on modelarts : # Data set storage method # 鈹溾攢鈹€ openpose_dataset # 鈹溾攢鈹€ annotations # 鈹溾攢person_keypoints_train2017.json # 鈹斺攢person_keypoints_val2017.json -# 鈹溾攢ignore_mask_train2017 -# 鈹溾攢ignore_mask_val2017 +# 鈹溾攢ignore_mask_train +# 鈹溾攢ignore_mask_val # 鈹溾攢train2017 # 鈹斺攢val2017 # 鈹斺攢checkpoint @@ -248,14 +250,16 @@ For more configuration details, please refer the script `default_config.yaml`. Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "username/openpose/outputs/\*time*\/0-6_30000.ckpt". ```python - python eval.py --model_path path_to_eval_model.ckpt --imgpath_val ./dataset/val2017 --ann ./dataset/annotations/person_keypoints_val2017.json > eval.log 2>&1 & + # run evaluation example + python eval.py --model_path /home/model/openpose/ckpt/0-8_663.ckpt --imgpath_val /home/DataSet/coco/val2017 --ann /home/DataSet/coco/annotations/person_keypoints_val2017.json > eval.log 2>&1 & OR bash scripts/run_eval_ascend.sh [MODEL_PATH] [IMGPATH_VAL] [ANN] + # example: bash scripts/run_eval_ascend.sh /home/model/openpose/ckpt/0-8_663.ckpt /home/DataSet/coco/val2017 /home/DataSet/coco/annotations/person_keypoints_val2017.json ``` The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows: - ```python + ```log # grep "AP" eval.log {'AP': 0.40250956300341397, 'Ap .5': 0.6658941566481336, 'AP .75': 0.396047897339743, 'AP (M)': 0.3075356543635785, 'AP (L)': 0.533772768618845, 'AR': 0.4519836272040302, 'AR .5': 0.693639798488665, 'AR .75': 0.4570214105793451, 'AR (M)': 0.32155148866429945, 'AR (L)': 0.6330360460795242} diff --git a/official/cv/retinanet/README_CN.md b/official/cv/retinanet/README_CN.md index cf76f65f9..72076957c 100644 --- a/official/cv/retinanet/README_CN.md +++ b/official/cv/retinanet/README_CN.md @@ -224,7 +224,7 @@ python create_data.py --create_dataset coco --prefix retinanet_eval.mindrecord - Ascend: # 鍏崱骞惰璁粌绀轰緥(鍦╮etinanet鐩綍涓嬭繍琛�)锛� bash scripts/run_distribute_train.sh [DEVICE_NUM] [RANK_TABLE_FILE] [MINDRECORD_DIR] [PRE_TRAINED(optional)] [PRE_TRAINED_EPOCH_SIZE(optional)] -# example: bash scripts/run_distribute_train.sh 8 /root/hccl_8p_01234567_10.155.170.71.json /home/DataSet/MindRecord_COCO/ +# example: bash scripts/run_distribute_train.sh 8 ~/hccl_8p.json /home/DataSet/MindRecord_COCO/ # 鍗曞崱璁粌绀轰緥(鍦╮etinanet鐩綍涓嬭繍琛�)锛� bash scripts/run_single_train.sh [DEVICE_ID] [MINDRECORD_DIR] diff --git a/official/cv/shufflenetv1/README_CN.md b/official/cv/shufflenetv1/README_CN.md index 40b49f099..baca46001 100644 --- a/official/cv/shufflenetv1/README_CN.md +++ b/official/cv/shufflenetv1/README_CN.md @@ -125,11 +125,14 @@ ShuffleNetV1鐨勬牳蹇冮儴鍒嗚鍒嗘垚涓変釜闃舵锛屾瘡涓樁娈甸噸澶嶅爢绉簡 python: Ascend鍗曞崱璁粌绀轰緥锛歱ython train.py --train_dataset_path [DATA_DIR] + # example: python train.py --train_dataset_path /home/DataSet/ImageNet_Original/train shell: -<<<<<<< HEAD - Ascend鍏崱骞惰璁粌: sh scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_DIR] - Ascend鍗曞崱璁粌绀轰緥: sh scripts/run_standalone_train.sh [DEVICE_ID] [DATA_DIR] + Ascend鍏崱骞惰璁粌: bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_DIR] + # example: bash scripts/run_distribute_train.sh ~/hccl_8p.json /home/DataSet/ImageNet_Original/train + + Ascend鍗曞崱璁粌绀轰緥: bash scripts/run_standalone_train.sh [DEVICE_ID] [DATA_DIR] + # example: bash scripts/run_standalone_train.sh 0 /home/DataSet/ImageNet_Original/train - running on GPU with gpu default parameters @@ -146,10 +149,6 @@ ShuffleNetV1鐨勬牳蹇冮儴鍒嗚鍒嗘垚涓変釜闃舵锛屾瘡涓樁娈甸噸澶嶅爢绉簡 shell: GPU鍗曞崱璁粌绀轰緥: sh scripts/run_standalone_train_gpu.sh [DEVICE_ID] [DATA_DIR] GPU鍏崱骞惰璁粌: sh scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] -======= - Ascend鍏崱骞惰璁粌: bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_DIR] - Ascend鍗曞崱璁粌绀轰緥: bash scripts/run_standalone_train.sh [DEVICE_ID] [DATA_DIR] ->>>>>>> fe806b7430... update bash ``` 鍒嗗竷寮忚缁冮渶瑕佹彁鍓嶅垱寤篔SON鏍煎紡鐨凥CCL閰嶇疆鏂囦欢銆� @@ -162,7 +161,7 @@ ShuffleNetV1鐨勬牳蹇冮儴鍒嗚鍒嗘垚涓変釜闃舵锛屾瘡涓樁娈甸噸澶嶅爢绉簡 ckpt鏂囦欢灏嗗瓨鍌ㄥ湪 `save_ckpt_path` 璺緞涓嬶紝璁粌鏃ュ織灏嗚璁板綍鍒� `log.txt` 涓€傝缁冩棩蹇楅儴鍒嗙ず渚嬪涓嬶細 -```shell +```log epoch time: 99854.980, per step time: 79.820, avg loss: 4.093 epoch time: 99863.734, per step time: 79.827, avg loss: 4.010 epoch time: 99859.792, per step time: 79.824, avg loss: 3.869 @@ -180,9 +179,11 @@ epoch time: 99864.092, per step time: 79.827, avg loss: 3.442 # Ascend璇勪及绀轰緥 python: python eval.py --eval_dataset_path [DATA_DIR] --ckpt_path [PATH_CHECKPOINT] + # example: python eval.py --eval_dataset_path /home/DataSet/ImageNet_Original/validation_preprocess --ckpt_path /home/model/shufflenetv1/ckpt/shufflenetv1-250_1251 shell: bash scripts/run_eval.sh [DEVICE_ID] [DATA_DIR] [PATH_CHECKPOINT] + # example: bash scripts/run_eval.sh 0 /home/DataSet/ImageNet_Original/validation_preprocess /home/model/shufflenetv1/ckpt/shufflenetv1-250_1251 # GPU璇勪及绀轰緥 python: @@ -196,7 +197,7 @@ epoch time: 99864.092, per step time: 79.827, avg loss: 3.442 鍙互鍦� `eval_log.txt` 鏌ョ湅璇勪及缁撴灉銆� -```shell +```log result:{'Loss': 2.0479587888106323, 'Top_1_Acc': 0.7385817307692307, 'Top_5_Acc': 0.9135817307692308}, ckpt:'/home/shufflenetv1/train_parallel0/checkpoint/shufflenetv1-250_1251.ckpt', time: 98560.63866615295 ``` diff --git a/official/cv/simclr/README.md b/official/cv/simclr/README.md index 56553eb82..d14c5dda6 100644 --- a/official/cv/simclr/README.md +++ b/official/cv/simclr/README.md @@ -39,7 +39,7 @@ Dataset used: [CIFAR-10](<http://www.cs.toronto.edu/~kriz/cifar.html>) - Note锛欴ata will be processed in dataset.py - Download the dataset, the directory structure is as follows: -```bash +```cifar10 鈹溾攢cifar-10-batches-bin 鈹� 鈹斺攢cifar-10-verify-bin @@ -62,10 +62,14 @@ After installing MindSpore via the official website, you can start training and ```python # enter script dir, train SimCLR bash run_standalone_train_ascend.sh [cifar10] [TRAIN_DATASET_PATH] [DEVICE_ID] +# example: bash run_standalone_train_ascend cifar10 /home/DataSet/cifar10/cifar-10-batches-bin/ 0 or bash run_distribution_ascend.sh [DEVICENUM] [RANK_TABLE_FILE] [cifar10] [TRAIN_DATASET_PATH] +# example: bash run_distribution_ascend.sh 8 ~/hccl_8p.json cifar10 /home/DataSet/cifar10/cifar-10-batches-bin/ + # enter script dir, evaluate SimCLR bash run_standalone_eval_ascend.sh [cifar10] [DEVICE_ID] [SIMCLR_MODEL_PATH] [TRAIN_DATASET_PATH] [EVAL_DATASET_PATH] +# example: run_standalone_eval_ascend.sh cifar10 0 /home/model/simclr/ckpt/checkpoint-simclr-100_390.ckpt /home/DataSet/cifar10/cifar-10-batches-bin/ /home/DataSet/cifar10/cifar-10-verify-bin/ ``` ## [Script Description](#contents) @@ -152,11 +156,12 @@ Major parameters in linear_eval.py as follows: ```bash bash run_distribution_ascend.sh [DEVICENUM] [RANK_TABLE_FILE] [cifar10] [TRAIN_DATASET_PATH] + # example: bash run_distribution_ascend.sh 8 ~/hccl_8p.json cifar10 /home/DataSet/cifar10/cifar-10-batches-bin/ ``` After training, the loss value will be achieved as follows: - ```bash + ```log # grep "loss is " log epoch: 1 step: 48, loss is 9.5758915 epoch time: 253236.075 ms, per step time: 5275.752 ms @@ -186,11 +191,12 @@ Before running the command below, please check the checkpoint path used for eval ```bash bash run_standalone_eval_ascend.sh [cifar10] [DEVICE_ID] [SIMCLR_MODEL_PATH] [TRAIN_DATASET_PATH] [EVAL_DATASET_PATH] + # example: run_standalone_eval_ascend.sh cifar10 0 /home/model/simclr/ckpt/checkpoint-simclr-100_390.ckpt /home/DataSet/cifar10/cifar-10-batches-bin/ /home/DataSet/cifar10/cifar-10-verify-bin/ ``` You can view the results through the file "eval_log". The accuracy of the test dataset will be as follows: - ```bash + ```log # grep "Average accuracy: " eval_log 'Accuracy': 0.84505 ``` diff --git a/official/cv/simple_pose/README.md b/official/cv/simple_pose/README.md index e0c182c40..84db42fb4 100644 --- a/official/cv/simple_pose/README.md +++ b/official/cv/simple_pose/README.md @@ -98,6 +98,22 @@ Before you start your training process, you need to obtain mindspore imagenet pr 鈹斺攢resnet50.ckpt ``` +```dataset + 鈹斺攢 cocodataset + 鈹斺攢train2017 + 鈹斺攢val2017 + 鈹斺攢annotations +``` + +```default_config.yaml +DATASET: + ROOT:/home/DataSet/cocodataset +MODEL: + PRETRAINED:./resnet50.ckpt + +# Modify according to local path +``` + ## [Running](#contents) - running on local @@ -106,20 +122,23 @@ Before you start your training process, you need to obtain mindspore imagenet pr ```shell bash scripts/train_standalone.sh [CKPT_SAVE_DIR] [DEVICE_ID] [BATCH_SIZE] + # example: bash scripts/train_standalone.sh ./ 0 128 ``` To validate the model, change the settings in `default_config.yaml` to the path of the model you want to validate or setting that on the terminal. For example: - ```python + ```default_config.yaml TEST: ... - MODEL_FILE : './{path}/xxxx.ckpt' + MODEL_FILE : /home/model/simple_pose/ckpt/simplepose-140_1170.ckpt + # Modify according to local path ``` Then, run the shell script `scripts/eval.sh` with the format below: ```shell bash scripts/eval.sh [TEST_MODEL_FILE] [COCO_BBOX_FILE] [DEVICE_ID] + # example: bash scripts/eval.sh /home/model/simple_pose/ckpt/simplepose-140_1170.ckpt ./experiments/COCO_val2017_detections_AP_H_56_person.json 0 ``` - running on ModelArts @@ -211,7 +230,7 @@ Configurations for both training and evaluation are set in `default_config.yaml` - config for SimplePoseNet on COCO2017 dataset: -```python +```default_config.yaml # These parameters can be modified at the terminal ckpt_save_dir: 'checkpoints' # the folder to save the '*.ckpt' file batch_size: 128 # TRAIN.BATCH_SIZE @@ -280,17 +299,12 @@ Run `scripts/train_standalone.sh` to train the model standalone. The usage of th ```shell bash scripts/train_standalone.sh [CKPT_SAVE_DIR] [DEVICE_ID] [BATCH_SIZE] -``` - -For example, you can run the shell command below to launch the training procedure. - -```shell -bash scripts/train_standalone.sh results/standalone/ 0 128 +# example: bash scripts/train_standalone.sh ./ 0 128 ``` The script will run training in the background, you can view the results through the file `train_log[X].txt` as follows: -```text +```log loading parse... batch size :128 loading dataset from /data/coco2017/train2017 @@ -314,17 +328,12 @@ Run `scripts/train_distributed.sh` to train the model distributed. The usage of ```shell bash scripts/train_distributed.sh [MINDSPORE_HCCL_CONFIG_PATH] [CKPT_SAVE_DIR] [RANK_SIZE] -``` - -For example, you can run the shell command below to launch the distributed training procedure. - -```shell -bash scripts/train_distributed.sh /home/rank_table.json results/distributed/ 4 +# example: bash scripts/train_distributed.sh /root/hccl_8p_01234567_10.155.170.71.json ./checkpoint 8 ``` The above shell script will run distribute training in the background. You can view the results through the file `train_parallel[X]/log.txt` as follows: -```text +```log loading parse... batch size :64 loading dataset from /data/coco2017/train2017 @@ -348,17 +357,12 @@ run `scripts/eval.sh` to evaluate the model with one Ascend processor. The usage ```shell bash scripts/eval.sh [TEST_MODEL_FILE] [COCO_BBOX_FILE] [DEVICE_ID] -``` - -For example, you can run the shell command below to launch the validation procedure. - -```shell -bash scripts/eval.sh results/distributed/sim-140_1170.ckpt +# example: bash scripts/eval.sh /home/model/simple_pose/ckpt/simplepose-140_1170.ckpt ./experiments/COCO_val2017_detections_AP_H_56_person.json 0 ``` The above shell command will run validation procedure in the background. You can view the results through the file `eval_log[X].txt`. The result will be achieved as follows: -```text +```log use flip test: True loading model ckpt from results/distributed/sim-140_1170.ckpt loading dataset from /data/coco2017/val2017 @@ -419,7 +423,7 @@ bash run_infer_310.sh [MINDIR_PATH] [NEED_PREPROCESS] [DEVICE_ID] Inference result is saved in current path, you can find result like this in acc.log file. -```bash +```log AP: 0.7036180026660003 ``` diff --git a/official/cv/squeezenet/README.md b/official/cv/squeezenet/README.md index 7e9ec5313..894fc71ba 100644 --- a/official/cv/squeezenet/README.md +++ b/official/cv/squeezenet/README.md @@ -82,14 +82,21 @@ After installing MindSpore via the official website, you can start training and - running on Ascend ```bash + run squeezenet_residual as example # distributed training Usage: bash scripts/run_distribute_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [RANK_TABLE_FILE] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional) + # example: bash scripts/run_distribute_train.sh squeezenet_residual imagenet ~/hccl_8p.json /home/DataSet/ImageNet_Original/train + # example: bash scripts/run_distribute_train.sh squeezenet_residual cifar10 ~/hccl_8p.json /home/DataSet/cifar10/cifar-10-batches-bin # standalone training Usage: bash scripts/run_standalone_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional) + # example: bash scripts/run_standalone_train.sh squeezenet_residual imagenet 0 /home/DataSet/ImageNet_Original/train + # example: bash scripts/run_standalone_train.sh squeezenet_residual cifar10 0 /home/DataSet/cifar10/cifar-10-batches-bin # run evaluation example Usage: bash scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [CHECKPOINT_PATH] + # example bash scripts/run_eval.sh squeezenet_residual cifar10 0 /home/DataSet/cifar10/cifar-10-verify-bin /home/model/squeezenet/ckpt/squeezenet_residual_cifar10-120_1562.ckpt + # example bash scripts/run_eval.sh squeezenet_residual imagenet 0 /home/DataSet/ImageNet_Original/validation_preprocess /home/model/squeezenet/ckpt/squeezenet_residual_imagenet-300_5004.ckpt ``` - running on GPU @@ -117,7 +124,7 @@ After installing MindSpore via the official website, you can start training and If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows: -```python +```ModelArts # run distributed training on modelarts example # (1) First, Perform a or b. # a. Set "enable_modelarts=True" on yaml file. @@ -293,9 +300,13 @@ For more configuration details, please refer the file `*.yaml`. ```shell # distributed training Usage: bash scripts/run_distribute_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [RANK_TABLE_FILE] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional) + # example: bash scripts/run_distribute_train.sh squeezenet_residual imagenet ~/hccl_8p.json /home/DataSet/ImageNet_Original/train + # example: bash scripts/run_distribute_train.sh squeezenet_residual cifar10 ~/hccl_8p.json /home/DataSet/cifar10/cifar-10-batches-bin # standalone training Usage: bash scripts/run_standalone_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional) + # example: bash scripts/run_standalone_train.sh squeezenet_residual imagenet 0 /home/DataSet/ImageNet_Original/train + # example: bash scripts/run_standalone_train.sh squeezenet_residual cifar10 0 /home/DataSet/cifar10/cifar-10-batches-bin ``` For distributed training, a hccl configuration file with JSON format needs to be created in advance. @@ -308,7 +319,7 @@ Training result will be stored in the example path, whose folder name begins wit - Training SqueezeNet with CIFAR-10 dataset -```shell +```log # standalone training result epoch: 1 step 1562, loss is 1.7103254795074463 epoch: 2 step 1562, loss is 2.06101131439209 @@ -320,7 +331,7 @@ epoch: 5 step 1562, loss is 1.2140142917633057 - Training SqueezeNet with ImageNet dataset -```shell +```log # distribute training result(8 pcs) epoch: 1 step 5004, loss is 5.716324329376221 epoch: 2 step 5004, loss is 5.350603103637695 @@ -332,7 +343,7 @@ epoch: 5 step 5004, loss is 4.136358261108398 - Training SqueezeNet_Residual with CIFAR-10 dataset -```shell +```log # standalone training result epoch: 1 step 1562, loss is 2.298271656036377 epoch: 2 step 1562, loss is 2.2728664875030518 @@ -344,7 +355,7 @@ epoch: 5 step 1562, loss is 1.3370063304901123 - Training SqueezeNet_Residual with ImageNet dataset -```shell +```log # distribute training result(8 pcs) epoch: 1 step 5004, loss is 6.802495002746582 epoch: 2 step 5004, loss is 6.386072158813477 @@ -363,11 +374,8 @@ epoch: 5 step 5004, loss is 4.888848304748535 ```shell # evaluation Usage: bash scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [CHECKPOINT_PATH] -``` - -```shell -# evaluation example -bash scripts/run_eval.sh squeezenet cifar10 0 ~/cifar-10-verify-bin train/squeezenet_cifar10-120_1562.ckpt +# example bash scripts/run_eval.sh squeezenet_residual cifar10 0 /home/DataSet/cifar10/cifar-10-verify-bin /home/model/squeezenet/ckpt/squeezenet_residual_cifar10-120_1562.ckpt +# example bash scripts/run_eval.sh squeezenet_residual imagenet 0 /home/DataSet/ImageNet_Original/validation_preprocess /home/model/squeezenet/ckpt/squeezenet_residual_imagenet-300_5004.ckpt ``` checkpoint can be produced in training process. @@ -378,25 +386,25 @@ Evaluation result will be stored in the example path, whose folder name is "eval - Evaluating SqueezeNet with CIFAR-10 dataset -```shell +```log result: {'top_1_accuracy': 0.8896233974358975, 'top_5_accuracy': 0.9965945512820513} ``` - Evaluating SqueezeNet with ImageNet dataset -```shell +```log result: {'top_1_accuracy': 0.5851472471190781, 'top_5_accuracy': 0.8105393725992317} ``` - Evaluating SqueezeNet_Residual with CIFAR-10 dataset -```shell +```log result: {'top_1_accuracy': 0.9077524038461539, 'top_5_accuracy': 0.9969951923076923} ``` - Evaluating SqueezeNet_Residual with ImageNet dataset -```shell +```log result: {'top_1_accuracy': 0.6094950384122919, 'top_5_accuracy': 0.826324423815621} ``` @@ -406,7 +414,7 @@ result: {'top_1_accuracy': 0.6094950384122919, 'top_5_accuracy': 0.8263244238156 Export MindIR on local -```shell +```python python export.py --checkpoint_file_path [CKPT_PATH] --batch_size [BATCH_SIZE] --net_name [NET] --dataset [DATASET] --file_format [EXPORT_FORMAT] --config_path [CONFIG_PATH] ``` @@ -418,7 +426,7 @@ The checkpoint_file_path parameter is required, Export on ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start as follows) -```python +```ModelArts # Export on ModelArts # (1) Perform a or b. # a. Set "enable_modelarts=True" on default_config.yaml file. @@ -459,25 +467,25 @@ Inference result is saved in current path, you can find result like this in acc. - Infer SqueezeNet with CIFAR-10 dataset -```bash +```log 'Top1_Accuracy': 83.62% 'Top5_Accuracy': 99.31% ``` - Infer SqueezeNet with ImageNet dataset -```bash +```log 'Top1_Accuracy': 59.30% 'Top5_Accuracy': 81.40% ``` - Infer SqueezeNet_Residual with CIFAR-10 dataset -```bash +```log 'Top1_Accuracy': 87.28% 'Top5_Accuracy': 99.58% ``` - Infer SqueezeNet_Residual with ImageNet dataset -```bash +```log 'Top1_Accuracy': 60.82% 'Top5_Accuracy': 82.56% ``` -- GitLab