From 5daf09a79740dd38394827d5b30d86ac7c10296b Mon Sep 17 00:00:00 2001 From: anzhengqi <anzhengqi1@huawei.com> Date: Wed, 7 Sep 2022 10:31:20 +0800 Subject: [PATCH] modify some network readme and scripts --- official/cv/octsqueeze/README.md | 6 ++--- official/cv/octsqueeze/README_CN.md | 6 ++--- .../scripts/run_train_distribute.sh | 5 ++-- .../nlp/tinybert/src/tinybert_for_gd_td.py | 7 +++++ official/recommend/ncf/README.md | 16 +++-------- official/recommend/ncf/scripts/run_eval.sh | 27 ------------------- .../recommend/ncf/scripts/run_eval_ascend.sh | 4 ++- .../recommend/ncf/scripts/run_eval_gpu.sh | 4 ++- official/recommend/ncf/scripts/run_train.sh | 27 ------------------- .../recommend/ncf/scripts/run_train_ascend.sh | 3 ++- research/cv/retinaface/README_CN.md | 1 + 11 files changed, 29 insertions(+), 77 deletions(-) delete mode 100644 official/recommend/ncf/scripts/run_eval.sh delete mode 100644 official/recommend/ncf/scripts/run_train.sh diff --git a/official/cv/octsqueeze/README.md b/official/cv/octsqueeze/README.md index 4b977f3e4..5bece2444 100644 --- a/official/cv/octsqueeze/README.md +++ b/official/cv/octsqueeze/README.md @@ -84,8 +84,8 @@ bash run_train_standalone.sh [TRAINING_DATASET_PATH] [DEVICE] [CHECKPOINT_SAVE_P # example: bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0 # or tain OctSqueeze parallelly (8P) -bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] -# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ +bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE] +# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json # evaluate OctSqueeze bash run_eval.sh [TEST_DATASET_PATH] [COMPRESSED_DATA_PATH] [RECONSTRUCTED_DATA_PATH] [MODE] [DEVICE] @@ -190,7 +190,7 @@ Major parameters in eval.py: # or enter script dir, run 1P training script bash bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0 # or enter script dir, run 8P training script - bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ + bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json ``` After training, the loss value will be achieved as follows: diff --git a/official/cv/octsqueeze/README_CN.md b/official/cv/octsqueeze/README_CN.md index 6aaf67434..bee44b10b 100644 --- a/official/cv/octsqueeze/README_CN.md +++ b/official/cv/octsqueeze/README_CN.md @@ -87,8 +87,8 @@ bash run_train_standalone.sh [TRAINING_DATASET_PATH] [DEVICE] [CHECKPOINT_SAVE_P # example: bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0 # 鎴栧垎甯冨紡璁粌OctSqueeze (8P) -bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] -# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ +bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE] +# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json # 璇勪及OctSqueeze bash run_eval.sh [TEST_DATASET_PATH] [COMPRESSED_DATA_PATH] [RECONSTRUCTED_DATA_PATH] [MODE] [DEVICE] @@ -194,7 +194,7 @@ eval.py涓殑涓昏鍙傛暟濡備笅锛� # 鎴栬繘鍏ヨ剼鏈洰褰曪紝鎵ц1P鑴氭湰 bash bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0 # 鎴栬繘鍏ヨ剼鏈洰褰曪紝鎵ц8P鑴氭湰 - bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ + bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json ``` 缁忚繃璁粌鍚庯紝鎹熷け鍊煎涓嬶細 diff --git a/official/cv/octsqueeze/scripts/run_train_distribute.sh b/official/cv/octsqueeze/scripts/run_train_distribute.sh index 036f6d683..764d206fb 100644 --- a/official/cv/octsqueeze/scripts/run_train_distribute.sh +++ b/official/cv/octsqueeze/scripts/run_train_distribute.sh @@ -14,9 +14,9 @@ # limitations under the License. # ============================================================================ -if [ $# != 2 ] +if [ $# != 3 ] then - echo "Usage: bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH]" + echo "Usage: bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE]" exit 1 fi @@ -38,6 +38,7 @@ fi data_path=$1 checkpoint=$2 +export RANK_TABLE_FILE=$3 export RANK_SIZE=8 current_exec_path=$(pwd) diff --git a/official/nlp/tinybert/src/tinybert_for_gd_td.py b/official/nlp/tinybert/src/tinybert_for_gd_td.py index b1a70e92b..076fd919f 100644 --- a/official/nlp/tinybert/src/tinybert_for_gd_td.py +++ b/official/nlp/tinybert/src/tinybert_for_gd_td.py @@ -212,6 +212,9 @@ class BertTrainWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): """ def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell) + self.degree = 1 + if self.reducer_flag: + self.degree = get_group_size() self.cast = P.Cast() def construct(self, @@ -430,7 +433,11 @@ class BertEvaluationWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): """ def __init__(self, network, optimizer, scale_update_cell=None): super(BertEvaluationWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell) + self.degree = 1 + if self.reducer_flag: + self.degree = get_group_size() self.cast = P.Cast() + def construct(self, input_ids, input_mask, diff --git a/official/recommend/ncf/README.md b/official/recommend/ncf/README.md index 9e3c8665a..7f6ced0ba 100644 --- a/official/recommend/ncf/README.md +++ b/official/recommend/ncf/README.md @@ -78,7 +78,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil # [Environment Requirements](#contents) -- Hardware(Ascend锛� +- Hardware(Ascend/GPU/CPU锛� - Prepare hardware environment with Ascend. - Framework - [MindSpore](https://www.mindspore.cn/install/en) @@ -94,9 +94,6 @@ After installing MindSpore via the official website, you can start training and #run data process bash scripts/run_download_dataset.sh -# run training example -bash scripts/run_train.sh - # run training example on Ascend bash scripts/run_train_ascend.sh @@ -106,9 +103,6 @@ bash scripts/run_train_gpu.sh # run training distribute example on Ascend bash scripts/run_distribute_train.sh /path/hccl.json /path/MovieLens -# run evaluation example -bash scripts/run_eval.sh - # run evaluation example on Ascend bash scripts/run_eval_ascend.sh @@ -170,10 +164,8 @@ If you want to run in modelarts, please check the official documentation of [mod 鈹� 鈹溾攢鈹€ascend_distributed_launcher 鈹� 鈹溾攢鈹€__init__.py // init file 鈹� 鈹溾攢鈹€get_distribute_pretrain_cmd.py // create distribute shell script - 鈹� 鈹溾攢鈹€run_train.sh // shell script for train 鈹� 鈹溾攢鈹€run_train_ascend.sh // shell script for train on Ascend 鈹� 鈹溾攢鈹€run_distribute_train.sh // shell script for distribute train - 鈹� 鈹溾攢鈹€run_eval.sh // shell script for evaluation 鈹� 鈹溾攢鈹€run_eval_ascend.sh // shell script for evaluation on Ascend 鈹� 鈹溾攢鈹€run_train_gpu.sh // shell script for train on GPU 鈹� 鈹溾攢鈹€run_eval_gpu.sh // shell script for evaluation on GPU @@ -233,12 +225,12 @@ Parameters for both training and evaluation can be set in config.py. ```bash bash scripts/run_train_gpu.sh [DATASET_PATH] [CKPT_FILE] [DEVICE_ID] - ``` + ```run_train - on CPU ```bash - python train.py --data_path=./dataset --dataset=ml-1m --train_epochs=25 --batch_size=256 --output_path=./output/ --checkpoint_path=./checkpoint --device_target=CPU --device_id=0 --num_parallel_workers=2 > train.log 2>&1 & + pytrun_trainth=./dataset --dataset=ml-1m --train_epochs=25 --batch_size=256 --output_path=./output/ --crun_trainpoint --device_target=CPU --device_id=0 --num_parallel_workers=2 > train.log 2>&1 & ``` The python command above will run in the background, you can view the results through the file `train.log`. After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows: @@ -284,7 +276,7 @@ Parameters for both training and evaluation can be set in config.py. - evaluation on ml-1m dataset when running on CPU ```bash - python eval.py --data_path=./dataset --dataset=ml-1m --eval_batch_size=160000 --output_path=./output/ --eval_file_name=eval.log --checkpoint_file_path=./ckpt --device_target=CPU --device_id=0 > log.txt 2>&1 & + python eval.py --data_path=./dataset --dataset=ml-1m --eval_batch_size=160000 --output_path=./output/ --eval_file_name=eval.log --checkpoint_file_path=./ckpt --device_target=CPU --device_id=0 > eval.log 2>&1 & ``` The accuracy of the test dataset will be as follows: diff --git a/official/recommend/ncf/scripts/run_eval.sh b/official/recommend/ncf/scripts/run_eval.sh deleted file mode 100644 index b7a054adb..000000000 --- a/official/recommend/ncf/scripts/run_eval.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright 2022 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -if [ $# != 2 ] -then - echo "Please run the script as: " - echo "bash scripts/run_eval.sh DATASET_PATH CKPT_FILE" - echo "for example: bash scripts/run_eval.sh /dataset_path /ncf.ckpt" -exit 1 -fi - - -data_path=$1 -ckpt_file=$2 -python ./eval.py --data_path $data_path --dataset 'ml-1m' --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file > log.txt 2>&1 & diff --git a/official/recommend/ncf/scripts/run_eval_ascend.sh b/official/recommend/ncf/scripts/run_eval_ascend.sh index 7a1c95ec3..7fbb068fd 100644 --- a/official/recommend/ncf/scripts/run_eval_ascend.sh +++ b/official/recommend/ncf/scripts/run_eval_ascend.sh @@ -24,4 +24,6 @@ fi data_path=$1 ckpt_file=$2 export DEVICE_ID=$3 -python ./eval.py --data_path $data_path --dataset 'ml-1m' --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file --device_target=Ascend --device_id $DEVICE_ID > log.txt 2>&1 & +python ./eval.py --data_path $data_path --dataset 'ml-1m' --eval_batch_size 160000 \ + --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file \ + --device_target=Ascend --device_id $DEVICE_ID > eval.log 2>&1 & diff --git a/official/recommend/ncf/scripts/run_eval_gpu.sh b/official/recommend/ncf/scripts/run_eval_gpu.sh index e253b6fa0..357b7677c 100644 --- a/official/recommend/ncf/scripts/run_eval_gpu.sh +++ b/official/recommend/ncf/scripts/run_eval_gpu.sh @@ -25,4 +25,6 @@ fi data_path=$1 ckpt_file=$2 export CUDA_VISIBLE_DEVICES=$3 -python ./eval.py --data_path $data_path --dataset 'ml-1m' --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file --device_target=GPU --device_id=0 > log.txt 2>&1 & +python ./eval.py --data_path $data_path --dataset 'ml-1m' --eval_batch_size 160000 \ + --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file \ + --device_target=GPU --device_id=0 > eval.log 2>&1 & diff --git a/official/recommend/ncf/scripts/run_train.sh b/official/recommend/ncf/scripts/run_train.sh deleted file mode 100644 index f7f947a93..000000000 --- a/official/recommend/ncf/scripts/run_train.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright 2022 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [ $# != 2 ] -then - echo "Please run the script as: " - echo "bash scripts/run_train.sh DATASET_PATH CKPT_FILE" - echo "for example: bash scripts/run_train.sh /dataset_path /ncf.ckpt" -exit 1 -fi - -data_path=$1 -ckpt_file=$2 -python ./train.py --data_path $data_path --dataset 'ml-1m' --train_epochs 20 --batch_size 256 --output_path './output/' --checkpoint_path $ckpt_file > train.log 2>&1 & diff --git a/official/recommend/ncf/scripts/run_train_ascend.sh b/official/recommend/ncf/scripts/run_train_ascend.sh index 43626edb2..137f4ebff 100644 --- a/official/recommend/ncf/scripts/run_train_ascend.sh +++ b/official/recommend/ncf/scripts/run_train_ascend.sh @@ -23,4 +23,5 @@ fi data_path=$1 ckpt_file=$2 -python ./train.py --data_path $data_path --dataset 'ml-1m' --train_epochs 25 --batch_size 256 --output_path './output/' --checkpoint_path $ckpt_file --device_target=Ascend > train.log 2>&1 & +python ./train.py --data_path $data_path --dataset 'ml-1m' --train_epochs 25 --batch_size 256 \ + --output_path './output/' --checkpoint_path $ckpt_file --device_target=Ascend > train.log 2>&1 & diff --git a/research/cv/retinaface/README_CN.md b/research/cv/retinaface/README_CN.md index 251836bbf..8043f40a1 100644 --- a/research/cv/retinaface/README_CN.md +++ b/research/cv/retinaface/README_CN.md @@ -274,6 +274,7 @@ RetinaFace鍙互浣跨敤ResNet50鎴朚obileNet0.25楠ㄥ共鎻愬彇鍥惧儚鐗瑰緛杩涜妫€ - Ascend澶勭悊鍣ㄧ幆澧冭繍琛岋紙浣跨敤ResNet50浣滀负backbone锛� ```bash + # 灏唖rc/config.py鏂囦欢涓璶npu鍙傛暟鏀逛负1 python train.py --backbone_name 'ResNet50' > train.log 2>&1 & OR bash ./scripts/run_standalone_train_ascend.sh -- GitLab