diff --git a/official/cv/Deepsort/generater-detection.py b/official/cv/Deepsort/generater-detection.py index 98e4344252bd5646f673894619ce32f201a76cef..660d2e3429334474807e5b1e79e6825d34af6663 100644 --- a/official/cv/Deepsort/generater-detection.py +++ b/official/cv/Deepsort/generater-detection.py @@ -217,6 +217,8 @@ if __name__ == "__main__": if target not in ('GPU', "Ascend"): raise ValueError("Unsupported device target.") + device_id = int(os.getenv('DEVICE_ID', '0')) + device_num = int(os.getenv('RANK_SIZE', '1')) context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False, @@ -224,8 +226,6 @@ if __name__ == "__main__": if args.run_modelarts: import moxing as mox - device_id = int(os.getenv('DEVICE_ID')) - device_num = int(os.getenv('RANK_SIZE')) context.set_context(device_id=device_id) local_data_url = '/cache/data' local_ckpt_url = '/cache/ckpt' @@ -243,17 +243,14 @@ if __name__ == "__main__": det_dir = local_det_url + '/' elif target == "Ascend": if args.run_distribute: - device_id = int(os.getenv('DEVICE_ID')) - device_num = int(os.getenv('RANK_SIZE')) context.set_context(device_id=device_id) init() context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(device_id=args.device_id) + context.set_context(device_id=device_id) device_num = 1 - device_id = args.device_id DATA_DIR = args.data_url local_train_url = args.train_url ckpt_dir = args.ckpt_url @@ -268,7 +265,6 @@ if __name__ == "__main__": parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - device_id = 0 context.set_context(device_id=device_id) DATA_DIR = args.data_url local_train_url = args.train_url diff --git a/official/cv/Deepsort/scripts/run_standalone_train_ascend.sh b/official/cv/Deepsort/scripts/run_standalone_train_ascend.sh new file mode 100644 index 0000000000000000000000000000000000000000..3a34d8fc39deb30983f709ebc47569da0365aca9 --- /dev/null +++ b/official/cv/Deepsort/scripts/run_standalone_train_ascend.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ]; then + echo "Usage: bash run_distribute_train_ascend.sh [DATA_PATH] [CKPT_PATH]" + exit 1 +fi + +export RANK_SIZE=1 +python src/deep/train.py --data_url [DATA_PATH] --train_url [CKPT_PATH] --device=Ascend > out.log 2>&1 \ No newline at end of file diff --git a/official/cv/Deepsort/src/deep/train.py b/official/cv/Deepsort/src/deep/train.py index 43eb22876e267c1364a711aa3688314d866dc4c9..0e08218f5c6e3efadd57414b442baffad1dc404c 100644 --- a/official/cv/Deepsort/src/deep/train.py +++ b/official/cv/Deepsort/src/deep/train.py @@ -20,7 +20,7 @@ import mindspore.dataset.vision as C import mindspore.dataset as ds import mindspore.nn as nn from mindspore import Tensor, context -from mindspore.communication.management import init, get_rank +from mindspore.communication.management import init from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor from mindspore.train.model import Model from mindspore.context import ParallelMode @@ -62,12 +62,12 @@ if target not in ('GPU', "Ascend"): context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False) -device_num = int(os.getenv('RANK_SIZE')) +device_num = int(os.getenv('RANK_SIZE', '1')) +device_id = int(os.getenv('DEVICE_ID', '0')) +rank = int(os.getenv('RANK_ID', '0')) if args.run_modelarts: import moxing as mox - device_id = int(os.getenv('DEVICE_ID')) - device_num = int(os.getenv('RANK_SIZE')) cfg.batch_size = cfg.batch_size*int(8/device_num) context.set_context(device_id=device_id) local_data_url = '/cache/data' @@ -80,8 +80,6 @@ if args.run_modelarts: DATA_DIR = local_data_url + '/' elif target == "Ascend": if args.run_distribute: - device_id = int(os.getenv('DEVICE_ID')) - device_num = int(os.getenv('RANK_SIZE')) cfg.batch_size = cfg.batch_size*int(8/device_num) context.set_context(device_id=device_id) init() @@ -89,22 +87,18 @@ elif target == "Ascend": context.set_auto_parallel_context(device_num=device_num,\ parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - context.set_context(device_id=args.device_id) + context.set_context(device_id=device_id) device_num = 1 cfg.batch_size = cfg.batch_size*int(8/device_num) - device_id = args.device_id DATA_DIR = args.data_url + '/' elif target == "GPU": if args.run_distribute: init("nccl") context.reset_auto_parallel_context() - rank = get_rank() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: - rank = 0 - device_id = int(os.getenv('DEVICE_ID')) context.set_context(device_id=device_id) DATA_DIR = args.data_url diff --git a/official/cv/c3d/src/tools/ckpt_convert.py b/official/cv/c3d/src/tools/ckpt_convert.py index 17be761c0fc0d7ada774d93be4698867fc568651..ff00e20a195a0c4ecd7693c6e9b22f229849a456 100644 --- a/official/cv/c3d/src/tools/ckpt_convert.py +++ b/official/cv/c3d/src/tools/ckpt_convert.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================ +import sys import torch from mindspore import Tensor diff --git a/official/nlp/transformer/scripts/run_eval.sh b/official/nlp/transformer/scripts/run_eval.sh index 0ce68c6ae1ac306aa4a93a9d704760d49db2d6ae..ad70b383861bbc75cf539bc87329530189361b2b 100644 --- a/official/nlp/transformer/scripts/run_eval.sh +++ b/official/nlp/transformer/scripts/run_eval.sh @@ -26,6 +26,7 @@ fi export DEVICE_TARGET=$1 export CONFIG_PATH=$5 DEVICE_ID=$2 +export DEVICE_ID=$2 get_real_path(){ if [ "${1:0:1}" == "/" ]; then diff --git a/official/nlp/transformer/scripts/run_standalone_train.sh b/official/nlp/transformer/scripts/run_standalone_train.sh index 43f11bd7938156e10f7b3d07b4c75d24c81f15c3..96b16a47440fdedc32352e51f6031634a7b3e3d8 100644 --- a/official/nlp/transformer/scripts/run_standalone_train.sh +++ b/official/nlp/transformer/scripts/run_standalone_train.sh @@ -35,6 +35,7 @@ fi export DEVICE_TARGET=$1 DEVICE_ID=$2 +export DEVICE_ID=$2 EPOCH_SIZE=$3 GRADIENT_ACCUMULATE_STEP=$4 DATA_PATH=$5 diff --git a/research/cv/PSPNet/README.md b/research/cv/PSPNet/README.md index 9f866fb248e1e5eb3727208b43f5ffa883c57884..83bfac1c692ab149160554eff6f74e7dd473522e 100644 --- a/research/cv/PSPNet/README.md +++ b/research/cv/PSPNet/README.md @@ -251,7 +251,7 @@ The ckpt_file parameter is required, |training parameter |epoch=100,batch_size=8 | |optimizer |SGD optimizer锛宮omentum=0.9,weight_decay=0.0001 | |loss function |SoftmaxCrossEntropyLoss | -|training speed |epoch time: 493974.632 ms, per step time: 464.699 ms(1p for voc2012)| +|training speed |epoch time: 493974.632 ms, per step time: 464.699 ms(1p for voc2012), 485 ms(8p for voc2012), 998 ms(1p for ADE20K), 1050 ms(8p for ADE20K)| |total time |6h10m34s(1pcs) | |Script URL |https://gitee.com/mindspore/models/tree/master/research/cv/PSPNet| |Random number seed |set_seed = 1234 | diff --git a/research/cv/Spnas/README.md b/research/cv/Spnas/README.md index 9801ac1dc6e7103f17cad13d10bb2a483a2ba48c..3d08c96248ba054a5afd7b17284e8d7c543d66d9 100644 --- a/research/cv/Spnas/README.md +++ b/research/cv/Spnas/README.md @@ -86,7 +86,7 @@ Spnas ### For training ```bash -python3 train.py --config_path=src/spnas.yaml +python3 train.py --config_file=src/spnas.yaml ``` > Or one can run following script for all tasks. diff --git a/research/cv/Spnas/scripts/run_distributed.sh b/research/cv/Spnas/scripts/run_distributed.sh index dbc589f03db6547263c2285397c64794fdc66abb..a5c5639bcdc60f8e6af136ec4fe6514f936892c6 100644 --- a/research/cv/Spnas/scripts/run_distributed.sh +++ b/research/cv/Spnas/scripts/run_distributed.sh @@ -20,4 +20,4 @@ export RANK_SIZE=8 RANK_TABLE_FILE=$(realpath $1) export RANK_TABLE_FILE -python3 ../train.py --config_path=../src/spnas_distributed.yml \ No newline at end of file +python3 ../train.py --config_file=../src/spnas_distributed.yml \ No newline at end of file diff --git a/research/cv/Spnas/train.py b/research/cv/Spnas/train.py index 722c5e89193b8ada62fbd419b1a3337251427bd0..3239540d64ef570660d401f4dfd3ce36adaf4ae9 100644 --- a/research/cv/Spnas/train.py +++ b/research/cv/Spnas/train.py @@ -19,9 +19,9 @@ import vega if __name__ == '__main__': parser = argparse.ArgumentParser(description="Spnas network") - parser.add_argument("--config_path", type=str, required=True, help="spnas config path.") + parser.add_argument("--config_file", type=str, required=True, help="spnas config path.") args = parser.parse_args() - config_path = args.config_path + config_file = args.config_file vega.set_backend('mindspore', 'NPU') - vega.run(config_path) + vega.run(config_file) diff --git a/research/cv/rcnn/README.md b/research/cv/rcnn/README.md index 5f6b445570557145d18d16cecb5df921ce61d449..4da1d2ed096f5a44448829ff011301dcac56662b 100644 --- a/research/cv/rcnn/README.md +++ b/research/cv/rcnn/README.md @@ -105,8 +105,8 @@ python process_data.py - running on Ascend锛� ```shell -bash scripts/run_standalone_train_ascend.sh 0 - ``` +bash run_standalone_train_ascend.sh 0 +``` The command above will run in the background, you can view the results through the file logs @@ -137,7 +137,7 @@ bash scripts/run_standalone_train_ascend.sh 0 - running on GPU锛� ```shell -bash scripts/run_standalone_train_gpu.sh 0 +bash run_standalone_train_gpu.sh 0 ``` The command above will run in the background, you can view the results through the file logs @@ -181,7 +181,7 @@ bash scripts/run_standalone_train_gpu.sh 0 - distributed running on Ascend锛� ```shell -bash ./scripts/run_distribute_train_ascend.sh rank_table.json +bash run_distribute_train_ascend.sh rank_table.json ``` The above shell script will run distribute training in the background. You can view the results through the file train_parallel[X]/logs. The loss value will be achieved as follows: @@ -207,7 +207,7 @@ bash ./scripts/run_distribute_train_ascend.sh rank_table.json - distributed running on GPU锛� ```shell -bash scripts/run_distribute_train_gpu.sh +bash run_distribute_train_gpu.sh ``` The above shell script will run distribute training in the background. @@ -250,7 +250,7 @@ scripts/log_train_regression:[2021-11-09 14:40:58.586][DEBUG] trainer.py(121)->t Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path. ```shell -bash scripts/run_standalone_eval_ascend.sh 0 +bash run_standalone_eval_ascend.sh 0 ``` The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows: @@ -267,7 +267,7 @@ svm_thresh: 0.6, map: 0.31060216644862054 Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path. ```shell -bash scripts/run_standalone_eval_gpu.sh 0 +bash run_standalone_eval_gpu.sh 0 ``` The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows: diff --git a/research/cv/rcnn/README_CN.md b/research/cv/rcnn/README_CN.md index 0755f0b14aaa852c0581fd349a0b13a13a61e033..62bc592996b02856d8f5530bfe58f97a0744fd6b 100644 --- a/research/cv/rcnn/README_CN.md +++ b/research/cv/rcnn/README_CN.md @@ -95,7 +95,7 @@ R-CNN鐨勬暣浣撶綉缁滄灦鏋勮鎯咃紝璇峰弬鑰冮摼鎺�: [Link](https://arxiv.org/abs ```shell python process_data.py - ``` +``` ### [璁粌杩囩▼](#contents) @@ -104,7 +104,7 @@ python process_data.py - 鍦ˋscend涓婅繘琛岃缁冿細 ```shell -bash scripts/run_standalone_train_ascend.sh 0 +bash run_standalone_train_ascend.sh 0 ``` 涓婅堪鍛戒护灏嗗湪鍚庡彴杩愯锛屼綘鍙互鍦ㄦ棩蹇楁枃浠舵煡鐪嬬粨鏋溿€� @@ -135,7 +135,7 @@ bash scripts/run_standalone_train_ascend.sh 0 閫氳繃鎵ц濡備笅鍛戒护杩涜璁粌锛� -bash scripts/run_standalone_train_gpu.sh 0 +bash run_standalone_train_gpu.sh 0 璁粌鍚庯紝榛樿浼氬湪鑴氭湰鏂囦欢澶逛笅寰楀埌涓€浜沜heckpoints鏂囦欢锛屽彲浠ュ湪鏃ュ織鏂囦欢涓煡鐪嬬粨鏋滐紝loss濡備笅鎵€绀猴細 @@ -174,7 +174,7 @@ bash scripts/run_standalone_train_gpu.sh 0 - 鍦ˋscend涓婅繘琛屽垎甯冨紡璁粌锛� ```shell -bash ./scripts/run_distribute_train_ascend.sh rank_table.json +bash run_distribute_train_ascend.sh rank_table.json ``` 涓婅堪鑴氭湰灏嗗湪鍚庡彴杩愯. 鎮ㄥ彲浠ラ€氳繃鏂囦欢 train_parallel[X]/logs 鏌ョ湅缁撴灉銆� 鎹熷け濡備笅: @@ -200,7 +200,7 @@ bash ./scripts/run_distribute_train_ascend.sh rank_table.json - 鍦℅PU涓婅繘琛屽垎甯冨紡璁粌锛� ```shell -bash scripts/run_distribute_train_gpu.sh +bash run_distribute_train_gpu.sh ``` 涓婅堪鑴氭湰灏嗗湪鍚庡彴杩愯銆� @@ -243,7 +243,7 @@ scripts/log_train_regression:[2021-11-09 14:40:58.586][DEBUG] trainer.py(121)->t 鍦ㄨ繍琛屼互涓嬪懡浠や箣鍓嶏紝璇锋鏌ョ敤浜庤瘎浼扮殑checkpoints璺緞锛岃灏哻heckpoints璺緞璁剧疆涓虹粷瀵瑰畬鏁磋矾寰勩€� ```shell -bash scripts/run_standalone_eval_ascend.sh 0 +bash run_standalone_eval_ascend.sh 0 ``` 涓婇潰鐨勮剼鏈皢鍦ㄥ悗鍙拌繍琛屻€傛偍鍙互閫氳繃鈥渆val.log鈥濇煡鐪嬬粨鏋�,娴嬭瘯鏁版嵁闆嗙殑鍑嗙‘鐜囧涓嬶細 @@ -261,7 +261,7 @@ svm_thresh: 0.6, map: 0.31060216644862054 ```shell -bash scripts/run_standalone_eval_gpu.sh 0 +bash run_standalone_eval_gpu.sh 0 ``` diff --git a/research/cv/rcnn/scripts/run_distribute_train_gpu.sh b/research/cv/rcnn/scripts/run_distribute_train_gpu.sh index 367fc5f72c8675fe0afde3184ec40036b58e187c..33cb93f5303e2336eb83662c67394fbc34094294 100644 --- a/research/cv/rcnn/scripts/run_distribute_train_gpu.sh +++ b/research/cv/rcnn/scripts/run_distribute_train_gpu.sh @@ -24,11 +24,11 @@ export RANK_SIZE=8 echo "start training ... " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_finetune --merge-stderr-to-stdout \ -python train.py --step 0 >log_train_finetune 2>&1 & +python ../train.py --step 0 >log_train_finetune 2>&1 & wait CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_svm --merge-stderr-to-stdout \ -python train.py --step 1 >log_train_svm 2>&1 & +python ../train.py --step 1 >log_train_svm 2>&1 & wait CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_regression --merge-stderr-to-stdout \ -python train.py --step 2 >log_train_regression 2>&1 & +python ../train.py --step 2 >log_train_regression 2>&1 & cd .. diff --git a/research/cv/rcnn/scripts/run_standalone_eval_gpu.sh b/research/cv/rcnn/scripts/run_standalone_eval_gpu.sh index 6389d9ebaf912ffa82b9316f5956daa91d1bf514..361fd643525723e9fb261717d7539a006bf9ab1a 100644 --- a/research/cv/rcnn/scripts/run_standalone_eval_gpu.sh +++ b/research/cv/rcnn/scripts/run_standalone_eval_gpu.sh @@ -20,5 +20,6 @@ exit 1 fi export DEVICE_ID=$1 +cd ../ python eval.py --device_id=${DEVICE_ID} 1>scripts/result.txt 2>scripts/eval_log.txt & cd .. diff --git a/research/cv/rcnn/scripts/run_standalone_train_gpu.sh b/research/cv/rcnn/scripts/run_standalone_train_gpu.sh index d227b8fd584b853f1f950c6b346980e5d2790477..c18eb5c04651b83bd6779b1a964959f187d80ebb 100644 --- a/research/cv/rcnn/scripts/run_standalone_train_gpu.sh +++ b/research/cv/rcnn/scripts/run_standalone_train_gpu.sh @@ -20,9 +20,9 @@ exit 1 fi export DEVICE_ID=$1 -python train.py --device_id=${DEVICE_ID} --step 0 >train_log_finetune 2>&1 & +python ../train.py --device_id=${DEVICE_ID} --step 0 >train_log_finetune 2>&1 & wait -python train.py --device_id=${DEVICE_ID} --step 1 >train_log_svm 2>&1 & +python ../train.py --device_id=${DEVICE_ID} --step 1 >train_log_svm 2>&1 & wait -python train.py --device_id=${DEVICE_ID} --step 2 >train_log_regression 2>&1 & +python ../train.py --device_id=${DEVICE_ID} --step 2 >train_log_regression 2>&1 & cd .. diff --git a/research/cv/wideresnet/train.py b/research/cv/wideresnet/train.py index 5d455b39e5ba85462278d083c86de24b2ef6b6da..c8efea25211360a863ffd2fda43d648a0fbda2ee 100644 --- a/research/cv/wideresnet/train.py +++ b/research/cv/wideresnet/train.py @@ -27,12 +27,12 @@ from mindspore.context import ParallelMode from mindspore import Tensor from mindspore.nn.optim import Momentum from mindspore.train.loss_scale_manager import FixedLossScaleManager -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.model import Model import mindspore.nn as nn import mindspore.common.initializer as weight_init from mindspore.train.serialization import load_checkpoint, load_param_into_net -from src.callbacks import CustomLossMonitor, TimeMonitor, EvalCallback +from src.callbacks import CustomLossMonitor, EvalCallback from src.wide_resnet import wideresnet from src.dataset import create_dataset from src.model_utils.config import config as cfg diff --git a/research/nlp/ktnet/scripts/run_squad_eval.sh b/research/nlp/ktnet/scripts/run_squad_eval.sh index d043541c670c782c8fe4448973e4a3b25d736908..574754c9d38d8c79019fefd659dcda9e95ad72e3 100644 --- a/research/nlp/ktnet/scripts/run_squad_eval.sh +++ b/research/nlp/ktnet/scripts/run_squad_eval.sh @@ -16,13 +16,18 @@ PWD_DIR=`pwd` DATA=$1 +scripts_path=$(dirname $0) LOAD_CHECKPOINT_PATH=$2 BERT_DIR=$DATA/cased_L-24_H-1024_A-16 WN_CPT_EMBEDDING_PATH=$DATA/KB_embeddings/wn_concept2vec.txt NELL_CPT_EMBEDDING_PATH=$DATA/KB_embeddings/nell_concept2vec.txt -python3 run_KTNET_squad_eval.py \ +if [ ! -d log ]; then + mkdir log +fi + +python3 $scripts_path/../run_KTNET_squad_eval.py \ --device_target "Ascend" \ --device_id 0 \ --batch_size 8 \ diff --git a/research/nlp/ternarybert/scripts/run_standalone_eval_ascend.sh b/research/nlp/ternarybert/scripts/run_standalone_eval_ascend.sh index d5390a49a3bcd9611d649f5002fa932d79632532..94eaebb8bedb25e36c10c1b6e85b24522d248e73 100644 --- a/research/nlp/ternarybert/scripts/run_standalone_eval_ascend.sh +++ b/research/nlp/ternarybert/scripts/run_standalone_eval_ascend.sh @@ -27,10 +27,8 @@ echo "===============================================start evaling============== task_name=$1 device_target=$2 -device_id=$3 -model_dir=$4 -data_dir=$5 -device_id=$6 +model_dir=$3 +data_dir=$4 mkdir -p ms_log PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) @@ -40,6 +38,6 @@ export GLOG_logtostderr=0 python ${PROJECT_DIR}/../eval.py \ --task_name=$task_name \ --device_target=$device_target \ - --device_id=$device_id \ + --device_id=$DEVICE_ID \ --model_dir=$model_dir \ --data_dir=$data_dir > eval_log.txt 2>&1 &