modify some network scripts

130000a6 · anzhengqi · b60f327b · 130000a6 · 130000a6 · 130000a6
Commit 130000a6 authored 2 years ago by anzhengqi
--- a/official/cv/Deepsort/generater-detection.py
+++ b/official/cv/Deepsort/generater-detection.py
@@ -217,6 +217,8 @@ if __name__ == "__main__":
    if target not in ('GPU', "Ascend"):
        raise ValueError("Unsupported device target.")

+    device_id = int(os.getenv('DEVICE_ID', '0'))
+    device_num = int(os.getenv('RANK_SIZE', '1'))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=target,
                        save_graphs=False,
@@ -224,8 +226,6 @@ if __name__ == "__main__":

    if args.run_modelarts:
        import moxing as mox
-        device_id = int(os.getenv('DEVICE_ID'))
-        device_num = int(os.getenv('RANK_SIZE'))
        context.set_context(device_id=device_id)
        local_data_url = '/cache/data'
        local_ckpt_url = '/cache/ckpt'
@@ -243,17 +243,14 @@ if __name__ == "__main__":
        det_dir = local_det_url + '/'
    elif target == "Ascend":
        if args.run_distribute:
-            device_id = int(os.getenv('DEVICE_ID'))
-            device_num = int(os.getenv('RANK_SIZE'))
            context.set_context(device_id=device_id)
            init()
            context.reset_auto_parallel_context()
            context.set_auto_parallel_context(device_num=device_num,
                                              parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
        else:
-            context.set_context(device_id=args.device_id)
+            context.set_context(device_id=device_id)
            device_num = 1
-            device_id = args.device_id
        DATA_DIR = args.data_url
        local_train_url = args.train_url
        ckpt_dir = args.ckpt_url
@@ -268,7 +265,6 @@ if __name__ == "__main__":
                                              parallel_mode=ParallelMode.DATA_PARALLEL,
                                              gradients_mean=True)
        else:
-            device_id = 0
            context.set_context(device_id=device_id)
        DATA_DIR = args.data_url
        local_train_url = args.train_url

--- a/official/cv/Deepsort/scripts/run_standalone_train_ascend.sh
+++ b/official/cv/Deepsort/scripts/run_standalone_train_ascend.sh
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 2 ]; then
+  echo "Usage: bash run_distribute_train_ascend.sh [DATA_PATH] [CKPT_PATH]"
+  exit 1
+fi
+
+export RANK_SIZE=1
+python src/deep/train.py --data_url [DATA_PATH] --train_url [CKPT_PATH] --device=Ascend > out.log 2>&1
\ No newline at end of file
--- a/official/cv/Deepsort/src/deep/train.py
+++ b/official/cv/Deepsort/src/deep/train.py
@@ -20,7 +20,7 @@ import mindspore.dataset.vision as C
 import mindspore.dataset as ds
 import mindspore.nn as nn
 from mindspore import Tensor, context
-from mindspore.communication.management import init, get_rank
+from mindspore.communication.management import init
 from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor
 from mindspore.train.model import Model
 from mindspore.context import ParallelMode
@@ -62,12 +62,12 @@ if target not in ('GPU', "Ascend"):

 context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)

-device_num = int(os.getenv('RANK_SIZE'))
+device_num = int(os.getenv('RANK_SIZE', '1'))
+device_id = int(os.getenv('DEVICE_ID', '0'))
+rank = int(os.getenv('RANK_ID', '0'))

 if args.run_modelarts:
    import moxing as mox
-    device_id = int(os.getenv('DEVICE_ID'))
-    device_num = int(os.getenv('RANK_SIZE'))
    cfg.batch_size = cfg.batch_size*int(8/device_num)
    context.set_context(device_id=device_id)
    local_data_url = '/cache/data'
@@ -80,8 +80,6 @@ if args.run_modelarts:
    DATA_DIR = local_data_url + '/'
 elif target == "Ascend":
    if args.run_distribute:
-        device_id = int(os.getenv('DEVICE_ID'))
-        device_num = int(os.getenv('RANK_SIZE'))
        cfg.batch_size = cfg.batch_size*int(8/device_num)
        context.set_context(device_id=device_id)
        init()
@@ -89,22 +87,18 @@ elif target == "Ascend":
        context.set_auto_parallel_context(device_num=device_num,\
             parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
    else:
-        context.set_context(device_id=args.device_id)
+        context.set_context(device_id=device_id)
        device_num = 1
        cfg.batch_size = cfg.batch_size*int(8/device_num)
-        device_id = args.device_id
    DATA_DIR = args.data_url + '/'
 elif target == "GPU":
    if args.run_distribute:
        init("nccl")
        context.reset_auto_parallel_context()
-        rank = get_rank()
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
    else:
-        rank = 0
-        device_id = int(os.getenv('DEVICE_ID'))
        context.set_context(device_id=device_id)
    DATA_DIR = args.data_url


--- a/official/cv/c3d/src/tools/ckpt_convert.py
+++ b/official/cv/c3d/src/tools/ckpt_convert.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================

+import sys
 import torch

 from mindspore import Tensor

--- a/official/nlp/transformer/scripts/run_eval.sh
+++ b/official/nlp/transformer/scripts/run_eval.sh
@@ -26,6 +26,7 @@ fi
 export DEVICE_TARGET=$1
 export CONFIG_PATH=$5
 DEVICE_ID=$2
+export DEVICE_ID=$2

 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then

--- a/official/nlp/transformer/scripts/run_standalone_train.sh
+++ b/official/nlp/transformer/scripts/run_standalone_train.sh
@@ -35,6 +35,7 @@ fi

 export DEVICE_TARGET=$1
 DEVICE_ID=$2
+export DEVICE_ID=$2
 EPOCH_SIZE=$3
 GRADIENT_ACCUMULATE_STEP=$4
 DATA_PATH=$5

--- a/research/cv/PSPNet/README.md
+++ b/research/cv/PSPNet/README.md
@@ -251,7 +251,7 @@ The ckpt_file parameter is required,
 |training parameter     |epoch=100,batch_size=8   |
 |optimizer              |SGD optimizer，momentum=0.9,weight_decay=0.0001    |
 |loss function          |SoftmaxCrossEntropyLoss   |
-|training speed         |epoch time: 493974.632 ms, per step time: 464.699 ms(1p for voc2012)|
+|training speed         |epoch time: 493974.632 ms, per step time: 464.699 ms(1p for voc2012), 485 ms(8p for voc2012), 998 ms(1p for ADE20K), 1050 ms(8p for ADE20K)|
 |total time             |6h10m34s(1pcs)    |
 |Script URL             |https://gitee.com/mindspore/models/tree/master/research/cv/PSPNet|
 |Random number seed     |set_seed = 1234     |

--- a/research/cv/Spnas/README.md
+++ b/research/cv/Spnas/README.md
@@ -86,7 +86,7 @@ Spnas
 ### For training

 ```bash
-python3 train.py --config_path=src/spnas.yaml
+python3 train.py --config_file=src/spnas.yaml
 ```

 > Or one can run following script for all tasks.

--- a/research/cv/Spnas/scripts/run_distributed.sh
+++ b/research/cv/Spnas/scripts/run_distributed.sh
@@ -20,4 +20,4 @@ export RANK_SIZE=8
 RANK_TABLE_FILE=$(realpath $1)
 export RANK_TABLE_FILE

-python3 ../train.py --config_path=../src/spnas_distributed.yml
\ No newline at end of file
+python3 ../train.py --config_file=../src/spnas_distributed.yml
\ No newline at end of file
--- a/research/cv/Spnas/train.py
+++ b/research/cv/Spnas/train.py
@@ -19,9 +19,9 @@ import vega

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Spnas network")
-    parser.add_argument("--config_path", type=str, required=True, help="spnas config path.")
+    parser.add_argument("--config_file", type=str, required=True, help="spnas config path.")
    args = parser.parse_args()

-    config_path = args.config_path
+    config_file = args.config_file
    vega.set_backend('mindspore', 'NPU')
-    vega.run(config_path)
+    vega.run(config_file)
--- a/research/cv/rcnn/README.md
+++ b/research/cv/rcnn/README.md
@@ -105,8 +105,8 @@ python process_data.py
 - running on Ascend：

 ```shell
-bash scripts/run_standalone_train_ascend.sh 0
- ```
+bash run_standalone_train_ascend.sh 0
+```

  The command above will run in the background, you can view the results through the file logs

@@ -137,7 +137,7 @@ bash scripts/run_standalone_train_ascend.sh 0
 - running on GPU：

 ```shell
-bash scripts/run_standalone_train_gpu.sh 0
+bash run_standalone_train_gpu.sh 0
 ```

  The command above will run in the background, you can view the results through the file logs
@@ -181,7 +181,7 @@ bash scripts/run_standalone_train_gpu.sh 0
 - distributed running on Ascend：

 ```shell
-bash ./scripts/run_distribute_train_ascend.sh rank_table.json
+bash run_distribute_train_ascend.sh rank_table.json
 ```

  The above shell script will run distribute training in the background. You can view the results through the file train_parallel[X]/logs. The loss value will be achieved as follows:
@@ -207,7 +207,7 @@ bash ./scripts/run_distribute_train_ascend.sh rank_table.json
 - distributed running on GPU：

 ```shell
-bash scripts/run_distribute_train_gpu.sh
+bash run_distribute_train_gpu.sh
 ```

 The above shell script will run distribute training in the background.
@@ -250,7 +250,7 @@ scripts/log_train_regression:[2021-11-09 14:40:58.586][DEBUG] trainer.py(121)->t
  Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path.

 ```shell
-bash scripts/run_standalone_eval_ascend.sh 0
+bash run_standalone_eval_ascend.sh 0
 ```

  The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:
@@ -267,7 +267,7 @@ svm_thresh: 0.6, map: 0.31060216644862054
  Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path.

 ```shell
-bash scripts/run_standalone_eval_gpu.sh 0
+bash run_standalone_eval_gpu.sh 0
 ```

  The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:

--- a/research/cv/rcnn/README_CN.md
+++ b/research/cv/rcnn/README_CN.md
@@ -95,7 +95,7 @@ R-CNN的整体网络架构详情，请参考链接: [Link](https://arxiv.org/abs

 ```shell
 python process_data.py
- ```
+```

 ### [训练过程](#contents)

@@ -104,7 +104,7 @@ python process_data.py
 - 在Ascend上进行训练：

 ```shell
-bash scripts/run_standalone_train_ascend.sh 0
+bash run_standalone_train_ascend.sh 0
 ```

 上述命令将在后台运行，你可以在日志文件查看结果。
@@ -135,7 +135,7 @@ bash scripts/run_standalone_train_ascend.sh 0

 通过执行如下命令进行训练：

-bash scripts/run_standalone_train_gpu.sh 0
+bash run_standalone_train_gpu.sh 0

 训练后，默认会在脚本文件夹下得到一些checkpoints文件，可以在日志文件中查看结果，loss如下所示：

@@ -174,7 +174,7 @@ bash scripts/run_standalone_train_gpu.sh 0
 - 在Ascend上进行分布式训练：

 ```shell
-bash ./scripts/run_distribute_train_ascend.sh rank_table.json
+bash run_distribute_train_ascend.sh rank_table.json
 ```

 上述脚本将在后台运行. 您可以通过文件 train_parallel[X]/logs 查看结果。 损失如下:
@@ -200,7 +200,7 @@ bash ./scripts/run_distribute_train_ascend.sh rank_table.json
 - 在GPU上进行分布式训练：

 ```shell
-bash scripts/run_distribute_train_gpu.sh
+bash run_distribute_train_gpu.sh
 ```

 上述脚本将在后台运行。
@@ -243,7 +243,7 @@ scripts/log_train_regression:[2021-11-09 14:40:58.586][DEBUG] trainer.py(121)->t
 在运行以下命令之前，请检查用于评估的checkpoints路径，请将checkpoints路径设置为绝对完整路径。

 ```shell
-bash scripts/run_standalone_eval_ascend.sh 0
+bash run_standalone_eval_ascend.sh 0
 ```

 上面的脚本将在后台运行。您可以通过“eval.log”查看结果,测试数据集的准确率如下：
@@ -261,7 +261,7 @@ svm_thresh: 0.6, map: 0.31060216644862054

 ```shell

-bash scripts/run_standalone_eval_gpu.sh 0
+bash run_standalone_eval_gpu.sh 0

 ```


--- a/research/cv/rcnn/scripts/run_distribute_train_gpu.sh
+++ b/research/cv/rcnn/scripts/run_distribute_train_gpu.sh
@@ -24,11 +24,11 @@ export RANK_SIZE=8

 echo "start training ... "
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_finetune --merge-stderr-to-stdout \
-python train.py  --step 0 >log_train_finetune 2>&1 &
+python ../train.py  --step 0 >log_train_finetune 2>&1 &
 wait
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_svm --merge-stderr-to-stdout \
-python train.py  --step 1 >log_train_svm 2>&1 &
+python ../train.py  --step 1 >log_train_svm 2>&1 &
 wait
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_regression --merge-stderr-to-stdout \
-python train.py  --step 2 >log_train_regression 2>&1 &
+python ../train.py  --step 2 >log_train_regression 2>&1 &
 cd ..
--- a/research/cv/rcnn/scripts/run_standalone_eval_gpu.sh
+++ b/research/cv/rcnn/scripts/run_standalone_eval_gpu.sh
@@ -20,5 +20,6 @@ exit 1
 fi

 export DEVICE_ID=$1
+cd ../
 python eval.py --device_id=${DEVICE_ID} 1>scripts/result.txt 2>scripts/eval_log.txt &
 cd ..
--- a/research/cv/rcnn/scripts/run_standalone_train_gpu.sh
+++ b/research/cv/rcnn/scripts/run_standalone_train_gpu.sh
@@ -20,9 +20,9 @@ exit 1
 fi

 export DEVICE_ID=$1
-python train.py  --device_id=${DEVICE_ID} --step 0 >train_log_finetune 2>&1 &
+python ../train.py  --device_id=${DEVICE_ID} --step 0 >train_log_finetune 2>&1 &
 wait
-python train.py  --device_id=${DEVICE_ID} --step 1 >train_log_svm 2>&1 &
+python ../train.py  --device_id=${DEVICE_ID} --step 1 >train_log_svm 2>&1 &
 wait
-python train.py  --device_id=${DEVICE_ID} --step 2 >train_log_regression 2>&1 &
+python ../train.py  --device_id=${DEVICE_ID} --step 2 >train_log_regression 2>&1 &
 cd ..
--- a/research/cv/wideresnet/train.py
+++ b/research/cv/wideresnet/train.py
@@ -27,12 +27,12 @@ from mindspore.context import ParallelMode
 from mindspore import Tensor
 from mindspore.nn.optim import Momentum
 from mindspore.train.loss_scale_manager import FixedLossScaleManager
-from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
 from mindspore.train.model import Model
 import mindspore.nn as nn
 import mindspore.common.initializer as weight_init
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from src.callbacks import CustomLossMonitor, TimeMonitor, EvalCallback
+from src.callbacks import CustomLossMonitor, EvalCallback
 from src.wide_resnet import wideresnet
 from src.dataset import create_dataset
 from src.model_utils.config import config as cfg

--- a/research/nlp/ktnet/scripts/run_squad_eval.sh
+++ b/research/nlp/ktnet/scripts/run_squad_eval.sh
@@ -16,13 +16,18 @@

 PWD_DIR=`pwd`
 DATA=$1
+scripts_path=$(dirname $0)
 LOAD_CHECKPOINT_PATH=$2

 BERT_DIR=$DATA/cased_L-24_H-1024_A-16
 WN_CPT_EMBEDDING_PATH=$DATA/KB_embeddings/wn_concept2vec.txt
 NELL_CPT_EMBEDDING_PATH=$DATA/KB_embeddings/nell_concept2vec.txt

-python3 run_KTNET_squad_eval.py \
+if [ ! -d log ]; then
+    mkdir log
+fi
+
+python3 $scripts_path/../run_KTNET_squad_eval.py \
  --device_target "Ascend" \
  --device_id 0 \
  --batch_size 8 \

--- a/research/nlp/ternarybert/scripts/run_standalone_eval_ascend.sh
+++ b/research/nlp/ternarybert/scripts/run_standalone_eval_ascend.sh
@@ -27,10 +27,8 @@ echo "===============================================start evaling==============

 task_name=$1
 device_target=$2
-device_id=$3
-model_dir=$4
-data_dir=$5
-device_id=$6
+model_dir=$3
+data_dir=$4

 mkdir -p ms_log
 PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
@@ -40,6 +38,6 @@ export GLOG_logtostderr=0
 python ${PROJECT_DIR}/../eval.py \
    --task_name=$task_name \
    --device_target=$device_target \
-    --device_id=$device_id \
+    --device_id=$DEVICE_ID \
    --model_dir=$model_dir \
    --data_dir=$data_dir > eval_log.txt 2>&1 &