diff --git a/research/cv/single_path_nas/README.md b/research/cv/single_path_nas/README.md
index ae660b649592fc8998de928da8c2c3622efa6051..ac54c8ae3694d5574d98872490f32a61ab5736f2 100644
--- a/research/cv/single_path_nas/README.md
+++ b/research/cv/single_path_nas/README.md
@@ -94,13 +94,16 @@ After installing MindSpore through the official website, you can follow the step
```bash
# Run the training example
- python train.py --device_id=0 > train.log 2>&1 &
+ python train.py --device_id=0 --device_target="Ascend" --data_path=/imagenet/train > train.log 2>&1 &
+
+ # Run the standalone training example
+ bash ./scripts/run_standalone_train_ascend.sh [DEVICE_ID] [DATA_PATH]
# Run a distributed training example
- bash ./scripts/run_train.sh [RANK_TABLE_FILE] imagenet
+ bash ./scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [DEVICE_NUM] [DATA_PATH]
# Run evaluation example
- python eval.py --checkpoint_path ./ckpt_0 > ./eval.log 2>&1 &
+ python eval.py --checkpoint_path=./ckpt_0 --val_data_path=/imagenet/val --device_id=0 --device_target="Ascend"> ./eval.log 2>&1 &
# Run the inference example
bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID]
@@ -116,27 +119,30 @@ After installing MindSpore through the official website, you can follow the step
```bash
# Run the training example
- python train.py --device_target="GPU" --data_path="/path/to/imagenet/train/" --lr_init=0.26 > train.log 2>&1 &
+ python train.py --device_id=0 --device_target="GPU" --data_path=/imagenet/train > train.log 2>&1 &
+
+ # Run the standalone training example
+ bash ./scripts/run_standalone_train_gpu.sh [DEVICE_ID] [DATA_PATH] > train.log 2>&1 &
# Run a distributed training example
- bash ./scripts/run_distribute_train_gpu.sh "/path/to/imagenet/train/"
+ bash ./scripts/run_distributed_train_gpu.sh [CUDA_VISIBLE_DEVICES] [DEVICE_NUM] [DATA_PATH]
# Run evaluation example
- python eval.py --device_target="GPU" --val_data_path="/path/to/imagenet/val/" --checkpoint_path ./ckpt_0 > ./eval.log 2>&1 &
+ python eval.py --device_target="GPU" --device_id=0 --val_data_path="/path/to/imagenet/val/" --checkpoint_path ./ckpt_0 > ./eval.log 2>&1 &
```
# Scripts Description
## Scripts and sample code
-```bash
+```text
├── model_zoo
├── scripts
- │ ├──run_distribute_train.sh // Shell script for running the Ascend distributed training
- │ ├──run_distribute_train_gou.sh // Shell script for running the GPU distributed training
- │ ├──run_standalone_train.sh // Shell script for running the Ascend standalone training
+ │ ├──run_distribute_train_ascend.sh // Shell script for running the Ascend distributed training
+ │ ├──run_distribute_train_gpu.sh // Shell script for running the GPU distributed training
+ │ ├──run_standalone_train_ascend.sh // Shell script for running the Ascend standalone training
│ ├──run_standalone_train_gpu.sh // Shell script for running the GPU standalone training
- │ ├──run_eval.sh // Shell script for running the Ascend evaluation
+ │ ├──run_eval_ascend.sh // Shell script for running the Ascend evaluation
│ ├──run_eval_gpu.sh // Shell script for running the GPU evaluation
│ ├──run_infer_310.sh // Shell script for running the Ascend 310 inference
├── src
@@ -177,10 +183,6 @@ Training parameters and evaluation parameters can be configured in a `config.py`
'weight_decay':1e-5 # Weight decay value
'image_height':224 # Height of the model input image
'image_width':224 # Width of the model input image
- 'data_path':'/data/ILSVRC2012_train/' # The absolute path to the training dataset
- 'val_data_path':'/data/ILSVRC2012_val/' # The absolute path to the validation dataset
- 'device_target':'Ascend' # Device
- 'device_id':0 # ID of the device used for training/evaluation.
'keep_checkpoint_max':40 # Number of checkpoints to keep
'checkpoint_path':None # The absolute path to the checkpoint file or a directory, where the checkpoints are saved
@@ -205,7 +207,7 @@ For more configuration details, please refer to the script `config.py`.
- Using an Ascend processor environment
```bash
- python train.py --device_id=0 > train.log 2>&1 &
+ python train.py --device_id=0 --device_target="Ascend" --data_path=/imagenet/train > train.log 2>&1 &
```
The above python command will run in the background, and the result can be viewed through the generated train.log file.
@@ -213,7 +215,7 @@ For more configuration details, please refer to the script `config.py`.
- Using an GPU environment
```bash
- python train.py --device_target='GPU' --data_path="/path/to/imagenet/train/" --lr_init=0.26 > train.log 2>&1 &
+ python train.py --device_id=0 --device_target="GPU" --data_path=/imagenet/train > train.log 2>&1 &
```
The above python command will run in the background, and the result can be viewed through the generated train.log file.
@@ -223,7 +225,7 @@ For more configuration details, please refer to the script `config.py`.
- Using an Ascend processor environment
```bash
- bash ./scripts/run_distribute_train.sh [RANK_TABLE_FILE]
+ bash ./scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [DEVICE_NUM] [DATA_PATH]
```
The above shell script will run distributed training in the background.
@@ -231,7 +233,7 @@ For more configuration details, please refer to the script `config.py`.
- Using a GPU environment
```bash
- bash ./scripts/run_distribute_train_gpu.sh [TRAIN_PATH](optional)
+ bash ./scripts/run_distributed_train_gpu.sh [CUDA_VISIBLE_DEVICES] [DEVICE_NUM] [DATA_PATH]
```
> TRAIN_PATH - Path to the directory with the training subset of the dataset.
@@ -249,9 +251,9 @@ the training log files and the checkpoints will be stored.
“./ckpt_0” is a directory, where the trained model is saved in the .ckpt format.
```bash
- python eval.py --checkpoint_path ./ckpt_0 > ./eval.log 2>&1 &
+ python eval.py --checkpoint_path=./ckpt_0 --device_id=0 --device_target="Ascend" --val_data_path/imagenet/val > ./eval.log 2>&1 &
OR
- bash ./scripts/run_eval.sh
+ bash ./scripts/run_eval_ascend.sh [DEVICE_ID] [DATA_PATH] [CKPT_FILE/CKPT_DIR]
```
- Evaluate the model on the ImageNet-1k dataset using the GPU environment
@@ -259,9 +261,9 @@ the training log files and the checkpoints will be stored.
“./ckpt_0” is a directory, where the trained model is saved in the .ckpt format.
```bash
- python eval.py --device_target="GPU" --checkpoint_path ./ckpt_0 > ./eval.log 2>&1 &
+ python eval.py --checkpoint_path=./ckpt_0 --device_id=0 --device_target="GPU" --val_data_path/imagenet/val > ./eval.log 2>&1 &
OR
- bash ./scripts/run_eval_gpu.sh [CKPT_FILE_OR_DIR] [VALIDATION_DATASET](optional)
+ bash ./scripts/run_eval_gpu.sh [DEVICE_ID] [DATA_PATH] [CKPT_FILE/CKPT_DIR]
```
> CKPT_FILE_OR_DIR - Path to the trained model checkpoint or to the directory, containing checkpoints.
diff --git a/research/cv/single_path_nas/README_CN.md b/research/cv/single_path_nas/README_CN.md
index 62c6d04c6d95599f77e88920ea26721e403b443f..cb424a576d83166634772f5466994a3b113fcb4c 100644
--- a/research/cv/single_path_nas/README_CN.md
+++ b/research/cv/single_path_nas/README_CN.md
@@ -77,13 +77,16 @@ single-path-nas的作者用一个7x7的大卷积,来代表3x3、5x5和7x7的
```bash
# 运行训练示例
- python train.py --device_id=0 > train.log 2>&1 &
+ python train.py --device_id=0 --data_path=/imagenet/train --device_target=Ascend> train.log 2>&1 &
+
+ # 运行单卡训练示例
+ bash ./scripts/run_standalone_train_ascend.sh [DEVICE_ID] [DATA_PATH]
# 运行分布式训练示例
- bash ./scripts/run_train.sh [RANK_TABLE_FILE] imagenet
+ bash ./scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [DEVICE_NUM] [DATA_PATH]
# 运行评估示例
- python eval.py --checkpoint_path ./ckpt_0 > ./eval.log 2>&1 &
+ python eval.py --checkpoint_path=./ckpt_0 --device_id=0 --device_target="Ascend" --val_data_path/imagenet/val > ./eval.log 2>&1 &
# 运行推理示例
bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID]
@@ -99,14 +102,14 @@ single-path-nas的作者用一个7x7的大卷积,来代表3x3、5x5和7x7的
## 脚本及样例代码
-```bash
+```text
├── model_zoo
├── scripts
- │ ├──run_distribute_train.sh // 分布式到Ascend的shell脚本
- │ ├──run_distribute_train_gou.sh // Shell script for running the GPU distributed training
- │ ├──run_standalone_train.sh // Shell script for running the Ascend standalone training
+ │ ├──run_distribute_train_ascend.sh // 分布式到Ascend的shell脚本
+ │ ├──run_distribute_train_gpu.sh // Shell script for running the GPU distributed training
+ │ ├──run_standalone_train_ascend.sh // Shell script for running the Ascend standalone training
│ ├──run_standalone_train_gpu.sh // Shell script for running the GPU standalone training
- │ ├──run_eval.sh // 测试脚本
+ │ ├──run_eval_ascend.sh // 测试脚本
│ ├──run_eval_gpu.sh // Shell script for running the GPU evaluation
│ ├──run_infer_310.sh // 310推理脚本
├── src
@@ -147,10 +150,6 @@ single-path-nas的作者用一个7x7的大卷积,来代表3x3、5x5和7x7的
'weight_decay':1e-5 # 权重衰减值
'image_height':224 # 输入到模型的图像高度
'image_width':224 # 输入到模型的图像宽度
- 'data_path':'/data/ILSVRC2012_train/' # 训练数据集的绝对全路径
- 'val_data_path':'/data/ILSVRC2012_val/' # 评估数据集的绝对全路径
- 'device_target':'Ascend' # 运行设备
- 'device_id':0 # 用于训练或评估数据集的设备ID使用run_train.sh进行分布式训练时可以忽略。
'keep_checkpoint_max':40 # 最多保存80个ckpt模型文件
'checkpoint_path':None # checkpoint文件保存的绝对全路径
```
@@ -164,7 +163,7 @@ single-path-nas的作者用一个7x7的大卷积,来代表3x3、5x5和7x7的
- Ascend处理器环境运行
```bash
- python train.py --device_id=0 > train.log 2>&1 &
+ python train.py --device_id=0 --device_target="Ascend" --data_path=/imagenet/train > train.log 2>&1 &
```
上述python命令将在后台运行,可以通过生成的train.log文件查看结果。
@@ -174,7 +173,7 @@ single-path-nas的作者用一个7x7的大卷积,来代表3x3、5x5和7x7的
- Ascend处理器环境运行
```bash
- bash ./scripts/run_train.sh [RANK_TABLE_FILE] imagenet
+ bash ./scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [DEVICE_NUM] [DATA_PATH]
```
上述shell脚本将在后台运行分布训练。
@@ -188,9 +187,9 @@ single-path-nas的作者用一个7x7的大卷积,来代表3x3、5x5和7x7的
“./ckpt_0”是保存了训练好的.ckpt模型文件的目录。
```bash
- python eval.py --checkpoint_path ./ckpt_0 > ./eval.log 2>&1 &
+ python eval.py --checkpoint_path=./ckpt_0 --device_id=0 --device_target="Ascend" --val_data_path=/imagenet/val > ./eval.log 2>&1 &
OR
- bash ./scripts/run_eval.sh
+ bash ./scripts/run_eval_ascend.sh [DEVICE_ID] [DATA_PATH] [CKPT_FILE/CKPT_DIR]
```
## 导出过程
diff --git a/research/cv/single_path_nas/eval.py b/research/cv/single_path_nas/eval.py
index ddd8a3e4b6c0f3a0e3783fdb56d481bcce561e38..3a1805014a77cce50582dfaf1ee82a5a44eabb43 100644
--- a/research/cv/single_path_nas/eval.py
+++ b/research/cv/single_path_nas/eval.py
@@ -37,14 +37,15 @@ from src.dataset import create_dataset_imagenet
set_seed(1)
parser = argparse.ArgumentParser(description='single-path-nas')
-parser.add_argument('--dataset_name', type=str, default='imagenet', choices=['imagenet',],
+parser.add_argument('--dataset_name', type=str, default='imagenet', choices=['imagenet'],
help='dataset name.')
-parser.add_argument('--val_data_path', type=str, default=None,
+parser.add_argument('--val_data_path', type=str, default=None, required=True,
help='Path to the validation dataset (e.g. "/datasets/imagenet/val/")')
-parser.add_argument('--device_target', type=str, choices=['Ascend', 'GPU', 'CPU'],
- default=None, help='Target device: Ascend, GPU or CPU')
-parser.add_argument('--checkpoint_path', type=str, default='./ckpt_0', help='Checkpoint file path or dir path')
-parser.add_argument('--device_id', type=int, default=None, help='device id of Ascend. (Default: None)')
+parser.add_argument('--device_target', type=str, choices=['Ascend', 'GPU', 'CPU'], required=True,
+ default="Ascend", help='Target device: Ascend, GPU or CPU')
+parser.add_argument('--checkpoint_path', type=str, default='./ckpt_0', help='Checkpoint file path or dir path',
+ required=True)
+parser.add_argument('--device_id', type=int, default=None, help='device id of Ascend. (Default: None)', required=True)
args_opt = parser.parse_args()
@@ -67,20 +68,10 @@ class CrossEntropySmooth(LossBase):
if __name__ == '__main__':
-
+ device_target = args_opt.device_target
if args_opt.dataset_name == "imagenet":
cfg = imagenet_cfg
-
- if args_opt.val_data_path is not None:
- cfg.val_data_path = args_opt.val_data_path
-
- if args_opt.device_target is not None:
- cfg.device_target = args_opt.device_target
-
- device_target = cfg.device_target
- dataset_drop_reminder = (device_target == 'GPU')
-
- dataset = create_dataset_imagenet(cfg.val_data_path, 1, False, drop_reminder=dataset_drop_reminder)
+ dataset = create_dataset_imagenet(args_opt.val_data_path, 1, False, drop_reminder=True)
else:
raise ValueError("dataset is not support.")
@@ -91,12 +82,8 @@ if __name__ == '__main__':
net = spnasnet.spnasnet(num_classes=cfg.num_classes)
model = Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'})
- context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target)
- if device_target == "Ascend":
- if args_opt.device_id is not None:
- context.set_context(device_id=args_opt.device_id)
- else:
- context.set_context(device_id=cfg.device_id)
+ context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
+ context.set_context(device_id=args_opt.device_id)
print(f'Checkpoint path: {args_opt.checkpoint_path}')
diff --git a/research/cv/single_path_nas/scripts/docker_start.sh b/research/cv/single_path_nas/scripts/docker_start.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ff4ec545577096e72691d0cdbaa4403e2aacadca
--- /dev/null
+++ b/research/cv/single_path_nas/scripts/docker_start.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Copyright(C) 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker_image=$1
+data_dir=$2
+model_dir=$3
+
+docker run -it --ipc=host \
+ --device=/dev/davinci0 \
+ --device=/dev/davinci1 \
+ --device=/dev/davinci2 \
+ --device=/dev/davinci3 \
+ --device=/dev/davinci4 \
+ --device=/dev/davinci5 \
+ --device=/dev/davinci6 \
+ --device=/dev/davinci7 \
+ --device=/dev/davinci_manager \
+ --device=/dev/devmm_svm --device=/dev/hisi_hdc \
+ -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+ -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
+ -v ${model_dir}:${model_dir} \
+ -v ${data_dir}:${data_dir} \
+ -v ~/ascend/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
+ -v ~/ascend/log/npu/slog/:/var/log/npu/slog -v ~/ascend/log/npu/profiling/:/var/log/npu/profiling \
+ -v ~/ascend/log/npu/dump/:/var/log/npu/dump -v ~/ascend/log/npu/:/usr/slog ${docker_image} \
+ /bin/bash
diff --git a/research/cv/single_path_nas/scripts/run_distribute_train.sh b/research/cv/single_path_nas/scripts/run_distribute_train_ascend.sh
similarity index 81%
rename from research/cv/single_path_nas/scripts/run_distribute_train.sh
rename to research/cv/single_path_nas/scripts/run_distribute_train_ascend.sh
index fab9920c8223c639bf50adbc5e9284cdb9cf1165..22bccb50b9ed315bba78492f88500a4e209c8402 100644
--- a/research/cv/single_path_nas/scripts/run_distribute_train.sh
+++ b/research/cv/single_path_nas/scripts/run_distribute_train_ascend.sh
@@ -14,9 +14,9 @@
# limitations under the License.
# ============================================================================
-if [ $# != 1 ]
+if [ $# != 3 ]
then
- echo "Usage: bash run_train.sh [RANK_TABLE_FILE]"
+ echo "Usage: bash ./scripts/run_distribute_train_ascend.sh [RANK_TABLE_FILE] [DEVICE_NUM] [DATA_PATH]"
exit 1
fi
@@ -31,8 +31,9 @@ dataset_type='imagenet'
ulimit -u unlimited
-export DEVICE_NUM=8
-export RANK_SIZE=8
+export DEVICE_NUM=$2
+export RANK_SIZE=$2
+export DATA_PATH=$3
RANK_TABLE_FILE=$(realpath $1)
export RANK_TABLE_FILE
echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}"
@@ -50,6 +51,6 @@ do
echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type"
cd ./train_parallel$i ||exit
env > env.log
- python train.py --device_id=$i --dataset_name=$dataset_type> log 2>&1 &
+ python train.py --device_id=$i --dataset_name=$dataset_type --data_path=$3 --device_target="Ascend"> log 2>&1 &
cd ..
done
\ No newline at end of file
diff --git a/research/cv/single_path_nas/scripts/run_distribute_train_gpu.sh b/research/cv/single_path_nas/scripts/run_distribute_train_gpu.sh
index 0a64c51536dbe8a692542178305f803053b40f87..098450e59ef893b91d4ff8dbf6411cd81bedbf70 100644
--- a/research/cv/single_path_nas/scripts/run_distribute_train_gpu.sh
+++ b/research/cv/single_path_nas/scripts/run_distribute_train_gpu.sh
@@ -13,50 +13,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
-
-
-if [ $# != 0 ] && [ $# != 1 ]
-then
- echo "Usage: bash run_distribute_train_gpu.sh [TRAIN_DATASET](optional)"
- exit 1
-fi
-
-if [ $# == 1 ] && [ ! -d $1 ]
+if [ $# -lt 3 ]
then
- echo "error: TRAIN_DATASET=$1 is not a directory"
- exit 1
+ echo "Usage: bash ./scripts/run_distributed_train_gpu.sh [CUDA_VISIBLE_DEVICES] [DEVICE_NUM] [DATA_PATH]"
+exit 1
fi
-ulimit -u unlimited
+dataset_name="imagenet"
+export RANK_SIZE=$1
+export DEVICE_NUM=$2
+export CUDA_VISIBLE_DEVICES=$1
+DATA_PATH=$3
-rm -rf ./train_parallel
-mkdir ./train_parallel
-cp ./*.py ./train_parallel
-cp -r ./src ./train_parallel
-cd ./train_parallel || exit
-env > env.log
+mpirun -n ${DEVICE_NUM} --allow-run-as-root --output-filename log_output \
+--merge-stderr-to-stdout python train.py \
+--device_target="GPU" --dataset_name=$dataset_name \
+--data_path=$DATA_PATH > log.txt 2>&1 &
-if [ $# == 0 ]
-then
- mpirun -n 8 \
- --allow-run-as-root \
- --output-filename 'log_output' \
- --merge-stderr-to-stdout \
- python ./train.py \
- --use_gpu_distributed=1 \
- --device_target='GPU' \
- --lr_init=1.5 > log.txt 2>&1 &
-fi
-
-if [ $# == 1 ]
-then
- mpirun -n 8 \
- --allow-run-as-root \
- --output-filename 'log_output' \
- --merge-stderr-to-stdout \
- python ./train.py \
- --use_gpu_distributed=1 \
- --device_target='GPU' \
- --data_path="$1" \
- --lr_init=1.5 > log.txt 2>&1 &
-fi
diff --git a/research/cv/single_path_nas/scripts/run_eval.sh b/research/cv/single_path_nas/scripts/run_eval_ascend.sh
similarity index 71%
rename from research/cv/single_path_nas/scripts/run_eval.sh
rename to research/cv/single_path_nas/scripts/run_eval_ascend.sh
index 6cef937954c9c3281d56cf2fca4a56ee75fe9b54..177b5bfd5b14ded25a41e363c916d27e3636fb7b 100644
--- a/research/cv/single_path_nas/scripts/run_eval.sh
+++ b/research/cv/single_path_nas/scripts/run_eval_ascend.sh
@@ -14,25 +14,25 @@
# limitations under the License.
# ============================================================================
-if [ $# != 1 ]
+if [ $# != 3 ]
then
- echo "Usage: bash run_eval.sh checkpoint_path_dir/checkpoint_path_file"
+ echo "Usage: bash ./scripts/run_eval_ascend.sh [DEVICE_ID] [DATA_PATH] [CKPT_FILE/CKPT_DIR]"
exit 1
fi
-if [ ! -d $1 ] && [ ! -f $1 ]
+if [ ! -d $3 ] && [ ! -f $3 ]
then
- echo "error: checkpoint_path=$1 is neither a directory nor a file"
+ echo "error: checkpoint_path=$3 is neither a directory nor a file"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=1
-export DEVICE_ID=0
+export DEVICE_ID=$1
export RANK_SIZE=$DEVICE_NUM
export RANK_ID=0
echo "start evaluation for device $DEVICE_ID"
-python eval.py --checkpoint_path=$1 > ./eval.log 2>&1 &
\ No newline at end of file
+python eval.py --checkpoint_path=$3 --val_data_path=$2 --device_id=$DEVICE_ID --device_target="Ascend" > ./eval.log 2>&1 &
\ No newline at end of file
diff --git a/research/cv/single_path_nas/scripts/run_eval_gpu.sh b/research/cv/single_path_nas/scripts/run_eval_gpu.sh
index f30886a8269eb6728a7528e6a9191dd2d0cb2f20..6b30c2a84607997da521eee19a952e06d3a72bb9 100644
--- a/research/cv/single_path_nas/scripts/run_eval_gpu.sh
+++ b/research/cv/single_path_nas/scripts/run_eval_gpu.sh
@@ -14,38 +14,25 @@
# limitations under the License.
# ============================================================================
-if [ $# != 1 ] && [ $# != 2 ]
+if [ $# != 3 ]
then
- echo "Usage: bash run_eval_gpu.sh [CKPT_FILE_OR_DIR] [VALIDATION_DATASET](optional)"
- exit 1
+ echo "Usage: bash ./scripts/run_eval_gpu.sh [DEVICE_ID] [DATA_PATH] [CKPT_FILE/CKPT_DIR]"
+exit 1
fi
-if [ ! -d $1 ] && [ ! -f $1 ]
+if [ ! -d $3 ] && [ ! -f $3 ]
then
- echo "error: CKPT_FILE_OR_DIR=$1 is neither a directory nor a file"
- exit 1
+ echo "error: checkpoint_path=$3 is neither a directory nor a file"
+ exit 1
fi
-if [ $# == 2 ] && [ ! -d $2 ]
-then
- echo "error: VALIDATION_DATASET=$2 is not a directory"
- exit 1
-fi
ulimit -u unlimited
+export DEVICE_NUM=1
+export DEVICE_ID=$1
+export RANK_SIZE=$DEVICE_NUM
+export RANK_ID=0
-if [ $# == 1 ]
-then
- GLOG_v=3 python eval.py \
- --checkpoint_path="$1" \
- --device_target="GPU" > "./eval.log" 2>&1 &
-fi
-
-if [ $# == 2 ]
-then
- GLOG_v=3 python eval.py \
- --checkpoint_path="$1" \
- --val_data_path="$2" \
- --device_target="GPU" > "./eval.log" 2>&1 &
-fi
+echo "start evaluation for device $DEVICE_ID"
+python eval.py --checkpoint_path=$3 --val_data_path=$2 --device_id=$DEVICE_ID --device_target="GPU" > ./eval.log 2>&1 &
\ No newline at end of file
diff --git a/research/cv/single_path_nas/scripts/run_standalone_train.sh b/research/cv/single_path_nas/scripts/run_standalone_train_ascend.sh
similarity index 76%
rename from research/cv/single_path_nas/scripts/run_standalone_train.sh
rename to research/cv/single_path_nas/scripts/run_standalone_train_ascend.sh
index 884c99f8b3fa8d0f5d664725c78f9e7466e9d069..9cd87039241654989cc95ecfda59361862f695fc 100644
--- a/research/cv/single_path_nas/scripts/run_standalone_train.sh
+++ b/research/cv/single_path_nas/scripts/run_standalone_train_ascend.sh
@@ -14,15 +14,9 @@
# limitations under the License.
# ============================================================================
-if [ $# != 0 ]
+if [ $# != 2 ]
then
- echo "Usage: bash run_train.sh"
-exit 1
-fi
-
-if [ ! -f $1 ]
-then
- echo "error: RANK_TABLE_FILE=$1 is not a file"
+ echo "Usage: bash ./scripts/run_standalone_train_ascend.sh [DEVICE_ID] [DATA_PATH]"
exit 1
fi
@@ -31,10 +25,11 @@ dataset_type='imagenet'
ulimit -u unlimited
-export DEVICE_ID=0
+export DEVICE_ID=$1
+export DATA_PATH=$2
export DEVICE_NUM=1
export RANK_ID=0
export RANK_SIZE=1
echo "start training for device $DEVICE_ID"
-python train.py --device_id=$DEVICE_ID --dataset_name=$dataset_type> log 2>&1 &
\ No newline at end of file
+python train.py --device_id=$DEVICE_ID --data_path=$2 --dataset_name=$dataset_type --device_target="Ascend" > log 2>&1 &
\ No newline at end of file
diff --git a/research/cv/single_path_nas/scripts/run_standalone_train_gpu.sh b/research/cv/single_path_nas/scripts/run_standalone_train_gpu.sh
index 16b3e7fbd4c0d5cf3e7e42a649c9c0b0d9ac22d4..944977ff65a8f42b06726473d190f6eeb47d42cd 100644
--- a/research/cv/single_path_nas/scripts/run_standalone_train_gpu.sh
+++ b/research/cv/single_path_nas/scripts/run_standalone_train_gpu.sh
@@ -1,5 +1,5 @@
#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,33 +14,22 @@
# limitations under the License.
# ============================================================================
-if [ $# != 0 ] && [ $# != 1 ]
+if [ $# != 2 ]
then
- echo "Usage: bash run_standalone_train_gpu.sh [TRAIN_DATASET](optional)"
- exit 1
+ echo "Usage: bash ./scripts/run_standalone_train_gpu.sh [DEVICE_ID] [DATA_PATH]"
+exit 1
fi
-if [ $# == 1 ] && [ ! -d $1 ]
-then
- echo "error: TRAIN_DATASET=$1 is not a directory"
- exit 1
-fi
-ulimit -u unlimited
+dataset_type='imagenet'
-rm -rf ./train_standalone
-mkdir ./train_standalone
-cp ./*.py ./train_standalone
-cp -r ./src ./train_standalone
-cd ./train_standalone || exit
-env > env.log
-if [ $# == 0 ]
-then
- python train.py --device_target='GPU' --lr_init=0.26 > log.txt 2>&1 &
-fi
+ulimit -u unlimited
+export DEVICE_ID=$1
+export DATA_PATH=$2
+export DEVICE_NUM=1
+export RANK_ID=0
+export RANK_SIZE=1
-if [ $# == 1 ]
-then
- python train.py --device_target='GPU' --data_path="$1" --lr_init=0.26 > log.txt 2>&1 &
-fi
+echo "start training for device $DEVICE_ID"
+python train.py --device_id=$DEVICE_ID --data_path=$2 --dataset_name=$dataset_type --device_target="GPU" > log 2>&1 &
\ No newline at end of file
diff --git a/research/cv/single_path_nas/src/config.py b/research/cv/single_path_nas/src/config.py
index 54e9594cfae81bcbae84b0519dc1fd0739ab2fe9..09227a3cf9c7d17ad18b1e4690cc5f64f3c920b4 100644
--- a/research/cv/single_path_nas/src/config.py
+++ b/research/cv/single_path_nas/src/config.py
@@ -28,10 +28,6 @@ imagenet_cfg = edict({
'weight_decay': 1e-5,
'image_height': 224,
'image_width': 224,
- 'data_path': '/data/ILSVRC2012_train/',
- 'val_data_path': '/data/ILSVRC2012_val/',
- 'device_target': 'Ascend',
- 'device_id': 0,
'keep_checkpoint_max': 40,
'checkpoint_path': None,
'onnx_filename': 'single-path-nas',
diff --git a/research/cv/single_path_nas/train.py b/research/cv/single_path_nas/train.py
index 3a54e28f3a794b31935b0e345e90aaff1207955f..f23d741390ee88200bab7680cf2687a7c2971e63 100644
--- a/research/cv/single_path_nas/train.py
+++ b/research/cv/single_path_nas/train.py
@@ -74,15 +74,11 @@ if __name__ == '__main__':
help='dataset name.')
parser.add_argument('--filter_prefix', type=str, default='huawei',
help='filter_prefix name.')
- parser.add_argument('--lr_init', type=float, default=None,
- help='Override the learning rate value in the configuration file')
- parser.add_argument('--device_id', type=int, default=None,
+ parser.add_argument('--device_id', type=int, default=0,
help='device id of Ascend. (Default: None)')
- parser.add_argument('--device_target', type=str, choices=['Ascend', 'GPU'],
- default=None, help='Target device: Ascend or GPU')
- parser.add_argument('--use_gpu_distributed', type=int, default=0,
- help='Enable distributed GPU training.')
- parser.add_argument('--data_path', type=str, default=None,
+ parser.add_argument('--device_target', type=str, choices=['Ascend', 'GPU'], required=True,
+ default="Ascend", help='Target device: Ascend or GPU')
+ parser.add_argument('--data_path', type=str, default=None, required=True,
help='Path to the training dataset (e.g. "/datasets/imagenet/train/")')
args_opt = parser.parse_args()
@@ -92,32 +88,20 @@ if __name__ == '__main__':
else:
raise ValueError("Unsupported dataset.")
- if args_opt.data_path is not None:
- cfg.data_path = args_opt.data_path
-
- # set context
- if args_opt.device_target is not None:
- cfg.device_target = args_opt.device_target
-
- if args_opt.lr_init is not None:
- cfg.lr_init = args_opt.lr_init
-
- device_target = cfg.device_target
+ device_target = args_opt.device_target
# We enabling the graph kernel only for the Ascend device.
- enable_graph_kernel = (device_target == 'Ascend')
+ # enable_graph_kernel = (device_target == 'Ascend')
+ enable_graph_kernel = False
- context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target,
+ context.set_context(mode=context.GRAPH_MODE, device_target=device_target,
enable_graph_kernel=enable_graph_kernel)
- device_num = int(os.environ.get("DEVICE_NUM", 1))
+ device_num = int(os.environ.get("DEVICE_NUM", "1"))
rank = 0
if device_target == "Ascend":
- if args_opt.device_id is not None:
- context.set_context(device_id=args_opt.device_id)
- else:
- context.set_context(device_id=cfg.device_id)
+ context.set_context(device_id=args_opt.device_id)
if device_num > 1:
context.reset_auto_parallel_context()
@@ -128,7 +112,7 @@ if __name__ == '__main__':
rank = get_rank()
elif device_target == "GPU":
# Using the rank and devices number determined by the communication module.
- if args_opt.use_gpu_distributed == 1:
+ if device_num > 1:
init('nccl')
device_num = get_group_size()
rank = get_rank()
@@ -136,9 +120,6 @@ if __name__ == '__main__':
context.set_auto_parallel_context(device_num=device_num,
parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
- else:
- device_num = 1
-
else:
raise ValueError("Unsupported platform.")
@@ -146,12 +127,12 @@ if __name__ == '__main__':
if args_opt.dataset_name == "imagenet":
if device_num > 1:
- dataset = create_dataset_imagenet(cfg.data_path, 1, num_parallel_workers=8,
+ dataset = create_dataset_imagenet(args_opt.data_path, 1, num_parallel_workers=8,
device_num=device_num, rank_id=rank,
- drop_reminder=dataset_drop_reminder)
+ drop_reminder=True)
else:
- dataset = create_dataset_imagenet(cfg.data_path, 1, num_parallel_workers=8,
- drop_reminder=dataset_drop_reminder)
+ dataset = create_dataset_imagenet(args_opt.data_path, 1, num_parallel_workers=8,
+ drop_reminder=True)
else:
raise ValueError("Unsupported dataset.")