Skip to content
Snippets Groups Projects
Commit 130000a6 authored by anzhengqi's avatar anzhengqi
Browse files

modify some network scripts

parent b60f327b
No related branches found
No related tags found
No related merge requests found
Showing
with 72 additions and 52 deletions
......@@ -217,6 +217,8 @@ if __name__ == "__main__":
if target not in ('GPU', "Ascend"):
raise ValueError("Unsupported device target.")
device_id = int(os.getenv('DEVICE_ID', '0'))
device_num = int(os.getenv('RANK_SIZE', '1'))
context.set_context(mode=context.GRAPH_MODE,
device_target=target,
save_graphs=False,
......@@ -224,8 +226,6 @@ if __name__ == "__main__":
if args.run_modelarts:
import moxing as mox
device_id = int(os.getenv('DEVICE_ID'))
device_num = int(os.getenv('RANK_SIZE'))
context.set_context(device_id=device_id)
local_data_url = '/cache/data'
local_ckpt_url = '/cache/ckpt'
......@@ -243,17 +243,14 @@ if __name__ == "__main__":
det_dir = local_det_url + '/'
elif target == "Ascend":
if args.run_distribute:
device_id = int(os.getenv('DEVICE_ID'))
device_num = int(os.getenv('RANK_SIZE'))
context.set_context(device_id=device_id)
init()
context.reset_auto_parallel_context()
context.set_auto_parallel_context(device_num=device_num,
parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
else:
context.set_context(device_id=args.device_id)
context.set_context(device_id=device_id)
device_num = 1
device_id = args.device_id
DATA_DIR = args.data_url
local_train_url = args.train_url
ckpt_dir = args.ckpt_url
......@@ -268,7 +265,6 @@ if __name__ == "__main__":
parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
else:
device_id = 0
context.set_context(device_id=device_id)
DATA_DIR = args.data_url
local_train_url = args.train_url
......
#!/bin/bash
# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 2 ]; then
echo "Usage: bash run_distribute_train_ascend.sh [DATA_PATH] [CKPT_PATH]"
exit 1
fi
export RANK_SIZE=1
python src/deep/train.py --data_url [DATA_PATH] --train_url [CKPT_PATH] --device=Ascend > out.log 2>&1
\ No newline at end of file
......@@ -20,7 +20,7 @@ import mindspore.dataset.vision as C
import mindspore.dataset as ds
import mindspore.nn as nn
from mindspore import Tensor, context
from mindspore.communication.management import init, get_rank
from mindspore.communication.management import init
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor
from mindspore.train.model import Model
from mindspore.context import ParallelMode
......@@ -62,12 +62,12 @@ if target not in ('GPU', "Ascend"):
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
device_num = int(os.getenv('RANK_SIZE'))
device_num = int(os.getenv('RANK_SIZE', '1'))
device_id = int(os.getenv('DEVICE_ID', '0'))
rank = int(os.getenv('RANK_ID', '0'))
if args.run_modelarts:
import moxing as mox
device_id = int(os.getenv('DEVICE_ID'))
device_num = int(os.getenv('RANK_SIZE'))
cfg.batch_size = cfg.batch_size*int(8/device_num)
context.set_context(device_id=device_id)
local_data_url = '/cache/data'
......@@ -80,8 +80,6 @@ if args.run_modelarts:
DATA_DIR = local_data_url + '/'
elif target == "Ascend":
if args.run_distribute:
device_id = int(os.getenv('DEVICE_ID'))
device_num = int(os.getenv('RANK_SIZE'))
cfg.batch_size = cfg.batch_size*int(8/device_num)
context.set_context(device_id=device_id)
init()
......@@ -89,22 +87,18 @@ elif target == "Ascend":
context.set_auto_parallel_context(device_num=device_num,\
parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
else:
context.set_context(device_id=args.device_id)
context.set_context(device_id=device_id)
device_num = 1
cfg.batch_size = cfg.batch_size*int(8/device_num)
device_id = args.device_id
DATA_DIR = args.data_url + '/'
elif target == "GPU":
if args.run_distribute:
init("nccl")
context.reset_auto_parallel_context()
rank = get_rank()
context.set_auto_parallel_context(device_num=device_num,
parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
else:
rank = 0
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id)
DATA_DIR = args.data_url
......
......@@ -13,6 +13,7 @@
# limitations under the License.
# ============================================================================
import sys
import torch
from mindspore import Tensor
......
......@@ -26,6 +26,7 @@ fi
export DEVICE_TARGET=$1
export CONFIG_PATH=$5
DEVICE_ID=$2
export DEVICE_ID=$2
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
......
......@@ -35,6 +35,7 @@ fi
export DEVICE_TARGET=$1
DEVICE_ID=$2
export DEVICE_ID=$2
EPOCH_SIZE=$3
GRADIENT_ACCUMULATE_STEP=$4
DATA_PATH=$5
......
......@@ -251,7 +251,7 @@ The ckpt_file parameter is required,
|training parameter |epoch=100,batch_size=8 |
|optimizer |SGD optimizer,momentum=0.9,weight_decay=0.0001 |
|loss function |SoftmaxCrossEntropyLoss |
|training speed |epoch time: 493974.632 ms, per step time: 464.699 ms(1p for voc2012)|
|training speed |epoch time: 493974.632 ms, per step time: 464.699 ms(1p for voc2012), 485 ms(8p for voc2012), 998 ms(1p for ADE20K), 1050 ms(8p for ADE20K)|
|total time |6h10m34s(1pcs) |
|Script URL |https://gitee.com/mindspore/models/tree/master/research/cv/PSPNet|
|Random number seed |set_seed = 1234 |
......
......@@ -86,7 +86,7 @@ Spnas
### For training
```bash
python3 train.py --config_path=src/spnas.yaml
python3 train.py --config_file=src/spnas.yaml
```
> Or one can run following script for all tasks.
......
......@@ -20,4 +20,4 @@ export RANK_SIZE=8
RANK_TABLE_FILE=$(realpath $1)
export RANK_TABLE_FILE
python3 ../train.py --config_path=../src/spnas_distributed.yml
\ No newline at end of file
python3 ../train.py --config_file=../src/spnas_distributed.yml
\ No newline at end of file
......@@ -19,9 +19,9 @@ import vega
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Spnas network")
parser.add_argument("--config_path", type=str, required=True, help="spnas config path.")
parser.add_argument("--config_file", type=str, required=True, help="spnas config path.")
args = parser.parse_args()
config_path = args.config_path
config_file = args.config_file
vega.set_backend('mindspore', 'NPU')
vega.run(config_path)
vega.run(config_file)
......@@ -105,8 +105,8 @@ python process_data.py
- running on Ascend:
```shell
bash scripts/run_standalone_train_ascend.sh 0
```
bash run_standalone_train_ascend.sh 0
```
The command above will run in the background, you can view the results through the file logs
......@@ -137,7 +137,7 @@ bash scripts/run_standalone_train_ascend.sh 0
- running on GPU:
```shell
bash scripts/run_standalone_train_gpu.sh 0
bash run_standalone_train_gpu.sh 0
```
The command above will run in the background, you can view the results through the file logs
......@@ -181,7 +181,7 @@ bash scripts/run_standalone_train_gpu.sh 0
- distributed running on Ascend:
```shell
bash ./scripts/run_distribute_train_ascend.sh rank_table.json
bash run_distribute_train_ascend.sh rank_table.json
```
The above shell script will run distribute training in the background. You can view the results through the file train_parallel[X]/logs. The loss value will be achieved as follows:
......@@ -207,7 +207,7 @@ bash ./scripts/run_distribute_train_ascend.sh rank_table.json
- distributed running on GPU:
```shell
bash scripts/run_distribute_train_gpu.sh
bash run_distribute_train_gpu.sh
```
The above shell script will run distribute training in the background.
......@@ -250,7 +250,7 @@ scripts/log_train_regression:[2021-11-09 14:40:58.586][DEBUG] trainer.py(121)->t
Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path.
```shell
bash scripts/run_standalone_eval_ascend.sh 0
bash run_standalone_eval_ascend.sh 0
```
The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:
......@@ -267,7 +267,7 @@ svm_thresh: 0.6, map: 0.31060216644862054
Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path.
```shell
bash scripts/run_standalone_eval_gpu.sh 0
bash run_standalone_eval_gpu.sh 0
```
The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:
......
......@@ -95,7 +95,7 @@ R-CNN的整体网络架构详情,请参考链接: [Link](https://arxiv.org/abs
```shell
python process_data.py
```
```
### [训练过程](#contents)
......@@ -104,7 +104,7 @@ python process_data.py
- 在Ascend上进行训练:
```shell
bash scripts/run_standalone_train_ascend.sh 0
bash run_standalone_train_ascend.sh 0
```
上述命令将在后台运行,你可以在日志文件查看结果。
......@@ -135,7 +135,7 @@ bash scripts/run_standalone_train_ascend.sh 0
通过执行如下命令进行训练:
bash scripts/run_standalone_train_gpu.sh 0
bash run_standalone_train_gpu.sh 0
训练后,默认会在脚本文件夹下得到一些checkpoints文件,可以在日志文件中查看结果,loss如下所示:
......@@ -174,7 +174,7 @@ bash scripts/run_standalone_train_gpu.sh 0
- 在Ascend上进行分布式训练:
```shell
bash ./scripts/run_distribute_train_ascend.sh rank_table.json
bash run_distribute_train_ascend.sh rank_table.json
```
上述脚本将在后台运行. 您可以通过文件 train_parallel[X]/logs 查看结果。 损失如下:
......@@ -200,7 +200,7 @@ bash ./scripts/run_distribute_train_ascend.sh rank_table.json
- 在GPU上进行分布式训练:
```shell
bash scripts/run_distribute_train_gpu.sh
bash run_distribute_train_gpu.sh
```
上述脚本将在后台运行。
......@@ -243,7 +243,7 @@ scripts/log_train_regression:[2021-11-09 14:40:58.586][DEBUG] trainer.py(121)->t
在运行以下命令之前,请检查用于评估的checkpoints路径,请将checkpoints路径设置为绝对完整路径。
```shell
bash scripts/run_standalone_eval_ascend.sh 0
bash run_standalone_eval_ascend.sh 0
```
上面的脚本将在后台运行。您可以通过“eval.log”查看结果,测试数据集的准确率如下:
......@@ -261,7 +261,7 @@ svm_thresh: 0.6, map: 0.31060216644862054
```shell
bash scripts/run_standalone_eval_gpu.sh 0
bash run_standalone_eval_gpu.sh 0
```
......
......@@ -24,11 +24,11 @@ export RANK_SIZE=8
echo "start training ... "
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_finetune --merge-stderr-to-stdout \
python train.py --step 0 >log_train_finetune 2>&1 &
python ../train.py --step 0 >log_train_finetune 2>&1 &
wait
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_svm --merge-stderr-to-stdout \
python train.py --step 1 >log_train_svm 2>&1 &
python ../train.py --step 1 >log_train_svm 2>&1 &
wait
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output_regression --merge-stderr-to-stdout \
python train.py --step 2 >log_train_regression 2>&1 &
python ../train.py --step 2 >log_train_regression 2>&1 &
cd ..
......@@ -20,5 +20,6 @@ exit 1
fi
export DEVICE_ID=$1
cd ../
python eval.py --device_id=${DEVICE_ID} 1>scripts/result.txt 2>scripts/eval_log.txt &
cd ..
......@@ -20,9 +20,9 @@ exit 1
fi
export DEVICE_ID=$1
python train.py --device_id=${DEVICE_ID} --step 0 >train_log_finetune 2>&1 &
python ../train.py --device_id=${DEVICE_ID} --step 0 >train_log_finetune 2>&1 &
wait
python train.py --device_id=${DEVICE_ID} --step 1 >train_log_svm 2>&1 &
python ../train.py --device_id=${DEVICE_ID} --step 1 >train_log_svm 2>&1 &
wait
python train.py --device_id=${DEVICE_ID} --step 2 >train_log_regression 2>&1 &
python ../train.py --device_id=${DEVICE_ID} --step 2 >train_log_regression 2>&1 &
cd ..
......@@ -27,12 +27,12 @@ from mindspore.context import ParallelMode
from mindspore import Tensor
from mindspore.nn.optim import Momentum
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
from mindspore.train.model import Model
import mindspore.nn as nn
import mindspore.common.initializer as weight_init
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from src.callbacks import CustomLossMonitor, TimeMonitor, EvalCallback
from src.callbacks import CustomLossMonitor, EvalCallback
from src.wide_resnet import wideresnet
from src.dataset import create_dataset
from src.model_utils.config import config as cfg
......
......@@ -16,13 +16,18 @@
PWD_DIR=`pwd`
DATA=$1
scripts_path=$(dirname $0)
LOAD_CHECKPOINT_PATH=$2
BERT_DIR=$DATA/cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=$DATA/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=$DATA/KB_embeddings/nell_concept2vec.txt
python3 run_KTNET_squad_eval.py \
if [ ! -d log ]; then
mkdir log
fi
python3 $scripts_path/../run_KTNET_squad_eval.py \
--device_target "Ascend" \
--device_id 0 \
--batch_size 8 \
......
......@@ -27,10 +27,8 @@ echo "===============================================start evaling==============
task_name=$1
device_target=$2
device_id=$3
model_dir=$4
data_dir=$5
device_id=$6
model_dir=$3
data_dir=$4
mkdir -p ms_log
PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
......@@ -40,6 +38,6 @@ export GLOG_logtostderr=0
python ${PROJECT_DIR}/../eval.py \
--task_name=$task_name \
--device_target=$device_target \
--device_id=$device_id \
--device_id=$DEVICE_ID \
--model_dir=$model_dir \
--data_dir=$data_dir > eval_log.txt 2>&1 &
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment