diff --git a/research/cv/E-NET/README_CN.md b/research/cv/E-NET/README_CN.md index 424f488579ae5e0c35f78171cb17dbed7bf1ce51..beaa0942ab7b8dc415bce0a7daeec6c66fe80c74 100644 --- a/research/cv/E-NET/README_CN.md +++ b/research/cv/E-NET/README_CN.md @@ -14,9 +14,9 @@ - [鏁版嵁闆哴(#鏁版嵁闆�) - [鑴氭湰璇存槑](#鑴氭湰璇存槑) - [璁粌涓庨獙璇乚(#璁粌涓庨獙璇�) - - [鍗曞崱璁粌](#鍗曞崱璁粌) - - [澶氬崱璁粌](#澶氬崱璁粌) - - [楠岃瘉鍗曚釜ckpt](#楠岃瘉鍗曚釜ckpt) + - [鍗曞崱璁粌](#鍗曞崱璁粌) + - [澶氬崱璁粌](#澶氬崱璁粌) + - [楠岃瘉鍗曚釜ckpt](#楠岃瘉鍗曚釜ckpt) - [妯″瀷鎻忚堪](#妯″瀷鎻忚堪) - [310鎺ㄧ悊](#310鎺ㄧ悊) @@ -45,7 +45,7 @@ E-NET涓昏鐢ㄤ簬鍥惧儚鍒嗗壊棰嗗煙锛屾槸涓€绉嶇鍒扮鐨勫垎鍓叉柟娉曘€傝涔� ## 鐜 -Ascend +Ascend锛孏PU ## 鏁版嵁闆� @@ -89,7 +89,9 @@ python src/build_mrdata.py \ 鈹� 鈹斺攢鈹€ utils.cc // utils瀹炵幇 鈹溾攢鈹€ scripts 鈹� 鈹溾攢鈹€ run_distribute_train.sh // 澶氬崱璁粌鑴氭湰 -鈹� 鈹斺攢鈹€ run_standalone_train.sh // 鍗曞崱璁粌鑴氭湰 +鈹� 鈹溾攢鈹€ run_standalone_train.sh // 鍗曞崱璁粌鑴氭湰 +鈹� 鈹溾攢鈹€ run_standalone_train_gpu.sh // 鍗曞崱璁粌鑴氭湰锛圙PU) +鈹� 鈹斺攢鈹€ run_distribute_train_gpu.sh // 鍗曞崱璁粌鑴氭湰锛圙PU) 鈹溾攢鈹€ src 鈹� 鈹溾攢鈹€ build_mrdata.py // 鐢熸垚mindrecord鏁版嵁闆� 鈹� 鈹溾攢鈹€ config.py // 閰嶇疆鍙傛暟鑴氭湰 @@ -112,12 +114,22 @@ python src/build_mrdata.py \ 濡傛灉浣犺浣跨敤鍗曞崱杩涜璁粌锛岃繘鍏ラ」鐩牴鐩綍锛岄敭鍏� +#### Ascend鍗曞崱 + ```bash nohup bash scripts/run_standalone_train.sh /home/name/cityscapes 0 & ``` 鍏朵腑/home/name/cityscapes鎸囨暟鎹泦鐨勪綅缃紝鍏跺悗鐨�0鎸囧畾device_id. +#### GPU鍗曞崱 + +```bash +nohup `bash scripts/run_standalone_train_gpu.sh 0 /home/name/cityscapes` & +``` + +鍏朵腑0鎸囧畾device_id锛屽叾鍚庣殑/home/name/cityscapes鎸囨暟鎹泦鐨勪綅缃� + 杩愯璇ヨ剼鏈細瀹屾垚瀵规ā鍨嬬殑璁粌鍜岃瘎浼颁袱涓樁娈点€� 鍏朵腑璁粌闃舵鍒嗕笁姝ワ紝鍓嶄袱姝ョ敤浜庤缁僂net妯″瀷鐨勭紪鐮佸櫒閮ㄥ垎锛岀涓夋浼氳缁冨畬鏁寸殑Enet缃戠粶銆� @@ -135,7 +147,9 @@ tail -f log_single_device/log_stage*.txt 渚嬪锛屼綘瑕佷娇鐢�4鍗¤繘琛岃缁冿紝杩涘叆椤圭洰鏍圭洰褰曪紝閿叆 -```py +#### Ascend澶氬崱 + +```bash nohup bash scripts/run_distribute_train.sh /home/name/cityscapes 4 0,1,2,3 /home/name/rank_table_4pcs.json & ``` @@ -149,21 +163,31 @@ tail -f log_multi_device/log0/log*.txt 鏄剧ず璁粌鐘舵€併€� -### 楠岃瘉鍗曚釜ckpt +#### GPU澶氬崱 + +```bash +nohup `bash scripts/run_distribute_train_gpu.sh 4 0,1,2,3 /home/name/cityscapes` & +``` +鍏朵腑4鎸噐ank_size, 鍐嶅悗鐨�0,1,2,3鍒跺畾浜嗚澶囩殑缂栧彿, /home/name/cityscapes鎸囨暟鎹泦鐨勪綅缃紝 鍦ㄩ」鐩牴鐩綍涓嬩細鐢熸垚log_distribute_device鏂囦欢澶癸紝./log_distribute_device/log_output*/1/rank.*/stdout鍗充负澶氬崱鏃ュ織鏂囦欢锛� 閿叆 -```py -python eval.py \ - --data_path /path/cityscapes \ - --run_distribute false \ - --encode false \ - --model_root_path /path/ENet/ENet.ckpt \ - --device_id 1 +```bash +tail -f log_distribute_device/log_output*/1/rank.*/stdout ``` -data_path涓烘暟鎹泦鏍圭洰褰曪紝model_root_path涓篶kpt鏂囦欢璺緞銆� +鏄剧ず璁粌鐘舵€併€� + +### 楠岃瘉鍗曚釜ckpt + +閿叆 + +```bash +bash scripts/run_eval_gpu.sh 0 /home/name/cityscapes /checkpoint/E-NET.ckpt + +``` +鍏朵腑0鍒跺畾浜嗚澶囩殑缂栧彿, /home/name/cityscapes鎸囨暟鎹泦鐨勪綅缃紝/checkpoint/E-NET.ckpt鎸嘽kpt鏂囦欢鐨勪綅缃敭鍏� 楠岃瘉瀹屾瘯鍚庯紝浼氬湪ckpt鏂囦欢鍚岀洰褰曚笅鍚庣紑metrics.txt鏂囦欢璁板綍缁撴灉銆� ```txt @@ -185,38 +209,38 @@ iou_class [0.96626199 0.75290523 0.87924483 0.43634233 0.44190292 0.50485979 ##### Cityscapes涓婅缁僂-Net -| 鍙傛暟 | Ascend | -| -------------------------- | ----------------------------------------------------------- | -| 妯″瀷鐗堟湰 | E-Net | | -| 璧勬簮 | Ascend 910锛汣PU 2.60GHz锛�192鏍革紱鍐呭瓨 755G锛涚郴缁� Euler2.8 | -| 涓婁紶鏃ユ湡 | 2021-10-09 | 2021-07-05 | -| MindSpore鐗堟湰 | 1.2.0 | -| 鏁版嵁闆� | Cityscapes | -| 璁粌鍙傛暟 | epoch=250, steps=496, batch_size = 6, lr=5e-4 | -| 浼樺寲鍣� | Adam | -| 鎹熷け鍑芥暟 | 甯︽潈閲嶇殑Softmax浜ゅ弶鐔� | -| 杈撳嚭 | 璇箟鍒嗗壊鍥� | -| 鎹熷け | 0.17356214 | -| 閫熷害 | 鍗曞崱锛�882姣/姝�; | -| 鎬绘椂闀� | 鍗曞崱锛�30h; | -| 鍙傛暟(M) | 0.34 | -| 寰皟妫€鏌ョ偣 | 4.40M (.ckpt鏂囦欢) | -| 鎺ㄧ悊妯″瀷 | 9.97M(.air鏂囦欢) | | +| 鍙傛暟 | Ascend | GPU| +| --------------------| ----------------------------------------------------- |--------| +| 妯″瀷鐗堟湰 | E-Net | E-Net | +| 璧勬簮 | Ascend 910锛汣PU 2.60GHz锛�192鏍革紱鍐呭瓨 755G锛涚郴缁� Euler2.8 | RTX3090锛汣PU 2.90GHz锛�64鏍革紱鍐呭瓨 252G锛涚郴缁� Ubuntu20.04| +| 涓婁紶鏃ユ湡 | 2021-10-09 | 2022-3-23 | +| MindSpore鐗堟湰 | 1.2.0 | 1.6.1 | +| 鏁版嵁闆� | Cityscapes | Cityscapes | +| 璁粌鍙傛暟 | epoch=250, steps=496, batch_size = 6, lr=5e-4 | epoch=250, steps=495, batch_size = 6, lr=5e-4 | +| 浼樺寲鍣� | Adam | Adam | +| 鎹熷け鍑芥暟 | 甯︽潈閲嶇殑Softmax浜ゅ弶鐔� | 甯︽潈閲嶇殑Softmax浜ゅ弶鐔� | +| 杈撳嚭 | 璇箟鍒嗗壊鍥� | 璇箟鍒嗗壊鍥� | +| 鎹熷け | 0.17356214 | 0.20114072 | +| 閫熷害 | 鍗曞崱锛�882姣/姝�; | 鍗曞崱锛�571姣/姝�; | +| 鎬绘椂闀� | 鍗曞崱锛�30h; | 鍗曞崱锛�25h; | +| 鍙傛暟(M) | 0.34 | 0.34 | +| 寰皟妫€鏌ョ偣 | 4.40M (.ckpt鏂囦欢) | 4.60M | +| 鎺ㄧ悊妯″瀷 | 9.97M(.air鏂囦欢) | 9.97M(.air鏂囦欢) | #### 璇勪及鎬ц兘 ##### Cityscapes涓婅瘎浼癊-Net -| 鍙傛暟 | Ascend | -| ------------------- | --------------------------- | -| 妯″瀷鐗堟湰 | E-Net | -| 璧勬簮 | Ascend 910锛涚郴缁� Euler2.8 | -| 涓婁紶鏃ユ湡 | 2021-10-09 | -| MindSpore 鐗堟湰 | 1.2.0 | -| 鏁版嵁闆� | Cityscapes, 500寮犲浘鍍� | -| batch_size | 6 | -| 杈撳嚭 | 璇箟鍒嗗壊鍥� | -| 鍑嗙‘鎬� | 鍗曞崱: 62.19%; | +| 鍙傛暟 | Ascend | GPU | +| ------------------- | --------------------------|---------------------------| +| 妯″瀷鐗堟湰 | E-Net | E-Net | +| 璧勬簮 | Ascend 910锛涚郴缁� Euler2.8 | RTX3090锛涚郴缁� Ubuntu20.04 | +| 涓婁紶鏃ユ湡 | 2021-10-09 | 2022-3-23 | +| MindSpore 鐗堟湰 | 1.2.0 | 1.6.1 | +| 鏁版嵁闆� | Cityscapes, 500寮犲浘鍍� | Cityscapes, 500寮犲浘鍍� | +| batch_size | 6 | 6 | +| 杈撳嚭 | 璇箟鍒嗗壊鍥� | 璇箟鍒嗗壊鍥� | +| 鍑嗙‘鎬� | 鍗曞崱: 62.19%; | 鍗曞崱: 62.22%; | ## 310鎺ㄧ悊 @@ -243,4 +267,4 @@ cp /path/to/cityscapes/leftImg8bit/val/munster/* /path/to/images/ 楠岃瘉闆嗙殑ground truth, 鍚岀悊涔熻褰掑埌/path/to/labels/涓�. 鍏朵綑鐨勫弬鏁�/path/to/enet.mindir鎸噈indir鏂囦欢鐨勮矾寰�, /path/to/result鎺ㄧ悊缁撴灉鐨勮緭鍑鸿矾寰�(涔熼渶瑕佹彁鍓嶇敓鎴愯鏂囦欢澶�), 0鎸囩殑鏄痙evice_id -鏈€缁堟帹鐞嗙粨鏋滀細杈撳嚭鍦�/res/result/鏂囦欢澶逛笅, 褰撳墠鐩綍涓嬩細鐢熸垚metric.txt, 鍏朵腑鍖呭惈绮惧害. \ No newline at end of file +鏈€缁堟帹鐞嗙粨鏋滀細杈撳嚭鍦�/res/result/鏂囦欢澶逛笅, 褰撳墠鐩綍涓嬩細鐢熸垚metric.txt, 鍏朵腑鍖呭惈绮惧害. diff --git a/research/cv/E-NET/eval.py b/research/cv/E-NET/eval.py index 20c19209b095276b07a15056bf310639f65d3eb4..a5965946f4db8a294400d9f5bce59e05e804d293 100644 --- a/research/cv/E-NET/eval.py +++ b/research/cv/E-NET/eval.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,12 +21,12 @@ from argparse import ArgumentParser import numpy as np from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net - +from mindspore.common import set_seed from src.criterion import SoftmaxCrossEntropyLoss from src.dataset import getCityScapesDataLoader_GeneratorDataset from src.iou_eval import iouEval from src.model import Encoder_pred, Enet -from src.util import getBool, getCityLossWeight, seed_seed +from src.util import getBool, getCityLossWeight def IOU(network_trained, dataloader, num_class, enc): @@ -88,7 +88,8 @@ if __name__ == "__main__": parser.add_argument('--run_distribute', type=str) parser.add_argument('--encode', type=str) parser.add_argument('--model_root_path', type=str) - parser.add_argument('--device_id', type=int) + parser.add_argument('--device_id', type=int, default=0) + parser.add_argument('--device_target', type=str, default='Ascend') config = parser.parse_args() model_root_path_ = config.model_root_path @@ -96,10 +97,11 @@ if __name__ == "__main__": device_id = config.device_id CityScapesRoot = config.data_path run_distribute = getBool(config.run_distribute) + device_target = config.device_target - seed_seed() + set_seed(1) context.set_context(mode=context.GRAPH_MODE) - context.set_context(device_target="Ascend") + context.set_context(device_target=device_target) context.set_context(device_id=device_id) context.set_context(save_graphs=False) diff --git a/research/cv/E-NET/scripts/run_distribute_train_gpu.sh b/research/cv/E-NET/scripts/run_distribute_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..31bb3c70fa0f6ae3bc6534693b8744f1d496bd10 --- /dev/null +++ b/research/cv/E-NET/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,104 @@ +#! /bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 3 ] +then + echo "Usage: bash scripts/run_distribute_train_gpu.sh RANK_SIZE CUDA_VISIBLE_DEVICES /path/to/cityscapes" + echo "Example: bash scripts/run_distribute_train_gpu.sh 4 0,1,2,3 /home/name/cityscapes" + exit 1 +fi + +if [ ! -d $3 ] +then + echo "error: DATASET_PATH=$3 is not a directory" +exit 1 +fi + +# Get current script path +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) + + + +echo "RANK_SIZE: $1" +echo "CUDA_VISIBLE_DEVICES: $2" +echo "cityscapes_path: $3" +export RANK_SIZE=$1 +export CUDA_VISIBLE_DEVICES=$2 +cityscapes_path=$3 + +mkdir ./log_distribute_device +cd ./log_distribute_device +mkdir ./checkpoint + +# 1.train + echo "start training stage1" + +mpirun -n $RANK_SIZE --output-filename log_output1 --merge-stderr-to-stdout --allow-run-as-root \ + python -u $BASE_PATH/../train.py \ + --lr 1e-3 \ + --repeat 2 \ + --run_distribute true \ + --save_path './checkpoint' \ + --mindrecord_train_data "$BASE_PATH/../data/train.mindrecord" \ + --stage 1 \ + --ckpt_path "" \ + --device_target GPU \ + > log_stage1.txt 2>&1 +wait +# 2.train + echo "start training stage2" + +mpirun -n $RANK_SIZE --output-filename log_output2 --merge-stderr-to-stdout --allow-run-as-root \ + python -u $BASE_PATH/../train.py \ + --lr 1e-3 \ + --repeat 2 \ + --run_distribute true \ + --save_path './checkpoint' \ + --mindrecord_train_data "$BASE_PATH/../data/train.mindrecord" \ + --stage 2 \ + --ckpt_path "./checkpoint/Encoder_stage1.ckpt" \ + --device_target GPU \ + > log_stage2.txt 2>&1 +wait + +# 3.train + echo "start training stage3" + +mpirun -n $RANK_SIZE --output-filename log_output3 --merge-stderr-to-stdout --allow-run-as-root \ + python -u $BASE_PATH/../train.py \ + --lr 1e-3 \ + --repeat 2 \ + --run_distribute true \ + --save_path './checkpoint' \ + --mindrecord_train_data "$BASE_PATH/../data/train.mindrecord" \ + --stage 3 \ + --ckpt_path "./checkpoint/Encoder_stage2.ckpt" \ + --device_target GPU \ + > log_stage3.txt 2>&1 +wait + +# 4.eval + echo "start evaling" + +python -u $BASE_PATH/../eval.py \ + --data_path ${cityscapes_path} \ + --run_distribute false \ + --encode false \ + --model_root_path './checkpoint/ENet_stage3.ckpt' \ + --device_id 1 \ + --device_target GPU \ + > log_eval.txt 2>&1 & + diff --git a/research/cv/E-NET/scripts/run_eval_gpu.sh b/research/cv/E-NET/scripts/run_eval_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..b6ee05604f0bd0e5615ffa911995679eae3a3d07 --- /dev/null +++ b/research/cv/E-NET/scripts/run_eval_gpu.sh @@ -0,0 +1,53 @@ +#! /bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 3 ] +then + echo "Usage: bash scripts/run_eval_gpu.sh DEVICE_ID /path/to/cityscapes /path/checkpoint/ENet.ckpt " + echo "Example: bash scripts/run_eval_gpu.sh 4 /home/name/cityscapes /path/checkpoint/ENet.ckpt " + exit 1 +fi + +if [ ! -d $2 ] +then + echo "error: DATASET_PATH=$2 is not a directory" +exit 1 +fi + +# Get current script path +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) + +mkdir ./log_eval +cd ./log_eval + +echo "DEVICE_ID: $1" +echo "cityscapes_path: $2" +echo "ckpt_path: $3" + +export DEVICE_ID=$1 +export RANK_SIZE=1 +cityscapes_path=$2 +ckpt_path=$3 + +python -u $BASE_PATH/../eval.py \ + --data_path ${cityscapes_path} \ + --run_distribute false \ + --encode false \ + --model_root_path ${ckpt_path} \ + --device_id 1 \ + --device_target GPU \ + > log_eval.txt 2>&1 & + diff --git a/research/cv/E-NET/scripts/run_standalone_train.sh b/research/cv/E-NET/scripts/run_standalone_train.sh index 1e529c7fe5bf680a9d86fee731e09270a4dea4dd..63d2627136fccb935b81931ed3325ccda0e1efee 100644 --- a/research/cv/E-NET/scripts/run_standalone_train.sh +++ b/research/cv/E-NET/scripts/run_standalone_train.sh @@ -76,4 +76,5 @@ python -u ../eval.py \ --encode false \ --model_root_path './' \ --device_id ${DEVICE_ID} \ - > log_eval.txt 2>&1 & \ No newline at end of file + > log_eval.txt 2>&1 & + diff --git a/research/cv/E-NET/scripts/run_standalone_train_gpu.sh b/research/cv/E-NET/scripts/run_standalone_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..6774ad12dcd78d736f24d16ae37b47ea8ebbfafa --- /dev/null +++ b/research/cv/E-NET/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,85 @@ +#! /bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: bash scripts/run_distribute_train_gpu.sh DEVICE_ID /path/to/cityscapes" + echo "Example: bash scripts/run_distribute_train_gpu.sh 4 /home/name/cityscapes" + exit 1 +fi + +if [ ! -d $2 ] +then + echo "error: DATASET_PATH=$2 is not a directory" +exit 1 +fi + +# Get current script path +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) + + +mkdir ./log_single_device +cd ./log_single_device +mkdir ./checkpoint + +echo "DEVICE_ID: $1" +echo "cityscapes_path: $2" +export DEVICE_ID=$1 +export RANK_SIZE=1 +cityscapes_path=$2 + +python -u $BASE_PATH/../train.py \ + --lr 5e-4 \ + --repeat 1 \ + --run_distribute false \ + --save_path './checkpoint' \ + --mindrecord_train_data "../data/train.mindrecord" \ + --stage 1 \ + --ckpt_path "" \ + --device_target GPU \ + > log_stage1.txt 2>&1 + +python -u $BASE_PATH/../train.py \ + --lr 5e-4 \ + --repeat 1 \ + --run_distribute false \ + --save_path './checkpoint' \ + --mindrecord_train_data "../data/train.mindrecord" \ + --stage 2 \ + --ckpt_path "./checkpoint/Encoder_stage1.ckpt" \ + --device_target GPU \ + > log_stage2.txt 2>&1 + +python -u $BASE_PATH/../train.py \ + --lr 5e-4 \ + --repeat 1 \ + --run_distribute false \ + --save_path './' \ + --mindrecord_train_data "../data/train.mindrecord" \ + --stage 3 \ + --ckpt_path "./checkpoint/Encoder_stage2.ckpt" \ + --device_target GPU \ + > log_stage3.txt 2>&1 + +python -u $BASE_PATH/../eval.py \ + --data_path ${cityscapes_path} \ + --run_distribute false \ + --encode false \ + --model_root_path './checkpoint/ENet_stage3.ckpt' \ + --device_id 1 \ + --device_target GPU \ + > log_eval.txt 2>&1 & + diff --git a/research/cv/E-NET/src/config.py b/research/cv/E-NET/src/config.py index d80c7fb6b1d6cc8c7425e81218b724cbe739000f..0dbdb4e261dc1fd1b2a099e3cc19bfe108b4f5c4 100644 --- a/research/cv/E-NET/src/config.py +++ b/research/cv/E-NET/src/config.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ parser.add_argument('--mindrecord_train_data', type=str) parser.add_argument('--stage', type=int) parser.add_argument('--ckpt_path', type=str) parser.add_argument('--num_class', type=int, default=20) - +parser.add_argument('--device_target', type=str, default='Ascend') config = parser.parse_args() max_lr = config.lr @@ -38,10 +38,12 @@ repeat = config.repeat stage = config.stage ckpt_path = config.ckpt_path save_path = config.save_path +device_target = config.device_target context.set_context(mode=context.GRAPH_MODE) -context.set_context(device_target="Ascend") -context.set_context(device_id=int(os.environ["DEVICE_ID"])) +context.set_context(device_target=device_target) +if not run_distribute: + context.set_context(device_id=int(os.environ["DEVICE_ID"])) context.set_context(save_graphs=False) seed_seed(2) # init random seed @@ -56,7 +58,7 @@ class TrainConfig_1: self.subset = "train" self.num_class = 20 self.train_img_size = 512 - self.epoch_num_save = 10 + self.epoch_num_save = 20 self.epoch = 65 self.encode = True self.attach_decoder = False @@ -69,7 +71,7 @@ class TrainConfig_2: self.subset = "train" self.num_class = 20 self.train_img_size = 512 - self.epoch_num_save = 10 + self.epoch_num_save = 20 self.epoch = 85 self.encode = True self.attach_decoder = False @@ -82,7 +84,7 @@ class TrainConfig_3: self.subset = "train" self.num_class = 20 self.train_img_size = 512 - self.epoch_num_save = 10 + self.epoch_num_save = 20 self.epoch = 100 self.encode = False self.attach_decoder = True diff --git a/research/cv/E-NET/src/dataset.py b/research/cv/E-NET/src/dataset.py index c85179216d5205350d9620f32657cb1fa1fd4d1c..83dec45d73cc5eb93a97d8b075ee5427591b08a9 100644 --- a/research/cv/E-NET/src/dataset.py +++ b/research/cv/E-NET/src/dataset.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,7 +22,6 @@ import mindspore.dataset as ds EXTENSIONS = ['.jpg', '.png'] - class MyGaussianBlur(ImageFilter.Filter): """GaussianBlur""" def __init__(self, radius=2, bounds=None): @@ -167,9 +166,9 @@ class cityscapes: filename = self.filenames[index] filenameGt = self.filenamesGt[index] - with open(image_path_city(self.images_root, filename), 'rb') as f: + with open(filename, 'rb') as f: image = load_image(f).convert('RGB') - with open(image_path_city(self.labels_root, filenameGt), 'rb') as f: + with open(filenameGt, 'rb') as f: label = load_image(f).convert('P') image, label = self.transform(image, label) return image, label @@ -210,7 +209,7 @@ def getCityScapesDataLoader_GeneratorDataset(CityScapesRoot, subset, batch_size, """CityScapesGeneratorDataset""" dataset = cityscapes(CityScapesRoot, subset, enc, aug, height) dataloader = ds.GeneratorDataset(dataset, column_names=["images", "labels"], - num_parallel_workers=8, shuffle=shuffle, shard_id=rank_id, + num_parallel_workers=6, shuffle=shuffle, shard_id=rank_id, num_shards=global_size, python_multiprocessing=True) if shuffle: dataloader = dataloader.shuffle(batch_size*10) @@ -224,16 +223,16 @@ def getCityScapesDataLoader_mindrecordDataset(stage, data_path, batch_size, enc, shuffle, aug, rank_id=0, global_size=1, repeat=1): """CityScapesmindrecordDataset""" dataloader = ds.MindDataset(data_path, columns_list=["data", "label"], - num_parallel_workers=8, shuffle=shuffle, shard_id=rank_id, num_shards=global_size) + num_parallel_workers=6, shuffle=shuffle, shard_id=rank_id, num_shards=global_size) transform = MyCoTransform(stage, enc, aug, height, if_from_mindrecord=True) dataloader = dataloader.map(operations=transform, input_columns=["data", "label"], output_columns=["data", "label"], - num_parallel_workers=8, python_multiprocessing=True) + num_parallel_workers=6, python_multiprocessing=True) if shuffle: dataloader = dataloader.shuffle(batch_size*10) - dataloader = dataloader.batch(batch_size, drop_remainder=False) + dataloader = dataloader.batch(batch_size, drop_remainder=True) if repeat > 1: dataloader = dataloader.repeat(repeat) return dataloader diff --git a/research/cv/E-NET/train.py b/research/cv/E-NET/train.py index bf97007d3cbe4e6ad1f873c15059bb58af1c6a36..b8e57b119c9160a61d9fb08c671c365f9a43aebd 100644 --- a/research/cv/E-NET/train.py +++ b/research/cv/E-NET/train.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,14 +19,15 @@ from mindspore.communication.management import get_group_size, get_rank, init from mindspore.context import ParallelMode from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor, LossMonitor from mindspore.train.loss_scale_manager import DynamicLossScaleManager -from mindspore.train.serialization import _update_param, load_checkpoint -from src.config import (TrainConfig_1, TrainConfig_2, TrainConfig_3, - ckpt_path, ms_train_data, num_class, repeat, run_distribute, save_path, stage, weight_init) +from mindspore.train.serialization import _update_param, load_checkpoint, save_checkpoint +from mindspore.common import set_seed +from src.config import (TrainConfig_1, TrainConfig_2, TrainConfig_3, ckpt_path, ms_train_data, num_class, repeat, run_distribute, save_path, stage, weight_init) from src.criterion import SoftmaxCrossEntropyLoss from src.dataset import getCityScapesDataLoader_mindrecordDataset from src.model import Encoder_pred, Enet from src.util import getCityLossWeight +set_seed(1) def attach(enet, encoder_pretrain): """move the params in encoder to enet""" @@ -35,7 +36,7 @@ def attach(enet, encoder_pretrain): enet_par = enet.parameters_dict() for name, param_old in encoder_trained_par.items(): if name.startswith("encoder"): - _update_param(enet_par[name], param_old) + _update_param(enet_par[name], param_old, True) def train(ckpt_path_, trainConfig_, rank_id, rank_size, stage_): """train enet""" @@ -59,6 +60,7 @@ def train(ckpt_path_, trainConfig_, rank_id, rank_size, stage_): network_enet = Enet(num_class, weight_init) attach(network_enet, network) network = network_enet + dataloader = getCityScapesDataLoader_mindrecordDataset(stage_, ms_train_data, 6, \ trainConfig_.encode, trainConfig_.train_img_size, shuffle=True, aug=True, \ rank_id=rank_id, global_size=rank_size, repeat=repeat) @@ -84,15 +86,15 @@ def train(ckpt_path_, trainConfig_, rank_id, rank_size, stage_): print("============== Starting {} Training ==============".format(save_prefix)) wrapper.train(trainConfig_.epoch, dataloader, callbacks=call_backs, dataset_sink_mode=True) + save_checkpoint(network, os.path.join(save_path, f"{save_prefix}_stage{stage_}.ckpt")) + return network if __name__ == "__main__": rank_id_ = 0 rank_size_ = 1 if run_distribute: - context.set_auto_parallel_context(parameter_broadcast=True) - context.set_auto_parallel_context(parallel_mode=\ - ParallelMode.DATA_PARALLEL, gradients_mean=False) + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=False) init() rank_id_ = get_rank() rank_size_ = get_group_size()