diff --git a/research/cv/ras/README.md b/research/cv/ras/README.md index 1cadd9a4cd5b7cabbee04c39a14242f0ab9d54eb..c2a18eb1bb50d10068be5ad2e3959ad082c97e2f 100644 --- a/research/cv/ras/README.md +++ b/research/cv/ras/README.md @@ -46,7 +46,7 @@ RAS鎬讳綋缃戠粶鏋舵瀯濡備笅: - DUTS-Train - image 10553寮� - - ground truth 10533寮� + - ground truth 10553寮� - 娉細鏁版嵁闆嗗湪src/dataset_train.py涓鐞� 娴嬭瘯闆� @@ -78,8 +78,8 @@ RAS鎬讳綋缃戠粶鏋舵瀯濡備笅: # 鐜瑕佹眰 -- 纭欢锛氭槆鑵惧鐞嗗櫒锛圓scend锛� - - 浣跨敤鏄囪吘澶勭悊鍣ㄦ潵鎼缓纭欢鐜銆� +- 纭欢锛氭槆鑵惧鐞嗗櫒锛圓scend/GPU锛� + - 浣跨敤鏄囪吘澶勭悊鍣ㄦ垨GPU鏉ユ惌寤虹‖浠剁幆澧冦€� - 妗嗘灦 - [MindSpore](https://www.mindspore.cn/install) @@ -101,6 +101,9 @@ RAS鎬讳綋缃戠粶鏋舵瀯濡備笅: 鈹� 鈹溾攢鈹€run_distribute_train.sh # 浣跨敤鏄囪吘澶勭悊鍣ㄨ繘琛屽叓鍗¤缁冪殑shell鑴氭湰 鈹� 鈹溾攢鈹€run_train.sh # 浣跨敤鏄囪吘澶勭悊鍣ㄨ繘琛屽崟鍗¤缁冪殑shell鑴氭湰 鈹� 鈹溾攢鈹€run_eval.sh # 浣跨敤鏄囪吘澶勭悊鍣ㄨ繘琛岃瘎浼扮殑鍗曞崱shell鑴氭湰 + 鈹� 鈹溾攢鈹€run_distribute_train_gpu.sh # 浣跨敤GPU杩涜澶氬崱璁粌鐨剆hell鑴氭湰 + 鈹� 鈹溾攢鈹€run_train_gpu.sh # 浣跨敤GPU杩涜鍗曞崱璁粌鐨剆hell鑴氭湰 + 鈹� 鈹溾攢鈹€run_eval_gpu.sh # 浣跨敤GPU杩涜璇勪及鐨勫崟鍗hell鑴氭湰 鈹溾攢鈹€src 鈹� 鈹溾攢鈹€dataset_train.py #鍒涘缓璁粌鏁版嵁闆� 鈹� 鈹溾攢鈹€dataset_test.py # 鍒涘缓鎺ㄧ悊鏁版嵁闆� @@ -136,6 +139,7 @@ RAS鎬讳綋缃戠粶鏋舵瀯濡備笅: ### 鐢ㄦ硶 - **娉細鍦ㄥ缓绔嬭缁冩暟鎹矾寰勬椂锛屽湪鐩綍鏈€鍚庝竴绾у垱寤轰袱涓枃浠跺すimages(瀛樻斁璁粌鍥剧墖),labels(瀛樻斁GT);modelarts妯″紡涓嬫棤闇€寤虹珛鐩綍锛岀洿鎺ュ瓨鏀緄mages.zip,labels.zip鍗冲彲** +- Ascend澶勭悊鍣ㄧ幆澧冭繍琛� ``` python - 鐩存帴浣跨敤python3鍦ㄧ粓绔繘琛岃繍琛� 锛� @@ -155,6 +159,24 @@ RAS鎬讳綋缃戠粶鏋舵瀯濡備笅: rank_size 涓哄鍗¤缁冩椂闇€瑕佺殑鍗℃暟 ``` +- GPU澶勭悊鍣ㄧ幆澧冭繍琛� + + resnet50棰勮缁冩ā鍨嬩笅杞借矾寰勶細[https://download.mindspore.cn/model_zoo/r1.3/resnet50_gpu_v130_imagenet_official_cv_bs32_acc0/resnet50_gpu_v130_imagenet_official_cv_bs32_acc0.ckpt](https://download.mindspore.cn/model_zoo/r1.3/resnet50_gpu_v130_imagenet_official_cv_bs32_acc0/resnet50_gpu_v130_imagenet_official_cv_bs32_acc0.ckpt) + +``` python + - 鐩存帴浣跨敤python3鍦ㄧ粓绔繘琛岃繍琛� 锛� + 濡傦細python3 -u train.py --is_modelarts NO --distribution_flag NO --device_target GPU --device_id 5 --lr 0.00005 --data_url '' --pretrained_model '' --train_url ''> output.log 2>&1 & + device_id 涓虹‖浠剁幆澧冧腑鐨勮姱鐗嘔D + lr 涓哄涔犵巼 + data_url 涓鸿缁冩暟鎹矾寰� + pretrained_model 涓簉esnet50棰勮缁冩ā鍨嬭矾寰� + train_url 涓鸿緭鍑虹殑ckpt淇濆瓨璺緞 + - bash杩愯 + bash script/run_train_gpu.sh device_id lr data_url pretrained_model train_url //鍗曞崱璁粌 + bash script/run_distribute_train_gpu.sh rank_size data_url pretrained_model train_url //澶氬崱鍒嗗竷寮忚缁� + rank_size 涓哄鍗¤缁冩椂闇€瑕佺殑鍗℃暟 +``` + ### 缁撴灉 璁粌缁撴灉淇濆瓨鍦ㄧず渚嬭矾寰勪腑銆傛鏌ョ偣榛樿淇濆瓨鍦╜--train_url`涓紝璁粌鏃ュ織閲嶅畾鍚戝埌`output/output.log`锛屽唴瀹瑰涓嬶細 @@ -193,6 +215,7 @@ The Consumption of per step is 0.136 s ### 鐢ㄦ硶 - **娉細鍦ㄦ帹鐞嗘暟鎹矾寰勭殑鏈€鍚庝竴绾х洰褰曚笅寤虹珛鏂囦欢澶筰mages鍜実ts,鍒嗗埆灏嗗浘鐗囧拰groundtruth瀛樺叆鍏朵腑;modelarts妯″紡涓嬫棤闇€寤虹珛images锛岀洿鎺ュ瓨鍌╥mages.zip鍜実ts.zip** +- Ascend澶勭悊鍣ㄧ幆澧冭繍琛� ``` python # 鎺ㄧ悊绀轰緥 @@ -205,6 +228,19 @@ The Consumption of per step is 0.136 s bash script/run_eval.sh device_id data_url train_url model_path pre_model ``` +- GPU澶勭悊鍣ㄧ幆澧冭繍琛� + +``` python +# 鎺ㄧ悊绀轰緥 + python3 -u eval.py --is_modelarts NO --device_target GPU --device_id 0 --data_url xxx --model_path xxx --pre_model xxx + device_id 涓鸿杩涜鎺ㄧ悊鐨勬満鍣ㄧ殑ID + data_url 涓烘帹鐞嗘暟鎹矾寰� + model_path 涓鸿缁冧繚瀛樼殑ckpt璺緞 + pre_model 涓虹綉缁渞esnet50棰勮缁冩ā鍨嬭矾寰� + bash 杩愯 + bash script/run_eval_gpu.sh device_id data_url train_url model_path pre_model +``` + ### 缁撴灉 鎺ㄧ悊缁撴灉淇濆瓨鍦ㄧず渚嬭矾寰勪腑锛屽彲浠ュ湪`--train_url`涓壘鍒板涓嬬粨鏋�,鏃ュ織鍙湪`output/eval_output.log`涓壘鍒帮細 @@ -222,16 +258,16 @@ The Consumption of per step is 0.136 s ## 璇勪及绮惧害 -| 鍙傛暟鍒楄〃 | RAS | -| -------------------------- | ----------------------------- | -| 妯″瀷鐗堟湰 | V1 | -| 璧勬簮 | Ascend 910锛汣PU 2.60HGHz 绯荤粺 Euler2.8 | -| 涓婁紶鏃ユ湡 | 2021-11-30 | -| MindSpore鐗堟湰 | 1.5 | -| 鏁版嵁闆� | ECSSD DUTS-Test DUT-OMRON HUK-IS| -| batch_size | 1 | -| 杈撳嚭 | 3浣嶆湁鏁堟暟瀛楀皬鏁� -| F-measure | 0.921 0.820 0.749 0.907 +| 鍙傛暟鍒楄〃 | Ascend 910 | GPU | +| -------------------------- | ----------------------------- | ------ | +| 妯″瀷鐗堟湰 | RAS | RAS | +| 璧勬簮 | Ascend 910锛汣PU 2.60HGHz 绯荤粺 Euler2.8 | RTX-3090 | +| 涓婁紶鏃ユ湡 | 2021-11-30 | 2021-12-23 | +| MindSpore鐗堟湰 | 1.5 | 1.5 | +| 鏁版嵁闆� | ECSSD DUTS-Test DUT-OMRON HUK-IS | ECSSD DUTS-Test DUT-OMRON HUK-IS | +| batch_size | 1 | 1 | +| 杈撳嚭 | 3浣嶆湁鏁堟暟瀛楀皬鏁� | 3浣嶆湁鏁堟暟瀛楀皬鏁� | +| F-measure | 0.921 0.820 0.749 0.907 | 0.920 0.819 0.751 0.906 | ## 闅忔満鎯呭喌璇存槑 diff --git a/research/cv/ras/eval.py b/research/cv/ras/eval.py index ea8405e4646c85b0392e7776b3fa72666a92b1a3..ff83ca1b96ebb10e601fa322da7b30efa8e43078 100644 --- a/research/cv/ras/eval.py +++ b/research/cv/ras/eval.py @@ -1,5 +1,5 @@ """ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -174,7 +174,7 @@ if __name__ == "__main__": #calculate F-measure gtfiles = sorted([gtname + gt_file for gt_file in os.listdir(gtname)]) - predictfiles = sorted([predictpath + predictfile for predictfile in os.listdir(predictpath)]) + predictfiles = sorted([os.path.join(predictpath, predictfile) for predictfile in os.listdir(predictpath)]) Fs = [] for i in range(len(gtfiles)): diff --git a/research/cv/ras/requirements.txt b/research/cv/ras/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..608b8822807ea1f7295c89483b82833d0eeb1b87 --- /dev/null +++ b/research/cv/ras/requirements.txt @@ -0,0 +1,3 @@ +numpy +PIL +argparse \ No newline at end of file diff --git a/research/cv/ras/script/run_distribute_train_gpu.sh b/research/cv/ras/script/run_distribute_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..03cb1568e1395006c3beb4af67a7ce722aeadc59 --- /dev/null +++ b/research/cv/ras/script/run_distribute_train_gpu.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + + +echo "====================================================================================================" +echo "Please run the script as:" +echo "bash script/run_distribute_train_gpu.sh [RANK_SIZE] [data_url] [pre_model] [train_url]" +echo "for example: bash script/run_distribute_train_gpu.sh 8 /home/data/ /home/resent50.ckpt /home/data/models/" +echo " ***** + RANK_SIZE: The parameter is numbers of device for distributed training; + data_url: The data_url directory is the directory where the data set is located,and there must be two folders, images and labels, under data_url; + pre_model: path of pretrained model; + train_url: the save path of checkpoint file." +echo "====================================================================================================" + +set -e +RANK_SIZE=$1 +data_url=$2 +pre_model=$3 +train_url=$4 + +export RANK_SIZE=${RANK_SIZE} +export DEVICE_NUM=${RANK_SIZE} + +rm -rf ./train_parallel +mkdir ./train_parallel +cp ./*.py ./train_parallel +cp -r ./src ./train_parallel +cp -r ./script ./train_parallel +cd ./train_parallel || exit + +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ +nohup python3 -u train.py \ + --is_modelarts NO --distribution_flag YES --lr 0.0002 --decay_epoch 70 --epoch 80 \ + --data_url ${data_url} --pretrained_model ${pre_model} \ + --train_url ${train_url} --device_target GPU > output.log 2>&1 & +cd .. + diff --git a/research/cv/ras/script/run_eval.sh b/research/cv/ras/script/run_eval.sh index 6033f394127601a9619e69df0004fa8abcc88825..ec84b6cbd50c156ec8668dbeb368fb77a6ac5a4e 100644 --- a/research/cv/ras/script/run_eval.sh +++ b/research/cv/ras/script/run_eval.sh @@ -17,7 +17,7 @@ echo "====================================================================================================" echo "Please run the script as:" -echo "bash script/eval.sh [device_id] [data_url] [model_path] [pre_model]" +echo "bash script/eval.sh [device_id] [data_url] [train_url] [model_path] [pre_model]" echo "for example: bash script/eval.sh 5 /home/data/Test/ /home/data/results/ /home/data/models/RAS800.ckpt /home/data/resnet50.ckpt" echo " ***** device_id: The device id for evaluation; diff --git a/research/cv/ras/script/run_eval_gpu.sh b/research/cv/ras/script/run_eval_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..6eca20dedb3d2c8431afda5a13c46a16dc85bb7e --- /dev/null +++ b/research/cv/ras/script/run_eval_gpu.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "====================================================================================================" +echo "Please run the script as:" +echo "bash script/run_eval_gpu.sh [device_id] [data_url] [train_url] [model_path] [pre_model]" +echo "for example: bash script/eval_gpu.sh 5 /home/data/Test/ /home/data/results/ /home/data/models/RAS800.ckpt /home/data/resnet50.ckpt" +echo " ***** + device_id: The device id for evaluation; + data_url: The data_url directory is the directory where the dataset is located,and there must be two folders, images and gts, under data_url; + train_url: This is a save path of evaluation results; + model_path: the save path of checkpoint file produced by the RAS during training process; + pre_model: path of pretrained model. + " +echo "====================================================================================================" + +set -e +rm -rf output_eval +mkdir output_eval + +device_id=$1 +data_url=$2 +train_url=$3 +model_path=$4 +pre_model=$5 + +python3 -u eval.py --device_id ${device_id} --data_url ${data_url} --train_url ${train_url} --model_path ${model_path} --pre_model ${pre_model} --device_target GPU > output_eval/eval_output.log 2>&1 & + diff --git a/research/cv/ras/script/run_train_gpu.sh b/research/cv/ras/script/run_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..aa80cbcda1f5aa645888166f123526c90e33dd23 --- /dev/null +++ b/research/cv/ras/script/run_train_gpu.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +echo "====================================================================================================" +echo "Please run the script as:" +echo "bash script/run_train_gpu.sh [device_id] [lr] [data_url] [pre_model] [train_url]" +echo "for example: bash script/run_train_gpu.sh 5 0.00005 /home/data/ /home/resent50.ckpt /home/data/models/" +echo " ***** + device_id: The parameter is device's ID for training; + lr: learning_rate + data_url: The data_url directory is the directory where the data set is located,and there must be two folders, images and labels, under data_url; + pre_model: path of pretrained model; + train_url: the save path of checkpoint file." +echo "====================================================================================================" + + +set -e +rm -rf output +mkdir output + +device_id=$1 +lr=$2 +data_url=$3 +pre_model=$4 +train_url=$5 + +python3 -u train.py --is_modelarts NO --distribution_flag NO --device_id ${device_id} --lr ${lr} --data_url ${data_url} --pretrained_model ${pre_model} --train_url ${train_url} --device_target GPU> output/output.log 2>&1 & diff --git a/research/cv/ras/src/loss.py b/research/cv/ras/src/loss.py index d824d22f8ba58b6adfb340b3901f71222f18c9dc..7f8880173561662bd0c85e44bee0c1df4778c07e 100644 --- a/research/cv/ras/src/loss.py +++ b/research/cv/ras/src/loss.py @@ -1,5 +1,5 @@ """ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -51,6 +51,45 @@ class LossFn(nn.Cell): result = (loss + iou).mean() return result +class BceIouLoss(nn.Cell): + """ + a loss function + """ + def __init__(self, batchsize): + super(BceIouLoss, self).__init__() + self.bce = nn.BCEWithLogitsLoss() + self.mean = ops.ReduceMean(keep_dims=False) + self.batchsize = batchsize + self.sigmoid = nn.Sigmoid() + self.sum = ops.ReduceSum() + + def iou(self, predict, target): + pred = self.sigmoid(predict) + inter = self.sum(pred * target, (2, 3)) + union = self.sum(pred + target, (2, 3)) + iou = 1-(inter+1)/(union-inter+1) + return iou + + def construct(self, predict, target): + iou1 = self.iou(predict[0], target) + bce1 = self.bce(predict[0], target) + loss1 = self.mean(iou1 + bce1) + iou2 = self.iou(predict[1], target) + bce2 = self.bce(predict[1], target) + loss2 = self.mean(iou2 + bce2) + iou3 = self.iou(predict[2], target) + bce3 = self.bce(predict[2], target) + loss3 = self.mean(iou3 + bce3) + iou4 = self.iou(predict[3], target) + bce4 = self.bce(predict[3], target) + loss4 = self.mean(iou4 + bce4) + iou5 = self.iou(predict[4], target) + bce5 = self.bce(predict[4], target) + loss5 = self.mean(iou5 + bce5) + loss_fuse = loss1 + loss2 + loss3 + loss4 + loss5 + loss = loss_fuse / self.batchsize + return loss + class BuildTrainNetwork(nn.Cell): """ diff --git a/research/cv/ras/train.py b/research/cv/ras/train.py index 821e2ed539352e4e846a38fd76c33751b24f7053..6570cca74c0f217aa6583c859e770d51b618563c 100644 --- a/research/cv/ras/train.py +++ b/research/cv/ras/train.py @@ -1,5 +1,5 @@ """ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,12 +29,15 @@ import mindspore.nn as nn from mindspore import save_checkpoint from mindspore.communication import init, get_rank from mindspore.context import ParallelMode +from mindspore.train import Model +from mindspore.train.callback import TimeMonitor, LossMonitor, ModelCheckpoint, CheckpointConfig from src.model import BoneModel from src.TrainOneStepMyself import TrainOneStep -from src.loss import LossFn, BuildTrainNetwork +from src.loss import LossFn, BceIouLoss, BuildTrainNetwork from src.dataset_train import TrainDataLoader + sys.path.append("../") @@ -61,6 +64,7 @@ parser.add_argument('--print_flag', type=int, default=20, help='determines wheth parser.add_argument('--data_url', type=str) parser.add_argument('--pretrained_model', type=str) parser.add_argument('--train_url', type=str) +parser.add_argument('--dataset_sink_mode', type=str, default='YES') par = parser.parse_args() @@ -96,57 +100,82 @@ class RASWhole: print("The number of the model parameters:{}".format(param_number)) print("Data Number {}".format(traindataloader.data_number)) loss_fn = LossFn() + bce_iou_loss = BceIouLoss(par.batchsize) train_model = BuildTrainNetwork(model, loss_fn) - lr = par.lr - opt = nn.Adam(params=model.trainable_params(), learning_rate=lr, loss_scale=1024) - train_net = TrainOneStep(train_model, optimizer=opt) - train_net.set_train() - total_step = traindataloader.data_number - - for epoch in range(par.epoch): - print("-----------------This Training is epoch %d --------------" % (epoch + 1)) - i = 1 - if epoch+1 in par.decay_epoch: - lr = lr * 0.1 - opt = nn.Adam(params=model.trainable_params(), learning_rate=lr, loss_scale=1024) - train_net = TrainOneStep(train_model, optimizer=opt) - train_net.set_train() - - for data in traindataloader.dataset.create_dict_iterator(): - image, label = data["data"], data["label"] - loss = train_net(image, label, par.batchsize) - - if par.distribution_flag == "NO": - if (epoch > 10) and (loss > 0.5): - print("Please try once again") - return - - if i % par.print_flag == 0 or i == total_step: - print("epoch:%d, learning_rate:%.8f,iter [%d/%d],Loss || " - % ((epoch + 1), lr, i, total_step), end='') - print(loss) - present_time = time.time() - mean_step_time = (present_time - train_time) / par.print_flag - print("The Consumption of per step is %.3f s" % mean_step_time) - train_time = present_time - print("+++++++++++++++++++++++++++++++++++++++++++++++++") - i += 1 + if device_target == "GPU": if par.distribution_flag == "YES": rank_id = get_rank() - if (epoch + 1) % 10 == 0: - if par.is_modelarts == "YES": - save_path_all = os.path.join(checkpoint_out, "RAS%d" % (epoch + 1) + str(rank_id) + ".ckpt") - else: - save_path_all = os.path.join(save_path, "RAS%d" % (epoch + 1) + str(rank_id) + ".ckpt") - save_checkpoint(train_net, save_path_all) + par.decay_epoch = ["".join(par.decay_epoch)] + else: + rank_id = 0 + batch_num = traindataloader.dataset.get_dataset_size() + if rank_id == 0: + save_per_epoch = 5 + config_ck = CheckpointConfig(save_checkpoint_steps=save_per_epoch*batch_num, + keep_checkpoint_max=30) + ckpoint = ModelCheckpoint(prefix="RAS", directory=os.path.join(par.train_url, str(rank_id)), + config=config_ck) + callbacks = [TimeMonitor(), LossMonitor(), ckpoint] else: - if (epoch + 1) % 5 == 0: - if par.is_modelarts == "YES": - save_path_all = os.path.join(checkpoint_out, "RAS%d.ckpt" % (epoch + 1)) - else: - save_path_all = os.path.join(save_path, "RAS%d.ckpt" % (epoch + 1)) - save_checkpoint(train_net, save_path_all) + callbacks = [TimeMonitor(), LossMonitor()] + piecewise = [int(x) * batch_num for x in par.decay_epoch] + piecewise.append(par.epoch * batch_num) + lr = nn.piecewise_constant_lr(piecewise, [par.lr, par.lr * 0.1]) + opt = nn.Adam(params=model.trainable_params(), learning_rate=lr, loss_scale=1024) + model_train = Model(model, loss_fn=bce_iou_loss, optimizer=opt) + model_train.train(epoch=par.epoch, train_dataset=traindataloader.dataset, + callbacks=callbacks, dataset_sink_mode=par.dataset_sink_mode == "YES") + else: + lr = par.lr + opt = nn.Adam(params=model.trainable_params(), learning_rate=lr, loss_scale=1024) + train_net = TrainOneStep(train_model, optimizer=opt) + train_net.set_train() + total_step = traindataloader.data_number + for epoch in range(par.epoch): + print("-----------------This Training is epoch %d --------------" % (epoch + 1)) + i = 1 + if epoch+1 in par.decay_epoch: + lr = lr * 0.1 + opt = nn.Adam(params=model.trainable_params(), learning_rate=lr, loss_scale=1024) + train_net = TrainOneStep(train_model, optimizer=opt) + train_net.set_train() + + for data in traindataloader.dataset.create_dict_iterator(): + image, label = data["data"], data["label"] + loss = train_net(image, label, par.batchsize) + + if par.distribution_flag == "NO": + if (epoch > 10) and (loss > 0.5): + print("Please try once again") + return + + if i % par.print_flag == 0 or i == total_step: + print("epoch:%d, learning_rate:%.8f,iter [%d/%d],Loss || " + % ((epoch + 1), lr, i, total_step/self.batchsize), end='') + print(loss) + present_time = time.time() + mean_step_time = (present_time - train_time) / par.print_flag + print("The Consumption of per step is %.3f s" % mean_step_time) + train_time = present_time + print("+++++++++++++++++++++++++++++++++++++++++++++++++") + i += 1 + + if par.distribution_flag == "YES": + rank_id = get_rank() + if (epoch + 1) % 10 == 0: + if par.is_modelarts == "YES": + save_path_all = os.path.join(checkpoint_out, "RAS%d" % (epoch + 1) + str(rank_id) + ".ckpt") + else: + save_path_all = os.path.join(save_path, "RAS%d" % (epoch + 1) + str(rank_id) + ".ckpt") + save_checkpoint(train_net, save_path_all) + else: + if (epoch + 1) % 5 == 0: + if par.is_modelarts == "YES": + save_path_all = os.path.join(checkpoint_out, "RAS%d.ckpt" % (epoch + 1)) + else: + save_path_all = os.path.join(save_path, "RAS%d.ckpt" % (epoch + 1)) + save_checkpoint(train_net, save_path_all) if par.is_modelarts == "YES": mox.file.copy_parallel(src_url=checkpoint_out, dst_url=models_path) @@ -156,18 +185,29 @@ if __name__ == "__main__": np.random.seed(100) if par.distribution_flag == 'YES': print("++++++++++++++++++++ Training with distributed style +++++++++++++++++++") - device_id = int(os.getenv('DEVICE_ID')) - device_num = int(os.getenv('RANK_SIZE')) - ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=device_target, device_id=device_id) - ms.context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True, device_num=device_num) - init() + if device_target == "Ascend": + device_id = int(os.getenv('DEVICE_ID')) + device_num = int(os.getenv('RANK_SIZE')) + ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=device_target, device_id=device_id) + ms.context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True, device_num=device_num) + init() + else: + init() + ms.context.reset_auto_parallel_context() + device_num = int(os.getenv('RANK_SIZE')) + rank = get_rank() + ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=device_target) + ms.context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True, device_num=device_num) else: if par.is_modelarts == "YES": device_id = int(os.getenv("DEVICE_ID")) else: device_id = int(par.device_id) ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=device_target, device_id=device_id) + if device_target == "GPU": + ms.context.set_context(enable_graph_kernel=True) if par.is_modelarts == "YES": @@ -201,6 +241,8 @@ if __name__ == "__main__": pre_trained_model_path = par.pretrained_model save_models_path = par.train_url save_path = save_models_path + if device_target == "GPU" and par.distribution_flag == "YES": + save_path = os.path.join(save_path, str(rank)) if not os.path.exists(save_path): os.makedirs(save_path) start_time = time.time()