diff --git a/research/cv/ProtoNet/README.md b/research/cv/ProtoNet/README.md index 109618ffd5e13cdd2bc9e0bf0808f960c059d74e..84d821c65bfb810493c041f428bbe3e7f00fe777 100644 --- a/research/cv/ProtoNet/README.md +++ b/research/cv/ProtoNet/README.md @@ -60,7 +60,7 @@ python train.py # [Environment Requirements](#contents) -- Hardware(Ascend) +- Hardware(Ascend&GPU) - Prepare hardware environment with Ascend. - Framework - [MindSpore](https://www.mindspore.cn/install/en) @@ -79,6 +79,12 @@ sh run_standalone_train_ascend.sh "../dataset" 1 60 500 sh run_standalone_eval_ascend.sh "../dataset" "./output/best_ck.ckpt" 1 5 # enter script dir, train ProtoNet distributed sh run_distribution_ascend.sh "./rank_table.json" "../dataset" 60 500 +# enter script dir, train ProtoNet in GPU. +sh run_standalone_train_gpu.sh "../dataset" 1 60 500 +# enter script dir, evaluate ProtoNet +sh run_standalone_eval_gpu.sh "../dataset" "./output/best_ck.ckpt" 1 5 +# enter script dir, train ProtoNet distributed +sh run_distribution_gpu.sh "../dataset" 60 500 ``` ## [Script and Sample Code](#contents) @@ -89,6 +95,9 @@ sh run_distribution_ascend.sh "./rank_table.json" "../dataset" 60 500 鈹溾攢鈹€ requirements.txt 鈹溾攢鈹€ README.md // descriptions about lenet 鈹溾攢鈹€ scripts + 鈹� 鈹溾攢鈹€run_standalone_train_gpu.sh // train in GPU + 鈹� 鈹溾攢鈹€run_standalone_eval_gpu.sh // evaluate in GPU + 鈹� 鈹溾攢鈹€run_distribution_gpu.sh // distribution in GPU 鈹� 鈹溾攢鈹€run_standalone_train_ascend.sh // train in ascend 鈹� 鈹溾攢鈹€run_standalone_eval_ascend.sh // evaluate in ascend 鈹� 鈹溾攢鈹€run_distribution_ascend.sh // distribution in ascend @@ -125,7 +134,8 @@ Major parameters in train.py and config.py as follows: ### Training ```bash -sh run_standalone_train_ascend.sh "../dataset" 1 60 500 +bash run_standalone_train_ascend.sh "../dataset" 1 60 500 +bash run_standalone_train_gpu.sh "../dataset" 1 60 500 ``` The model checkpoint will be saved in the current directory. @@ -138,11 +148,13 @@ Before running the command below, please check the checkpoint path used for eval ```bash sh run_standalone_eval_ascend.sh "../dataset" "./output/best_ck.ckpt" 1 5 +sh run_standalone_eval_gpu.sh "../dataset" "./output/best_ck.ckpt" 1 5 ``` ```shell -Test Acc: 0.9954400658607483 Loss: 0.02102319709956646 +Test Acc in Ascend: 0.9954400658607483 Loss: 0.02102319709956646 +Test Acc in GPU: 0.996999979019165 Loss: 0.013885765336453915 ``` # [Model Description](#contents) @@ -151,21 +163,21 @@ Test Acc: 0.9954400658607483 Loss: 0.02102319709956646 ### Evaluation Performance -| Parameters | ProtoNet | -| -------------------------- | ---------------------------------------------------------- | -| Resource | CentOs 8.2; Ascend 910 ; CPU 2.60GHz锛�192cores锛汳emory 755G | -| uploaded Date | 03/26/2021 (month/day/year) | -| MindSpore Version | 1.1.1 | -| Dataset | OMNIGLOT | -| Training Parameters | episode=500, class_num = 5, lr=0.001, classes_per_it_tr=60, num_support_tr=5, num_query_tr=5, classes_per_it_val=20, num_support_val=5, num_query_val=15 | -| Optimizer | Adam | -| Loss Function | Prototypicalloss | -| outputs | Accuracy | -| Loss | 0.002 | -| Speed | 215 ms/step | -| Total time | 3 h 23m (8p) | -| Checkpoint for Fine tuning | 440 KB (.ckpt file) | -| Scripts | <https://gitee.com/mindspore/models/tree/master/research/cv/ProtoNet> | +| Parameters | ProtoNet(Ascend) |ProtoNet(GPU) | +| -------------------------- | ---------------------------------------------------------- | ---------------------------------------------------------- | +| Resource | CentOs 8.2; Ascend 910 ; CPU 2.60GHz锛�192cores锛汳emory 755G | ubuntu 18.04; Tesla V100 ; CPU 2.60GHz | +| uploaded Date | 03/26/2021 (month/day/year) | 08/28/2021 (month/day/year) | +| MindSpore Version | 1.1.1 | 1.3.0 | +| Dataset | OMNIGLOT |OMNIGLOT | +| Training Parameters | episode=500, class_num = 5, lr=0.001, classes_per_it_tr=60, num_support_tr=5, num_query_tr=5, classes_per_it_val=20, num_support_val=5, num_query_val=15 | episode=500, class_num = 5, lr=0.001, classes_per_it_tr=60, num_support_tr=5, num_query_tr=5, classes_per_it_val=20, num_support_val=5, num_query_val=15 | +| Optimizer | Adam | Adam | +| Loss Function | Prototypicalloss | Prototypicalloss | +| outputs | Accuracy | Accuracy | +| Loss | 0.002 | 0.002 | +| Speed | 215 ms/step | 144 ms/step | +| Total time | 3 h 23m (8p) | 2 h 48m (8p) | +| Checkpoint for Fine tuning | 440 KB (.ckpt file) | 441 KB (.ckpt file) | +| Scripts | <https://gitee.com/mindspore/models/tree/master/research/cv/ProtoNet> |<https://gitee.com/mindspore/models/tree/master/research/cv/ProtoNet> | # [ModelZoo Homepage](#contents) diff --git a/research/cv/ProtoNet/scripts/run_distribution_gpu.sh b/research/cv/ProtoNet/scripts/run_distribution_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..bea9cc8b252aca3529065fd20226d94cf06a8538 --- /dev/null +++ b/research/cv/ProtoNet/scripts/run_distribution_gpu.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 4 ] +then + echo "Usage: sh run_distribution_gpu.sh [DATA_PATH] [TRAIN_CLASS] [EPOCHS] [DEVICE_NUM]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} +export DATA_PATH=$(get_real_path $1) # dataset path +export TRAIN_CLASS=$2 # train class, propose 20 +export EPOCHS=$3 # num of epochs +export DEVICE_NUM=$4 # device_num + +if [ ! -d $DATA_PATH ] +then + echo "error: DATA_PATH=$DATA_PATH is not a directory" +exit 1 +fi + +rm -rf distribute_output +mkdir distribute_output + +mpirun --allow-run-as-root -n $DEVICE_NUM --output-filename log_output --merge-stderr-to-stdout \ + python ../train.py --dataset_root=$DATA_PATH \ + --device_target="GPU" \ + --classes_per_it_tr=$TRAIN_CLASS \ + --experiment_root="./distribute_output" \ + --epochs=$EPOCHS > distribute_log 2>&1 & diff --git a/research/cv/ProtoNet/scripts/run_standalone_eval_gpu.sh b/research/cv/ProtoNet/scripts/run_standalone_eval_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..81cf1a623211592d875fcb3896062933c6101636 --- /dev/null +++ b/research/cv/ProtoNet/scripts/run_standalone_eval_gpu.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# an simple tutorial as follows, more parameters can be setting +if [ $# != 4 ] +then + echo "Usage: sh run_standalone_eval_gpu.sh [DATA_PATH] [CKPT_PATH] [DEVICE_ID] [EVAL_CLASS]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +export DATA_PATH=$(get_real_path $1) # dataset path +export CKPT_PATH=$2$(get_real_path $1) # path of ckpt file to load +export DEVICE_ID=$3 # device id +export EVAL_CLASS=$4 # class used in evaluation +if [ ! -d $DATA_PATH ] +then + echo "error: DATA_PATH=$DATA_PATH is not a directory" +exit 1 +fi + +if [ ! -d CKPT_PATH ] +then + echo "error: CKPT_PATH=$CKPT_PATH is not a directory" +exit 1 +fi + +python ../eval.py --dataset_root=$DATA_PATH --experiment_root=$CKPT_PATH \ + --device_id=$DEVICE_ID --device_target="GPU" \ + --classes_per_it_val=$EVAL_CLASS > eval_log 2>&1 & diff --git a/research/cv/ProtoNet/scripts/run_standalone_train_gpu.sh b/research/cv/ProtoNet/scripts/run_standalone_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..060fa1d22299bdeb06a60590d3594a468eb37a7f --- /dev/null +++ b/research/cv/ProtoNet/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# an simple tutorial as follows, more parameters can be setting +if [ $# != 4 ] +then + echo "Usage: sh run_standalone_train_gpu.sh [DATA_PATH] [DEVICE_ID] [TRAIN_CLASS] [EPOCHS]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +export DATA_PATH=$(get_real_path $1) # dataset path +export DEVICE_ID=$2 # device_id +export TRAIN_CLASS=$3 # train class, propose 20 +export EPOCHS=$4 # num of epochs + +if [ ! -d $DATA_PATH ] +then + echo "error: DATA_PATH=$DATA_PATH is not a directory" +exit 1 +fi + +python ../train.py --dataset_root=$DATA_PATH \ + --device_id=$DEVICE_ID --device_target="GPU" \ + --classes_per_it_tr=$TRAIN_CLASS \ + --experiment_root="./standalone_output" \ + --epochs=$EPOCHS > log 2>&1 & diff --git a/research/cv/ProtoNet/src/EvalCallBack.py b/research/cv/ProtoNet/src/EvalCallBack.py index 08a297926b88709743494ebe0200d57716db07ff..08ab44565ef74355f469e572a7d2d6083b402685 100644 --- a/research/cv/ProtoNet/src/EvalCallBack.py +++ b/research/cv/ProtoNet/src/EvalCallBack.py @@ -26,7 +26,7 @@ class EvalCallBack(Callback): """ CallBack class """ - def __init__(self, options, net, eval_dataset, path): + def __init__(self, options, net, eval_dataset, path, rank_id=0): self.net = net self.eval_dataset = eval_dataset self.path = path @@ -34,6 +34,7 @@ class EvalCallBack(Callback): self.avgloss = 0 self.bestacc = 0 self.options = options + self.rank_id = rank_id def epoch_begin(self, run_context): @@ -58,12 +59,12 @@ class EvalCallBack(Callback): if self.avgacc > self.bestacc: self.bestacc = self.avgacc print('Epoch {}: Avg Accuracy: {}(best) Avg Loss:{}'.format(cur_epoch, self.avgacc, self.avgloss)) - best_path = os.path.join(self.path, 'best_ck.ckpt') + best_path = os.path.join(self.path, f'best_ck_{self.rank_id}.ckpt') save_checkpoint(cur_net, best_path) else: print('Epoch {}: Avg Accuracy: {} Avg Loss:{}'.format(cur_epoch, self.avgacc, self.avgloss)) - last_path = os.path.join(self.path, 'last_ck.ckpt') + last_path = os.path.join(self.path, f'last_ck_{self.rank_id}.ckpt') save_checkpoint(cur_net, last_path) print("Best Acc:", self.bestacc) print('=========EPOCH {} END========='.format(cur_epoch)) diff --git a/research/cv/ProtoNet/train.py b/research/cv/ProtoNet/train.py index 9ba1873d1a6411731733931f0ee53e3314e6f73f..b2bbe5b319838eb49a2ecbff8996b86c5388e3eb 100644 --- a/research/cv/ProtoNet/train.py +++ b/research/cv/ProtoNet/train.py @@ -22,7 +22,7 @@ from mindspore.train import Model from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore import dataset as ds import mindspore.context as context -from mindspore.communication.management import init +from mindspore.communication.management import init, get_rank from mindspore.context import ParallelMode from src.EvalCallBack import EvalCallBack from src.protonet import WithLossCell @@ -35,7 +35,7 @@ local_data_url = './cache/data' local_train_url = './cache/out' -def train(opt, tr_dataloader, net, loss_fn, eval_loss_fn, optim, path, val_dataloader=None): +def train(opt, tr_dataloader, net, loss_fn, eval_loss_fn, optim, path, rank_id, val_dataloader=None): ''' train function ''' @@ -47,11 +47,11 @@ def train(opt, tr_dataloader, net, loss_fn, eval_loss_fn, optim, path, val_datal eval_data = ds.GeneratorDataset(val_dataloader, column_names=['data', 'label', 'classes']) - eval_cb = EvalCallBack(opt, my_acc_cell, eval_data, path) + eval_cb = EvalCallBack(opt, my_acc_cell, eval_data, path, rank_id) config = CheckpointConfig(save_checkpoint_steps=10, keep_checkpoint_max=5, saved_network=net) - ckpoint_cb = ModelCheckpoint(prefix='protonet', directory=path, config=config) + ckpoint_cb = ModelCheckpoint(prefix=str(rank_id) + '_protonet', directory=path, config=config) print('==========training test==========') starttime = datetime.datetime.now() @@ -68,24 +68,29 @@ def main(): global local_train_url options = get_parser().parse_args() + device_num = int(os.environ.get("DEVICE_NUM", 1)) - if options.run_offline: - - device_num = int(os.environ.get("DEVICE_NUM", 1)) + if options.device_target == "GPU": + rank_id = get_rank() + if options.run_offline: if device_num > 1: - init() context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - context.set_context(device_id=options.device_id) + + if options.device_target == "Ascend": + context.set_context(device_id=options.device_id) + local_data_url = options.dataset_root local_train_url = options.experiment_root - if not os.path.exists(options.experiment_root): - os.makedirs(options.experiment_root) + if device_num > 1 and options.device_target == "GPU": + pass + else: + if not os.path.exists(options.experiment_root): + os.makedirs(options.experiment_root) else: - device_num = int(os.environ.get("DEVICE_NUM", 1)) device_id = int(os.getenv("DEVICE_ID")) import moxing as mox @@ -114,7 +119,19 @@ def main(): Net = ProtoNet() optim = nn.Adam(params=Net.trainable_params(), learning_rate=0.001) - train(options, tr_dataloader, Net, loss_fn, eval_loss_fn, optim, local_train_url, val_dataloader) + if options.device_target == "Ascend": + train( + options, + tr_dataloader, + Net, + loss_fn, + eval_loss_fn, + optim, + local_train_url, + options.device_id, + val_dataloader) + elif options.device_target == "GPU": + train(options, tr_dataloader, Net, loss_fn, eval_loss_fn, optim, local_train_url, rank_id, val_dataloader) if not options.run_offline: mox.file.copy_parallel(src_url='./cache/out', dst_url=options.train_url)