diff --git a/research/cv/STGAN/README.md b/research/cv/STGAN/README.md index 9fa0cbc6a9855c4cb5d70c8fe6c48038842cb0f2..28c8896cb8a466da9d4e7821e51111d16b950360 100644 --- a/research/cv/STGAN/README.md +++ b/research/cv/STGAN/README.md @@ -53,8 +53,8 @@ Dataset used: [CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) ## [Environment Requirements](#contents) -- Hardware锛圓scend锛� - - Prepare hardware environment with Ascend processor. +- Hardware锛圓scend/GPU锛� + - Prepare hardware environment with Ascend processor.It also supports the use of GPU processor to prepare the hardware environment. - Framework - [MindSpore](https://www.mindspore.cn/install/en) - For more information, please check the resources below锛� @@ -65,14 +65,27 @@ Dataset used: [CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) After installing MindSpore via the official website, you can start training and evaluation as follows: -```python -# enter script dir, train STGAN -sh scripts/run_standalone_train.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] -# distributed training -sh scripts/run_distribute_train.sh [RANK_TABLE_FILE] [EXPERIMENT_NAME] [DATA_PATH] -# enter script dir, evaluate STGAN -sh scripts/run_eval.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PATH] -``` +- running on Ascend + + ```python + # train STGAN + sh scripts/run_standalone_train.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] + # distributed training + sh scripts/run_distribute_train.sh [RANK_TABLE_FILE] [EXPERIMENT_NAME] [DATA_PATH] + # evaluate STGAN + sh scripts/run_eval.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PATH] + ``` + +- running on GPU + + ```python + # train STGAN + sh scripts/run_standalone_train_gpu.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] + # distributed training + sh scripts/run_distribute_train_gpu.sh [EXPERIMENT_NAME] [DATA_PATH] + # evaluate STGAN, if you want to evaluate distributed training result, you should enter ./train_parallel + sh scripts/run_eval_gpu.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PATH] + ``` ## [Script Description](#contents) @@ -84,9 +97,13 @@ sh scripts/run_eval.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PAT 鈹溾攢鈹€ README.md // descriptions about STGAN 鈹溾攢鈹€ requirements.txt // package needed 鈹溾攢鈹€ scripts - 鈹� 鈹溾攢鈹€run_standalone_train.sh // train in ascend - 鈹� 鈹溾攢鈹€run_eval.sh // evaluate in ascend - 鈹� 鈹溾攢鈹€run_distribute_train.sh // distributed train in ascend + 鈹� 鈹溾攢鈹€docker_start.sh // start docker container + 鈹� 鈹溾攢鈹€run_standalone_train.sh // train in ascend + 鈹� 鈹溾攢鈹€run_eval.sh // evaluate in ascend + 鈹� 鈹溾攢鈹€run_distribute_train.sh // distributed train in ascend + 鈹� 鈹溾攢鈹€run_standalone_train_gpu.sh // train in GPU + 鈹� 鈹溾攢鈹€run_eval_gpu.sh // evaluate in GPU + 鈹� 鈹溾攢鈹€run_distribute_train_gpu.sh // distributed train in GPU 鈹溾攢鈹€ src 鈹溾攢鈹€ dataset 鈹溾攢鈹€ datasets.py // creating dataset @@ -114,7 +131,7 @@ Major parameters in train.py and utils/args.py as follows: --n_epochs: Total training epochs. --batch_size: Training batch size. --image_size: Image size used as input to the model. ---device_target: Device where the code will be implemented. Optional value is "Ascend". +--device_target: Device where the code will be implemented. Optional value is "Ascend" or "GPU". ``` ### [Training Process](#contents) @@ -125,12 +142,22 @@ Major parameters in train.py and utils/args.py as follows: ```bash python train.py --dataroot ./dataset --experiment_name 128 > log 2>&1 & - # or enter script dir, and run the script + # or run the script sh scripts/run_standalone_train.sh ./dataset 128 0 # distributed training sh scripts/run_distribute_train.sh ./config/rank_table_8pcs.json 128 /data/dataset ``` +- running on GPU + + ```bash + python train.py --dataroot ./dataset --experiment_name 128 --platform="GPU" > log 2>&1 & + # or run the script + sh scripts/run_standalone_train_gpu.sh ./dataset 128 0 + # distributed training + sh scripts/run_distribute_train_gpu.sh 128 /data/dataset + ``` + After training, the loss value will be achieved as follows: ```bash @@ -155,10 +182,18 @@ Before running the command below, please check the checkpoint path used for eval ```bash python eval.py --dataroot ./dataset --experiment_name 128 > eval_log.txt 2>&1 & - # or enter script dir, and run the script + # or run the script sh scripts/run_eval.sh ./dataset 128 0 ./ckpt/generator.ckpt ``` +- running on GPU + + ```bash + python eval.py --dataroot ./dataset --experiment_name 128 --platform="GPU" > eval_log.txt 2>&1 & + # or run the script (if you want to evaluate distributed training result, you should enter ./train_parallel, then run the script) + sh scripts/run_eval_gpu.sh ./dataset 128 0 ./ckpt/generator.ckpt + ``` + You can view the results in the output directory, which contains a batch of result sample images. ### Model Export @@ -211,22 +246,22 @@ NN inference cost average time: 9.98606 ms of infer_count 10 #### Evaluation Performance -| Parameters | Ascend | -| -------------------------- | ----------------------------------------------------------- | -| Model Version | V1 | -| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory, 755G | -| uploaded Date | 05/07/2021 (month/day/year) | -| MindSpore Version | 1.2.0 | -| Dataset | CelebA | -| Training Parameters | epoch=100, batch_size = 128 | -| Optimizer | Adam | -| Loss Function | Loss | -| Output | predict class | -| Loss | 6.5523 | -| Speed | 1pc: 400 ms/step; 8pcs: 143 ms/step | -| Total time | 1pc: 41:36:07 | -| Checkpoint for Fine tuning | 170.55M(.ckpt file) | -| Scripts | [STGAN script](https://gitee.com/mindspore/models/tree/master/research/cv/STGAN) | +| Parameters | Ascend | GPU | +| -------------------------- | ----------------------------------------------------------- | --- | +| Model Version | V1 | V1 | +| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory, 755G | RTX-3090 | +| uploaded Date | 05/07/2021 (month/day/year) | 11/23/2021 (month/day/year) | +| MindSpore Version | 1.2.0 | 1.5.0rc1 | +| Dataset | CelebA | CelebA | +| Training Parameters | epoch=100, batch_size = 128 | epoch=100, batch_size=64 | +| Optimizer | Adam | Adam | +| Loss Function | Loss | Loss | +| Output | predict class | image | +| Loss | 6.5523 | 31.23 | +| Speed | 1pc: 400 ms/step; 8pcs: 143 ms/step | 1pc: 369 ms/step; 8pcs: 68 ms/step | +| Total time | 1pc: 41:36:07 | 1pc: 29:15:09 | +| Checkpoint for Fine tuning | 170.55M(.ckpt file) | 283.76M(.ckpt file) | +| Scripts | [STGAN script](https://gitee.com/mindspore/models/tree/master/research/cv/STGAN) | [STGAN script](https://gitee.com/mindspore/models/tree/master/research/cv/STGAN) | ## [Model Description](#contents) diff --git a/research/cv/STGAN/scripts/run_distribute_train_gpu.sh b/research/cv/STGAN/scripts/run_distribute_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..ef2220d2e9a3646273b5b67839ed7be5f99dc633 --- /dev/null +++ b/research/cv/STGAN/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_distribute_train_gpu.sh [EXPERIMENT_NAME] [DATA_PATH]" +exit 1 +fi + + +export DEVICE_NUM=8 +export RANK_SIZE=8 + +rm -rf ./train_parallel +mkdir ./train_parallel +cp ./*.py ./train_parallel +cp -r ./src ./train_parallel +cp -r ./scripts ./train_parallel +cd ./train_parallel || exit + +export EXPERIMENT_NAME=$1 +export DATA_PATH=$2 + +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ +nohup python train.py \ + --dataroot=$DATA_PATH \ + --experiment_name=$EXPERIMENT_NAME \ + --device_num ${DEVICE_NUM} \ + --platform="GPU" > log 2>&1 & +cd .. diff --git a/research/cv/STGAN/scripts/run_eval_gpu.sh b/research/cv/STGAN/scripts/run_eval_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..d295a1232b0f28071969c346f12c0c30fd722477 --- /dev/null +++ b/research/cv/STGAN/scripts/run_eval_gpu.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 4 ] +then + echo "Usage: sh run_eval_gpu.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PATH]" +exit 1 +fi + +export DATA_PATH=$1 +export EXPERIMENT_NAME=$2 +export DEVICE_ID=$3 +export CHECKPOINT_PATH=$4 + +python eval.py --dataroot=$DATA_PATH --experiment_name=$EXPERIMENT_NAME \ + --device_id=$DEVICE_ID --ckpt_path=$CHECKPOINT_PATH \ + --platform="GPU" > eval_log 2>&1 & diff --git a/research/cv/STGAN/scripts/run_standalone_train_gpu.sh b/research/cv/STGAN/scripts/run_standalone_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..c2836e951aa00be68c93e1ff0767bb4df3cdf373 --- /dev/null +++ b/research/cv/STGAN/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 3 ] +then + echo "Usage: sh run_standalone_train_gpu.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID]" +exit 1 +fi + +export DATA_PATH=$1 +export EXPERIMENT_NAME=$2 +export DEVICE_ID=$3 + +python train.py --dataroot=$DATA_PATH --experiment_name=$EXPERIMENT_NAME \ + --device_id=$DEVICE_ID --platform="GPU" > log 2>&1 & diff --git a/research/cv/STGAN/src/models/base_model.py b/research/cv/STGAN/src/models/base_model.py index 763586eacb31637e015751db761d459bec842be6..817f111bba94659879eed3c14bbb3a25588d7b05 100644 --- a/research/cv/STGAN/src/models/base_model.py +++ b/research/cv/STGAN/src/models/base_model.py @@ -48,14 +48,15 @@ class BaseModel(ABC): ), 'Checkpoint path not found at %s' % self.save_dir self.current_iteration = self.args.continue_iter else: - if not os.path.exists(self.save_dir): + if not os.path.exists(self.save_dir) and self.args.rank == 0: mkdirs(self.save_dir) + # save config self.config_save_path = os.path.join(self.save_dir, 'config') - if not os.path.exists(self.config_save_path): + if not os.path.exists(self.config_save_path) and self.args.rank == 0: mkdirs(self.config_save_path) - if self.isTrain: + if self.isTrain and self.args.rank == 0: with open(os.path.join(self.config_save_path, 'train.conf'), 'w') as f: f.write(json.dumps(vars(self.args))) @@ -67,7 +68,7 @@ class BaseModel(ABC): # sample save path if self.isTrain: self.sample_save_path = os.path.join(self.save_dir, 'sample') - if not os.path.exists(self.sample_save_path): + if not os.path.exists(self.sample_save_path) and self.args.rank == 0: mkdirs(self.sample_save_path) # test result save path @@ -79,7 +80,7 @@ class BaseModel(ABC): # train log save path if self.isTrain: self.train_log_path = os.path.join(self.save_dir, 'logs') - if not os.path.exists(self.train_log_path): + if not os.path.exists(self.train_log_path) and self.args.rank == 0: mkdirs(self.train_log_path) @abstractmethod @@ -109,7 +110,7 @@ class BaseModel(ABC): def save_networks(self): """ saving networks """ for name in self.model_names: - if isinstance(name, str): + if isinstance(name, str) and self.args.rank == 0: save_filename = '%s_%s.ckpt' % (self.current_iteration, name) save_filename_latest = 'latest_%s.ckpt' % name save_path = os.path.join(self.save_dir, 'ckpt') diff --git a/research/cv/STGAN/src/utils/args.py b/research/cv/STGAN/src/utils/args.py index d1d8c56a2e2324c72926e49d940fecc404e1cf35..9e0bfedd4bbe75e91318889cccb02687abccb609 100644 --- a/research/cv/STGAN/src/utils/args.py +++ b/research/cv/STGAN/src/utils/args.py @@ -19,7 +19,7 @@ import ast import datetime from mindspore.context import ParallelMode from mindspore import context -from mindspore.communication.management import init +from mindspore.communication.management import init, get_rank def add_basic_parameters(parser): """ add basic parameters """ @@ -266,7 +266,7 @@ def get_args(phase): assert args.experiment_name != default_experiment_name, "--experiment_name should be assigned in test mode" if args.continue_train: assert args.experiment_name != default_experiment_name, "--experiment_name should be assigned in continue" - if args.device_num > 1 and args.platform != "CPU": + if args.device_num > 1 and args.platform == "Ascend": context.set_context(mode=context.GRAPH_MODE, device_target=args.platform, save_graphs=args.save_graphs, @@ -278,6 +278,12 @@ def get_args(phase): device_num=args.device_num) init() args.rank = int(os.environ["DEVICE_ID"]) + elif args.device_num > 1 and args.platform == "GPU": + init() + context.reset_auto_parallel_context() + args.rank = get_rank() + context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) else: context.set_context(mode=context.GRAPH_MODE, device_target=args.platform, diff --git a/research/cv/STGAN/train.py b/research/cv/STGAN/train.py index 99fb250505a5eaa756a852ceb73b5a515b7620a6..f93a2f0a138dcbdfd947d306843fb40817f5d0b2 100644 --- a/research/cv/STGAN/train.py +++ b/research/cv/STGAN/train.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================ """ STGAN TRAIN""" +import time import tqdm from mindspore.common import set_seed @@ -44,8 +45,9 @@ def train(): model = STGANModel(args) it_count = 0 - for _ in tqdm.trange(args.n_epochs, desc='Epoch Loop'): - for _ in tqdm.trange(iter_per_epoch, desc='Inner Epoch Loop'): + for _ in tqdm.trange(args.n_epochs, desc='Epoch Loop', unit='epoch'): + start_epoch_time = time.time() + for _ in tqdm.trange(iter_per_epoch, desc='Step Loop', unit='step'): if model.current_iteration > it_count: it_count += 1 continue @@ -56,11 +58,11 @@ def train(): model.optimize_parameters() # saving model - if (it_count + 1) % args.save_freq == 0: + if (it_count + 1) % args.save_freq == 0 and args.rank == 0: model.save_networks() # sampling - if (it_count + 1) % args.sample_freq == 0: + if (it_count + 1) % args.sample_freq == 0 and args.rank == 0: model.eval(data_loader) except KeyboardInterrupt: @@ -69,7 +71,9 @@ def train(): it_count += 1 model.current_iteration = it_count - + if args.rank == 0: + with open('performance.log', "a") as f: + f.write('average speed: {}ms/step\n'.format((time.time() - start_epoch_time)*1000/iter_per_epoch)) model.save_networks() print('\n\n=============== finish training ===============\n\n')