diff --git a/research/cv/STGAN/README.md b/research/cv/STGAN/README.md
index dd7d34cdbe22bdb370adfd526710c12959e074c1..a4dbbe38ce53825d6b747bef9e4de179f3ab72c3 100644
--- a/research/cv/STGAN/README.md
+++ b/research/cv/STGAN/README.md
@@ -53,8 +53,8 @@ Dataset used: [CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)
## [Environment Requirements](#contents)
-- Hardware(Ascend)
- - Prepare hardware environment with Ascend processor.
+- Hardware(Ascend/GPU)
+ - Prepare hardware environment with Ascend processor.It also supports the use of GPU processor to prepare the hardware environment.
- Framework
- [MindSpore](https://www.mindspore.cn/install/en)
- For more information, please check the resources below:
@@ -65,14 +65,27 @@ Dataset used: [CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)
After installing MindSpore via the official website, you can start training and evaluation as follows:
-```python
-# enter script dir, train STGAN
-sh scripts/run_standalone_train.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID]
-# distributed training
-sh scripts/run_distribute_train.sh [RANK_TABLE_FILE] [EXPERIMENT_NAME] [DATA_PATH]
-# enter script dir, evaluate STGAN
-sh scripts/run_eval.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PATH]
-```
+- running on Ascend
+
+ ```python
+ # train STGAN
+ sh scripts/run_standalone_train.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID]
+ # distributed training
+ sh scripts/run_distribute_train.sh [RANK_TABLE_FILE] [EXPERIMENT_NAME] [DATA_PATH]
+ # evaluate STGAN
+ sh scripts/run_eval.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PATH]
+ ```
+
+- running on GPU
+
+ ```python
+ # train STGAN
+ sh scripts/run_standalone_train_gpu.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID]
+ # distributed training
+ sh scripts/run_distribute_train_gpu.sh [EXPERIMENT_NAME] [DATA_PATH]
+ # evaluate STGAN, if you want to evaluate distributed training result, you should enter ./train_parallel
+ sh scripts/run_eval_gpu.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PATH]
+ ```
## [Script Description](#contents)
@@ -84,9 +97,13 @@ sh scripts/run_eval.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PAT
├── README.md // descriptions about STGAN
├── requirements.txt // package needed
├── scripts
- │ ├──run_standalone_train.sh // train in ascend
- │ ├──run_eval.sh // evaluate in ascend
- │ ├──run_distribute_train.sh // distributed train in ascend
+ │ ├──docker_start.sh // start docker container
+ │ ├──run_standalone_train.sh // train in ascend
+ │ ├──run_eval.sh // evaluate in ascend
+ │ ├──run_distribute_train.sh // distributed train in ascend
+ │ ├──run_standalone_train_gpu.sh // train in GPU
+ │ ├──run_eval_gpu.sh // evaluate in GPU
+ │ ├──run_distribute_train_gpu.sh // distributed train in GPU
├── src
├── dataset
├── datasets.py // creating dataset
@@ -114,7 +131,7 @@ Major parameters in train.py and utils/args.py as follows:
--n_epochs: Total training epochs.
--batch_size: Training batch size.
--image_size: Image size used as input to the model.
---device_target: Device where the code will be implemented. Optional value is "Ascend".
+--device_target: Device where the code will be implemented. Optional value is "Ascend" or "GPU".
```
### [Training Process](#contents)
@@ -125,12 +142,22 @@ Major parameters in train.py and utils/args.py as follows:
```bash
python train.py --dataroot ./dataset --experiment_name 128 > log 2>&1 &
- # or enter script dir, and run the script
+ # or run the script
sh scripts/run_standalone_train.sh ./dataset 128 0
# distributed training
sh scripts/run_distribute_train.sh ./config/rank_table_8pcs.json 128 /data/dataset
```
+- running on GPU
+
+ ```bash
+ python train.py --dataroot ./dataset --experiment_name 128 --platform="GPU" > log 2>&1 &
+ # or run the script
+ sh scripts/run_standalone_train_gpu.sh ./dataset 128 0
+ # distributed training
+ sh scripts/run_distribute_train_gpu.sh 128 /data/dataset
+ ```
+
After training, the loss value will be achieved as follows:
```bash
@@ -155,10 +182,18 @@ Before running the command below, please check the checkpoint path used for eval
```bash
python eval.py --dataroot ./dataset --experiment_name 128 > eval_log.txt 2>&1 &
- # or enter script dir, and run the script
+ # or run the script
sh scripts/run_eval.sh ./dataset 128 0 ./ckpt/generator.ckpt
```
+- running on GPU
+
+ ```bash
+ python eval.py --dataroot ./dataset --experiment_name 128 --platform="GPU" > eval_log.txt 2>&1 &
+ # or run the script (if you want to evaluate distributed training result, you should enter ./train_parallel, then run the script)
+ sh scripts/run_eval_gpu.sh ./dataset 128 0 ./ckpt/generator.ckpt
+ ```
+
You can view the results in the output directory, which contains a batch of result sample images.
### Model Export
@@ -175,22 +210,22 @@ python export.py --ckpt_path [CHECKPOINT_PATH] --platform [PLATFORM] --file_form
#### Evaluation Performance
-| Parameters | Ascend |
-| -------------------------- | ----------------------------------------------------------- |
-| Model Version | V1 |
-| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory, 755G |
-| uploaded Date | 05/07/2021 (month/day/year) |
-| MindSpore Version | 1.2.0 |
-| Dataset | CelebA |
-| Training Parameters | epoch=100, batch_size = 128 |
-| Optimizer | Adam |
-| Loss Function | Loss |
-| Output | predict class |
-| Loss | 6.5523 |
-| Speed | 1pc: 400 ms/step; 8pcs: 143 ms/step |
-| Total time | 1pc: 41:36:07 |
-| Checkpoint for Fine tuning | 170.55M(.ckpt file) |
-| Scripts | [STGAN script](https://gitee.com/mindspore/models/tree/master/research/cv/STGAN) |
+| Parameters | Ascend | GPU |
+| -------------------------- | ----------------------------------------------------------- | --- |
+| Model Version | V1 | V1 |
+| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory, 755G | RTX-3090 |
+| uploaded Date | 05/07/2021 (month/day/year) | 11/23/2021 (month/day/year) |
+| MindSpore Version | 1.2.0 | 1.5.0rc1 |
+| Dataset | CelebA | CelebA |
+| Training Parameters | epoch=100, batch_size = 128 | epoch=100, batch_size=64 |
+| Optimizer | Adam | Adam |
+| Loss Function | Loss | Loss |
+| Output | predict class | image |
+| Loss | 6.5523 | 31.23 |
+| Speed | 1pc: 400 ms/step; 8pcs: 143 ms/step | 1pc: 369 ms/step; 8pcs: 68 ms/step |
+| Total time | 1pc: 41:36:07 | 1pc: 29:15:09 |
+| Checkpoint for Fine tuning | 170.55M(.ckpt file) | 283.76M(.ckpt file) |
+| Scripts | [STGAN script](https://gitee.com/mindspore/models/tree/master/research/cv/STGAN) | [STGAN script](https://gitee.com/mindspore/models/tree/master/research/cv/STGAN) |
## [Model Description](#contents)
diff --git a/research/cv/STGAN/scripts/run_distribute_train_gpu.sh b/research/cv/STGAN/scripts/run_distribute_train_gpu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ef2220d2e9a3646273b5b67839ed7be5f99dc633
--- /dev/null
+++ b/research/cv/STGAN/scripts/run_distribute_train_gpu.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 2 ]
+then
+ echo "Usage: sh run_distribute_train_gpu.sh [EXPERIMENT_NAME] [DATA_PATH]"
+exit 1
+fi
+
+
+export DEVICE_NUM=8
+export RANK_SIZE=8
+
+rm -rf ./train_parallel
+mkdir ./train_parallel
+cp ./*.py ./train_parallel
+cp -r ./src ./train_parallel
+cp -r ./scripts ./train_parallel
+cd ./train_parallel || exit
+
+export EXPERIMENT_NAME=$1
+export DATA_PATH=$2
+
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
+nohup python train.py \
+ --dataroot=$DATA_PATH \
+ --experiment_name=$EXPERIMENT_NAME \
+ --device_num ${DEVICE_NUM} \
+ --platform="GPU" > log 2>&1 &
+cd ..
diff --git a/research/cv/STGAN/scripts/run_eval_gpu.sh b/research/cv/STGAN/scripts/run_eval_gpu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d295a1232b0f28071969c346f12c0c30fd722477
--- /dev/null
+++ b/research/cv/STGAN/scripts/run_eval_gpu.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 4 ]
+then
+ echo "Usage: sh run_eval_gpu.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID] [CHECKPOINT_PATH]"
+exit 1
+fi
+
+export DATA_PATH=$1
+export EXPERIMENT_NAME=$2
+export DEVICE_ID=$3
+export CHECKPOINT_PATH=$4
+
+python eval.py --dataroot=$DATA_PATH --experiment_name=$EXPERIMENT_NAME \
+ --device_id=$DEVICE_ID --ckpt_path=$CHECKPOINT_PATH \
+ --platform="GPU" > eval_log 2>&1 &
diff --git a/research/cv/STGAN/scripts/run_standalone_train_gpu.sh b/research/cv/STGAN/scripts/run_standalone_train_gpu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c2836e951aa00be68c93e1ff0767bb4df3cdf373
--- /dev/null
+++ b/research/cv/STGAN/scripts/run_standalone_train_gpu.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 3 ]
+then
+ echo "Usage: sh run_standalone_train_gpu.sh [DATA_PATH] [EXPERIMENT_NAME] [DEVICE_ID]"
+exit 1
+fi
+
+export DATA_PATH=$1
+export EXPERIMENT_NAME=$2
+export DEVICE_ID=$3
+
+python train.py --dataroot=$DATA_PATH --experiment_name=$EXPERIMENT_NAME \
+ --device_id=$DEVICE_ID --platform="GPU" > log 2>&1 &
diff --git a/research/cv/STGAN/src/models/base_model.py b/research/cv/STGAN/src/models/base_model.py
index 763586eacb31637e015751db761d459bec842be6..817f111bba94659879eed3c14bbb3a25588d7b05 100644
--- a/research/cv/STGAN/src/models/base_model.py
+++ b/research/cv/STGAN/src/models/base_model.py
@@ -48,14 +48,15 @@ class BaseModel(ABC):
), 'Checkpoint path not found at %s' % self.save_dir
self.current_iteration = self.args.continue_iter
else:
- if not os.path.exists(self.save_dir):
+ if not os.path.exists(self.save_dir) and self.args.rank == 0:
mkdirs(self.save_dir)
+
# save config
self.config_save_path = os.path.join(self.save_dir, 'config')
- if not os.path.exists(self.config_save_path):
+ if not os.path.exists(self.config_save_path) and self.args.rank == 0:
mkdirs(self.config_save_path)
- if self.isTrain:
+ if self.isTrain and self.args.rank == 0:
with open(os.path.join(self.config_save_path, 'train.conf'),
'w') as f:
f.write(json.dumps(vars(self.args)))
@@ -67,7 +68,7 @@ class BaseModel(ABC):
# sample save path
if self.isTrain:
self.sample_save_path = os.path.join(self.save_dir, 'sample')
- if not os.path.exists(self.sample_save_path):
+ if not os.path.exists(self.sample_save_path) and self.args.rank == 0:
mkdirs(self.sample_save_path)
# test result save path
@@ -79,7 +80,7 @@ class BaseModel(ABC):
# train log save path
if self.isTrain:
self.train_log_path = os.path.join(self.save_dir, 'logs')
- if not os.path.exists(self.train_log_path):
+ if not os.path.exists(self.train_log_path) and self.args.rank == 0:
mkdirs(self.train_log_path)
@abstractmethod
@@ -109,7 +110,7 @@ class BaseModel(ABC):
def save_networks(self):
""" saving networks """
for name in self.model_names:
- if isinstance(name, str):
+ if isinstance(name, str) and self.args.rank == 0:
save_filename = '%s_%s.ckpt' % (self.current_iteration, name)
save_filename_latest = 'latest_%s.ckpt' % name
save_path = os.path.join(self.save_dir, 'ckpt')
diff --git a/research/cv/STGAN/src/utils/args.py b/research/cv/STGAN/src/utils/args.py
index d1d8c56a2e2324c72926e49d940fecc404e1cf35..9e0bfedd4bbe75e91318889cccb02687abccb609 100644
--- a/research/cv/STGAN/src/utils/args.py
+++ b/research/cv/STGAN/src/utils/args.py
@@ -19,7 +19,7 @@ import ast
import datetime
from mindspore.context import ParallelMode
from mindspore import context
-from mindspore.communication.management import init
+from mindspore.communication.management import init, get_rank
def add_basic_parameters(parser):
""" add basic parameters """
@@ -266,7 +266,7 @@ def get_args(phase):
assert args.experiment_name != default_experiment_name, "--experiment_name should be assigned in test mode"
if args.continue_train:
assert args.experiment_name != default_experiment_name, "--experiment_name should be assigned in continue"
- if args.device_num > 1 and args.platform != "CPU":
+ if args.device_num > 1 and args.platform == "Ascend":
context.set_context(mode=context.GRAPH_MODE,
device_target=args.platform,
save_graphs=args.save_graphs,
@@ -278,6 +278,12 @@ def get_args(phase):
device_num=args.device_num)
init()
args.rank = int(os.environ["DEVICE_ID"])
+ elif args.device_num > 1 and args.platform == "GPU":
+ init()
+ context.reset_auto_parallel_context()
+ args.rank = get_rank()
+ context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
+ gradients_mean=True)
else:
context.set_context(mode=context.GRAPH_MODE,
device_target=args.platform,
diff --git a/research/cv/STGAN/train.py b/research/cv/STGAN/train.py
index 99fb250505a5eaa756a852ceb73b5a515b7620a6..f93a2f0a138dcbdfd947d306843fb40817f5d0b2 100644
--- a/research/cv/STGAN/train.py
+++ b/research/cv/STGAN/train.py
@@ -13,6 +13,7 @@
# limitations under the License.
# ============================================================================
""" STGAN TRAIN"""
+import time
import tqdm
from mindspore.common import set_seed
@@ -44,8 +45,9 @@ def train():
model = STGANModel(args)
it_count = 0
- for _ in tqdm.trange(args.n_epochs, desc='Epoch Loop'):
- for _ in tqdm.trange(iter_per_epoch, desc='Inner Epoch Loop'):
+ for _ in tqdm.trange(args.n_epochs, desc='Epoch Loop', unit='epoch'):
+ start_epoch_time = time.time()
+ for _ in tqdm.trange(iter_per_epoch, desc='Step Loop', unit='step'):
if model.current_iteration > it_count:
it_count += 1
continue
@@ -56,11 +58,11 @@ def train():
model.optimize_parameters()
# saving model
- if (it_count + 1) % args.save_freq == 0:
+ if (it_count + 1) % args.save_freq == 0 and args.rank == 0:
model.save_networks()
# sampling
- if (it_count + 1) % args.sample_freq == 0:
+ if (it_count + 1) % args.sample_freq == 0 and args.rank == 0:
model.eval(data_loader)
except KeyboardInterrupt:
@@ -69,7 +71,9 @@ def train():
it_count += 1
model.current_iteration = it_count
-
+ if args.rank == 0:
+ with open('performance.log', "a") as f:
+ f.write('average speed: {}ms/step\n'.format((time.time() - start_epoch_time)*1000/iter_per_epoch))
model.save_networks()
print('\n\n=============== finish training ===============\n\n')