diff --git a/official/cv/resnext/README_CN.md b/official/cv/resnext/README_CN.md index 65bc48c3a32ab0aa5d5aecd0077ba31376c27eaf..ce817653d4cf12d1b915dd4c9966a79a5183856b 100644 --- a/official/cv/resnext/README_CN.md +++ b/official/cv/resnext/README_CN.md @@ -203,6 +203,11 @@ GPU: bash run_distribute_train_for_gpu.sh DATA_PATH # 鍗曟満璁粌 bash run_standalone_train_for_gpu.sh DEVICE_ID DATA_PATH +GPU_ResNext101: + # 鍒嗗竷寮忚缁冪ず渚嬶紙8鍗★級 + bash scripts/run_distribute_train_for_gpu_resnext101.sh DATA_PATH CONFIG_PATH + # 鍗曟満璁粌 + bash scripts/run_standalone_train_for_gpu_resnext101.sh DEVICE_ID DATA_PATH CONFIG_PATH ``` ## 杩佺Щ璁粌杩囩▼ @@ -232,6 +237,8 @@ python eval.py --data_path ~/imagenet/val/ --device_target Ascend --checkpoint_f 鎴栭€氳繃shell鑴氭湰寮€濮嬭缁冿細 ```shell +# GPU_ResNext101璇勪及 +bash scripts/run_eval_for_gpu_resnext101.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH CONFIG_PATH # 璇勪及 bash scripts/run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH PLATFORM ``` @@ -243,6 +250,8 @@ DEVICE_TARGET is Ascend or GPU, default is Ascend. ```shell # 妫€鏌ョ偣璇勪及 bash scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /ResNeXt_100.ckpt Ascend +# GPU_ResNext101璇勪及 +bash scripts/run_eval_for_gpu_resnext101.sh 0 ~/ImageNet2012/val ~/outputs_demo/best_acc.ckpt ./resnext101_config.yaml ``` #### 缁撴灉 @@ -348,6 +357,20 @@ Total data:50000, top1 accuracy:0.79858, top5 accuracy:0.94716 | 鎬绘椂闀� | 7.8灏忔椂 锛�8鍗★級 | 21.5灏忔椂 锛�8鍗★級 | | 璋冧紭妫€鏌ョ偣 | 192 M锛�.ckpt鏂囦欢锛� | 192 M锛�.ckpt鏂囦欢锛� | +| 鍙傛暟 | ResNeXt101 | +| ------------- | ------------------------------------------------- | +| 璧勬簮 | GeForce RTX 3090锛汣PU锛�3.50GHz锛�64鏍革紱鍐呭瓨锛�264GB | +| 涓婁紶鏃ユ湡 | 2022-6-30 | +| MindSpore鐗堟湰 | 1.7.0 | +| 鏁版嵁闆� | ImageNet | +| 璁粌鍙傛暟 | src/config.py | +| 浼樺寲鍣� | Momentum | +| 鎹熷け鍑芥暟 | Softmax浜ゅ弶鐔� | +| 鎹熷け | 1.2427 | +| 鍑嗙‘鐜� | 78.04%(TOP1) | +| 鎬绘椂闀� | 50灏忔椂 锛�8鍗★級 | +| 璋冧紭妫€鏌ョ偣 | 638 M锛�.ckpt鏂囦欢锛� | + #### 鎺ㄧ悊鎬ц兘 | 鍙傛暟 |ResNeXt50 | | | @@ -360,15 +383,15 @@ Total data:50000, top1 accuracy:0.79858, top5 accuracy:0.94716 | 杈撳嚭 | 姒傜巼 | 姒傜巼 | 姒傜巼 | | 鍑嗙‘鐜� | acc=78.16%(TOP1) | acc=78.05%(TOP1) | | -| 鍙傛暟 | ResNeXt101 | -| ------------------- | --------------------------- | -| 璧勬簮 | Ascend 310; OS Euler2.8 | -| 涓婁紶鏃ユ湡 | 06/22/2021 (month/day/year) | -| MindSpore鐗堟湰 | 1.2.0 | -| 鏁版嵁闆� | ImageNet | -| batch_size | 1 | -| 杈撳嚭 | 姒傜巼 | -| 鍑嗙‘鐜� | TOP1: 79.85%, TOP5: 94.71% | +| 鍙傛暟 | ResNeXt101 | | +| ------------------- | --------------------------- | --------------------------- | +| 璧勬簮 | Ascend 310; OS Euler2.8 | GeForce RTX 3090 | +| 涓婁紶鏃ユ湡 | 06/22/2021 (month/day/year) | 2022-6-30 | +| MindSpore鐗堟湰 | 1.2.0 | 1.7.0 | +| 鏁版嵁闆� | ImageNet | ImageNet | +| batch_size | 1 | 1 | +| 杈撳嚭 | 姒傜巼 | 姒傜巼 | +| 鍑嗙‘鐜� | TOP1: 79.85%, TOP5: 94.71% | 78.04%(TOP1) | # 闅忔満鎯呭喌璇存槑 diff --git a/official/cv/resnext/resnext101_config.yaml b/official/cv/resnext/resnext101_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b5ed9069c6817fe171391fead01243686e627e6 --- /dev/null +++ b/official/cv/resnext/resnext101_config.yaml @@ -0,0 +1,79 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +network: "resnext101" +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +eval_data_path: "/cache/data" +output_path: "" +load_path: "" +device_target: "GPU" +checkpoint_path: "./checkpoint/" +checkpoint_file_path: "" +run_eval: True +eval_per_batch_size: 64 +eval_interval: 1 +eval_start_epoch: 120 +save_best_ckpt: True +ckpt_path: 'outputs_demo/' +# ============================================================================== +# Training options +image_size: [224,224] +num_classes: 1000 +batch_size: 1 + +lr: 0.05 +lr_scheduler: "cosine_annealing" +lr_epochs: [30,60,90,120] +lr_gamma: 0.1 +eta_min: 0 +T_max: 150 +max_epoch: 150 +warmup_epochs: 1 + +weight_decay: 0.0001 +momentum: 0.9 +is_dynamic_loss_scale: 0 +loss_scale: 1024 +label_smooth: 1 +label_smooth_factor: 0.1 +per_batch_size: 96 + +ckpt_interval: 5 +ckpt_save_max: 5 +is_save_on_master: 1 +rank_save_ckpt_flag: 0 +outputs_dir: "" +log_path: "./output_log" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnext101" +file_format: 'MINDIR' +result_path: "" +label_path: "" + +--- +# Help description for each configuration +enable_modelarts: "Whether training on modelarts, default: False" +data_url: "Dataset url for obs" +train_url: "Training output url for obs" +checkpoint_url: "The location of checkpoint for obs" +data_path: "Dataset path for local" +output_path: "Training output path for local" +load_path: "The location of checkpoint for obs" +device_target: "Target device type, available: [Ascend, GPU, CPU]" +enable_profiling: "Whether enable profiling while training, default: False" +num_classes: "Class for dataset" +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint" +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/official/cv/resnext/scripts/run_distribute_train_for_gpu_resnext101.sh b/official/cv/resnext/scripts/run_distribute_train_for_gpu_resnext101.sh new file mode 100755 index 0000000000000000000000000000000000000000..c3ce9d1e1d4c0e20847223b96f74c63776971606 --- /dev/null +++ b/official/cv/resnext/scripts/run_distribute_train_for_gpu_resnext101.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 2 ] +then + echo "Usage: bash run_distribute_train_for_gpu_resnext101.sh [DATA_PATH] [CONFIG_PATH]." + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +if [ ! -d "$(get_real_path $1)" ] +then + echo "error: DATA_PATH=$1 is not a directory" + echo "Usage: bash run_distribute_train_for_gpu_resnext101.sh [DATA_PATH] [CONFIG_PATH]." +exit 1 +fi + +if [ ! -f "$(get_real_path $2)" ] +then + echo "error: CONFIG_PATH=$2 is not a file" + echo "Usage: bash run_distribute_train_for_gpu_resnext101.sh [DATA_PATH] [CONFIG_PATH]." +exit 1 +fi + +DATA_PATH=$(get_real_path $1) +CONFIG_PATH=$(get_real_path $2) + +TRAIN_DATA_PATH="${DATA_PATH}/train" +VALID_DATA_PATH="${DATA_PATH}/val" + +rm -rf logs +mkdir logs +cp -r ./train.py ./resnext101_config.yaml ./src ./logs +cd ./logs +echo "start training" +mpirun --allow-run-as-root -n 8 python ./train.py --data_path=${TRAIN_DATA_PATH} --eval_data_path=${VALID_DATA_PATH} --run_distribute=True --config_path=${CONFIG_PATH} > train.log 2>&1 & diff --git a/official/cv/resnext/scripts/run_eval_for_gpu_resnext101.sh b/official/cv/resnext/scripts/run_eval_for_gpu_resnext101.sh new file mode 100755 index 0000000000000000000000000000000000000000..424f1c42af5b43e9b597de3042ac055d51988091 --- /dev/null +++ b/official/cv/resnext/scripts/run_eval_for_gpu_resnext101.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 4 ] +then + echo "Usage: bash run_eval_for_gpu_resnext101.sh [DEVICE_ID] [EVAL_DATA_DIR] [CHECKPOINTPATH] [CONFIG_PATH]." + exit 1 +fi + +if [ ! -d "$(get_real_path $2)" ] +then + echo "error: EVAL_DATA_DIR=$2 is not a directory" + echo "Usage: bash run_eval_for_gpu_resnext101.sh [DEVICE_ID] [EVAL_DATA_DIR] [CHECKPOINTPATH] [CONFIG_PATH]." +exit 1 +fi + +if [ ! -f "$(get_real_path $3)" ] +then + echo "error: CHECKPOINTPATH=$3 is not a file" + echo "Usage: bash run_eval_for_gpu_resnext101.sh [DEVICE_ID] [EVAL_DATA_DIR] [CHECKPOINTPATH] [CONFIG_PATH]." +exit 1 +fi + +if [ ! -f "$(get_real_path $4)" ] +then + echo "error: CONFIG_PATH=$4 is not a file" + echo "Usage: bash run_eval_for_gpu_resnext101.sh [DEVICE_ID] [EVAL_DATA_DIR] [CHECKPOINTPATH] [CONFIG_PATH]." +exit 1 +fi + +export CUDA_VISIBLE_DEVICES=$1 +DATA_DIR=$(get_real_path $2) +PATH_CHECKPOINT=$(get_real_path $3) +CONFIG_PATH=$(get_real_path $4) +python eval.py \ + --checkpoint_file_path=$PATH_CHECKPOINT \ + --data_path=$DATA_DIR \ + --config_path=$CONFIG_PATH > eval.log 2>&1 & + diff --git a/official/cv/resnext/scripts/run_standalone_train_for_gpu_resnext101.sh b/official/cv/resnext/scripts/run_standalone_train_for_gpu_resnext101.sh new file mode 100755 index 0000000000000000000000000000000000000000..fab81e872c478e2a6689f1327c02f730741ffb6f --- /dev/null +++ b/official/cv/resnext/scripts/run_standalone_train_for_gpu_resnext101.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 3 ] +then + echo "Usage: bash run_standalone_train_for_gpu_resnext101.sh [DEVICE_ID] [DATA_PATH] [CONFIG_PATH]." + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +if [ ! -d "$(get_real_path $2)" ] +then + echo "error: DATA_PATH=$2 is not a directory" + echo "Usage: bash run_standalone_train_for_gpu_resnext101.sh [DEVICE_ID] [DATA_PATH] [CONFIG_PATH]." +exit 1 +fi + +if [ ! -f "$(get_real_path $3)" ] +then + echo "error: CONFIG_PATH=$3 is not a file" + echo "Usage: bash run_standalone_train_for_gpu_resnext101.sh [DEVICE_ID] [DATA_PATH] [CONFIG_PATH]." +exit 1 +fi + +export CUDA_VISIBLE_DEVICES=$1 +DATA_DIR=$(get_real_path $2) +CONFIG_PATH=$(get_real_path $3) +TRAIN_DATA_PATH="${DATA_DIR}/train" +EVAL_DATA_PATH="${DATA_DIR}/val" + +python train.py \ + --data_path=${TRAIN_DATA_PATH} \ + --eval_data_path=${EVAL_DATA_PATH} \ + --config_path=${CONFIG_PATH} > train.log 2>&1 & diff --git a/official/cv/resnext/src/backbone/resnet.py b/official/cv/resnext/src/backbone/resnet.py index cbb6c544ff0cb775c49fe23a9a731a3bd0e5ca2e..be76782ffe7b4fa40bd15991990d725e1187595d 100644 --- a/official/cv/resnext/src/backbone/resnet.py +++ b/official/cv/resnext/src/backbone/resnet.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ ResNet based ResNext """ import mindspore.nn as nn -from mindspore.ops.operations import Add, Split, Concat +from mindspore.ops.operations import TensorAdd, Split, Concat from mindspore.ops import operations as P from mindspore.common.initializer import TruncatedNormal @@ -105,7 +105,7 @@ class BasicBlock(nn.Cell): self.down_sample = down_sample self.down_sample_flag = True - self.add = Add() + self.add = TensorAdd() def construct(self, x): identity = x @@ -176,7 +176,7 @@ class Bottleneck(nn.Cell): self.down_sample_flag = True self.cast = P.Cast() - self.add = Add() + self.add = TensorAdd() def construct(self, x): identity = x @@ -279,4 +279,6 @@ def resnext50(platform="Ascend"): return ResNet(Bottleneck, [3, 4, 6, 3], width_per_group=4, groups=32, platform=platform) def resnext101(platform="Ascend"): - return ResNet(Bottleneck, [3, 4, 23, 3], width_per_group=4, groups=32, platform=platform) + if platform == "Ascend": + return ResNet(Bottleneck, [3, 4, 23, 3], width_per_group=4, groups=32, platform=platform) + return ResNet(Bottleneck, [3, 4, 23, 3], width_per_group=4, groups=64, platform=platform) diff --git a/official/cv/resnext/src/eval_callback.py b/official/cv/resnext/src/eval_callback.py new file mode 100644 index 0000000000000000000000000000000000000000..d1db389440e57f76515fdccde38382750c52cdea --- /dev/null +++ b/official/cv/resnext/src/eval_callback.py @@ -0,0 +1,158 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Evaluation callback when training""" + +import os +import stat +import time +from mindspore import save_checkpoint +from mindspore import log as logger +from mindspore.train.callback import Callback + + +class EvalCallBack(Callback): + """ + Evaluation callback when training. + + Args: + eval_function (function): evaluation function. + eval_param_dict (dict): evaluation parameters' configure dict. + interval (int): run evaluation interval, default is 1. + eval_start_epoch (int): evaluation start epoch, default is 1. + save_best_ckpt (bool): Whether to save best checkpoint, default is True. + best_ckpt_name (str): bast checkpoint name, default is `best.ckpt`. + metrics_name (str): evaluation metrics name, default is `acc`. + + Returns: + None + + Examples: + >>> EvalCallBack(eval_function, eval_param_dict) + """ + + def __init__(self, eval_function, eval_param_dict, interval=1, eval_start_epoch=1, save_best_ckpt=True, + ckpt_directory="./", best_ckpt_name="best.ckpt", metrics_name="acc"): + super(EvalCallBack, self).__init__() + self.eval_param_dict = eval_param_dict + self.eval_function = eval_function + self.eval_start_epoch = eval_start_epoch + if interval < 1: + raise ValueError("interval should >= 1.") + self.interval = interval + self.save_best_ckpt = save_best_ckpt + self.best_res = 0 + self.best_epoch = 0 + if not os.path.isdir(ckpt_directory): + os.makedirs(ckpt_directory) + self.bast_ckpt_path = os.path.join(ckpt_directory, best_ckpt_name) + self.metrics_name = metrics_name + + def remove_ckpoint_file(self, file_name): + """Remove the specified checkpoint file from this checkpoint manager and also from the directory.""" + try: + os.chmod(file_name, stat.S_IWRITE) + os.remove(file_name) + except OSError: + logger.warning("OSError, failed to remove the older ckpt file %s.", file_name) + except ValueError: + logger.warning("ValueError, failed to remove the older ckpt file %s.", file_name) + + def epoch_end(self, run_context): + """Callback when epoch end.""" + cb_params = run_context.original_args() + cur_epoch = cb_params.cur_epoch_num + if cur_epoch >= self.eval_start_epoch and (cur_epoch - self.eval_start_epoch) % self.interval == 0: + eval_start = time.time() + res = self.eval_function(self.eval_param_dict) + eval_cost = time.time() - eval_start + print("epoch: {}, {}: {}, eval_cost:{:.2f}".format(cur_epoch, self.metrics_name, res, eval_cost), + flush=True) + if res >= self.best_res: + self.best_res = res + self.best_epoch = cur_epoch + print("update best result: {}".format(res), flush=True) + if self.save_best_ckpt: + if os.path.exists(self.bast_ckpt_path): + self.remove_ckpoint_file(self.bast_ckpt_path) + save_checkpoint(cb_params.train_network, self.bast_ckpt_path) + print("update best checkpoint at: {}".format(self.bast_ckpt_path), flush=True) + + def end(self, run_context): + print("End training, the best {0} is: {1}, the best {0} epoch is {2}".format(self.metrics_name, + self.best_res, + self.best_epoch), flush=True) + +class ProgressMonitor(Callback): + """monitor loss and time""" + def __init__(self, args): + super(ProgressMonitor, self).__init__() + self.me_epoch_start_time = 0 + self.me_epoch_start_step_num = 0 + self.args = args + self.ckpt_history = [] + + def begin(self, run_context): + self.args.logger.info('start network train...') + + def epoch_begin(self, run_context): + pass + + def epoch_end(self, run_context, *me_args): + """describe network construct""" + cb_params = run_context.original_args() + me_step = cb_params.cur_step_num - 1 + + real_epoch = me_step // self.args.steps_per_epoch + time_used = time.time() - self.me_epoch_start_time + fps_mean = (self.args.per_batch_size * (me_step-self.me_epoch_start_step_num)) + fps_mean = fps_mean * self.args.group_size + fps_mean = fps_mean / time_used + self.args.logger.info('epoch[{}], iter[{}], loss:{}, ' + 'mean_fps:{:.2f}' + 'imgs/sec'.format(real_epoch, + me_step, + cb_params.net_outputs, + fps_mean)) + + if self.args.rank_save_ckpt_flag: + import glob + ckpts = glob.glob(os.path.join(self.args.outputs_dir, '*.ckpt')) + for ckpt in ckpts: + ckpt_fn = os.path.basename(ckpt) + if not ckpt_fn.startswith('{}-'.format(self.args.rank)): + continue + if ckpt in self.ckpt_history: + continue + self.ckpt_history.append(ckpt) + self.args.logger.info('epoch[{}], iter[{}], loss:{}, ' + 'ckpt:{},' + 'ckpt_fn:{}'.format(real_epoch, + me_step, + cb_params.net_outputs, + ckpt, + ckpt_fn)) + + + self.me_epoch_start_step_num = me_step + self.me_epoch_start_time = time.time() + + def step_begin(self, run_context): + pass + + def step_end(self, run_context, *me_args): + pass + + def end(self, run_context): + self.args.logger.info('end network train...') diff --git a/official/cv/resnext/train.py b/official/cv/resnext/train.py index cb0704f108d74b6e25acb6ffc808d26a5970b2c8..bf8645064632a9dd026501c84f65fcde092d6481 100644 --- a/official/cv/resnext/train.py +++ b/official/cv/resnext/train.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,6 +35,7 @@ from src.utils.var_init import load_pretrain_model from src.image_classification import get_network from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper +from src.eval_callback import EvalCallBack, ProgressMonitor set_seed(1) @@ -84,13 +85,23 @@ def set_graph_kernel_context(device_target): if device_target == "GPU": context.set_context(enable_graph_kernel=True) +def apply_eval(eval_param): + eval_model = eval_param["model"] + eval_ds = eval_param["dataset"] + metrics_name = eval_param["metrics_name"] + res = eval_model.eval(eval_ds) + return res[metrics_name] + @moxing_wrapper() def train(): """training process""" set_parameters() - if os.getenv('DEVICE_ID', "not_set").isdigit(): - context.set_context(device_id=int(os.getenv('DEVICE_ID'))) - set_graph_kernel_context(config.device_target) + if config.device_target == "Ascend": + if os.getenv('DEVICE_ID', "not_set").isdigit(): + context.set_context(device_id=int(os.getenv('DEVICE_ID'))) + elif config.device_target == "GPU": + if os.getenv('CUDA_VISIBLE_DEVICES', "not_set").isdigit(): + context.set_context(device_id=0) # init distributed if config.run_distribute: @@ -101,6 +112,7 @@ def train(): de_dataset = classification_dataset(config.data_path, config.image_size, config.per_batch_size, 1, config.rank, config.group_size, num_parallel_workers=8) + de_dataset.map_model = 4 # !!!important config.steps_per_epoch = de_dataset.get_dataset_size() config.logger.save_args(config) @@ -133,11 +145,37 @@ def train(): else: loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) - model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, - metrics={'acc'}, amp_level="O3") - - # checkpoint save - callbacks = [TimeMonitor(data_size=config.steps_per_epoch), LossMonitor()] + if config.network == "resnext101": + model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, + metrics={'acc'}, amp_level="O2") + # checkpoint save + callbacks = [ProgressMonitor(config), LossMonitor()] + else: + model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, + metrics={'acc'}, amp_level="O3") + callbacks = [TimeMonitor(data_size=config.steps_per_epoch), LossMonitor()] + + if config.run_eval and config.rank_save_ckpt_flag: + if config.eval_data_path is None or (not os.path.isdir(config.eval_data_path)): + raise ValueError("{} is not a existing path.".format(config.eval_data_path)) + eval_de_dataset = classification_dataset(config.eval_data_path, + image_size=config.image_size, + per_batch_size=config.eval_per_batch_size, + max_epoch=1, + rank=config.rank, + group_size=config.group_size, + mode='eval') + eval_param_dict = {"model": model, "dataset": eval_de_dataset, "metrics_name": "acc"} + eval_callback = EvalCallBack(apply_eval, + eval_param_dict, + interval=config.eval_interval, + eval_start_epoch=config.eval_start_epoch, + save_best_ckpt=config.save_best_ckpt, + ckpt_directory=config.ckpt_path, + best_ckpt_name="best_acc.ckpt", + metrics_name="acc" + ) + callbacks.append(eval_callback) if config.rank_save_ckpt_flag: ckpt_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval * config.steps_per_epoch, keep_checkpoint_max=config.ckpt_save_max) @@ -146,7 +184,6 @@ def train(): directory=save_ckpt_path, prefix='{}'.format(config.rank)) callbacks.append(ckpt_cb) - model.train(config.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)