diff --git a/research/cv/centernet_resnet101/README.md b/research/cv/centernet_resnet101/README.md index d9be8cbc7dfbc0ea7f3a17637b5e0fc434344d55..71d97c5e7cedbffc0c3012cc6aa954a722056547 100644 --- a/research/cv/centernet_resnet101/README.md +++ b/research/cv/centernet_resnet101/README.md @@ -96,9 +96,7 @@ Dataset used: [COCO2017](https://cocodataset.org/) ```pip pip install Cython - pip install pycocotools - pip install mmcv==0.2.14 ``` @@ -112,7 +110,6 @@ Dataset used: [COCO2017](https://cocodataset.org/) 鈹斺攢instance_val2017.json 鈹溾攢val2017 鈹斺攢train2017 - ``` 2. If your own dataset is used. **Select dataset to other when run script.** @@ -134,15 +131,18 @@ Dataset used: [COCO2017](https://cocodataset.org/) ```shell # create dataset in mindrecord format bash scripts/convert_dataset_to_mindrecord.sh [COCO_DATASET_DIR] [MINDRECORD_DATASET_DIR] - # standalone training on Ascend bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [MINDRECORD_DATASET_PATH] [LOAD_CHECKPOINT_PATH](optional) - + # standalone training on GPU + bash scripts/run_standalone_train_gpu.sh [DEVICE_ID] [MINDRECORD_DATASET_PATH] [LOAD_CHECKPOINT_PATH](optional) # distributed training on Ascend bash scripts/run_distributed_train_ascend.sh [MINDRECORD_DATASET_PATH] [RANK_TABLE_FILE] [LOAD_CHECKPOINT_PATH](optional) - + # distributed training on GPU + bash scripts/run_distributed_train_gpu.sh [MINDRECORD_DATASET_PATH] [RANK_SIZE] [LOAD_CHECKPOINT_PATH](optional) # eval on Ascend bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID] [RUN_MODE] [DATA_DIR] [LOAD_CHECKPOINT_PATH] + # eval on GPU + bash scripts/run_standalone_eval_gpu.sh [DEVICE_ID] [RUN_MODE] [DATA_DIR] [LOAD_CHECKPOINT_PATH] ``` - running on ModelArts @@ -217,7 +217,7 @@ Dataset used: [COCO2017](https://cocodataset.org/) # (9) Set the "Output file path" and "Job log path" to your path on the website UI interface. # (10) Under the item "resource pool selection", select the specification of a single card. # (11) Create your job. - ``` + ``` # [Script Description](#contents) @@ -243,9 +243,12 @@ Dataset used: [COCO2017](https://cocodataset.org/) 鈹� 鈹� 鈹溾攢鈹€ README.md 鈹� 鈹溾攢鈹€ convert_dataset_to_mindrecord.sh // shell script for converting coco type dataset to mindrecord 鈹� 鈹溾攢鈹€ run_standalone_train_ascend.sh // shell script for standalone training on ascend + 鈹� 鈹溾攢鈹€ run_standalone_train_gpu.sh // shell script for standalone training on gpu 鈹� 鈹溾攢鈹€ run_infer_310.sh // shell script for 310 inference on ascend 鈹� 鈹溾攢鈹€ run_distributed_train_ascend.sh // shell script for distributed training on ascend + 鈹� 鈹溾攢鈹€ run_distributed_train_gpu.sh // shell script for distributed training on gpu 鈹� 鈹溾攢鈹€ run_standalone_eval_ascend.sh // shell script for standalone evaluation on ascend + 鈹� 鈹溾攢鈹€ run_standalone_eval_gpu.sh // shell script for standalone evaluation on gpu 鈹斺攢鈹€ src 鈹溾攢鈹€ model_utils 鈹� 鈹溾攢鈹€ config.py // parsing parameter configuration file of "*.yaml" @@ -433,6 +436,12 @@ The command above will run in the background, after converting mindrecord files bash scripts/run_distributed_train_ascend.sh /path/mindrecord_dataset /path/hccl.json /path/load_ckpt(optional) ``` +#### Running on GPU + +```shell +bash scripts/run_distributed_train_gpu.sh /path/mindrecord_dataset rank_size /path/load_ckpt(optional) +``` + The command above will run in the background, you can view training logs in LOG*/training_log.txt and LOG*/ms_log/. After training finished, you will get some checkpoint files under the LOG*/ckpt_0 folder by default. The loss value will be displayed as follows: ```text @@ -452,6 +461,8 @@ epoch time: 235430.151 ms, per step time: 514.040 ms # Evaluation base on validation dataset will be done automatically, while for test or test-dev dataset, the accuracy should be upload to the CodaLab official website(https://competitions.codalab.org). # On Ascend bash scripts/run_standalone_eval_ascend.sh device_id val(or test) /path/coco_dataset /path/load_ckpt +# On GPU +bash scripts/run_standalone_eval_gpu.sh device_id val(or test) /path/coco_dataset /path/load_ckpt ``` you can see the MAP result below as below: @@ -545,26 +556,26 @@ Inference result is saved in current path, you can find result like this in acc. ## [Performance](#contents) -### Training Performance On Ascend 910 - -CenterNet on 11.8K images(The annotation and data format must be the same as coco) - -| Parameters | CenterNet_ResNet101 | -| -------------------------- | ---------------------------------------------------------------| -| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory, 755G | -| uploaded Date | 16/7/2021 (month/day/year) | -| MindSpore Version | 1.2.0 | -| Dataset | COCO2017 | -| Training Parameters | 8p, epoch=330, steps=151140, batch_size = 32, lr=4.8e-4 | -| Optimizer | Adam | -| Loss Function | Focal Loss, L1 Loss, RegLoss | -| outputs | detections | -| Loss | 1.5-2.0 | -| Speed | 8p 25 img/s | -| Total time: training | 8p: 23 h | -| Total time: evaluation | keep res: test 1h, val 0.7h; fix res: test 40min, val 8min| -| Checkpoint | 591.70MB (.ckpt file) | -| Scripts | [centernet_resnet101 script](https://gitee.com/mindspore/models/tree/master/research/cv/centernet_resnet101) | +### Training Performance + +CenterNet on 118K images(The annotation and data format must be the same as coco) + +| Parameters | CenterNet_ResNet101 | CenterNte_ResNet101 | +| ---------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory, 755G | | +| uploaded Date | 16/7/2021 (day/month/year) | 3/22/2022(day/month/year) | +| MindSpore Version | 1.2.0 | 1.5.0 | +| Dataset | COCO2017 | COCO2017 | +| Training Parameters | 8p, epoch=330, steps=151140, batch_size = 32, lr=4.8e-4 | 8p, epoch=330, steps=151140, batch_size = 32, lr=4.8e-4 | +| Optimizer | Adam | Adam | +| Loss Function | Focal Loss, L1 Loss, RegLoss | Focal Loss, L1 Loss, RegLoss | +| outputs | detections | detections | +| Loss | 1.5-2.0 | 1.5-2.0 | +| Speed | 8p 25 img/s | 8p 1350ms/step | +| Total time: training | 8p: 23 h | 8p: 59h | +| Total time: evaluation | keep res: test 1h, val 0.7h; fix res: test 40min, val 8min | val 10min | +| Checkpoint | 591.70MB (.ckpt file) | 591.70MB(.ckpt file) | +| Scripts | [centernet_resnet101 script](https://gitee.com/mindspore/models/tree/master/research/cv/centernet_resnet101) | [centernet_resnet101 script](https://gitee.com/mindspore/models/tree/master/research/cv/centernet_resnet101) | ### Inference Performance On Ascend 910 diff --git a/research/cv/centernet_resnet101/default_config.yaml b/research/cv/centernet_resnet101/default_config.yaml index 3c12b0df01c6b449f08db527ef8dbd2bde8acf61..9bf22593ef38dd7c1d6673348efec28383a989b6 100644 --- a/research/cv/centernet_resnet101/default_config.yaml +++ b/research/cv/centernet_resnet101/default_config.yaml @@ -265,7 +265,7 @@ run_mode: "test or validation, default is test." enable_eval: "Whether evaluate accuracy after prediction" --- -device_target: ['Ascend'] +device_target: ['Ascend', 'GPU'] distribute: ["true", "false"] need_profiler: ["true", "false"] enable_save_ckpt: ["true", "false"] diff --git a/research/cv/centernet_resnet101/scripts/convert_dataset_to_mindrecord.sh b/research/cv/centernet_resnet101/scripts/convert_dataset_to_mindrecord.sh index b4c686bb0ba85ef655a849c2821a81b7272ca352..4e509e0b54107126b53206cb5ef296ad6a811778 100644 --- a/research/cv/centernet_resnet101/scripts/convert_dataset_to_mindrecord.sh +++ b/research/cv/centernet_resnet101/scripts/convert_dataset_to_mindrecord.sh @@ -24,8 +24,13 @@ MINDRECORD_DIR=$2 export GLOG_v=1 PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) +LOG_DIR=$PROJECT_DIR/../logs +if [ ! -d $LOG_DIR ] +then + mkdir $LOG_DIR +fi python ${PROJECT_DIR}/../src/dataset.py \ --coco_data_dir=$COCO_DIR \ --mindrecord_dir=$MINDRECORD_DIR \ - --mindrecord_prefix="coco_det.train.mind" > create_dataset.log 2>&1 & \ No newline at end of file + --mindrecord_prefix="coco_det.train.mind" >${LOG_DIR}/create_dataset.log 2>&1 & diff --git a/research/cv/centernet_resnet101/scripts/run_distributed_train_gpu.sh b/research/cv/centernet_resnet101/scripts/run_distributed_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..8cc5afcff57dfed3cba42986c61c1d2f6784e3a0 --- /dev/null +++ b/research/cv/centernet_resnet101/scripts/run_distributed_train_gpu.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_distributed_train_gpu.sh MINDRECORD_DIR DEVICE_NUM LOAD_CHECKPOINT_PATH" +echo "for example: bash run_distributed_train_gpu.sh /path/mindrecord_dataset 8 /path/load_ckpt" +echo "if no ckpt, just run: bash run_distributed_train_gpu.sh /path/mindrecord_dataset 8" +echo "==============================================================================================================" + +MINDRECORD_DIR=$1 +RANK_SIZE=$2 +if [ $# == 3 ]; +then + LOAD_CHECKPOINT_PATH=$3 +else + LOAD_CHECKPOINT_PATH="" +fi + +mkdir -p ms_log +PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) +CUR_DIR=`pwd` +LOG_DIR=$PROJECT_DIR/../logs +if [ ! -d $LOG_DIR ] +then + mkdir $LOG_DIR +fi +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 +export RANK_SIZE=$RANK_SIZE + +mpirun -n $RANK_SIZE --allow-run-as-root python ${PROJECT_DIR}/../train.py \ + --distribute=true \ + --device_num=$RANK_SIZE \ + --device_target=GPU \ + --need_profiler=false \ + --profiler_path=./profiler \ + --enable_save_ckpt=true \ + --do_shuffle=true \ + --enable_data_sink=false \ + --data_sink_steps=-1 \ + --epoch_size=330 \ + --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \ + --save_checkpoint_steps=3664 \ + --save_checkpoint_num=5 \ + --mindrecord_dir=$MINDRECORD_DIR \ + --mindrecord_prefix="coco_det.train.mind" \ + --visual_image=false \ + --save_result_dir="" >${LOG_DIR}/distributed_training_gpu_log.txt 2>&1 & diff --git a/research/cv/centernet_resnet101/scripts/run_standalone_eval_ascend.sh b/research/cv/centernet_resnet101/scripts/run_standalone_eval_ascend.sh index eee33cbda8c7ee1a03fad045556052989af66df5..7b91c1039696930efd62f435a5054fe3ef49ac05 100644 --- a/research/cv/centernet_resnet101/scripts/run_standalone_eval_ascend.sh +++ b/research/cv/centernet_resnet101/scripts/run_standalone_eval_ascend.sh @@ -59,3 +59,4 @@ python ${PROJECT_DIR}/../eval.py \ --visual_image=true \ --enable_eval=true \ --save_result_dir=./ > eval_log.txt 2>&1 & + diff --git a/research/cv/centernet_resnet101/scripts/run_standalone_eval_gpu.sh b/research/cv/centernet_resnet101/scripts/run_standalone_eval_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..8a0da8f65b701a6bf0b547bdb7859b8fc61d90ce --- /dev/null +++ b/research/cv/centernet_resnet101/scripts/run_standalone_eval_gpu.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_standalone_eval_gpu.sh DEVICE_ID RUN_MODE DATA_DIR LOAD_CHECKPOINT_PATH" +echo "for example of validation: bash run_standalone_eval_gpu.sh 0 val /path/coco_dataset /path/load_ckpt" +echo "for example of test: bash run_standalone_eval_gpu.sh 0 test /path/coco_dataset /path/load_ckpt" +echo "==============================================================================================================" +DEVICE_ID=$1 +RUN_MODE=$2 +DATA_DIR=$3 +LOAD_CHECKPOINT_PATH=$4 +mkdir -p ms_log +PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) +CUR_DIR=`pwd` +LOG_DIR=$PROJECT_DIR/../logs +if [ ! -d $LOG_DIR ] +then + mkdir $LOG_DIR +fi +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 +export DEVICE_ID=$DEVICE_ID + +# install nms module from third party +if python -c "import nms" > /dev/null 2>&1 +then + echo "NMS module already exits, no need reinstall." +else + if [ -f './CenterNet' ] + then + echo "NMS module was not found, but has been downloaded" + else + echo "NMS module was not found, install it now..." + git clone https://github.com/xingyizhou/CenterNet.git + fi + cd CenterNet/src/lib/external/ || exit + make + python setup.py install + cd - || exit + rm -rf CenterNet +fi + +python ${PROJECT_DIR}/../eval.py \ + --device_target=GPU \ + --device_id=$DEVICE_ID \ + --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \ + --data_dir=$DATA_DIR \ + --run_mode=$RUN_MODE \ + --visual_image=true \ + --enable_eval=true \ + --save_result_dir=./ > ${LOG_DIR}/eval_gpu_log.txt 2>&1 & diff --git a/research/cv/centernet_resnet101/scripts/run_standalone_train_ascend.sh b/research/cv/centernet_resnet101/scripts/run_standalone_train_ascend.sh index 31f661252bbc6b26c57981b9384e9ae83909b085..9ab6d8e5a68e0b4d076182b0db32f4218c3f1333 100644 --- a/research/cv/centernet_resnet101/scripts/run_standalone_train_ascend.sh +++ b/research/cv/centernet_resnet101/scripts/run_standalone_train_ascend.sh @@ -53,4 +53,4 @@ python ${PROJECT_DIR}/../train.py \ --mindrecord_dir=$MINDRECORD_DIR \ --mindrecord_prefix="coco_det.train.mind" \ --visual_image=false \ - --save_result_dir="" > training_log.txt 2>&1 & \ No newline at end of file + --save_result_dir="" > training_log.txt 2>&1 & diff --git a/research/cv/centernet_resnet101/scripts/run_standalone_train_gpu.sh b/research/cv/centernet_resnet101/scripts/run_standalone_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..79d91f18453bc370ad8d0b1552207c987f8b008b --- /dev/null +++ b/research/cv/centernet_resnet101/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_standalone_train_gpu.sh DEVICE_ID MINDRECORD_DIR LOAD_CHECKPOINT_PATH" +echo "for example: bash run_standalone_train_gpu.sh 0 /path/mindrecord_dataset /path/load_ckpt" +echo "if no ckpt, just run: bash run_standalone_train.sh 0 /path/mindrecord_dataset" +echo "==============================================================================================================" + +DEVICE_ID=$1 +MINDRECORD_DIR=$2 +if [ $# == 3 ]; +then + LOAD_CHECKPOINT_PATH=$3 +else + LOAD_CHECKPOINT_PATH="" +fi + +mkdir -p ms_log +PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) +CUR_DIR=`pwd` +LOG_DIR=$PROJECT_DIR/../logs +if [ ! -d $LOG_DIR ] +then + mkdir $LOG_DIR +fi +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 +export DEVICE_ID=$DEVICE_ID + +python ${PROJECT_DIR}/../train.py \ + --distribute=false \ + --device_target=GPU \ + --need_profiler=false \ + --profiler_path=./profiler \ + --device_id=$DEVICE_ID \ + --enable_save_ckpt=true \ + --do_shuffle=true \ + --enable_data_sink=true \ + --data_sink_steps=-1 \ + --epoch_size=330 \ + --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \ + --save_checkpoint_steps=3664 \ + --save_checkpoint_num=1 \ + --mindrecord_dir=$MINDRECORD_DIR \ + --mindrecord_prefix="coco_det.train.mind" \ + --visual_image=false \ + --save_result_dir="" >${LOG_DIR}/training_gpu_log.txt 2>&1 & diff --git a/research/cv/centernet_resnet101/src/centernet_det.py b/research/cv/centernet_resnet101/src/centernet_det.py index 81d1975d1a2f3ccb9ac7d4d4e9177ed355164376..c82fd3dd5f75906b89b2df72067c1a63ff3e0946 100644 --- a/research/cv/centernet_resnet101/src/centernet_det.py +++ b/research/cv/centernet_resnet101/src/centernet_det.py @@ -16,17 +16,12 @@ CenterNet for training and evaluation """ - import mindspore.nn as nn import mindspore.ops as ops -from mindspore import context from mindspore import dtype as mstype from mindspore.common.tensor import Tensor -from mindspore.context import ParallelMode from mindspore.common.initializer import Constant -from mindspore.communication.management import get_group_size -from mindspore.nn.wrap.grad_reducer import DistributedGradReducer -from src.utils import Sigmoid, GradScale +from src.utils import Sigmoid from src.utils import FocalLoss, RegLoss from src.decode import DetectionDecode from src.resnet101 import Bottleneck, ResNet101, weights_init @@ -66,8 +61,7 @@ class GatherDetectionFeatureCell(nn.Cell): if net_config.reg_offset: heads.update({'reg': 2}) head_conv = net_config.head_conv - self.resnet101 = ResNet101(self.block_class, self.layers, - heads, head_conv) + self.resnet101 = ResNet101(self.block_class, self.layers, heads, head_conv) weights_init(self.resnet101) self.hm_fn = _generate_feature(cin=64, cout=heads['hm'], kernel_size=1, @@ -222,61 +216,16 @@ class CenterNetWithLossScaleCell(nn.Cell): Tuple of Tensors, the loss, overflow flag and scaling sens of the network. """ def __init__(self, network, optimizer, sens=1): - super(CenterNetWithLossScaleCell, self).__init__(auto_prefix=False) + super(CenterNetWithLossScaleCell, self).__init__() self.image = ImagePreProcess() - self.network = network - self.network.set_grad() - self.weights = optimizer.parameters - self.optimizer = optimizer - self.grad = ops.GradOperation(get_by_list=True, sens_param=True) - self.reducer_flag = False - self.allreduce = ops.AllReduce() - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") - if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: - self.reducer_flag = True - self.grad_reducer = ops.identity - self.degree = 1 - if self.reducer_flag: - self.degree = get_group_size() - self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) - self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) - self.cast = ops.Cast() - self.alloc_status = ops.NPUAllocFloatStatus() - self.get_status = ops.NPUGetFloatStatus() - self.clear_before_grad = ops.NPUClearFloatStatus() - self.reduce_sum = ops.ReduceSum(keep_dims=False) - self.base = Tensor(1, mstype.float32) - self.less_equal = ops.LessEqual() - self.grad_scale = GradScale() - self.loss_scale = sens + manager = nn.FixedLossScaleUpdateCell(loss_scale_value=sens) + self.train_on_step = nn.TrainOneStepWithLossScaleCell(network, optimizer, scale_sense=manager) @ops.add_flags(has_effect=True) def construct(self, image, hm, reg_mask, ind, wh, reg): """Defines the computation performed.""" image = self.image(image) - weights = self.weights - loss = self.network(image, hm, reg_mask, ind, wh, reg) - scaling_sens = self.cast(self.loss_scale, mstype.float32) * 2.0 / 2.0 - # alloc status and clear should be right before gradoperation - init = self.alloc_status() - self.clear_before_grad(init) - grads = self.grad(self.network, weights)(image, hm, reg_mask, ind, wh, reg, scaling_sens) - grads = self.grad_reducer(grads) - grads = self.grad_scale(scaling_sens * self.degree, grads) - self.get_status(init) - flag_sum = self.reduce_sum(init, (0,)) - if self.is_distributed: - flag_reduce = self.allreduce(flag_sum) - cond = self.less_equal(self.base, flag_reduce) - else: - cond = self.less_equal(self.base, flag_sum) - overflow = cond - if overflow: - succ = False - else: - succ = self.optimizer(grads) - ret = (loss, cond, scaling_sens) - return ops.depend(ret, succ) + return self.train_on_step(image, hm, reg_mask, ind, wh, reg) class CenterNetDetEval(nn.Cell): diff --git a/research/cv/centernet_resnet101/src/dataset.py b/research/cv/centernet_resnet101/src/dataset.py index bf8bee33d5e7c27057efd4a879a920bd855ce45d..adf33ca01c4bf8749db42dd672deefe5d6acabbe 100644 --- a/research/cv/centernet_resnet101/src/dataset.py +++ b/research/cv/centernet_resnet101/src/dataset.py @@ -117,7 +117,7 @@ class COCOHP(ds.Dataset): self.num_samples = len(self.images) self.keep_res = keep_res logger.info('Loaded {} {} samples'.format(self.run_mode, self.num_samples)) - + print('Loaded {} {} samples'.format(self.run_mode, self.num_samples)) def __len__(self): return self.num_samples diff --git a/research/cv/centernet_resnet101/src/visual.py b/research/cv/centernet_resnet101/src/visual.py index 315fdb62462b3ec46d5a0adc69c2aeb61fe5cae6..8e40a4a43cbe556a8e4972ee9e4ece15a6eaab6b 100644 --- a/research/cv/centernet_resnet101/src/visual.py +++ b/research/cv/centernet_resnet101/src/visual.py @@ -158,9 +158,9 @@ def visual_image(img, annos, save_path, ratio=None, height=None, width=None, nam continue txt = '{}{:.2f}'.format(name, ann["score"]) cat_size = cv2.getTextSize(txt, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0] - cv2.rectangle(img, (bbox[0], int(bbox[1] - cat_size[1] - 5)), + cv2.rectangle(img, (int(bbox[0]), int(bbox[1] - cat_size[1] - 5)), (int(bbox[0] + cat_size[0]), int(bbox[1] - 2)), c, -1) - cv2.putText(img, txt, (bbox[0], int(bbox[1] - 5)), + cv2.putText(img, txt, (int(bbox[0]), int(bbox[1] - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, lineType=cv2.LINE_AA) ct = (int((bbox[0] + bbox[2]) / 2), int((bbox[1] + bbox[3]) / 2)) diff --git a/research/cv/centernet_resnet101/train.py b/research/cv/centernet_resnet101/train.py index 6fde3191181be3b2ab97b6ce57da6c5897e0f1b3..fe98b1e95c5d0f5b900ba27f6f9aeb46c8070d07 100644 --- a/research/cv/centernet_resnet101/train.py +++ b/research/cv/centernet_resnet101/train.py @@ -28,10 +28,10 @@ from mindspore.nn.optim import Adam from mindspore import log as logger from mindspore.common import set_seed from mindspore.profiler import Profiler +import mindspore as ms from src.dataset import COCOHP -from src.centernet_det import CenterNetLossCell, CenterNetWithLossScaleCell -from src.centernet_det import CenterNetWithoutLossScaleCell +from src.centernet_det import CenterNetLossCell, CenterNetWithLossScaleCell, CenterNetWithoutLossScaleCell from src.utils import LossCallBack, CenterNetPolynomialDecayLR, CenterNetMultiEpochsDecayLR from src.model_utils.config import config, dataset_config, net_config, train_config from src.model_utils.moxing_adapter import moxing_wrapper @@ -108,23 +108,22 @@ def train(): rank = 0 device_num = 1 num_workers = 8 - if config.device_target == "Ascend": - context.set_context(device_id=get_device_id()) - if config.distribute == "true": - D.init() - device_num = get_device_num() - rank = get_rank_id() - ckpt_save_dir = config.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/' - - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, - device_num=device_num) - _set_parallel_all_reduce_split() + if config.distribute == "true": + D.init() + device_num = get_device_num() + rank = get_rank_id() + ckpt_save_dir = config.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/' + + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + device_num=device_num) + _set_parallel_all_reduce_split() else: - config.distribute = "false" + context.set_context(device_id=get_device_id()) config.need_profiler = "false" config.enable_data_sink = "false" + ckpt_save_dir = config.save_checkpoint_path + 'ckpt/' # Start create dataset! # mindrecord files will be generated at args_opt.mindrecord_dir such as centernet.mindrecord0, 1, ... file_num. @@ -142,7 +141,6 @@ def train(): logger.info("train steps: {}".format(config.train_steps)) optimizer = _get_optimizer(net_with_loss, dataset_size) - enable_static_time = config.device_target == "CPU" callback = [TimeMonitor(config.data_sink_steps), LossCallBack(dataset_size, enable_static_time)] if config.enable_save_ckpt == "true" and get_device_id() % min(8, device_num) == 0: @@ -158,6 +156,10 @@ def train(): if config.device_target == "Ascend": net_with_grads = CenterNetWithLossScaleCell(net_with_loss, optimizer=optimizer, sens=train_config.loss_scale_value) + elif config.device_target == "GPU": + net_with_loss = net_with_loss.to_float(ms.float16) + net_with_grads = CenterNetWithLossScaleCell(net_with_loss, optimizer=optimizer, + sens=train_config.loss_scale_value) else: net_with_grads = CenterNetWithoutLossScaleCell(net_with_loss, optimizer=optimizer)