diff --git a/research/cv/ICNet/README.md b/research/cv/ICNet/README.md index 00d5db4beefc67c73ba6f12cd7326c88ebf42863..971fa6c5c65e37aead2bdfa36974dffbc23d01cb 100644 --- a/research/cv/ICNet/README.md +++ b/research/cv/ICNet/README.md @@ -149,7 +149,6 @@ keep_checkpoint_max: 10 ### Pre-training The folder Res50V1_PRE contains the scripts for pre-training and its dataset is [image net](https://image-net.org/). More details in [GENet_Res50](https://gitee.com/mindspore/models/tree/master/research/cv/GENet_Res50) - - Usage: ```shell @@ -221,18 +220,18 @@ avgtime 0.19648232793807982 ## Performance -### Training Performance +### Distributed Training Performance |Parameter | ICNet | | ------------------- | --------------------------------------------------------- | |resources | Ascend 910锛汣PU 2.60GHz, 192core锛沵emory锛�755G | |Upload date |2021.6.1 | |mindspore version |mindspore1.2.0 | -|training parameter |epoch=160,batch_size=32 | +|training parameter |epoch=160,batch_size=4 | |optimizer |SGD optimizer锛宮omentum=0.9,weight_decay=0.0001 | |loss function |SoftmaxCrossEntropyLoss | -|training speed | epoch time锛�285693.557 ms per step time :42.961 ms | -|total time |about 5 hours | +|training speed | epoch time锛�21469.152 ms(8pcs) per step time :230.851 ms(8pcs) | +|total time |1h1m34s(8pcs) | |Script URL | | |Random number seed |set_seed = 1234 | diff --git a/research/cv/ICNet/scripts/run_distribute_train8p.sh b/research/cv/ICNet/scripts/run_distribute_train8p.sh index f1e1e4b2a87140ddd4ae539e15e9fe10f8af1ba7..9d96fca9fb24dcbdfe4f3666780def0ded3db498 100644 --- a/research/cv/ICNet/scripts/run_distribute_train8p.sh +++ b/research/cv/ICNet/scripts/run_distribute_train8p.sh @@ -20,7 +20,7 @@ then echo "Usage: bash scripts/run_distribute_train8p.sh [RANK_TABLE_FILE] [PROJECT_PATH]" echo "Please run the script as: " echo "bash scripts/run_distribute_train8p.sh [RANK_TABLE_FILE] [PROJECT_PATH]" - echo "for example: bash script/run_distribute_train8p.sh /absolute/path/to/RANK_TABLE_FILE /root/ICNet/" + echo "for example: bash scripts/run_distribute_train8p.sh /absolute/path/to/RANK_TABLE_FILE /root/ICNet/" echo "==============================================================================================================" exit 1 fi diff --git a/research/cv/ICNet/scripts/run_train1p.sh b/research/cv/ICNet/scripts/run_train1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..8fdf2b1957e8b4a8275f440937b58c803f211f83 --- /dev/null +++ b/research/cv/ICNet/scripts/run_train1p.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "==============================================================================================================" + echo "Usage: bash scripts/run_train1p.sh [PROJECT_PATH] [DEVICE_ID]" + echo "for example: bash scripts/run_train1p.sh /root/ICNet/ 0" + echo "==============================================================================================================" + exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) + + +if [ ! -d $PATH1 ] +then + echo "error: PROJECT_PATH=$PATH1 is not a directory" +exit 1 +fi + + +rm -rf LOG +mkdir ./LOG +export RANK_SIZE=1 +export RANK_ID=0 +export DEVICE_ID=$2 +echo "start training for rank $i, device $DEVICE_ID" +env > env.log + +python3 train.py --project_path=$PATH2 > log.txt 2>&1 & diff --git a/research/cv/ICNet/src/models/icnet_dc.py b/research/cv/ICNet/src/models/icnet_dc.py index e0d9441c3fea193f4bec2bed28aaeb6f21a65387..6476850fbce668371b9f88d0de8261d7deb119d5 100644 --- a/research/cv/ICNet/src/models/icnet_dc.py +++ b/research/cv/ICNet/src/models/icnet_dc.py @@ -28,19 +28,19 @@ context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') class ICNetdc(nn.Cell): """Image Cascade Network""" - def __init__(self, nclass=19, pretrained_path="", istraining=True): + def __init__(self, nclass=19, pretrained_path="", istraining=True, norm_layer=nn.SyncBatchNorm): super(ICNetdc, self).__init__() self.conv_sub1 = nn.SequentialCell( - _ConvBNReLU(3, 32, 3, 2), - _ConvBNReLU(32, 32, 3, 2), - _ConvBNReLU(32, 64, 3, 2) + _ConvBNReLU(3, 32, 3, 2, norm_layer=norm_layer), + _ConvBNReLU(32, 32, 3, 2, norm_layer=norm_layer), + _ConvBNReLU(32, 64, 3, 2, norm_layer=norm_layer) ) self.istraining = istraining self.ppm = PyramidPoolingModule() self.backbone = SegBaseModel(root=pretrained_path) - self.head = _ICHead(nclass) + self.head = _ICHead(nclass, norm_layer=norm_layer) self.loss = ICNetLoss() @@ -73,7 +73,6 @@ class ICNetdc(nn.Cell): outputs = output return outputs - class PyramidPoolingModule(nn.Cell): """PPM""" diff --git a/research/cv/ICNet/train.py b/research/cv/ICNet/train.py index 4fca39a9f674629b6d441682addfc437aca32ee1..d14eb3407af1b4662067eba9149266e9aa69e7f7 100644 --- a/research/cv/ICNet/train.py +++ b/research/cv/ICNet/train.py @@ -29,7 +29,8 @@ from mindspore.train.callback import ModelCheckpoint from mindspore.train.callback import LossMonitor from mindspore.train.callback import TimeMonitor -device_id = int(os.getenv('RANK_ID')) +rank_id = int(os.getenv('RANK_ID')) +device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') @@ -52,12 +53,15 @@ def train_net(): mindrecord_dir = cfg['train']["mindrecord_dir"] mindrecord_file = os.path.join(mindrecord_dir, prefix) dataset = create_icnet_dataset(mindrecord_file, batch_size=cfg['train']["train_batch_size_percard"], - device_num=device_num, rank_id=device_id) + device_num=device_num, rank_id=rank_id) train_data_size = dataset.get_dataset_size() print("data_size", train_data_size) epoch = cfg["train"]["epochs"] - network = ICNetdc(pretrained_path=cfg["train"]["pretrained_model_path"]) # __init__ + if device_num > 1: + network = ICNetdc(pretrained_path=cfg["train"]["pretrained_model_path"]) # __init__ + else: + network = ICNetdc(pretrained_path=cfg["train"]["pretrained_model_path"], norm_layer=nn.BatchNorm2d) iters_per_epoch = train_data_size total_train_steps = iters_per_epoch * epoch