diff --git a/research/cv/yolox/README_CN.md b/research/cv/yolox/README_CN.md index 88f574c3f19a9b8d215d6f03cc9c98225b911866..9fd6717e5d9fc4972230b009ed92511232db38bf 100644 --- a/research/cv/yolox/README_CN.md +++ b/research/cv/yolox/README_CN.md @@ -278,20 +278,12 @@ train.py涓富瑕佺殑鍙傛暟濡備笅: ```log ... - 2021-12-24 16:16:14,099:INFO:epoch: 0 step: [612/1848], loss: 11.5023, overflow: False, scale: 262144, lr: 0.000044, time: 303.65 - 2021-12-24 16:16:25,031:INFO:epoch: 0 step: [648/1848], loss: 11.4281, overflow: False, scale: 262144, lr: 0.000049, time: 303.66 - 2021-12-24 16:16:35,966:INFO:epoch: 0 step: [684/1848], loss: 11.2717, overflow: False, scale: 262144, lr: 0.000055, time: 303.72 - 2021-12-24 16:16:46,900:INFO:epoch: 0 step: [720/1848], loss: 11.4875, overflow: False, scale: 262144, lr: 0.000061, time: 303.72 - 2021-12-24 16:16:57,834:INFO:epoch: 0 step: [756/1848], loss: 11.2793, overflow: False, scale: 262144, lr: 0.000067, time: 303.73 - 2021-12-24 16:17:08,770:INFO:epoch: 0 step: [792/1848], loss: 11.4845, overflow: False, scale: 262144, lr: 0.000074, time: 303.76 - 2021-12-24 16:17:19,705:INFO:epoch: 0 step: [828/1848], loss: 11.4574, overflow: False, scale: 262144, lr: 0.000080, time: 303.74 - 2021-12-24 16:17:30,638:INFO:epoch: 0 step: [864/1848], loss: 11.7713, overflow: False, scale: 262144, lr: 0.000088, time: 303.69 - 2021-12-24 16:17:41,571:INFO:epoch: 0 step: [900/1848], loss: 11.3390, overflow: False, scale: 262144, lr: 0.000095, time: 303.70 - 2021-12-24 16:17:52,503:INFO:epoch: 0 step: [936/1848], loss: 11.4625, overflow: False, scale: 262144, lr: 0.000103, time: 303.66 - 2021-12-24 16:18:03,437:INFO:epoch: 0 step: [972/1848], loss: 11.4421, overflow: False, scale: 262144, lr: 0.000111, time: 303.72 - 2021-12-24 16:18:14,372:INFO:epoch: 0 step: [1008/1848], loss: 11.1791, overflow: False, scale: 262144, lr: 0.000119, time: 303.74 - 2021-12-24 16:18:25,304:INFO:epoch: 0 step: [1044/1848], loss: 11.3785, overflow: False, scale: 262144, lr: 0.000128, time: 303.66 - 2021-12-24 16:18:36,236:INFO:epoch: 0 step: [1080/1848], loss: 11.4149, overflow: False, scale: 262144, lr: 0.000137, time: 303.64 + 2022-10-10 11:43:14,405:INFO:epoch: [1/300] step: [150/1848], loss: 15.9977, lr: 0.000003, avg step time: 332.07 ms + 2022-10-10 11:43:37,711:INFO:epoch: [1/300] step: [160/1848], loss: 14.6404, lr: 0.000003, avg step time: 330.58 ms + 2022-10-10 11:44:41,012:INFO:epoch: [1/300] step: [170/1848], loss: 16.2315, lr: 0.000004, avg step time: 330.08 ms + 2022-10-10 11:43:44,326:INFO:epoch: [1/300] step: [180/1848], loss: 16.9418, lr: 0.000004, avg step time: 331.37 ms + 2022-10-10 11:43:47,646:INFO:epoch: [1/300] step: [190/1848], loss: 17.1101, lr: 0.000005, avg step time: 331.87 ms + 2022-10-10 11:43:50,943:INFO:epoch: [1/300] step: [200/1848], loss: 16.7288, lr: 0.000005, avg step time: 329.74 ms ... ``` @@ -317,18 +309,18 @@ bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [BACKBONE] [BATCH_SIZE] ```log ===============================coco eval result=============================== - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.468 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.664 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.509 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.288 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.513 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.612 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.358 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.575 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.612 + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.451 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.646 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.494 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.281 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.493 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.577 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.350 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.566 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.610 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.416 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.662 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.764 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.664 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.751 ``` @@ -368,18 +360,18 @@ bash run_infer_310.sh [MINDIR_PATH] [DATA_DIR] [DEVICE_ID] yolox-darknet53 =============================coco eval result================================== - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.473 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.670 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.517 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.290 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.519 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.615 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.360 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.582 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.622 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.430 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.671 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.772 + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.480 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.674 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.524 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.304 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.525 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.616 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.364 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.585 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.625 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.435 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.678 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.762 yolox-x =============================coco eval result================================== Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.502 diff --git a/research/cv/yolox/default_config.yaml b/research/cv/yolox/default_config.yaml deleted file mode 100644 index 47a71c75404867d1a5bb58fb7061c36b8ddcd33d..0000000000000000000000000000000000000000 --- a/research/cv/yolox/default_config.yaml +++ /dev/null @@ -1,88 +0,0 @@ -backbone: "yolox_darknet53" #option for backbone, you can choose 'yolox_darknet53' or 'yolox_x' -data_aug: True -# path for local -device_target: "Ascend" -# /home/work/user-job-dir/outputs/model/ -outputs_dir: "./" -# ====================================================== -# Train option -save_graphs: False -lr_scheduler: "yolox_warm_cos_lr" -max_epoch: 285 -total_epoch: 300 -data_dir: "/home/work/user-job-dir/inputs/data/" -# last no data aug related -yolox_no_aug_ckpt: "" -need_profiler: 0 -pretrained: '' -resume_yolox: '' -# data aug -flip_prob: 0.5 -hsv_prob: 1.0 -# ========================================================\ -# dataset related -per_batch_size: 8 - -# network configuration -depth_wise: False -max_gt: 120 -num_classes: 80 -input_size: [640, 640] -fpn_strides: [8, 16, 32] -use_l1: False -use_syc_bn: True -updates: 0.0 - -# dynamic_k -n_candidate_k: 10 - -# optimizer and lr related -lr: 0.01 # 0.04 for yolox-x -min_lr_ratio: 0.001 -warmup_epochs: 5 -weight_decay: 0.0005 -momentum: 0.9 -no_aug_epochs: 15 -# logging related -log_interval: 30 -ckpt_interval: -1 -is_save_on_master: 1 -ckpt_max_num: 60 -opt: "Momentum" - -# distributed related -is_distributed: 1 -rank: 0 -group_size: 1 -bind_cpu: True -device_num: 8 - -# modelart -is_modelArts: 0 -enable_modelarts: False - -need_modelarts_dataset_unzip: False -modelarts_dataset_unzip_name: "coco2017" - -data_url: "" -train_url: "" -checkpoint_url: "" -data_path: "/home/work/user-job-dir/inputs/data/" -output_path: "./" -load_path: "/cache/checkpoint_path" -ckpt_path: './' - -# Eval option -log_path: "val/outputs/" -val_ckpt: "0-2755_64.ckpt" -conf_thre: 0.001 -nms_thre: 0.65 -eval_interval: 10 -run_eval: False -# modelarts -is_modelart: False -result_path: '' - -# export option -file_format: 'MINDIR' -export_bs: 1 diff --git a/research/cv/yolox/eval.py b/research/cv/yolox/eval.py index a5581c714a3005f201212b5945ff914a7b87e983..2aeae443ed9e225ce7f2a93336764baf8091c280 100644 --- a/research/cv/yolox/eval.py +++ b/research/cv/yolox/eval.py @@ -20,11 +20,10 @@ import datetime from tqdm import tqdm from model_utils.config import config from mindspore.context import ParallelMode -from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore import context from src.logger import get_logger -from src.util import DetectionEngine +from src.util import DetectionEngine, load_weights from src.yolox import DetectionBlock from src.yolox_dataset import create_yolox_dataset from src.initializer import default_recurisive_init @@ -58,16 +57,7 @@ def run_test(): default_recurisive_init(network) config.logger.info(config.val_ckpt) if os.path.isfile(config.val_ckpt): - param_dict = load_checkpoint(config.val_ckpt) - ema_param_dict = {} - for param in param_dict: - if param.startswith("ema."): - new_name = param.split("ema.")[1] - data = param_dict[param] - data.name = new_name - ema_param_dict[new_name] = data - - load_param_into_net(network, ema_param_dict) + network = load_weights(network, config.val_ckpt) config.logger.info('load model %s success', config.val_ckpt) else: config.logger.info('%s doesn''t exist or is not a pre-trained file', config.val_ckpt) diff --git a/research/cv/yolox/export.py b/research/cv/yolox/export.py index 3e6802bd4331f9c7236c4e1701c34128e0b5bbe5..956cdc4dc6779cbf384d4147c9b4f9f75a5f0c95 100644 --- a/research/cv/yolox/export.py +++ b/research/cv/yolox/export.py @@ -20,10 +20,11 @@ import os import numpy as np import mindspore as ms -from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from mindspore import Tensor, export, context from model_utils.config import config from src.yolox import DetectionBlock +from src.util import load_weights def run_export(): @@ -43,8 +44,7 @@ def run_export(): network = DetectionBlock(config, backbone=backbone) # default yolo-darknet53 network.set_train(False) assert config.val_ckpt is not None, "config.ckpt_file is None." - param_dict = load_checkpoint(config.val_ckpt) - load_param_into_net(network, param_dict) + network = load_weights(network, config.val_ckpt) input_arr = Tensor(np.ones([config.export_bs, 3, config.input_size[0], config.input_size[1]]), ms.float32) file_name = backbone export(network, input_arr, file_name=file_name, file_format=config.file_format) diff --git a/research/cv/yolox/model_utils/config.py b/research/cv/yolox/model_utils/config.py index e4061a12a2c32c861178dd3604e06c24336a2f95..a9e30f886e989093054b687d465dd0d569eb8312 100644 --- a/research/cv/yolox/model_utils/config.py +++ b/research/cv/yolox/model_utils/config.py @@ -115,7 +115,7 @@ def get_config(): """ parser = argparse.ArgumentParser(description="default name", add_help=False) current_dir = os.path.dirname(os.path.abspath(__file__)) - parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../yolox_darknet53.yaml"), help="Config file path") path_args, _ = parser.parse_known_args() default, helper, choices = parse_yaml(path_args.config_path) diff --git a/research/cv/yolox/scripts/run_distribute_train.sh b/research/cv/yolox/scripts/run_distribute_train.sh index d67f5ec3c22cf8631dd933aa7b92bd7cfe784b46..4ce54488a55cc81b9480e7e1c917e1238a4215ff 100644 --- a/research/cv/yolox/scripts/run_distribute_train.sh +++ b/research/cv/yolox/scripts/run_distribute_train.sh @@ -14,8 +14,8 @@ # limitations under the License. # =========================================================================== if [[ $# -lt 3 || $# -gt 4 ]];then - echo "Usage1: bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_FILE] [BACKBONE] for first data aug epochs" - echo "Usage2: bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_FILE] [BACKBONE] [RESUME_CKPT] for last no data aug epochs" + echo "Usage1: bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_FILE] [BACKBONE]" + echo "Usage2: bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE_FILE] [BACKBONE] [RESUME_CKPT] for resume" exit 1 fi @@ -90,23 +90,14 @@ then taskset -c $cmdopt python train.py \ --config_path=$CONFIG_PATH\ --data_dir=$DATASET_PATH \ - --yolox_no_aug_ckpt=$RESUME_CKPT \ --backbone=$BACKBONE \ - --data_aug=True \ - --is_distributed=1 \ - --lr=0.011 \ - --max_epoch=285 \ - --warmup_epochs=5 \ - --no_aug_epochs=15 \ - --min_lr_ratio=0.001 \ - --eval_interval=10 \ - --lr_scheduler=yolox_warm_cos_lr > log.txt 2>&1 & + --is_distributed=1 > log.txt 2>&1 & cd .. done fi if [ $# == 4 ] then - echo "Start to launch last no data augment epochs..." + echo "Start to resume train..." for((i=0; i<${DEVICE_NUM}; i++)) do start=`expr $i \* $avg` @@ -126,17 +117,9 @@ then taskset -c $cmdopt python train.py \ --config_path=$CONFIG_PATH\ --data_dir=$DATASET_PATH \ - --yolox_no_aug_ckpt=$RESUME_CKPT \ --backbone=$BACKBONE \ - --data_aug=False \ --is_distributed=1 \ - --lr=0.011 \ - --max_epoch=285 \ - --warmup_epochs=5 \ - --no_aug_epochs=15 \ - --min_lr_ratio=0.001 \ - --eval_interval=1 \ - --lr_scheduler=yolox_warm_cos_lr > log.txt 2>&1 & + --resume_yolox=$RESUME_CKPT > log.txt 2>&1 & cd .. done fi diff --git a/research/cv/yolox/scripts/run_standalone_train.sh b/research/cv/yolox/scripts/run_standalone_train.sh index f11ff6498c6307f928dfac9def5b966ff02ff2fd..830ac7799d683472645c399fa3fe07727da2a332 100644 --- a/research/cv/yolox/scripts/run_standalone_train.sh +++ b/research/cv/yolox/scripts/run_standalone_train.sh @@ -15,8 +15,8 @@ # ============================================================================ if [[ $# -lt 2 || $# -gt 3 ]];then - echo "Usage1: bash run_standalone_train.sh [DATASET_PATH] [BACKBONE] for first data aug epochs" - echo "Usage2: bash run_standalone_train.sh [DATASET_PATH] [BACKBONE] [LATEST_CKPT] for last no data aug epochs" + echo "Usage1: bash run_standalone_train.sh [DATASET_PATH] [BACKBONE]" + echo "Usage2: bash run_standalone_train.sh [DATASET_PATH] [BACKBONE] [RESUME_CKPT] for resume train" exit 1 fi @@ -45,6 +45,12 @@ then exit 1 fi +if [ $# == 3 ] +then + CKPT_FILE=$(get_real_path $3) + echo $CKPT_FILE +fi + export DEVICE_NUM=1 export DEVICE_ID=0 export RANK_ID=0 @@ -70,22 +76,17 @@ then python train.py \ --config_path=$CONFIG_PATH \ --data_dir=$DATASET_PATH \ - --data_aug=True \ --is_distributed=0 \ - --eval_interval=10 \ --backbone=$BACKBONE > log.txt 2>&1 & fi if [ $# == 3 ] then - echo "Start to launch last no data augment epochs..." - CKPT_FILE=$(get_real_path $3) - echo $CKPT_FILE + echo "Start to resume train..." python train.py \ + --config_path=$CONFIG_PATH \ --data_dir=$DATASET_PATH \ - --data_aug=False \ --is_distributed=0 \ - --eval_interval=1 \ --backbone=$BACKBONE \ - --yolox_no_aug_ckpt=$CKPT_FILE > log.txt 2>&1 & + --resume_yolox=$CKPT_FILE > log.txt 2>&1 & fi \ No newline at end of file diff --git a/research/cv/yolox/src/initializer.py b/research/cv/yolox/src/initializer.py index 97edf9dc8c2a8f048c8814666a69e08d22f82d5f..391707ad6ff2544ddb7c72adf6eb37a6a78edbf7 100644 --- a/research/cv/yolox/src/initializer.py +++ b/research/cv/yolox/src/initializer.py @@ -20,7 +20,7 @@ import numpy as np import mindspore.nn as nn from mindspore.common import initializer as init from mindspore.common.initializer import Initializer as MeInitializer -from src.util import load_backbone +from src.util import load_weights def calculate_gain(nonlinearity, param=None): @@ -180,9 +180,6 @@ def default_recurisive_init(custom_cell, prior_prob=1e-2): cell.bias.set_data(init.initializer(init.Uniform(bound), cell.bias.shape, cell.bias.dtype)) - elif isinstance(cell, (nn.BatchNorm2d, nn.BatchNorm1d, nn.SyncBatchNorm)): - cell.momentum = 0.97 - cell.eps = 0.001 else: pass initialize_head_biases(custom_cell, prior_prob=0.01) @@ -198,7 +195,7 @@ def initialize_head_biases(network, prior_prob): def load_yolox_params(args, network): """Load yolox darknet parameter from checkpoint.""" if args.pretrained_backbone: - network = load_backbone(network, args.pretrained_backbone, args) + network = load_weights(network, args.pretrained_backbone, args) args.logger.info('load pre-trained backbone {} into network'.format(args.pretrained_backbone)) else: args.logger.info('Not load pre-trained backbone, please be careful') diff --git a/research/cv/yolox/src/network_blocks.py b/research/cv/yolox/src/network_blocks.py index 3b007c99c2573b31b33d8afec9c038c03f5a4fa4..e404860736bcea421847c2d5270f199ec52e97f6 100644 --- a/research/cv/yolox/src/network_blocks.py +++ b/research/cv/yolox/src/network_blocks.py @@ -61,7 +61,7 @@ class BaseConv(nn.Cell): group=groups, has_bias=bias ) - self.bn = nn.BatchNorm2d(out_channels) + self.bn = nn.BatchNorm2d(num_features=out_channels, eps=1e-3, momentum=0.97) self.act = get_activation(act) def construct(self, x): @@ -74,7 +74,7 @@ def use_syc_bn(network): for _, cell in network.cells_and_names(): if isinstance(cell, BaseConv): out_channels = cell.bn.num_features - cell.bn = nn.SyncBatchNorm(out_channels) + cell.bn = nn.SyncBatchNorm(num_features=out_channels, eps=1e-3, momentum=0.97) class DWConv(nn.Cell): @@ -154,25 +154,19 @@ class SPPBottleneck(nn.Cell): self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation) self.m = nn.CellList( [ - nn.MaxPool2d(kernel_size=ks, stride=1) + nn.MaxPool2d(kernel_size=ks, stride=1, pad_mode='same') for ks in kernel_sizes ] ) - self.pad0 = ops.Pad(((0, 0), (0, 0), (kernel_sizes[0] // 2, kernel_sizes[0] // 2), - (kernel_sizes[0] // 2, kernel_sizes[0] // 2))) - self.pad1 = ops.Pad(((0, 0), (0, 0), (kernel_sizes[1] // 2, kernel_sizes[1] // 2), - (kernel_sizes[1] // 2, kernel_sizes[1] // 2))) - self.pad2 = ops.Pad(((0, 0), (0, 0), (kernel_sizes[2] // 2, kernel_sizes[2] // 2), - (kernel_sizes[2] // 2, kernel_sizes[2] // 2))) conv2_channels = hidden_channels * (len(kernel_sizes) + 1) self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation) def construct(self, x): x = self.conv1(x) op = ops.Concat(axis=1) - x1 = self.m[0](self.pad0(x)) - x2 = self.m[1](self.pad1(x)) - x3 = self.m[2](self.pad2(x)) + x1 = self.m[0](x) + x2 = self.m[1](x) + x3 = self.m[2](x) x = op((x, x1, x2, x3)) x = self.conv2(x) return x diff --git a/research/cv/yolox/src/transform.py b/research/cv/yolox/src/transform.py index e24d4d1bceebef3979d1767d9be8f0b5588b62f2..136397ccf900e4aa7c1af1a39a26d1bcb33742ae 100644 --- a/research/cv/yolox/src/transform.py +++ b/research/cv/yolox/src/transform.py @@ -15,7 +15,6 @@ """ image transform related """ import random import math - import cv2 import numpy as np @@ -128,21 +127,17 @@ def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.2): return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr) # candidates -def augment_hsv(img, hgain=0.015, sgain=0.7, vgain=0.4): - """ hsv augment """ - r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains - hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)) - dtype = img.dtype +def augment_hsv(img, hgain=5, sgain=30, vgain=30): + hsv_augs = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] # random gains + hsv_augs *= np.random.randint(0, 2, 3) # random selection of h, s, v + hsv_augs = hsv_augs.astype(np.int16) + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16) - x = np.arange(0, 256, dtype=np.int16) - lut_hue = ((x * r[0]) % 180).astype(dtype) - lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) - lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180 + img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255) + img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255) - img_hsv = cv2.merge( - (cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)) - ).astype(dtype) - cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) + cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img) # no return needed def _mirror(image, boxes, prob=0.5): @@ -183,8 +178,8 @@ class TrainTransform: self.strides = config.fpn_strides self.input_size = config.input_size else: - self.hsv_prob = 1.0 - self.flip_prob = 0.5 + self.hsv_prob = hsv_prob + self.flip_prob = flip_prob self.max_labels = max_labels self.strides = [8, 16, 32] self.input_size = (640, 640) @@ -229,11 +224,11 @@ class TrainTransform: targets_t = np.hstack((labels_t, boxes_t)) padded_labels = np.zeros((self.max_labels, 5)) - true_labels = len(targets_t) padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[: self.max_labels] padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32) gt_bboxes_per_image = padded_labels[:, 1:5] + true_labels = np.sum(np.sum(padded_labels, axis=1) > 0) # is_in_boxes_all [gt_max, 8400] is_in_boxes_all, is_in_boxes_and_center = self.get_in_boxes_info(gt_bboxes_per_image, true_labels) # is_in_boxes_all [gt_max, 8400] diff --git a/research/cv/yolox/src/util.py b/research/cv/yolox/src/util.py index 55a63f0c84dd60ddc85ab9434c5d5e87279cb4ff..7bdabf2fd4ec152e44f9af733ccae1f160b371fd 100644 --- a/research/cv/yolox/src/util.py +++ b/research/cv/yolox/src/util.py @@ -23,8 +23,7 @@ from datetime import datetime from collections import Counter import numpy as np import mindspore.common.dtype as mstype -from mindspore import load_checkpoint, load_param_into_net, save_checkpoint, Tensor, Parameter -from mindspore.common.parameter import ParameterTuple +from mindspore import load_checkpoint, load_param_into_net, save_checkpoint, Parameter from mindspore.train.callback import Callback from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval @@ -100,6 +99,7 @@ def yolox_warm_cos_lr( steps_per_epoch, warmup_epochs, max_epoch, + start_epoch, no_aug_epochs, warmup_lr_start=0, min_lr_ratio=0.05 @@ -122,6 +122,7 @@ def yolox_warm_cos_lr( lr = min_lr + 0.5 * (base_lr - min_lr) * (1.0 + math.cos( math.pi * (i - warmup_total_iters) / (total_iters - warmup_total_iters - no_aug_iter))) lr_each_step.append(lr) + lr_each_step = lr_each_step[start_epoch * steps_per_epoch:] return np.array(lr_each_step).astype(np.float32) @@ -231,22 +232,16 @@ def get_lr(args): lr = yolox_warm_cos_lr(lr=args.lr, steps_per_epoch=args.steps_per_epoch, warmup_epochs=args.warmup_epochs, - max_epoch=args.total_epoch, + max_epoch=args.max_epoch, + start_epoch=args.start_epoch, no_aug_epochs=args.no_aug_epochs, min_lr_ratio=args.min_lr_ratio) - elif args.lr_scheduler == 'no_aug_lr': - lr = yolox_no_aug_lr( - args.lr, - args.steps_per_epoch, - args.max_epoch, - min_lr_ratio=args.min_lr_ratio - ) else: raise NotImplementedError(args.lr_scheduler) return lr -def get_param_groups(network, weight_decay): +def get_param_groups(network, weight_decay, use_group_params=True): """Param groups for optimizer.""" decay_params = [] no_decay_params = [] @@ -263,22 +258,31 @@ def get_param_groups(network, weight_decay): no_decay_params.append(x) else: decay_params.append(x) + if use_group_params: + return [{'params': no_decay_params, 'weight_decay': 0.0}, + {'params': decay_params, 'weight_decay': weight_decay}] + return network.trainable_params() - return [{'params': no_decay_params, 'weight_decay': 0.0}, {'params': decay_params, 'weight_decay': weight_decay}] - -def load_backbone(net, ckpt_path, args): +def load_weights(net, ckpt_path): """Load darknet53 backbone checkpoint.""" - param_dict = load_checkpoint(ckpt_path) - load_param_into_net(net, param_dict) - - param_not_load = [] - for _, param in net.parameters_and_names(): - if param.name in param_dict: - pass - else: - param_not_load.append(param.name) - args.logger.info("not loading param is :", len(param_not_load)) + checkpoint_param = load_checkpoint(ckpt_path) + ema_param_dict = dict() + param_dict = dict() + + for param in checkpoint_param: + if param.startswith("ema.network"): + new_name = param.split("ema.")[1] + ema_data = checkpoint_param[param] + ema_data.name = new_name + ema_param_dict[new_name] = ema_data + elif param.startswith('network.'): + param_dict[param] = checkpoint_param[param] + + if ema_param_dict: + load_param_into_net(net, ema_param_dict) + else: + load_param_into_net(net, param_dict) return net @@ -325,39 +329,13 @@ def keep_loss_fp32(network): cell.to_float(mstype.float32) -class EMACallBack(Callback): - - def __init__(self, network, steps_per_epoch, cur_steps=0): - self.steps_per_epoch = steps_per_epoch - self.cur_steps = cur_steps - self.network = network +class ResumeCallback(Callback): + def __init__(self, start_epoch=0): + super(ResumeCallback, self).__init__() + self.start_epoch = start_epoch def epoch_begin(self, run_context): - if self.network.ema: - if not isinstance(self.network.ema_moving_weight, list): - tmp_moving = [] - for weight in self.network.ema_moving_weight: - tmp_moving.append(weight.asnumpy()) - self.network.ema_moving_weight = tmp_moving - - def step_end(self, run_context): - if self.network.ema: - self.network.moving_parameter_update() - self.cur_steps += 1 - - if self.cur_steps % self.steps_per_epoch == 0: - if isinstance(self.network.ema_moving_weight, list): - tmp_moving = [] - moving_name = [] - idx = 0 - for key in self.network.moving_name: - moving_name.append(key) - - for weight in self.network.ema_moving_weight: - param = Parameter(Tensor(weight), name=moving_name[idx]) - tmp_moving.append(param) - idx += 1 - self.network.ema_moving_weight = ParameterTuple(tmp_moving) + run_context.original_args().cur_epoch_num += self.start_epoch class YOLOXCB(Callback): @@ -365,22 +343,23 @@ class YOLOXCB(Callback): YOLOX Callback. """ - def __init__(self, logger, step_per_epoch, lr, save_ckpt_path, is_modelart=False, per_print_times=1, - train_url=None): + def __init__(self, config, lr, is_modelart=False, per_print_times=1, train_url=None): super(YOLOXCB, self).__init__() self.train_url = train_url if not isinstance(per_print_times, int) or per_print_times < 0: raise ValueError("print_step must be int and >= 0.") self._per_print_times = per_print_times self.lr = lr + self.is_modelarts = config.is_modelart + self.step_per_epoch = config.steps_per_epoch + self.logger = config.logger + self.save_ckpt_path = config.save_ckpt_dir + self.max_epoch = config.max_epoch self.is_modelarts = is_modelart - self.step_per_epoch = step_per_epoch self.current_step = 0 - self.save_ckpt_path = save_ckpt_path self.iter_time = time.time() self.epoch_start_time = time.time() self.average_loss = [] - self.logger = logger def epoch_begin(self, run_context): """ @@ -402,11 +381,10 @@ class YOLOXCB(Callback): cb_params = run_context.original_args() cur_epoch = cb_params.cur_epoch_num loss = cb_params.net_outputs - loss = "loss: %.4f, overflow: %s, scale: %s" % (float(loss[0].asnumpy()), - bool(loss[1].asnumpy()), - int(loss[2].asnumpy())) + loss = "loss: %.4f" % (float(loss.asnumpy())) self.logger.info( - "epoch: %s epoch time %.2fs %s" % (cur_epoch, time.time() - self.epoch_start_time, loss)) + "epoch: [%s/%s] time %.2fs %s" % ( + cur_epoch, self.max_epoch, time.time() - self.epoch_start_time, loss)) if self.current_step % (self.step_per_epoch * 1) == 0: if self.is_modelarts: @@ -438,11 +416,10 @@ class YOLOXCB(Callback): cb_params = run_context.original_args() cur_epoch = cb_params.cur_epoch_num loss = cb_params.net_outputs - loss = "loss: %.4f, overflow: %s, scale: %s" % (float(loss[0].asnumpy()), - bool(loss[1].asnumpy()), - int(loss[2].asnumpy())) - self.logger.info("epoch: %s step: [%s/%s], %s, lr: %.6f, avg step time: %.2f ms" % ( - cur_epoch, cur_epoch_step, self.step_per_epoch, loss, self.lr[self.current_step], + loss = "loss: %.4f" % (float(loss.asnumpy())) + self.logger.info("epoch: [%s/%s] step: [%s/%s], %s, lr: %.6f, avg step time: %.2f ms" % ( + cur_epoch, self.max_epoch, cur_epoch_step, self.step_per_epoch, loss, + self.lr[self.current_step], (time.time() - self.iter_time) * 1000 / self._per_print_times)) self.iter_time = time.time() self.current_step += 1 @@ -457,22 +434,25 @@ class YOLOXCB(Callback): class EvalCallBack(Callback): - def __init__(self, dataset, test_net, train_net, detection, config, start_epoch=0, interval=1): + def __init__(self, dataset, test_net, detection, config, interval=1): self.dataset = dataset - self.network = train_net self.test_network = test_net self.detection = detection self.logger = config.logger - self.start_epoch = start_epoch - self.interval = interval + self.start_epoch = config.start_epoch self.max_epoch = config.max_epoch + self.use_ema = config.use_ema + self.train_epoch = config.train_epoch + self.save_ckpt_path = config.save_ckpt_dir + self.rank = config.rank + self.resume_yolox = config.resume_yolox + self.interval = interval self.best_result = 0 self.best_epoch = 0 - self.rank = config.rank - def load_ema_parameter(self): + def load_ema_parameter(self, network): param_dict = {} - for name, param in self.network.parameters_and_names(): + for name, param in network.parameters_and_names(): if name.startswith("ema."): new_name = name.split('ema.')[-1] param_new = param.clone() @@ -480,31 +460,41 @@ class EvalCallBack(Callback): param_dict[new_name] = param_new load_param_into_net(self.test_network, param_dict) - def load_network_parameter(self): + def load_network_parameter(self, network): param_dict = {} - for name, param in self.network.parameters_and_names(): + for name, param in network.parameters_and_names(): if name.startswith("network."): param_new = param.clone() param_dict[name] = param_new load_param_into_net(self.test_network, param_dict) + def begin(self, run_context): + best_ckpt_path = os.path.join(self.save_ckpt_path, 'best.ckpt') + if os.path.exists(best_ckpt_path) and self.resume_yolox: + param_dict = load_checkpoint(best_ckpt_path) + self.best_result = param_dict['best_result'].asnumpy().item() + self.best_epoch = param_dict['best_epoch'].asnumpy().item() + self.logger.info('cur best result %s at epoch %s' % (self.best_result, self.best_epoch)) + def epoch_end(self, run_context): cb_param = run_context.original_args() cur_epoch = cb_param.cur_epoch_num - if cur_epoch >= self.start_epoch: - if (cur_epoch - self.start_epoch) % self.interval == 0 or cur_epoch == self.max_epoch: - self.load_network_parameter() - self.test_network.set_train(False) - eval_print_str, results = self.inference() - if results >= self.best_result: - self.best_result = results - self.best_epoch = cur_epoch - if os.path.exists('best.ckpt'): - self.remove_ckpoint_file('best.ckpt') - save_checkpoint(cb_param.train_network, 'best.ckpt') - self.logger.info("Best result %s at %s epoch" % (self.best_result, self.best_epoch)) - self.logger.info(eval_print_str) - self.logger.info('Ending inference...') + if cur_epoch % self.interval == 0 or cur_epoch == self.start_epoch + self.train_epoch: + if self.use_ema: + self.load_ema_parameter(cb_param.train_network) + else: + self.load_network_parameter(cb_param.train_network) + self.test_network.set_train(False) + eval_print_str, results = self.inference() + if results >= self.best_result: + self.best_result = results + self.best_epoch = cur_epoch + if os.path.exists('best.ckpt'): + self.remove_ckpoint_file('best.ckpt') + self.save_best_checkpoint(cb_param.train_network) + self.logger.info("Best result %s at %s epoch" % (self.best_result, self.best_epoch)) + self.logger.info(eval_print_str) + self.logger.info('Ending inference...') def end(self, run_context): self.logger.info("Best result %s at %s epoch" % (self.best_result, self.best_epoch)) @@ -544,6 +534,13 @@ class EvalCallBack(Callback): except ValueError: self.logger.info("ValueError, failed to remove the older ckpt file %s.", file_name) + def save_best_checkpoint(self, net): + param_list = [{'name': 'best_result', 'data': Parameter(self.best_result)}, + {'name': 'best_epoch', 'data': Parameter(self.best_epoch)}] + for name, param in net.parameters_and_names(): + param_list.append({'name': name, 'data': param}) + save_checkpoint(param_list, os.path.join(self.save_ckpt_path, 'best.ckpt')) + class Redirct: def __init__(self): @@ -733,3 +730,29 @@ class DetectionEngine: cocoEval.summarize() sys.stdout = stdout return rdct.content, cocoEval.stats[0] + + +def get_specified(): + res = [ + 'network.backbone.backbone.stem.0.conv.weight', + 'network.backbone.backbone.dark2.0.conv.weight', + 'network.backbone.backbone.dark3.0.conv.weight', + 'network.backbone.backbone.dark3.8.layer2.conv.weight', + 'network.backbone.backbone.dark4.0.conv.weight', + 'network.backbone.backbone.dark4.8.layer2.conv.weight', + 'network.backbone.backbone.dark5.0.conv.weight', + 'network.backbone.backbone.dark5.7.conv1.conv.weight', + 'network.backbone.backbone.dark5.7.conv2.conv.weight', + 'network.backbone.backbone.dark5.9.conv.weight', + 'network.head_l.cls_preds.weight', + 'network.head_m.cls_preds.weight', + 'network.head_s.cls_preds.weight', + 'network.head_l.reg_preds.weight', + 'network.head_m.reg_preds.weight', + 'network.head_s.reg_preds.weight', + 'network.head_l.obj_preds.weight', + 'network.head_m.obj_preds.weight', + 'network.head_s.obj_preds.weight', + ] + + return res diff --git a/research/cv/yolox/src/yolox.py b/research/cv/yolox/src/yolox.py index 48455b148c99b81ca876e458f85cec416ef4d0d8..93d3d18cea834b45190906975b757eca43536436 100644 --- a/research/cv/yolox/src/yolox.py +++ b/research/cv/yolox/src/yolox.py @@ -19,7 +19,6 @@ import mindspore.nn as nn from mindspore import Tensor, Parameter from mindspore import ops from mindspore.common.parameter import ParameterTuple -from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.ops import operations as P @@ -234,6 +233,8 @@ class YOLOLossCell(nn.Cell): self.grids = [(config.input_size[0] // _stride) * (config.input_size[1] // _stride) for _stride in config.fpn_strides] self.use_l1 = config.use_l1 + self.use_summary = config.use_summary + self.summary = ops.ScalarSummary() def construct(self, img, labels=None, pre_fg_mask=None, is_inbox_and_incenter=None): """ forward with loss return """ @@ -256,11 +257,11 @@ class YOLOLossCell(nn.Cell): gt_classes_ = self.one_hot(gt_classes, self.depth, self.on_value, self.off_value) gt_classes_expaned = ops.repeat_elements(self.unsqueeze(gt_classes_, 2), rep=total_num_anchors, axis=2) gt_classes_expaned = F.stop_gradient(gt_classes_expaned) - cls_preds_ = P.Sigmoid()(ops.repeat_elements(self.unsqueeze(cls_preds, 1), rep=gt_max, axis=1)) * \ P.Sigmoid()( ops.repeat_elements(self.unsqueeze(obj_preds, 1), rep=gt_max, axis=1) ) + pair_wise_cls_loss = P.ReduceSum()( P.BinaryCrossEntropy(reduction="none")(P.Sqrt()(cls_preds_), gt_classes_expaned, None), -1) pair_wise_cls_loss = pair_wise_cls_loss * pre_fg_mask @@ -333,7 +334,18 @@ class YOLOLossCell(nn.Cell): loss_cls = P.ReduceSum()(self.bce_loss(cls_preds, cls_target), -1) * obj_target loss_cls = P.ReduceSum()(loss_cls) - loss_all = (5 * loss_iou + loss_cls + loss_obj + loss_l1) / (P.ReduceSum()(obj_target) + 1e-3) + + num_fg_mask = P.ReduceSum()(obj_target) == 0 + num_fg = (num_fg_mask == 0) * P.ReduceSum()(obj_target) + 1.0 * num_fg_mask + loss_all = (5 * loss_iou + loss_cls + loss_obj + loss_l1) / num_fg + + if self.use_summary: + self.summary('num_fg', num_fg) + self.summary('loss_iou', loss_iou * 5 / num_fg) + self.summary('loss_cls', loss_cls / num_fg) + self.summary('loss_obj', loss_obj / num_fg) + self.summary('loss_l1', loss_l1 / num_fg) + return loss_all def get_l1_format_single(self, reg_target, stride, eps): @@ -389,52 +401,31 @@ class IOUloss(nn.Cell): return loss -grad_scale = C.MultitypeFuncGraph("grad_scale") -reciprocal = P.Reciprocal() - - -@grad_scale.register("Tensor", "Tensor") -def tensor_grad_scale(scale, grad): - return grad * reciprocal(scale) - - -_grad_overflow = C.MultitypeFuncGraph("_grad_overflow") -grad_overflow = P.FloatStatus() - - -@_grad_overflow.register("Tensor") -def _tensor_grad_overflow(grad): - return grad_overflow(grad) - - -class TrainOneStepWithEMA(nn.TrainOneStepWithLossScaleCell): +class TrainOneStepWithEMA(nn.TrainOneStepCell): """ Train one step with ema model """ - def __init__(self, network, optimizer, scale_sense, ema=True, decay=0.9998, updates=0, moving_name=None, - ema_moving_weight=None): - super(TrainOneStepWithEMA, self).__init__(network, optimizer, scale_sense) + def __init__(self, network, optimizer, dataset_size, sens=1.0, ema=True, decay=0.9998, updates=0): + super(TrainOneStepWithEMA, self).__init__(network, optimizer, sens=sens) + self.dataset_size = dataset_size self.ema = ema - self.moving_name = moving_name - self.ema_moving_weight = ema_moving_weight + self.decay = decay + self.updates = Parameter(Tensor(updates, mindspore.float32)) if self.ema: self.ema_weight = self.weights.clone("ema", init='same') - self.decay = decay - self.updates = Parameter(Tensor(updates, mindspore.float32)) + self.moving_parameter = list() + self.ema_moving_parameter = list() self.assign = ops.Assign() - self.ema_moving_parameters() + self.get_moving_parameters() - def ema_moving_parameters(self): - self.moving_name = {} - moving_list = [] - idx = 0 + def get_moving_parameters(self): for key, param in self.network.parameters_and_names(): if "moving_mean" in key or "moving_variance" in key: new_param = param.clone() new_param.name = "ema." + param.name - moving_list.append(new_param) - self.moving_name["ema." + key] = idx - idx += 1 - self.ema_moving_weight = ParameterTuple(moving_list) + self.moving_parameter.append(param) + self.ema_moving_parameter.append(new_param) + self.moving_parameter = ParameterTuple(self.moving_parameter) + self.ema_moving_parameter = ParameterTuple(self.ema_moving_parameter) def ema_update(self): """Update EMA parameters.""" @@ -445,40 +436,19 @@ class TrainOneStepWithEMA(nn.TrainOneStepWithLossScaleCell): for ema_v, weight in zip(self.ema_weight, self.weights): tep_v = ema_v * d self.assign(ema_v, (1.0 - d) * weight + tep_v) - return self.updates - # moving_parameter_update is executed inside the callback(EMACallBack) - def moving_parameter_update(self): - if self.ema: - d = (self.decay * (1 - ops.Exp()(-self.updates / 2000))).asnumpy().item() - # update moving mean and moving var - for key, param in self.network.parameters_and_names(): - if "moving_mean" in key or "moving_variance" in key: - idx = self.moving_name["ema." + key] - moving_weight = param.asnumpy() - tep_v = self.ema_moving_weight[idx] * d - ema_value = (1.0 - d) * moving_weight + tep_v - self.ema_moving_weight[idx] = ema_value + for ema_moving, moving in zip(self.ema_moving_parameter, self.moving_parameter): + tep_m = ema_moving * d + self.assign(ema_moving, (1.0 - d) * moving + tep_m) + return self.updates def construct(self, *inputs): - """ Forward """ - weights = self.weights loss = self.network(*inputs) - scaling_sens = self.scale_sense - - status, scaling_sens = self.start_overflow_check(loss, scaling_sens) - - scaling_sens_filled = C.ones_like(loss) * F.cast(scaling_sens, F.dtype(loss)) - grads = self.grad(self.network, weights)(*inputs, scaling_sens_filled) - grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads) - # apply grad reducer on grads + sens = F.fill(loss.dtype, loss.shape, self.sens) + grads = self.grad(self.network, self.weights)(*inputs, sens) grads = self.grad_reducer(grads) - self.ema_update() - - # get the overflow buffer - cond = self.get_overflow_status(status, grads) - overflow = self.process_loss_scale(cond) - # if there is no overflow, do optimize - if not overflow: - loss = F.depend(loss, self.optimizer(grads)) - return loss, cond, scaling_sens + loss = F.depend(loss, self.optimizer(grads)) + if self.ema: + self.ema_update() + + return loss diff --git a/research/cv/yolox/src/yolox_dataset.py b/research/cv/yolox/src/yolox_dataset.py index aac2255ca16b54bb69ba826bb1bf6ced8be2bd26..640bb0e5b69cd73211a47d8670490e2811e5d7d8 100644 --- a/research/cv/yolox/src/yolox_dataset.py +++ b/research/cv/yolox/src/yolox_dataset.py @@ -22,7 +22,7 @@ import cv2 import mindspore.dataset as de from pycocotools.coco import COCO -from src.transform import box_candidates, random_affine, TrainTransform, ValTransform +from src.transform import random_affine, TrainTransform, ValTransform min_keypoints_per_image = 10 @@ -101,7 +101,7 @@ class COCOYoloXDataset: self.enable_mosaic = enable_mosaic self.degrees = 10.0 self.translate = 0.1 - self.scale = (0.5, 1.5) + self.scale = (0.1, 2.0) self.mixup_scale = (0.5, 1.5) self.shear = 2.0 self.perspective = 0.0 @@ -111,7 +111,7 @@ class COCOYoloXDataset: if remove_images_without_annotations: img_ids = [] for img_id in self.img_ids: - ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None) + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) anno = self.coco.loadAnns(ann_ids) if has_valid_annotation(anno): img_ids.append(img_id) @@ -201,13 +201,12 @@ class COCOYoloXDataset: return img, label, pre_fg_mask, is_inbox_and_incenter def mixup(self, origin_img, origin_labels, input_dim): - """ Mixup data augment """ jit_factor = random.uniform(*self.mixup_scale) FLIP = random.uniform(0, 1) > 0.5 - cp_labels = np.empty(0) - while not cp_labels.size: + cp_labels = [] + while cp_labels: cp_index = random.randint(0, self.__len__() - 1) - cp_labels, _, _ = self.load_anno_from_ids(cp_index) + cp_labels = self.load_anno_from_ids(cp_index) img, cp_labels, _, _ = self.pull_item(cp_index) if len(img.shape) == 3: @@ -222,7 +221,9 @@ class COCOYoloXDataset: interpolation=cv2.INTER_LINEAR, ) - cp_img[: int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio)] = resized_img + cp_img[ + : int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio) + ] = resized_img cp_img = cv2.resize( cp_img, @@ -245,13 +246,17 @@ class COCOYoloXDataset: y_offset = random.randint(0, padded_img.shape[0] - target_h - 1) if padded_img.shape[1] > target_w: x_offset = random.randint(0, padded_img.shape[1] - target_w - 1) - padded_cropped_img = padded_img[y_offset: y_offset + target_h, x_offset: x_offset + target_w] + padded_cropped_img = padded_img[ + y_offset: y_offset + target_h, x_offset: x_offset + target_w + ] cp_bboxes_origin_np = adjust_box_anns( cp_labels[:, :4].copy(), cp_scale_ratio, 0, 0, origin_w, origin_h ) if FLIP: - cp_bboxes_origin_np[:, 0::2] = (origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]) + cp_bboxes_origin_np[:, 0::2] = ( + origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1] + ) cp_bboxes_transformed_np = cp_bboxes_origin_np.copy() cp_bboxes_transformed_np[:, 0::2] = np.clip( cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w @@ -259,15 +264,13 @@ class COCOYoloXDataset: cp_bboxes_transformed_np[:, 1::2] = np.clip( cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h ) - keep_list = box_candidates(cp_bboxes_origin_np.T, cp_bboxes_transformed_np.T, 5) - - if keep_list.sum() >= 1.0: - cls_labels = cp_labels[keep_list, 4:5].copy() - box_labels = cp_bboxes_transformed_np[keep_list] - labels = np.hstack((box_labels, cls_labels)) - origin_labels = np.vstack((origin_labels, labels)) - origin_img = origin_img.astype(np.float32) - origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32) + + cls_labels = cp_labels[:, 4:5].copy() + box_labels = cp_bboxes_transformed_np + labels = np.hstack((box_labels, cls_labels)) + origin_labels = np.vstack((origin_labels, labels)) + origin_img = origin_img.astype(np.float32) + origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32) return origin_img.astype(np.uint8), origin_labels diff --git a/research/cv/yolox/train.py b/research/cv/yolox/train.py index 7f8fa73b5f3cab04fde9392cbe0a2bce18ff2925..ceb4e4aaa7653935114cbdfcf2137341f9eea98d 100644 --- a/research/cv/yolox/train.py +++ b/research/cv/yolox/train.py @@ -20,10 +20,9 @@ import argparse from mindspore.context import ParallelMode from mindspore.common import set_seed -from mindspore.common.parameter import ParameterTuple -from mindspore.train.callback import CheckpointConfig, ModelCheckpoint +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, SummaryCollector from mindspore.communication.management import init, get_rank, get_group_size -from mindspore import context, Model, DynamicLossScaleManager, load_checkpoint, load_param_into_net +from mindspore import context, Model, load_checkpoint, load_param_into_net from mindspore.profiler.profiling import Profiler from mindspore.common.tensor import Tensor @@ -33,10 +32,12 @@ from model_utils.moxing_adapter import moxing_wrapper from src.initializer import default_recurisive_init from src.logger import get_logger from src.network_blocks import use_syc_bn -from src.util import get_param_groups, YOLOXCB, get_lr, load_backbone, EvalCallBack, DetectionEngine, EMACallBack +from src.util import get_specified, get_param_groups, YOLOXCB, get_lr, load_weights, EvalCallBack, DetectionEngine, \ + ResumeCallback from src.yolox import YOLOLossCell, TrainOneStepWithEMA, DetectionBlock from src.yolox_dataset import create_yolox_dataset + set_seed(888) @@ -49,10 +50,13 @@ def set_default(): else: config.data_root = os.path.join(config.data_dir, 'train2017') config.annFile = os.path.join(config.data_dir, 'annotations/instances_train2017.json') - outputs_dir = config.ckpt_path + outputs_dir = os.getcwd() + config.save_ckpt_dir = os.path.join(config.ckpt_dir, config.backbone) # logger config.outputs_dir = os.path.join(outputs_dir, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) + config.max_epoch = config.aug_epochs + config.no_aug_epochs + config.train_epoch = config.aug_epochs config.logger = get_logger(config.outputs_dir, config.rank) config.logger.save_args(config) @@ -66,44 +70,44 @@ def set_graph_kernel_context(): "--enable_expand_ops=Conv2D") -def network_init(cfg): +def network_init(): """ Network init """ device_id = int(os.getenv('DEVICE_ID', '0')) context.set_context(mode=context.GRAPH_MODE, - device_target=cfg.device_target, save_graphs=cfg.save_graphs, device_id=device_id, - save_graphs_path="ir_path") + device_target=config.device_target, save_graphs=config.save_graphs, device_id=device_id, + save_graphs_path="ir_path", max_call_depth=2000) set_graph_kernel_context() profiler = None - if cfg.need_profiler: - profiling_dir = os.path.join(cfg.outputs_dir, + if config.need_profiler: + profiling_dir = os.path.join(config.outputs_dir, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) profiler = Profiler(output_path=profiling_dir, is_detail=True, is_show_op_path=True) # init distributed - cfg.use_syc_bn = False - if cfg.is_distributed: - cfg.use_syc_bn = True + if not config.use_syc_bn: + config.use_syc_bn = False + if config.is_distributed: init() - cfg.rank = get_rank() - cfg.group_size = get_group_size() + config.rank = get_rank() + config.group_size = get_group_size() context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, - device_num=cfg.group_size) + device_num=config.group_size) # select for master rank save ckpt or all rank save, compatible for model parallel - cfg.rank_save_ckpt_flag = 0 - if cfg.is_save_on_master: - if cfg.rank == 0: - cfg.rank_save_ckpt_flag = 1 + if config.is_save_on_master: + config.rank_save_ckpt_flag = 0 + if config.rank == 0: + config.rank_save_ckpt_flag = 1 else: - cfg.rank_save_ckpt_flag = 1 + config.rank_save_ckpt_flag = 1 # logger - cfg.outputs_dir = os.path.join(cfg.ckpt_path, - datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) - cfg.logger = get_logger(cfg.outputs_dir, cfg.rank) - cfg.logger.save_args(cfg) + config.outputs_dir = os.path.join(config.outputs_dir, + datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) + config.logger = get_logger(config.outputs_dir, config.rank) + config.logger.save_args(config) return profiler @@ -169,7 +173,7 @@ def modelarts_pre_process(): print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1)) - config.ckpt_path = os.path.join(config.output_path, config.ckpt_path) + config.ckpt_path = os.path.join(config.output_path, config.ckpt_dir) def parser_init(): @@ -193,59 +197,27 @@ def get_val_dataset(): def get_optimizer(cfg, network, lr): - param_group = get_param_groups(network, cfg.weight_decay) if cfg.opt == "SGD": from mindspore.nn import SGD - opt = SGD(params=param_group, learning_rate=Tensor(lr), momentum=config.momentum, nesterov=True) + params = get_param_groups(network, cfg.weight_decay, use_group_params=False) + opt = SGD(params=params, learning_rate=Tensor(lr), momentum=config.momentum, weight_decay=config.weight_decay, + nesterov=True) cfg.logger.info("Use SGD Optimizer") else: from mindspore.nn import Momentum - opt = Momentum(params=param_group, - learning_rate=Tensor(lr), - momentum=cfg.momentum, - use_nesterov=True) + param_group = get_param_groups(network, cfg.weight_decay) + opt = Momentum(params=param_group, learning_rate=Tensor(lr), momentum=cfg.momentum, use_nesterov=True) cfg.logger.info("Use Momentum Optimizer") return opt -def load_resume_checkpoint(cfg, network, ckpt_path): - param_dict = load_checkpoint(ckpt_path) - - ema_train_weight = [] - ema_moving_weight = [] - param_load = {} - for key, param in param_dict.items(): - if key.startswith("network.") or key.startswith("moments."): - param_load[key] = param - elif "updates" in key: - cfg.updates = param - network.updates = cfg.updates - config.logger.info("network_ema updates:%s" % network.updates.asnumpy().item()) - load_param_into_net(network, param_load) - - for key, param in network.parameters_and_names(): - if key.startswith("ema.") and "moving_mean" not in key and "moving_variance" not in key: - ema_train_weight.append(param_dict[key]) - elif key.startswith("ema.") and ("moving_mean" in key or "moving_variance" in key): - ema_moving_weight.append(param_dict[key]) - - if network.ema: - if ema_train_weight and ema_moving_weight: - network.ema_weight = ParameterTuple(ema_train_weight) - network.ema_moving_weight = ParameterTuple(ema_moving_weight) - config.logger.info("successful loading ema weights") - - @moxing_wrapper(pre_process=modelarts_pre_process) def run_train(): """ Launch Train process """ parser = parser_init() args_opt, _ = parser.parse_known_args() set_default() - if not config.data_aug: # Train the last no data augment epochs - config.use_l1 = True # Add L1 loss - config.max_epoch = config.no_aug_epochs - config.lr_scheduler = "no_aug_lr" # fix the min lr for last no data aug epochs + if config.enable_modelarts: import moxing as mox local_data_url = os.path.join(config.data_path, str(config.rank)) @@ -254,81 +226,90 @@ def run_train(): config.data_dir = os.path.join(config.data_path, 'coco2017') mox.file.copy_parallel(config.annFile, local_annFile) config.annFile = os.path.join(local_data_url, 'instances_train2017.json') - profiler = network_init(config) + profiler = network_init() parallel_init(config) if config.backbone == "yolox_darknet53": backbone = "yolofpn" - else: + elif config.backbone == 'yolox_x': backbone = "yolopafpn" + else: + raise ValueError('backbone only support [yolox_darknet53, yolox_x]') base_network = DetectionBlock(config, backbone=backbone) - if config.pretrained: - base_network = load_backbone(base_network, config.pretrained, config) - config.logger.info('Training backbone is: %s' % config.backbone) - if config.use_syc_bn: + + # syc bn only support distributed training in graph mode + if config.use_syc_bn and config.is_distributed and context.get_context('mode') == context.GRAPH_MODE: config.logger.info("Using Synchronized batch norm layer...") use_syc_bn(base_network) default_recurisive_init(base_network) config.logger.info("Network weights have been initialized...") + if config.pretrained: + base_network = load_weights(base_network, config.pretrained) + config.logger.info('pretrained is: ', config.pretrained) + config.logger.info('Training backbone is: %s' % config.backbone) + network = YOLOLossCell(base_network, config) config.logger.info('Finish getting network...') - config.data_root = os.path.join(config.data_dir, 'train2017') - config.annFile = os.path.join(config.data_dir, 'annotations/instances_train2017.json') + + if config.resume_yolox: + if not os.path.isfile(config.resume_yolox): + raise TypeError('resume_yolox should be checkpoint path') + resume_param = load_checkpoint(config.resume_yolox, filter_prefix=['learning_rate', 'global_step']) + resume_epoch = config.resume_yolox.split('-')[1].split('_')[0] + config.start_epoch = int(resume_epoch) + if config.start_epoch >= config.aug_epochs: + config.data_aug = False + config.use_l1 = True + config.run_eval = True + config.eval_interval = 1 + config.ckpt_interval = 1 + config.train_epoch = config.max_epoch - config.start_epoch + else: + config.train_epoch = config.aug_epochs - config.start_epoch + config.logger.info('resume train from epoch: %s data_aug: %s' % (resume_epoch, config.data_aug)) + ds = create_yolox_dataset(image_dir=config.data_root, anno_path=config.annFile, batch_size=config.per_batch_size, device_num=config.group_size, rank=config.rank, data_aug=config.data_aug) ds_test = get_val_dataset() config.logger.info('Finish loading training dataset! batch size:%s' % config.per_batch_size) config.steps_per_epoch = ds.get_dataset_size() config.logger.info('%s steps for one epoch.' % config.steps_per_epoch) - if config.ckpt_interval <= 0: - config.ckpt_interval = 1 + lr = get_lr(config) config.logger.info("Learning rate scheduler:%s, base_lr:%s, min lr ratio:%s" % (config.lr_scheduler, config.lr, config.min_lr_ratio)) opt = get_optimizer(config, network, lr) - loss_scale_manager = DynamicLossScaleManager(init_loss_scale=2 ** 22) - update_cell = loss_scale_manager.get_update_cell() - network_ema = TrainOneStepWithEMA(network, opt, update_cell, - ema=True, decay=0.9998, updates=config.updates).set_train() + network_ema = TrainOneStepWithEMA(network, opt, config.steps_per_epoch, ema=config.use_ema, + decay=0.9998).set_train() if config.resume_yolox: - resume_steps = config.updates.asnumpy().items() - config.resume_epoch = resume_steps // config.steps_per_epoch - lr = lr[resume_steps:] - opt = get_optimizer(config, network, lr) - network_ema = TrainOneStepWithEMA(network, opt, update_cell, - ema=True, decay=0.9998, updates=resume_steps).set_train() - load_resume_checkpoint(config, network_ema, config.resume_yolox) - if not config.data_aug: - if os.path.isfile(config.yolox_no_aug_ckpt): # Loading the resume checkpoint for the last no data aug epochs - load_resume_checkpoint(config, network_ema, config.yolox_no_aug_ckpt) - config.logger.info("Finish load the resume checkpoint, begin to train the last...") - else: - raise FileNotFoundError('{} not exist or not a pre-trained file'.format(config.yolox_no_aug_ckpt)) + load_param_into_net(network_ema, resume_param) config.logger.info("Add ema model") - model = Model(network_ema, amp_level="O0") + model = Model(network_ema) + cb = [] - save_ckpt_path = None if config.rank_save_ckpt_flag: - cb.append(EMACallBack(network_ema, config.steps_per_epoch)) + if config.use_summary: + specified = {'collect_input_data': False, 'histogram_regular': '|'.join(get_specified())} + cb.append(SummaryCollector(summary_dir="./summary_dir", collect_freq=10, collect_specified_data=specified)) ckpt_config = CheckpointConfig(save_checkpoint_steps=config.steps_per_epoch * config.ckpt_interval, keep_checkpoint_max=config.ckpt_max_num) - save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/') - cb.append(ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(config.backbone))) - cb.append(YOLOXCB(config.logger, config.steps_per_epoch, lr=lr, save_ckpt_path=save_ckpt_path, - is_modelart=config.enable_modelarts, - per_print_times=config.log_interval, train_url=args_opt.train_url)) + cb.append( + ModelCheckpoint(config=ckpt_config, directory=config.save_ckpt_dir, prefix='{}'.format(config.backbone))) + if config.resume_yolox: + cb.append(ResumeCallback(config.start_epoch)) + cb.append(YOLOXCB(config, lr=lr, is_modelart=config.enable_modelarts, per_print_times=config.log_interval, + train_url=args_opt.train_url)) if config.run_eval: test_block = DetectionBlock(config, backbone=backbone) cb.append( - EvalCallBack(ds_test, test_block, network_ema, DetectionEngine(config), config, - interval=config.eval_interval)) + EvalCallBack(ds_test, test_block, DetectionEngine(config), config, interval=config.eval_interval)) if config.need_profiler: model.train(3, ds, callbacks=cb, dataset_sink_mode=True, sink_size=config.log_interval) profiler.analyse() else: - config.logger.info("Epoch number:%s" % config.max_epoch) - config.logger.info("All steps number:%s" % (config.max_epoch * config.steps_per_epoch)) + config.logger.info("Epoch number:%s" % config.train_epoch) + config.logger.info("All steps number:%s" % (config.train_epoch * config.steps_per_epoch)) config.logger.info("==================Start Training=========================") - model.train(config.max_epoch, ds, callbacks=cb, dataset_sink_mode=False, sink_size=-1) + model.train(config.train_epoch, ds, callbacks=cb, dataset_sink_mode=True, sink_size=-1) config.logger.info("==================Training END======================") diff --git a/research/cv/yolox/yolox_darknet53.yaml b/research/cv/yolox/yolox_darknet53.yaml index a9451b9403f2689bcc53c628e37a1b7f1d42795a..00c0410ba5a64c799297aa1780b40637a597a1f4 100644 --- a/research/cv/yolox/yolox_darknet53.yaml +++ b/research/cv/yolox/yolox_darknet53.yaml @@ -8,14 +8,16 @@ outputs_dir: "./" # Train option save_graphs: False lr_scheduler: "yolox_warm_cos_lr" -max_epoch: 285 -total_epoch: 300 -data_dir: "/home/work/user-job-dir/inputs/data/" -# last no data aug related -yolox_no_aug_ckpt: "" +aug_epochs: 285 +no_aug_epochs: 15 +data_dir: "" need_profiler: 0 -pretrained: '' +pretrained: '' # use abspath resume_yolox: '' +use_summary: False +use_ema: True +start_epoch: 0 + # data aug flip_prob: 0.5 hsv_prob: 1.0 @@ -25,9 +27,9 @@ per_batch_size: 8 # network configuration depth_wise: False -max_gt: 120 +max_gt: 70 num_classes: 80 -input_size: [640, 640] +input_size: [640, 640] # [h, w] fpn_strides: [8, 16, 32] use_l1: False use_syc_bn: True @@ -37,21 +39,20 @@ updates: 0.0 n_candidate_k: 10 # optimizer and lr related -lr: 0.011 # 0.04 for yolox-x -min_lr_ratio: 0.001 +lr: 0.01 +min_lr_ratio: 0.05 warmup_epochs: 5 weight_decay: 0.0005 momentum: 0.9 -no_aug_epochs: 15 # logging related -log_interval: 30 -ckpt_interval: -1 +log_interval: 10 +ckpt_interval: 1 is_save_on_master: 1 -ckpt_max_num: 60 -opt: "Momentum" +ckpt_max_num: 10 +opt: "SGD" # distributed related -is_distributed: 1 +is_distributed: 0 rank: 0 group_size: 1 bind_cpu: True @@ -70,7 +71,7 @@ checkpoint_url: "" data_path: "/home/work/user-job-dir/inputs/data/" output_path: "./" load_path: "/cache/checkpoint_path" -ckpt_path: './' +ckpt_dir: '../' # Eval option log_path: "val/outputs/" @@ -78,7 +79,7 @@ val_ckpt: "0-2755_64.ckpt" conf_thre: 0.001 nms_thre: 0.65 eval_interval: 10 -run_eval: False +run_eval: True # modelarts is_modelart: False result_path: '' diff --git a/research/cv/yolox/yolox_x.yaml b/research/cv/yolox/yolox_x.yaml index 4d45728f080ddf375d76d95f99fd257e534cf528..b7729056fab6903d6d09b317c3668df1949e9f9b 100644 --- a/research/cv/yolox/yolox_x.yaml +++ b/research/cv/yolox/yolox_x.yaml @@ -1,4 +1,4 @@ -backbone: "yolox_x" #option for backbone +backbone: "yolox_darknet53" #option for backbone data_aug: True # path for local device_target: "Ascend" @@ -8,14 +8,16 @@ outputs_dir: "./" # Train option save_graphs: False lr_scheduler: "yolox_warm_cos_lr" -max_epoch: 285 -total_epoch: 300 -data_dir: "/home/work/user-job-dir/inputs/data/" -# last no data aug related -yolox_no_aug_ckpt: "" +aug_epochs: 285 +no_aug_epochs: 15 +data_dir: "" need_profiler: 0 -pretrained: '' +pretrained: '' # use abspath resume_yolox: '' +use_summary: False +use_ema: True +start_epoch: 0 + # data aug flip_prob: 0.5 hsv_prob: 1.0 @@ -27,7 +29,7 @@ per_batch_size: 8 depth_wise: False max_gt: 120 num_classes: 80 -input_size: [640, 640] +input_size: [640, 640] # [h, w] fpn_strides: [8, 16, 32] use_l1: False use_syc_bn: True @@ -37,21 +39,20 @@ updates: 0.0 n_candidate_k: 10 # optimizer and lr related -lr: 0.04 # 0.04 for yolox-x +lr: 0.04 min_lr_ratio: 0.001 warmup_epochs: 5 weight_decay: 0.0005 momentum: 0.9 -no_aug_epochs: 15 # logging related -log_interval: 30 -ckpt_interval: -1 +log_interval: 10 +ckpt_interval: 1 is_save_on_master: 1 -ckpt_max_num: 60 +ckpt_max_num: 10 opt: "Momentum" # distributed related -is_distributed: 1 +is_distributed: 0 rank: 0 group_size: 1 bind_cpu: True @@ -70,7 +71,7 @@ checkpoint_url: "" data_path: "/home/work/user-job-dir/inputs/data/" output_path: "./" load_path: "/cache/checkpoint_path" -ckpt_path: './' +ckpt_dir: '../' # Eval option log_path: "val/outputs/" @@ -78,7 +79,7 @@ val_ckpt: "0-2755_64.ckpt" conf_thre: 0.001 nms_thre: 0.65 eval_interval: 10 -run_eval: False +run_eval: True # modelarts is_modelart: False result_path: ''