diff --git a/official/cv/faster_rcnn/default_config.yaml b/official/cv/faster_rcnn/default_config.yaml index 14ac161962ee00fe95fdefce3b1f3451a047be59..70a93068f1513a303ad4364f13f294a6c3645011 100644 --- a/official/cv/faster_rcnn/default_config.yaml +++ b/official/cv/faster_rcnn/default_config.yaml @@ -165,6 +165,8 @@ rank_id: 0 image_dir: '' anno_path: '' backbone: 'resnet_v1.5_50' +log_summary: False +grad_clip: False # eval.py FasterRcnn evaluation checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt" diff --git a/official/cv/faster_rcnn/default_config_101.yaml b/official/cv/faster_rcnn/default_config_101.yaml index b755a4f6d86d695c2aa6a30263239510018b0716..b89854ea7b78b90ab3371c086339f6218b34a875 100644 --- a/official/cv/faster_rcnn/default_config_101.yaml +++ b/official/cv/faster_rcnn/default_config_101.yaml @@ -165,6 +165,8 @@ rank_id: 0 image_dir: '' anno_path: '' backbone: 'resnet_v1_101' +log_summary: False +grad_clip: False # eval.py FasterRcnn evaluation checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt" diff --git a/official/cv/faster_rcnn/default_config_152.yaml b/official/cv/faster_rcnn/default_config_152.yaml index d0b7de7bf15d2b11d02341effa7e2504e5e9fb8a..0c97a00bd4bb9d3501443be67f25ac4c6fda4c27 100644 --- a/official/cv/faster_rcnn/default_config_152.yaml +++ b/official/cv/faster_rcnn/default_config_152.yaml @@ -165,6 +165,8 @@ rank_id: 0 image_dir: '' anno_path: '' backbone: 'resnet_v1_152' +log_summary: False +grad_clip: False # eval.py FasterRcnn evaluation checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt" diff --git a/official/cv/faster_rcnn/default_config_InceptionResnetV2.yaml b/official/cv/faster_rcnn/default_config_InceptionResnetV2.yaml index af024f4188621589b2cd988c7a10d9b65f86c77b..f9dde2194f4bab45910a7223418b6a289d41290a 100644 --- a/official/cv/faster_rcnn/default_config_InceptionResnetV2.yaml +++ b/official/cv/faster_rcnn/default_config_InceptionResnetV2.yaml @@ -165,6 +165,8 @@ rank_id: 0 image_dir: '' anno_path: '' backbone: 'inception_resnet_v2' +log_summary: False +grad_clip: False # eval.py FasterRcnn evaluation checkpoint_path: "/cache/checkpoint_path/faster_rcnn-20_7393.ckpt" diff --git a/official/cv/faster_rcnn/src/model_utils/config.py b/official/cv/faster_rcnn/src/model_utils/config.py index e5b56494f5acebf345fee79f4cc1aa5eea925dae..de7d85301adb6cb65020bf975f3358482c530e5d 100644 --- a/official/cv/faster_rcnn/src/model_utils/config.py +++ b/official/cv/faster_rcnn/src/model_utils/config.py @@ -121,13 +121,14 @@ def get_config(): default, helper, choices = parse_yaml(path_args.config_path) args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) default = Config(merge(args, default)) - default.feature_shapes = [ - [default.img_height // 4, default.img_width // 4], - [default.img_height // 8, default.img_width // 8], - [default.img_height // 16, default.img_width // 16], - [default.img_height // 32, default.img_width // 32], - [default.img_height // 64, default.img_width // 64], - ] + if not hasattr(default, "feature_shapes"): + default.feature_shapes = [ + [default.img_height // 4, default.img_width // 4], + [default.img_height // 8, default.img_width // 8], + [default.img_height // 16, default.img_width // 16], + [default.img_height // 32, default.img_width // 32], + [default.img_height // 64, default.img_width // 64], + ] default.num_bboxes = default.num_anchors * sum([lst[0] * lst[1] for lst in default.feature_shapes]) pprint(default) print("Please check the above information for the configurations", flush=True) diff --git a/official/cv/faster_rcnn/src/network_define.py b/official/cv/faster_rcnn/src/network_define.py index 954e53cf7dfdeeaa3741217893c32a56e2403e27..d9beec5d8e56e43f9b4de258d33d7c29ffcee032 100644 --- a/official/cv/faster_rcnn/src/network_define.py +++ b/official/cv/faster_rcnn/src/network_define.py @@ -15,12 +15,10 @@ """FasterRcnn training network wrapper.""" import time -import mindspore.common.dtype as mstype +import mindspore as ms import mindspore.ops as ops import mindspore.nn as nn -from mindspore import ParameterTuple, Tensor from mindspore.train.callback import Callback -from mindspore.nn.wrap.grad_reducer import DistributedGradReducer time_stamp_init = False time_stamp_first = 0 @@ -114,7 +112,15 @@ class WithLossCell(nn.Cell): return self._backbone -class TrainOneStepCell(nn.Cell): +_grad_scale = ops.MultitypeFuncGraph("grad_scale") + + +@_grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * ops.cast(ops.Reciprocal()(scale), ops.dtype(grad)) + + +class TrainOneStepCell(nn.TrainOneStepWithLossScaleCell): """ Network training package class. @@ -125,29 +131,33 @@ class TrainOneStepCell(nn.Cell): network (Cell): The training network. optimizer (Cell): Optimizer for updating the weights. sens (Number): The adjust parameter. Default value is 1.0. - reduce_flag (bool): The reduce flag. Default value is False. - mean (bool): Allreduce method. Default value is False. - degree (int): Device number. Default value is None. + grad_clip (bool): Whether clip gradients. Default value is False. """ - def __init__(self, network, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None): - super(TrainOneStepCell, self).__init__(auto_prefix=False) - self.network = network - self.network.set_grad() - self.weights = ParameterTuple(network.trainable_params()) - self.optimizer = optimizer - self.grad = ops.GradOperation(get_by_list=True, - sens_param=True) - self.sens = Tensor([sens,], mstype.float32) - self.reduce_flag = reduce_flag - if reduce_flag: - self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) + def __init__(self, network, optimizer, scale_sense=1, grad_clip=False): + if isinstance(scale_sense, (int, float)): + scale_sense = ms.Tensor(scale_sense, ms.float32) + super(TrainOneStepCell, self).__init__(network, optimizer, scale_sense) + self.grad_clip = grad_clip def construct(self, x, img_shape, gt_bboxe, gt_label, gt_num): weights = self.weights loss = self.network(x, img_shape, gt_bboxe, gt_label, gt_num) - grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, self.sens) - if self.reduce_flag: - grads = self.grad_reducer(grads) - - return ops.depend(loss, self.optimizer(grads)) + scaling_sens = self.scale_sense + + status, scaling_sens = self.start_overflow_check(loss, scaling_sens) + + scaling_sens_filled = ops.ones_like(loss) * ops.cast(scaling_sens, ops.dtype(loss)) + grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, scaling_sens_filled) + grads = self.hyper_map(ops.partial(_grad_scale, scaling_sens), grads) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + # get the overflow buffer + cond = self.get_overflow_status(status, grads) + overflow = self.process_loss_scale(cond) + if self.grad_clip: + grads = ops.clip_by_global_norm(grads) + # if there is no overflow, do optimize + if not overflow: + loss = ops.depend(loss, self.optimizer(grads)) + return loss diff --git a/official/cv/faster_rcnn/train.py b/official/cv/faster_rcnn/train.py index 627ad992e7312014d945b506f38e9eec360060d2..a1522027defb1e3c27ef397971931c3b1fef6637 100644 --- a/official/cv/faster_rcnn/train.py +++ b/official/cv/faster_rcnn/train.py @@ -48,7 +48,7 @@ def train_fasterrcnn_(): mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") print("CHECKING MINDRECORD FILES ...") - if rank == 0 and not os.path.exists(mindrecord_file): + if rank == 0 and not os.path.exists(mindrecord_file + ".db"): if not os.path.isdir(mindrecord_dir): os.makedirs(mindrecord_dir) if config.dataset == "coco": @@ -181,25 +181,19 @@ def train_fasterrcnn(): raise ValueError("Optimize type should be 'SGD' or 'Adam'") if config.opt_type.lower() == "sgd": opt = SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, - weight_decay=config.weight_decay, loss_scale=config.loss_scale) + weight_decay=config.weight_decay) else: - opt = Adam(params=net.trainable_params(), learning_rate=lr, - loss_scale=config.loss_scale, weight_decay=config.weight_decay) + opt = Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=config.weight_decay) net_with_loss = WithLossCell(net, loss) print(f"[{rank}]", "\tDone!\n") - if config.run_distribute: - print(f"\n[{rank}]", "===> Run distributed training...\n") - net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale, reduce_flag=True, - mean=True, degree=device_num) - else: - print(f"\n[{rank}]", "===> Run single GPU training...\n") - net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale) - + net = TrainOneStepCell(net_with_loss, opt, scale_sense=config.loss_scale) print(f"\n[{rank}]", "===> Creating callbacks...") - summary_collector = SummaryCollector(summary_dir) time_cb = TimeMonitor(data_size=dataset_size) loss_cb = LossCallBack(per_print_times=dataset_size, rank_id=rank, lr=lr.asnumpy()) - cb = [time_cb, loss_cb, summary_collector] + cb = [time_cb, loss_cb] + if config.log_summary: + summary_collector = SummaryCollector(summary_dir) + cb.apprnd(summary_collector) print(f"[{rank}]", "\tDone!\n") print(f"\n[{rank}]", "===> Configurating checkpoint saving...")