diff --git a/official/cv/faster_rcnn/default_config.yaml b/official/cv/faster_rcnn/default_config.yaml
index 14ac161962ee00fe95fdefce3b1f3451a047be59..70a93068f1513a303ad4364f13f294a6c3645011 100644
--- a/official/cv/faster_rcnn/default_config.yaml
+++ b/official/cv/faster_rcnn/default_config.yaml
@@ -165,6 +165,8 @@ rank_id: 0
image_dir: ''
anno_path: ''
backbone: 'resnet_v1.5_50'
+log_summary: False
+grad_clip: False
# eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
diff --git a/official/cv/faster_rcnn/default_config_101.yaml b/official/cv/faster_rcnn/default_config_101.yaml
index b755a4f6d86d695c2aa6a30263239510018b0716..b89854ea7b78b90ab3371c086339f6218b34a875 100644
--- a/official/cv/faster_rcnn/default_config_101.yaml
+++ b/official/cv/faster_rcnn/default_config_101.yaml
@@ -165,6 +165,8 @@ rank_id: 0
image_dir: ''
anno_path: ''
backbone: 'resnet_v1_101'
+log_summary: False
+grad_clip: False
# eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
diff --git a/official/cv/faster_rcnn/default_config_152.yaml b/official/cv/faster_rcnn/default_config_152.yaml
index d0b7de7bf15d2b11d02341effa7e2504e5e9fb8a..0c97a00bd4bb9d3501443be67f25ac4c6fda4c27 100644
--- a/official/cv/faster_rcnn/default_config_152.yaml
+++ b/official/cv/faster_rcnn/default_config_152.yaml
@@ -165,6 +165,8 @@ rank_id: 0
image_dir: ''
anno_path: ''
backbone: 'resnet_v1_152'
+log_summary: False
+grad_clip: False
# eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
diff --git a/official/cv/faster_rcnn/default_config_InceptionResnetV2.yaml b/official/cv/faster_rcnn/default_config_InceptionResnetV2.yaml
index af024f4188621589b2cd988c7a10d9b65f86c77b..f9dde2194f4bab45910a7223418b6a289d41290a 100644
--- a/official/cv/faster_rcnn/default_config_InceptionResnetV2.yaml
+++ b/official/cv/faster_rcnn/default_config_InceptionResnetV2.yaml
@@ -165,6 +165,8 @@ rank_id: 0
image_dir: ''
anno_path: ''
backbone: 'inception_resnet_v2'
+log_summary: False
+grad_clip: False
# eval.py FasterRcnn evaluation
checkpoint_path: "/cache/checkpoint_path/faster_rcnn-20_7393.ckpt"
diff --git a/official/cv/faster_rcnn/src/model_utils/config.py b/official/cv/faster_rcnn/src/model_utils/config.py
index e5b56494f5acebf345fee79f4cc1aa5eea925dae..de7d85301adb6cb65020bf975f3358482c530e5d 100644
--- a/official/cv/faster_rcnn/src/model_utils/config.py
+++ b/official/cv/faster_rcnn/src/model_utils/config.py
@@ -121,13 +121,14 @@ def get_config():
default, helper, choices = parse_yaml(path_args.config_path)
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
default = Config(merge(args, default))
- default.feature_shapes = [
- [default.img_height // 4, default.img_width // 4],
- [default.img_height // 8, default.img_width // 8],
- [default.img_height // 16, default.img_width // 16],
- [default.img_height // 32, default.img_width // 32],
- [default.img_height // 64, default.img_width // 64],
- ]
+ if not hasattr(default, "feature_shapes"):
+ default.feature_shapes = [
+ [default.img_height // 4, default.img_width // 4],
+ [default.img_height // 8, default.img_width // 8],
+ [default.img_height // 16, default.img_width // 16],
+ [default.img_height // 32, default.img_width // 32],
+ [default.img_height // 64, default.img_width // 64],
+ ]
default.num_bboxes = default.num_anchors * sum([lst[0] * lst[1] for lst in default.feature_shapes])
pprint(default)
print("Please check the above information for the configurations", flush=True)
diff --git a/official/cv/faster_rcnn/src/network_define.py b/official/cv/faster_rcnn/src/network_define.py
index 954e53cf7dfdeeaa3741217893c32a56e2403e27..d9beec5d8e56e43f9b4de258d33d7c29ffcee032 100644
--- a/official/cv/faster_rcnn/src/network_define.py
+++ b/official/cv/faster_rcnn/src/network_define.py
@@ -15,12 +15,10 @@
"""FasterRcnn training network wrapper."""
import time
-import mindspore.common.dtype as mstype
+import mindspore as ms
import mindspore.ops as ops
import mindspore.nn as nn
-from mindspore import ParameterTuple, Tensor
from mindspore.train.callback import Callback
-from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
time_stamp_init = False
time_stamp_first = 0
@@ -114,7 +112,15 @@ class WithLossCell(nn.Cell):
return self._backbone
-class TrainOneStepCell(nn.Cell):
+_grad_scale = ops.MultitypeFuncGraph("grad_scale")
+
+
+@_grad_scale.register("Tensor", "Tensor")
+def tensor_grad_scale(scale, grad):
+ return grad * ops.cast(ops.Reciprocal()(scale), ops.dtype(grad))
+
+
+class TrainOneStepCell(nn.TrainOneStepWithLossScaleCell):
"""
Network training package class.
@@ -125,29 +131,33 @@ class TrainOneStepCell(nn.Cell):
network (Cell): The training network.
optimizer (Cell): Optimizer for updating the weights.
sens (Number): The adjust parameter. Default value is 1.0.
- reduce_flag (bool): The reduce flag. Default value is False.
- mean (bool): Allreduce method. Default value is False.
- degree (int): Device number. Default value is None.
+ grad_clip (bool): Whether clip gradients. Default value is False.
"""
- def __init__(self, network, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None):
- super(TrainOneStepCell, self).__init__(auto_prefix=False)
- self.network = network
- self.network.set_grad()
- self.weights = ParameterTuple(network.trainable_params())
- self.optimizer = optimizer
- self.grad = ops.GradOperation(get_by_list=True,
- sens_param=True)
- self.sens = Tensor([sens,], mstype.float32)
- self.reduce_flag = reduce_flag
- if reduce_flag:
- self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
+ def __init__(self, network, optimizer, scale_sense=1, grad_clip=False):
+ if isinstance(scale_sense, (int, float)):
+ scale_sense = ms.Tensor(scale_sense, ms.float32)
+ super(TrainOneStepCell, self).__init__(network, optimizer, scale_sense)
+ self.grad_clip = grad_clip
def construct(self, x, img_shape, gt_bboxe, gt_label, gt_num):
weights = self.weights
loss = self.network(x, img_shape, gt_bboxe, gt_label, gt_num)
- grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, self.sens)
- if self.reduce_flag:
- grads = self.grad_reducer(grads)
-
- return ops.depend(loss, self.optimizer(grads))
+ scaling_sens = self.scale_sense
+
+ status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
+
+ scaling_sens_filled = ops.ones_like(loss) * ops.cast(scaling_sens, ops.dtype(loss))
+ grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, scaling_sens_filled)
+ grads = self.hyper_map(ops.partial(_grad_scale, scaling_sens), grads)
+ # apply grad reducer on grads
+ grads = self.grad_reducer(grads)
+ # get the overflow buffer
+ cond = self.get_overflow_status(status, grads)
+ overflow = self.process_loss_scale(cond)
+ if self.grad_clip:
+ grads = ops.clip_by_global_norm(grads)
+ # if there is no overflow, do optimize
+ if not overflow:
+ loss = ops.depend(loss, self.optimizer(grads))
+ return loss
diff --git a/official/cv/faster_rcnn/train.py b/official/cv/faster_rcnn/train.py
index 627ad992e7312014d945b506f38e9eec360060d2..a1522027defb1e3c27ef397971931c3b1fef6637 100644
--- a/official/cv/faster_rcnn/train.py
+++ b/official/cv/faster_rcnn/train.py
@@ -48,7 +48,7 @@ def train_fasterrcnn_():
mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
print("CHECKING MINDRECORD FILES ...")
- if rank == 0 and not os.path.exists(mindrecord_file):
+ if rank == 0 and not os.path.exists(mindrecord_file + ".db"):
if not os.path.isdir(mindrecord_dir):
os.makedirs(mindrecord_dir)
if config.dataset == "coco":
@@ -181,25 +181,19 @@ def train_fasterrcnn():
raise ValueError("Optimize type should be 'SGD' or 'Adam'")
if config.opt_type.lower() == "sgd":
opt = SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,
- weight_decay=config.weight_decay, loss_scale=config.loss_scale)
+ weight_decay=config.weight_decay)
else:
- opt = Adam(params=net.trainable_params(), learning_rate=lr,
- loss_scale=config.loss_scale, weight_decay=config.weight_decay)
+ opt = Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=config.weight_decay)
net_with_loss = WithLossCell(net, loss)
print(f"[{rank}]", "\tDone!\n")
- if config.run_distribute:
- print(f"\n[{rank}]", "===> Run distributed training...\n")
- net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale, reduce_flag=True,
- mean=True, degree=device_num)
- else:
- print(f"\n[{rank}]", "===> Run single GPU training...\n")
- net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale)
-
+ net = TrainOneStepCell(net_with_loss, opt, scale_sense=config.loss_scale)
print(f"\n[{rank}]", "===> Creating callbacks...")
- summary_collector = SummaryCollector(summary_dir)
time_cb = TimeMonitor(data_size=dataset_size)
loss_cb = LossCallBack(per_print_times=dataset_size, rank_id=rank, lr=lr.asnumpy())
- cb = [time_cb, loss_cb, summary_collector]
+ cb = [time_cb, loss_cb]
+ if config.log_summary:
+ summary_collector = SummaryCollector(summary_dir)
+ cb.apprnd(summary_collector)
print(f"[{rank}]", "\tDone!\n")
print(f"\n[{rank}]", "===> Configurating checkpoint saving...")