Skip to content
Snippets Groups Projects
Unverified Commit cfb335bb authored by i-robot's avatar i-robot Committed by Gitee
Browse files

!2923 add overflow check in fasterrcnn

Merge pull request !2923 from zhaoting/faster_rcnn
parents 2b24802f e2f3b70a
No related branches found
No related tags found
No related merge requests found
......@@ -165,6 +165,8 @@ rank_id: 0
image_dir: ''
anno_path: ''
backbone: 'resnet_v1.5_50'
log_summary: False
grad_clip: False
# eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
......
......@@ -165,6 +165,8 @@ rank_id: 0
image_dir: ''
anno_path: ''
backbone: 'resnet_v1_101'
log_summary: False
grad_clip: False
# eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
......
......@@ -165,6 +165,8 @@ rank_id: 0
image_dir: ''
anno_path: ''
backbone: 'resnet_v1_152'
log_summary: False
grad_clip: False
# eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
......
......@@ -165,6 +165,8 @@ rank_id: 0
image_dir: ''
anno_path: ''
backbone: 'inception_resnet_v2'
log_summary: False
grad_clip: False
# eval.py FasterRcnn evaluation
checkpoint_path: "/cache/checkpoint_path/faster_rcnn-20_7393.ckpt"
......
......@@ -121,13 +121,14 @@ def get_config():
default, helper, choices = parse_yaml(path_args.config_path)
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
default = Config(merge(args, default))
default.feature_shapes = [
[default.img_height // 4, default.img_width // 4],
[default.img_height // 8, default.img_width // 8],
[default.img_height // 16, default.img_width // 16],
[default.img_height // 32, default.img_width // 32],
[default.img_height // 64, default.img_width // 64],
]
if not hasattr(default, "feature_shapes"):
default.feature_shapes = [
[default.img_height // 4, default.img_width // 4],
[default.img_height // 8, default.img_width // 8],
[default.img_height // 16, default.img_width // 16],
[default.img_height // 32, default.img_width // 32],
[default.img_height // 64, default.img_width // 64],
]
default.num_bboxes = default.num_anchors * sum([lst[0] * lst[1] for lst in default.feature_shapes])
pprint(default)
print("Please check the above information for the configurations", flush=True)
......
......@@ -15,12 +15,10 @@
"""FasterRcnn training network wrapper."""
import time
import mindspore.common.dtype as mstype
import mindspore as ms
import mindspore.ops as ops
import mindspore.nn as nn
from mindspore import ParameterTuple, Tensor
from mindspore.train.callback import Callback
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
time_stamp_init = False
time_stamp_first = 0
......@@ -114,7 +112,15 @@ class WithLossCell(nn.Cell):
return self._backbone
class TrainOneStepCell(nn.Cell):
_grad_scale = ops.MultitypeFuncGraph("grad_scale")
@_grad_scale.register("Tensor", "Tensor")
def tensor_grad_scale(scale, grad):
return grad * ops.cast(ops.Reciprocal()(scale), ops.dtype(grad))
class TrainOneStepCell(nn.TrainOneStepWithLossScaleCell):
"""
Network training package class.
......@@ -125,29 +131,33 @@ class TrainOneStepCell(nn.Cell):
network (Cell): The training network.
optimizer (Cell): Optimizer for updating the weights.
sens (Number): The adjust parameter. Default value is 1.0.
reduce_flag (bool): The reduce flag. Default value is False.
mean (bool): Allreduce method. Default value is False.
degree (int): Device number. Default value is None.
grad_clip (bool): Whether clip gradients. Default value is False.
"""
def __init__(self, network, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None):
super(TrainOneStepCell, self).__init__(auto_prefix=False)
self.network = network
self.network.set_grad()
self.weights = ParameterTuple(network.trainable_params())
self.optimizer = optimizer
self.grad = ops.GradOperation(get_by_list=True,
sens_param=True)
self.sens = Tensor([sens,], mstype.float32)
self.reduce_flag = reduce_flag
if reduce_flag:
self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def __init__(self, network, optimizer, scale_sense=1, grad_clip=False):
if isinstance(scale_sense, (int, float)):
scale_sense = ms.Tensor(scale_sense, ms.float32)
super(TrainOneStepCell, self).__init__(network, optimizer, scale_sense)
self.grad_clip = grad_clip
def construct(self, x, img_shape, gt_bboxe, gt_label, gt_num):
weights = self.weights
loss = self.network(x, img_shape, gt_bboxe, gt_label, gt_num)
grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, self.sens)
if self.reduce_flag:
grads = self.grad_reducer(grads)
return ops.depend(loss, self.optimizer(grads))
scaling_sens = self.scale_sense
status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
scaling_sens_filled = ops.ones_like(loss) * ops.cast(scaling_sens, ops.dtype(loss))
grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, scaling_sens_filled)
grads = self.hyper_map(ops.partial(_grad_scale, scaling_sens), grads)
# apply grad reducer on grads
grads = self.grad_reducer(grads)
# get the overflow buffer
cond = self.get_overflow_status(status, grads)
overflow = self.process_loss_scale(cond)
if self.grad_clip:
grads = ops.clip_by_global_norm(grads)
# if there is no overflow, do optimize
if not overflow:
loss = ops.depend(loss, self.optimizer(grads))
return loss
......@@ -48,7 +48,7 @@ def train_fasterrcnn_():
mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
print("CHECKING MINDRECORD FILES ...")
if rank == 0 and not os.path.exists(mindrecord_file):
if rank == 0 and not os.path.exists(mindrecord_file + ".db"):
if not os.path.isdir(mindrecord_dir):
os.makedirs(mindrecord_dir)
if config.dataset == "coco":
......@@ -181,25 +181,19 @@ def train_fasterrcnn():
raise ValueError("Optimize type should be 'SGD' or 'Adam'")
if config.opt_type.lower() == "sgd":
opt = SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,
weight_decay=config.weight_decay, loss_scale=config.loss_scale)
weight_decay=config.weight_decay)
else:
opt = Adam(params=net.trainable_params(), learning_rate=lr,
loss_scale=config.loss_scale, weight_decay=config.weight_decay)
opt = Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=config.weight_decay)
net_with_loss = WithLossCell(net, loss)
print(f"[{rank}]", "\tDone!\n")
if config.run_distribute:
print(f"\n[{rank}]", "===> Run distributed training...\n")
net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale, reduce_flag=True,
mean=True, degree=device_num)
else:
print(f"\n[{rank}]", "===> Run single GPU training...\n")
net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale)
net = TrainOneStepCell(net_with_loss, opt, scale_sense=config.loss_scale)
print(f"\n[{rank}]", "===> Creating callbacks...")
summary_collector = SummaryCollector(summary_dir)
time_cb = TimeMonitor(data_size=dataset_size)
loss_cb = LossCallBack(per_print_times=dataset_size, rank_id=rank, lr=lr.asnumpy())
cb = [time_cb, loss_cb, summary_collector]
cb = [time_cb, loss_cb]
if config.log_summary:
summary_collector = SummaryCollector(summary_dir)
cb.apprnd(summary_collector)
print(f"[{rank}]", "\tDone!\n")
print(f"\n[{rank}]", "===> Configurating checkpoint saving...")
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment