Skip to content
Snippets Groups Projects
Commit e2f3b70a authored by zhaoting's avatar zhaoting
Browse files

add overflow check in fasterrcnn

parent 06c94563
Branches
No related tags found
No related merge requests found
...@@ -165,6 +165,8 @@ rank_id: 0 ...@@ -165,6 +165,8 @@ rank_id: 0
image_dir: '' image_dir: ''
anno_path: '' anno_path: ''
backbone: 'resnet_v1.5_50' backbone: 'resnet_v1.5_50'
log_summary: False
grad_clip: False
# eval.py FasterRcnn evaluation # eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt" checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
......
...@@ -165,6 +165,8 @@ rank_id: 0 ...@@ -165,6 +165,8 @@ rank_id: 0
image_dir: '' image_dir: ''
anno_path: '' anno_path: ''
backbone: 'resnet_v1_101' backbone: 'resnet_v1_101'
log_summary: False
grad_clip: False
# eval.py FasterRcnn evaluation # eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt" checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
......
...@@ -165,6 +165,8 @@ rank_id: 0 ...@@ -165,6 +165,8 @@ rank_id: 0
image_dir: '' image_dir: ''
anno_path: '' anno_path: ''
backbone: 'resnet_v1_152' backbone: 'resnet_v1_152'
log_summary: False
grad_clip: False
# eval.py FasterRcnn evaluation # eval.py FasterRcnn evaluation
checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt" checkpoint_path: "/cache/train/fasterrcnn/faster_rcnn-12_7393.ckpt"
......
...@@ -165,6 +165,8 @@ rank_id: 0 ...@@ -165,6 +165,8 @@ rank_id: 0
image_dir: '' image_dir: ''
anno_path: '' anno_path: ''
backbone: 'inception_resnet_v2' backbone: 'inception_resnet_v2'
log_summary: False
grad_clip: False
# eval.py FasterRcnn evaluation # eval.py FasterRcnn evaluation
checkpoint_path: "/cache/checkpoint_path/faster_rcnn-20_7393.ckpt" checkpoint_path: "/cache/checkpoint_path/faster_rcnn-20_7393.ckpt"
......
...@@ -121,6 +121,7 @@ def get_config(): ...@@ -121,6 +121,7 @@ def get_config():
default, helper, choices = parse_yaml(path_args.config_path) default, helper, choices = parse_yaml(path_args.config_path)
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
default = Config(merge(args, default)) default = Config(merge(args, default))
if not hasattr(default, "feature_shapes"):
default.feature_shapes = [ default.feature_shapes = [
[default.img_height // 4, default.img_width // 4], [default.img_height // 4, default.img_width // 4],
[default.img_height // 8, default.img_width // 8], [default.img_height // 8, default.img_width // 8],
......
...@@ -15,12 +15,10 @@ ...@@ -15,12 +15,10 @@
"""FasterRcnn training network wrapper.""" """FasterRcnn training network wrapper."""
import time import time
import mindspore.common.dtype as mstype import mindspore as ms
import mindspore.ops as ops import mindspore.ops as ops
import mindspore.nn as nn import mindspore.nn as nn
from mindspore import ParameterTuple, Tensor
from mindspore.train.callback import Callback from mindspore.train.callback import Callback
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
time_stamp_init = False time_stamp_init = False
time_stamp_first = 0 time_stamp_first = 0
...@@ -114,7 +112,15 @@ class WithLossCell(nn.Cell): ...@@ -114,7 +112,15 @@ class WithLossCell(nn.Cell):
return self._backbone return self._backbone
class TrainOneStepCell(nn.Cell): _grad_scale = ops.MultitypeFuncGraph("grad_scale")
@_grad_scale.register("Tensor", "Tensor")
def tensor_grad_scale(scale, grad):
return grad * ops.cast(ops.Reciprocal()(scale), ops.dtype(grad))
class TrainOneStepCell(nn.TrainOneStepWithLossScaleCell):
""" """
Network training package class. Network training package class.
...@@ -125,29 +131,33 @@ class TrainOneStepCell(nn.Cell): ...@@ -125,29 +131,33 @@ class TrainOneStepCell(nn.Cell):
network (Cell): The training network. network (Cell): The training network.
optimizer (Cell): Optimizer for updating the weights. optimizer (Cell): Optimizer for updating the weights.
sens (Number): The adjust parameter. Default value is 1.0. sens (Number): The adjust parameter. Default value is 1.0.
reduce_flag (bool): The reduce flag. Default value is False. grad_clip (bool): Whether clip gradients. Default value is False.
mean (bool): Allreduce method. Default value is False.
degree (int): Device number. Default value is None.
""" """
def __init__(self, network, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None): def __init__(self, network, optimizer, scale_sense=1, grad_clip=False):
super(TrainOneStepCell, self).__init__(auto_prefix=False) if isinstance(scale_sense, (int, float)):
self.network = network scale_sense = ms.Tensor(scale_sense, ms.float32)
self.network.set_grad() super(TrainOneStepCell, self).__init__(network, optimizer, scale_sense)
self.weights = ParameterTuple(network.trainable_params()) self.grad_clip = grad_clip
self.optimizer = optimizer
self.grad = ops.GradOperation(get_by_list=True,
sens_param=True)
self.sens = Tensor([sens,], mstype.float32)
self.reduce_flag = reduce_flag
if reduce_flag:
self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def construct(self, x, img_shape, gt_bboxe, gt_label, gt_num): def construct(self, x, img_shape, gt_bboxe, gt_label, gt_num):
weights = self.weights weights = self.weights
loss = self.network(x, img_shape, gt_bboxe, gt_label, gt_num) loss = self.network(x, img_shape, gt_bboxe, gt_label, gt_num)
grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, self.sens) scaling_sens = self.scale_sense
if self.reduce_flag:
grads = self.grad_reducer(grads) status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
return ops.depend(loss, self.optimizer(grads)) scaling_sens_filled = ops.ones_like(loss) * ops.cast(scaling_sens, ops.dtype(loss))
grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, scaling_sens_filled)
grads = self.hyper_map(ops.partial(_grad_scale, scaling_sens), grads)
# apply grad reducer on grads
grads = self.grad_reducer(grads)
# get the overflow buffer
cond = self.get_overflow_status(status, grads)
overflow = self.process_loss_scale(cond)
if self.grad_clip:
grads = ops.clip_by_global_norm(grads)
# if there is no overflow, do optimize
if not overflow:
loss = ops.depend(loss, self.optimizer(grads))
return loss
...@@ -48,7 +48,7 @@ def train_fasterrcnn_(): ...@@ -48,7 +48,7 @@ def train_fasterrcnn_():
mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
print("CHECKING MINDRECORD FILES ...") print("CHECKING MINDRECORD FILES ...")
if rank == 0 and not os.path.exists(mindrecord_file): if rank == 0 and not os.path.exists(mindrecord_file + ".db"):
if not os.path.isdir(mindrecord_dir): if not os.path.isdir(mindrecord_dir):
os.makedirs(mindrecord_dir) os.makedirs(mindrecord_dir)
if config.dataset == "coco": if config.dataset == "coco":
...@@ -181,25 +181,19 @@ def train_fasterrcnn(): ...@@ -181,25 +181,19 @@ def train_fasterrcnn():
raise ValueError("Optimize type should be 'SGD' or 'Adam'") raise ValueError("Optimize type should be 'SGD' or 'Adam'")
if config.opt_type.lower() == "sgd": if config.opt_type.lower() == "sgd":
opt = SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, opt = SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,
weight_decay=config.weight_decay, loss_scale=config.loss_scale) weight_decay=config.weight_decay)
else: else:
opt = Adam(params=net.trainable_params(), learning_rate=lr, opt = Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=config.weight_decay)
loss_scale=config.loss_scale, weight_decay=config.weight_decay)
net_with_loss = WithLossCell(net, loss) net_with_loss = WithLossCell(net, loss)
print(f"[{rank}]", "\tDone!\n") print(f"[{rank}]", "\tDone!\n")
if config.run_distribute: net = TrainOneStepCell(net_with_loss, opt, scale_sense=config.loss_scale)
print(f"\n[{rank}]", "===> Run distributed training...\n")
net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale, reduce_flag=True,
mean=True, degree=device_num)
else:
print(f"\n[{rank}]", "===> Run single GPU training...\n")
net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale)
print(f"\n[{rank}]", "===> Creating callbacks...") print(f"\n[{rank}]", "===> Creating callbacks...")
summary_collector = SummaryCollector(summary_dir)
time_cb = TimeMonitor(data_size=dataset_size) time_cb = TimeMonitor(data_size=dataset_size)
loss_cb = LossCallBack(per_print_times=dataset_size, rank_id=rank, lr=lr.asnumpy()) loss_cb = LossCallBack(per_print_times=dataset_size, rank_id=rank, lr=lr.asnumpy())
cb = [time_cb, loss_cb, summary_collector] cb = [time_cb, loss_cb]
if config.log_summary:
summary_collector = SummaryCollector(summary_dir)
cb.apprnd(summary_collector)
print(f"[{rank}]", "\tDone!\n") print(f"[{rank}]", "\tDone!\n")
print(f"\n[{rank}]", "===> Configurating checkpoint saving...") print(f"\n[{rank}]", "===> Configurating checkpoint saving...")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment