diff --git a/official/cv/centerface/src/centerface.py b/official/cv/centerface/src/centerface.py index f51aaa2eb64edb67aa721fc0330963c8f8146bd7..405cb9e5f70ed87ac459a9e5615ab1e5accefa33 100644 --- a/official/cv/centerface/src/centerface.py +++ b/official/cv/centerface/src/centerface.py @@ -233,10 +233,8 @@ class TrainingWrapper(nn.TrainOneStepWithLossScaleCell): if self.reducer_flag: grads = self.grad_reducer(grads) cond = self.get_overflow_status(status, grads) - overflow = self.process_loss_scale(cond) ret = (loss, cond, sens) - if not overflow: - self.optimizer(grads) + self.optimizer(grads) return ret diff --git a/official/cv/retinanet/src/lr_schedule.py b/official/cv/retinanet/src/lr_schedule.py index 93ce8c4b653aa7bafc19ba66cb80c27a3283803e..65a846f14b7aa93cfa99aa26978a2045498857c0 100644 --- a/official/cv/retinanet/src/lr_schedule.py +++ b/official/cv/retinanet/src/lr_schedule.py @@ -45,7 +45,7 @@ def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs1, warmup_epochs2, warmup_steps4 = warmup_steps3 + steps_per_epoch * warmup_epochs4 warmup_steps5 = warmup_steps4 + steps_per_epoch * warmup_epochs5 step_radio = [1e-4, 1e-3, 1e-2, 0.1] - if config.finetune: + if hasattr(config, finetune) and config.finetune: step_radio = [1e-4, 1e-2, 0.1, 1] for i in range(total_steps): if i < warmup_steps1: diff --git a/official/cv/retinanet/train.py b/official/cv/retinanet/train.py index 99a74bd4926d2544740f561a881adf92d301874b..59475849a3a808e5130865a57167103c55bda2df 100644 --- a/official/cv/retinanet/train.py +++ b/official/cv/retinanet/train.py @@ -127,7 +127,8 @@ def main(): config.lr_end_rate = ast.literal_eval(config.lr_end_rate) device_id = get_device_id() if config.device_target == "Ascend": - context.set_context(mempool_block_size="31GB") + if context.get_context("mode") == context.PYNATIVE_MODE: + context.set_context(mempool_block_size="31GB") elif config.device_target == "GPU": set_graph_kernel_context(config.device_target) elif config.device_target == "CPU": @@ -138,7 +139,7 @@ def main(): context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) if config.distribute: init() - device_num = config.device_num + device_num = get_device_num() rank = get_rank() context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) @@ -163,7 +164,7 @@ def main(): retinanet = retinanet50(backbone, config) net = retinanetWithLossCell(retinanet, config) init_net_param(net) - if config.finetune: + if hasattr(config, "finetune") and config.finetune: init_net_param(net, initialize_mode='XavierUniform') else: init_net_param(net) diff --git a/official/recommend/naml/script/run_distribute_train.sh b/official/recommend/naml/scripts/run_distribute_train.sh similarity index 100% rename from official/recommend/naml/script/run_distribute_train.sh rename to official/recommend/naml/scripts/run_distribute_train.sh diff --git a/official/recommend/naml/script/run_eval.sh b/official/recommend/naml/scripts/run_eval.sh similarity index 100% rename from official/recommend/naml/script/run_eval.sh rename to official/recommend/naml/scripts/run_eval.sh diff --git a/official/recommend/naml/script/run_infer_310.sh b/official/recommend/naml/scripts/run_infer_310.sh similarity index 100% rename from official/recommend/naml/script/run_infer_310.sh rename to official/recommend/naml/scripts/run_infer_310.sh diff --git a/official/recommend/naml/script/run_train.sh b/official/recommend/naml/scripts/run_train.sh similarity index 100% rename from official/recommend/naml/script/run_train.sh rename to official/recommend/naml/scripts/run_train.sh diff --git a/research/cv/glore_res/train.py b/research/cv/glore_res/train.py index ca95099a9029b54d566af160d46772168602f22c..7628ed81ba0908c888946e02c643063de9061a1a 100644 --- a/research/cv/glore_res/train.py +++ b/research/cv/glore_res/train.py @@ -46,7 +46,7 @@ from src.save_callback import SaveCallback if config.isModelArts: import moxing as mox -if config.net == 'resnet200' or config.net == 'resnet101': +if config.net == 'resnet200' or config.net == 'resnet101' or config.net == 'resnet50': if config.device_target == "GPU": config.cast_fp16 = False diff --git a/research/cv/yolov3_tiny/scripts/run_distribute_train.sh b/research/cv/yolov3_tiny/scripts/run_distribute_train.sh index 2aab7edd3b1fa4f5d9617594e21a72e53b042a8a..4c97f89015a47619475347b3a9bb065b6d48bac2 100644 --- a/research/cv/yolov3_tiny/scripts/run_distribute_train.sh +++ b/research/cv/yolov3_tiny/scripts/run_distribute_train.sh @@ -80,6 +80,7 @@ do --training_shape=640 \ --weight_decay=0.016 \ --loss_scale=1024 \ + --num_parallel_workers=32 \ --lr_scheduler=cosine_annealing > log.txt 2>&1 & cd .. done diff --git a/research/cv/yolov3_tiny/scripts/run_standalone_train.sh b/research/cv/yolov3_tiny/scripts/run_standalone_train.sh index 29adf4b1e9ece4cffdccaea59a118c1b64067e5c..8dcd366584d8a7e91014dd7badd6614b16d41dbd 100644 --- a/research/cv/yolov3_tiny/scripts/run_standalone_train.sh +++ b/research/cv/yolov3_tiny/scripts/run_standalone_train.sh @@ -67,5 +67,6 @@ python train.py \ --training_shape=640 \ --per_batch_size=32 \ --weight_decay=0.016 \ + --num_parallel_workers=32 \ --lr_scheduler=cosine_annealing > log.txt 2>&1 & cd .. diff --git a/research/cv/yolov3_tiny/src/yolo_dataset.py b/research/cv/yolov3_tiny/src/yolo_dataset.py index fd0a25f2825e899beb7a8f0d80e1ae6dfcaf95e2..25d6afcd1e20e88e5aad3d8ed2efaf9c7f9ab793 100644 --- a/research/cv/yolov3_tiny/src/yolo_dataset.py +++ b/research/cv/yolov3_tiny/src/yolo_dataset.py @@ -316,9 +316,9 @@ def create_yolo_dataset( CV.Normalize(mean, std), hwc_to_chw ], - num_parallel_workers=num_parallel_workers + num_parallel_workers=8 ) - ds = ds.batch(batch_size, num_parallel_workers=num_parallel_workers, drop_remainder=True) + ds = ds.batch(batch_size, num_parallel_workers=8, drop_remainder=True) else: ds = de.GeneratorDataset( yolo_dataset, @@ -331,9 +331,9 @@ def create_yolo_dataset( input_columns=["image", "img_id"], output_columns=["image", "image_shape", "img_id"], column_order=["image", "image_shape", "img_id"], - num_parallel_workers=num_parallel_workers + num_parallel_workers=8 ) - ds = ds.map(operations=hwc_to_chw, input_columns=["image"], num_parallel_workers=num_parallel_workers) + ds = ds.map(operations=hwc_to_chw, input_columns=["image"], num_parallel_workers=8) ds = ds.batch(batch_size, drop_remainder=True) ds = ds.repeat(max_epoch) return ds, len(yolo_dataset)