diff --git a/official/cv/unet/scripts/run_distribute_train.sh b/official/cv/unet/scripts/run_distribute_train.sh index 27bfa1073062925b3f92fe06c6afd973ccfeaf11..3d96d8de47b97d7b631367ef6f2fe9e143b08c35 100644 --- a/official/cv/unet/scripts/run_distribute_train.sh +++ b/official/cv/unet/scripts/run_distribute_train.sh @@ -39,6 +39,9 @@ DATASET=$(get_real_path $2) CONFIG_PATH=$(get_real_path $3) RANK_TABLE=$(get_real_path $1) export RANK_TABLE_FILE=$RANK_TABLE + +ulimit -u unlimited + for((i=0;i<RANK_SIZE;i++)) do rm -rf LOG$i diff --git a/official/cv/unet/src/data_loader.py b/official/cv/unet/src/data_loader.py index 16cd33e04c309011d06bfdc502a6668ab606b7ef..bdbb0f3865ae5739a80b8f9f75ef1296459d5f48 100644 --- a/official/cv/unet/src/data_loader.py +++ b/official/cv/unet/src/data_loader.py @@ -257,7 +257,7 @@ def create_multi_class_dataset(data_dir, img_size, repeat, batch_size, num_class mc_dataset = MultiClassDataset(data_dir, repeat, is_train, split, shuffle) dataset = ds.GeneratorDataset(mc_dataset, mc_dataset.column_names, shuffle=True, num_shards=group_size, shard_id=rank, - num_parallel_workers=num_parallel_workers, python_multiprocessing=True) + num_parallel_workers=num_parallel_workers, python_multiprocessing=is_train) compose_map_func = (lambda image, mask: preprocess_img_mask(image, mask, num_classes, tuple(img_size), augment and is_train, eval_resize)) dataset = dataset.map(operations=compose_map_func, input_columns=mc_dataset.column_names, diff --git a/official/cv/unet/src/utils.py b/official/cv/unet/src/utils.py index 0c3079a8acb867e8060a5cad977b595bf75cf6fd..d8222fe3e381c241552b273dc986d3c6d0346a66 100644 --- a/official/cv/unet/src/utils.py +++ b/official/cv/unet/src/utils.py @@ -92,10 +92,11 @@ class dice_coeff(nn.Metric): self._iou_sum = 0 self._samples_num = 0 self.img_num = 0 - self.eval_images_path = "./draw_eval" - if os.path.exists(self.eval_images_path): - shutil.rmtree(self.eval_images_path) - os.mkdir(self.eval_images_path) + if self.show_eval: + self.eval_images_path = "./draw_eval" + if os.path.exists(self.eval_images_path): + shutil.rmtree(self.eval_images_path) + os.mkdir(self.eval_images_path) def draw_img(self, gray, index): """ diff --git a/official/cv/unet/train.py b/official/cv/unet/train.py index 472c52c8da027419143e3fc5d093c28d649508bc..22543d39a9356d8d19eb1c4faf0e2fd37fddbcb7 100644 --- a/official/cv/unet/train.py +++ b/official/cv/unet/train.py @@ -114,7 +114,7 @@ def train_net(cross_valid_ind=1, amp_level=amp_level) print("============== Starting Training ==============") callbacks = [StepLossTimeMonitor(batch_size=batch_size, per_print_times=per_print_times), ckpoint_cb] - if config.run_eval: + if config.run_eval and rank == 0: eval_model = Model(UnetEval(net, need_slice=need_slice, eval_activate=config.eval_activate.lower()), loss_fn=TempLoss(), metrics={"dice_coeff": dice_coeff(False, config.show_eval)}) eval_param_dict = {"model": eval_model, "dataset": valid_dataset, "metrics_name": config.eval_metrics} diff --git a/official/recommend/naml/script/run_train.sh b/official/recommend/naml/script/run_train.sh index 6c2873d40cf6d6abbcd8b8f561b5c7db49adfa96..1855b5ad16789efbe5211cac9d7001499fada1d9 100644 --- a/official/recommend/naml/script/run_train.sh +++ b/official/recommend/naml/script/run_train.sh @@ -40,10 +40,3 @@ python ${PROJECT_DIR}/../train.py \ --save_checkpoint_path=${CHECKPOINT_PATH} \ --weight_decay=False \ --sink_mode=True - -python ${PROJECT_DIR}/../eval.py \ - --config_path=${config_path} \ - --platform=${PLATFORM} \ - --dataset=${DATASET} \ - --dataset_path=${DATASET_PATH} \ - --checkpoint_path=${CHECKPOINT_PATH}/naml_last.ckpt