diff --git a/official/recommend/ncf/README.md b/official/recommend/ncf/README.md index 14c96bbf460b8dd1dac95b4601109a5e1beae5bb..f12d20935fc3339f19e5330d277d9859f01a0de6 100644 --- a/official/recommend/ncf/README.md +++ b/official/recommend/ncf/README.md @@ -100,8 +100,8 @@ bash scripts/run_train.sh # run training example on GPU bash scripts/run_train_gpu.sh -# run distributed training example -bash scripts/run_train.sh rank_table.json +# run training distribute example on Ascend +bash scripts/run_distribute_train.sh /path/hccl.json /path/MovieLens # run evaluation example on Ascend bash run_eval.sh @@ -215,7 +215,10 @@ Parameters for both training and evaluation can be set in config.py. - on Ascend ```python + # train single bash scripts/run_train.sh + # train distribute + bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH] ``` - on GPU diff --git a/official/recommend/ncf/scripts/run_distribute_train.sh b/official/recommend/ncf/scripts/run_distribute_train.sh index ed6ad4df07f76caec028946f4fb76bfb41c93ac0..4c2e19135e9ca9aa3c72ef0691d1e15a444ee4fe 100644 --- a/official/recommend/ncf/scripts/run_distribute_train.sh +++ b/official/recommend/ncf/scripts/run_distribute_train.sh @@ -17,9 +17,8 @@ if [ $# -lt 1 ]; then echo "==============================================================================================================" echo "Please run the script as: " - echo "bash run_local_train.sh RANK_TABLE_FILE [OTHER_ARGS]" - echo "OTHER_ARGS will be passed to the training scripts directly," - echo "for example: bash run_local_train.sh /path/hccl.json /dataset_path" + echo "bash scripts/run_distribute_train.sh RANK_TABLE_FILE [DATA_PATH]" + echo "for example: bash scripts/run_distribute_train.sh /path/hccl.json /dataset_path" echo "It is better to use absolute path." echo "==============================================================================================================" exit 1 @@ -32,12 +31,12 @@ python3 ${BASE_PATH}/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --hccl_config_dir=$1 \ --hccl_time_out=600 \ --args=" --data_path=$2 \ ---dataset='ml-1m' \ ---train_epochs=50 \ ---output_path='./output/' \ ---eval_file_name='eval.log' \ ---checkpoint_path='./checkpoint/' \ ---device_target='Ascend'" \ + --dataset='ml-1m' \ + --train_epochs=50 \ + --output_path='./output/' \ + --eval_file_name='eval.log' \ + --checkpoint_path='./checkpoint/' \ + --device_target='Ascend'" \ --cmd_file=distributed_cmd.sh bash distributed_cmd.sh diff --git a/official/recommend/ncf/train.py b/official/recommend/ncf/train.py index df2bcab026d3541179cbdf2587cde89f39d53406..5cbdb951b10b11c66e027482d434c910b5eed66e 100644 --- a/official/recommend/ncf/train.py +++ b/official/recommend/ncf/train.py @@ -51,7 +51,7 @@ def run_train(): config.group_size = get_device_num() context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=config.group_size, parallel_mode=ParallelMode.DATA_PARALLEL, - parameter_broadcast=True, gradients_mean=True) + gradients_mean=True) if config.device_target == "Ascend": context.set_context(device_id=get_device_id())