diff --git a/official/nlp/pangu_alpha/README.md b/official/nlp/pangu_alpha/README.md index c0c248929a8f63fc1873fe8036087fcd74b6d998..6a06e4544cce3161294988a253d97eab30592b1c 100644 --- a/official/nlp/pangu_alpha/README.md +++ b/official/nlp/pangu_alpha/README.md @@ -159,7 +159,7 @@ The above command involves some `args` described below: - RANK_START: The start of rank_id in current machines, it helps to set the rank_id for each machine in multi-machine scenario. - LOCAL_DEVICE_NUM: The device number of the local machine. -The following command will launch he program will train 2.6B model with the following command: +The following command will launch the program to train 2.6B model: ```bash # run distributed training example in one ascend machine @@ -180,6 +180,9 @@ For distributed training, an hccl configuration file with JSON format needs to b Please follow the instructions in the link below: https:gitee.com/mindspore/models/tree/master/utils/hccl_tools. +Once you start training, the training log is redirected to device{rank_id}/log{rank_id}.txt(For example, +device0/log0.log). + ### Training on GPU The script will launch the GPU training through `mpirun`, the user can run the following command on any machine to start training. @@ -220,9 +223,9 @@ The above command involves some `args` described below: - LOCAL_DEVICE_NUM: The device number of the local machine. - EXPERT_NUM_PER_EP: Expert nums in one data parallel dim. -The following command will launch he program will train 60B model using 8 NPU. +The following command will launch the program to train 60B model using 8 NPU. Mode 2.6B only represents the model configuration is same with 2.6B model which is without MoE. -Running 0B model using 8 NPU in one server requires that the server has at least 1T host memory. +Training 60B model using 8 NPU in one server requires that the server has at least 1TB host memory. ```bash # run distributed training example in one ascend machine @@ -232,7 +235,7 @@ bash run_distributed_train_moe_host_device.sh /path/dataset /path/hccl.json 8 fp ### Incremental Training - Before we start Incremental Training, the following two steps must be done: +Before we start Incremental Training, the following two steps must be done: 1. Process the dataset using the released vocab, please refer to the [Increnmental Training in Dataset Generatiogn](#Incremental Training) 2. Download the`checkpoint` and `strategy` file according to the [Download Checkpoint](#Download Checkpoint). Each host should own the complete checkpoint files. @@ -291,8 +294,8 @@ ${FILE_PATH}/tokenizer/ ${FILE_PATH}/checkpoint_file filitered 2.6B fp32 The following script will run prediction on 1 Ascend cards or 1 Nvidia GPU. The difference is the net is initialized with float16 type. ```bash -$FILE_PATH=/home/your_path/ckpts -$DEVICE_TARGET=Ascend # or GPU +export FILE_PATH=/home/your_path/ckpts +export DEVICE_TARGET=Ascend # or GPU bash scripts/run_standalone_predict.sh ${FILE_PATH}/strategy_load_ckpt/strategy.ckpt \ ${FILE_PATH}/tokenizer/ ${FILE_PATH}/checkpoint_file filitered 2.6B $DEVICE_TARGET ``` diff --git a/official/nlp/pangu_alpha/predict.py b/official/nlp/pangu_alpha/predict.py index b629000cdcc1eb374c2fcc81942926f2876845ff..7ad513cf2834e18282d1a6b263ef57a7361c7b08 100644 --- a/official/nlp/pangu_alpha/predict.py +++ b/official/nlp/pangu_alpha/predict.py @@ -101,6 +101,7 @@ def load_model(args_opt): eod_reset=False, parallel_config=parallel_config, load_ckpt_path=args_opt.load_ckpt_path, + run_type=args_opt.run_type, param_init_type=mstype.float32 if args_opt.param_init_type == 'fp32' else mstype.float16) print("===config is: ", config, flush=True) print("=====args_opt is: ", args_opt, flush=True) diff --git a/official/nlp/pangu_alpha/scripts/run_standalone_export.sh b/official/nlp/pangu_alpha/scripts/run_standalone_export.sh index 4b30aa25b46b90b07842d8f37d735a716a3ae244..6fb05642d1cd2059068654bf407f72705a1f280a 100644 --- a/official/nlp/pangu_alpha/scripts/run_standalone_export.sh +++ b/official/nlp/pangu_alpha/scripts/run_standalone_export.sh @@ -27,9 +27,9 @@ export CKPT_NAME='filerted' for((i=0;i<$RANK_SIZE;i++)); do - rm -rf ${execute_path}/device_$i/ - mkdir ${execute_path}/device_$i/ - cd ${execute_path}/device_$i/ || exit + rm -rf ${execute_path}/device$i/ + mkdir ${execute_path}/device$i/ + cd ${execute_path}/device$i/ || exit export RANK_ID=$i export DEVICE_ID=$i python -s ${self_path}/../predict.py --strategy_load_ckpt_path=$STRATEGY --load_ckpt_path=$CKPT_PATH \ diff --git a/official/nlp/pangu_alpha/src/pangu_alpha.py b/official/nlp/pangu_alpha/src/pangu_alpha.py index 643e4b75b12821e04cf99e8e78cbc736991968c1..f7be1fda61887a6d6c229687d74049c0a142ae3e 100644 --- a/official/nlp/pangu_alpha/src/pangu_alpha.py +++ b/official/nlp/pangu_alpha/src/pangu_alpha.py @@ -290,6 +290,7 @@ class PanguAlpha_Model(Cell): if config.load_ckpt_path: self.load_embedding_from_ckpt(config.load_ckpt_path) + self.run_type = config.run_type def construct(self, input_ids, input_position, @@ -300,7 +301,7 @@ class PanguAlpha_Model(Cell): embed, word_table = self.embedding(input_ids, input_position, init_reset, batch_valid_length) hidden_state = P.Cast()(embed, self.dtype) # the input of the incremental prediction is 3d - if self._phase != 'predict': + if self.run_type != 'predict': hidden_state = self.reshape_to_2d(hidden_state) if self.blocks is not None: for i in range(self.num_layers - 1): @@ -479,10 +480,10 @@ class EvalNet(nn.Cell): bs, seq_length = F.shape(input_ids) input_position = F.tuple_to_array(F.make_range(seq_length)) input_position = P.Tile()(input_position, (bs, 1)) - if self.is_first_iteration: - attention_mask = self.get_attention_mask(input_mask) - else: + if self.is_first_iteration is False: attention_mask = P.Tile()(self.all_ones_attention_mask, (bs, 1, 1)) + else: + attention_mask = self.get_attention_mask(input_mask) logits = self.backbone(input_ids, input_position, attention_mask, init_reset, batch_valid_length) index = current_index.view(1,) diff --git a/official/nlp/pangu_alpha/src/pangu_alpha_config.py b/official/nlp/pangu_alpha/src/pangu_alpha_config.py index e587b8165fb14a19e2a86bdd9771db8ed9d01969..104ee350c7ee84f8f4df47aefd3df80dab3da6dc 100644 --- a/official/nlp/pangu_alpha/src/pangu_alpha_config.py +++ b/official/nlp/pangu_alpha/src/pangu_alpha_config.py @@ -42,6 +42,7 @@ class PanguAlphaConfig: use_moe=False, per_dp_dim_expert_num=4, parallel_config=None, + run_type='train', softmax_compute_type=mstype.float16): self.batch_size = batch_size self.seq_length = seq_length @@ -65,11 +66,12 @@ class PanguAlphaConfig: self.softmax_compute_type = softmax_compute_type self.use_moe = bool(use_moe) self.per_dp_dim_expert_num = per_dp_dim_expert_num + self.run_type = run_type def __str__(self): - info = "[PANGUALPHAConfig]" + '===' * 10 + '\n' + info = '===' * 10 + "[PANGUALPHAConfig]" + '===' * 10 + '\n' for k, v in self.__dict__.items(): - var_info = "{}:{}\n".format(k, v) + var_info = "--{}:{}\n".format(k, v) info += var_info info += '=' * 10 return info