diff --git a/official/nlp/pangu_alpha/README.md b/official/nlp/pangu_alpha/README.md
index c0c248929a8f63fc1873fe8036087fcd74b6d998..6a06e4544cce3161294988a253d97eab30592b1c 100644
--- a/official/nlp/pangu_alpha/README.md
+++ b/official/nlp/pangu_alpha/README.md
@@ -159,7 +159,7 @@ The above command involves some `args` described below:
 - RANK_START: The start of rank_id in current machines, it helps to set the rank_id for each machine in multi-machine scenario.
 - LOCAL_DEVICE_NUM: The device number of the local machine.
 
-The following command will launch he program will train 2.6B model with the following command:
+The following command will launch the program to train 2.6B model:
 
 ```bash
 # run distributed training example in one ascend machine
@@ -180,6 +180,9 @@ For distributed training, an hccl configuration file with JSON format needs to b
 Please follow the instructions in the link below:
 https:gitee.com/mindspore/models/tree/master/utils/hccl_tools.
 
+Once you start training, the training log is redirected to device{rank_id}/log{rank_id}.txt(For example,
+device0/log0.log).
+
 ### Training on GPU
 
 The script will launch the GPU training through `mpirun`, the user can run the following command on any machine to start training.
@@ -220,9 +223,9 @@ The above command involves some `args` described below:
 - LOCAL_DEVICE_NUM: The device number of the local machine.
 - EXPERT_NUM_PER_EP: Expert nums in one data parallel dim.
 
-The following command will launch he program will train 60B model using 8 NPU.
+The following command will launch the program to train 60B model using 8 NPU.
 Mode 2.6B only represents the model configuration is same with 2.6B model which is without MoE.
-Running 0B model using 8 NPU in one server requires that the server has at least 1T host memory.
+Training 60B model using 8 NPU in one server requires that the server has at least 1TB host memory.
 
 ```bash
 # run distributed training example in one ascend machine
@@ -232,7 +235,7 @@ bash run_distributed_train_moe_host_device.sh /path/dataset /path/hccl.json 8 fp
 
 ### Incremental Training
 
- Before we start Incremental Training, the following two steps must be done:
+Before we start Incremental Training, the following two steps must be done:
 
 1. Process the dataset using the released vocab, please refer to the [Increnmental Training in Dataset Generatiogn](#Incremental Training)
 2. Download the`checkpoint` and `strategy` file according to the  [Download Checkpoint](#Download Checkpoint). Each host should own the complete checkpoint files.
@@ -291,8 +294,8 @@ ${FILE_PATH}/tokenizer/  ${FILE_PATH}/checkpoint_file filitered 2.6B fp32
 The following script will run prediction on 1 Ascend cards or 1 Nvidia GPU. The difference is the net is initialized with float16 type.
 
 ```bash
-$FILE_PATH=/home/your_path/ckpts
-$DEVICE_TARGET=Ascend # or GPU
+export FILE_PATH=/home/your_path/ckpts
+export DEVICE_TARGET=Ascend # or GPU
 bash scripts/run_standalone_predict.sh ${FILE_PATH}/strategy_load_ckpt/strategy.ckpt \
 ${FILE_PATH}/tokenizer/  ${FILE_PATH}/checkpoint_file filitered 2.6B $DEVICE_TARGET
 ```
diff --git a/official/nlp/pangu_alpha/predict.py b/official/nlp/pangu_alpha/predict.py
index b629000cdcc1eb374c2fcc81942926f2876845ff..7ad513cf2834e18282d1a6b263ef57a7361c7b08 100644
--- a/official/nlp/pangu_alpha/predict.py
+++ b/official/nlp/pangu_alpha/predict.py
@@ -101,6 +101,7 @@ def load_model(args_opt):
         eod_reset=False,
         parallel_config=parallel_config,
         load_ckpt_path=args_opt.load_ckpt_path,
+        run_type=args_opt.run_type,
         param_init_type=mstype.float32 if args_opt.param_init_type == 'fp32' else mstype.float16)
     print("===config is: ", config, flush=True)
     print("=====args_opt is: ", args_opt, flush=True)
diff --git a/official/nlp/pangu_alpha/scripts/run_standalone_export.sh b/official/nlp/pangu_alpha/scripts/run_standalone_export.sh
index 4b30aa25b46b90b07842d8f37d735a716a3ae244..6fb05642d1cd2059068654bf407f72705a1f280a 100644
--- a/official/nlp/pangu_alpha/scripts/run_standalone_export.sh
+++ b/official/nlp/pangu_alpha/scripts/run_standalone_export.sh
@@ -27,9 +27,9 @@ export CKPT_NAME='filerted'
 
 for((i=0;i<$RANK_SIZE;i++));
 do
-  rm -rf ${execute_path}/device_$i/
-  mkdir ${execute_path}/device_$i/
-  cd ${execute_path}/device_$i/ || exit
+  rm -rf ${execute_path}/device$i/
+  mkdir ${execute_path}/device$i/
+  cd ${execute_path}/device$i/ || exit
   export RANK_ID=$i
   export DEVICE_ID=$i
   python -s ${self_path}/../predict.py --strategy_load_ckpt_path=$STRATEGY --load_ckpt_path=$CKPT_PATH \
diff --git a/official/nlp/pangu_alpha/src/pangu_alpha.py b/official/nlp/pangu_alpha/src/pangu_alpha.py
index 643e4b75b12821e04cf99e8e78cbc736991968c1..f7be1fda61887a6d6c229687d74049c0a142ae3e 100644
--- a/official/nlp/pangu_alpha/src/pangu_alpha.py
+++ b/official/nlp/pangu_alpha/src/pangu_alpha.py
@@ -290,6 +290,7 @@ class PanguAlpha_Model(Cell):
 
         if config.load_ckpt_path:
             self.load_embedding_from_ckpt(config.load_ckpt_path)
+        self.run_type = config.run_type
 
     def construct(self, input_ids,
                   input_position,
@@ -300,7 +301,7 @@ class PanguAlpha_Model(Cell):
         embed, word_table = self.embedding(input_ids, input_position, init_reset, batch_valid_length)
         hidden_state = P.Cast()(embed, self.dtype)
         # the input of the incremental prediction is 3d
-        if self._phase != 'predict':
+        if self.run_type != 'predict':
             hidden_state = self.reshape_to_2d(hidden_state)
         if self.blocks is not None:
             for i in range(self.num_layers - 1):
@@ -479,10 +480,10 @@ class EvalNet(nn.Cell):
         bs, seq_length = F.shape(input_ids)
         input_position = F.tuple_to_array(F.make_range(seq_length))
         input_position = P.Tile()(input_position, (bs, 1))
-        if self.is_first_iteration:
-            attention_mask = self.get_attention_mask(input_mask)
-        else:
+        if self.is_first_iteration is False:
             attention_mask = P.Tile()(self.all_ones_attention_mask, (bs, 1, 1))
+        else:
+            attention_mask = self.get_attention_mask(input_mask)
         logits = self.backbone(input_ids, input_position, attention_mask,
                                init_reset, batch_valid_length)
         index = current_index.view(1,)
diff --git a/official/nlp/pangu_alpha/src/pangu_alpha_config.py b/official/nlp/pangu_alpha/src/pangu_alpha_config.py
index e587b8165fb14a19e2a86bdd9771db8ed9d01969..104ee350c7ee84f8f4df47aefd3df80dab3da6dc 100644
--- a/official/nlp/pangu_alpha/src/pangu_alpha_config.py
+++ b/official/nlp/pangu_alpha/src/pangu_alpha_config.py
@@ -42,6 +42,7 @@ class PanguAlphaConfig:
                  use_moe=False,
                  per_dp_dim_expert_num=4,
                  parallel_config=None,
+                 run_type='train',
                  softmax_compute_type=mstype.float16):
         self.batch_size = batch_size
         self.seq_length = seq_length
@@ -65,11 +66,12 @@ class PanguAlphaConfig:
         self.softmax_compute_type = softmax_compute_type
         self.use_moe = bool(use_moe)
         self.per_dp_dim_expert_num = per_dp_dim_expert_num
+        self.run_type = run_type
 
     def __str__(self):
-        info = "[PANGUALPHAConfig]" + '===' * 10 + '\n'
+        info = '===' * 10 + "[PANGUALPHAConfig]" + '===' * 10 + '\n'
         for k, v in self.__dict__.items():
-            var_info = "{}:{}\n".format(k, v)
+            var_info = "--{}:{}\n".format(k, v)
             info += var_info
         info += '=' * 10
         return info