fix issue of offload

2279201d · bichaoyang · 84501432 · 2279201d · 2279201d
Commit 2279201d authored 2 years ago by bichaoyang
--- a/official/nlp/pangu_alpha/scripts/run_distribute_train_moe_host_device.sh
+++ b/official/nlp/pangu_alpha/scripts/run_distribute_train_moe_host_device.sh
@@ -40,6 +40,7 @@ LOCAL_DEVICE_NUM=${10}
 EXPERT_NUM=${11}
 ENABLE_ALLTOALL=${12}
 EXPERT_PARALLEL_NUM=${13}
+export HCCL_EXEC_TIMEOUT=1500

 for((i=0;i<${LOCAL_DEVICE_NUM};i++));
 do

--- a/official/nlp/pangu_alpha/src/pangu_alpha_wrapcell.py
+++ b/official/nlp/pangu_alpha/src/pangu_alpha_wrapcell.py
@@ -121,8 +121,6 @@ class PanguAlphaTrainOneStepWithLossScaleCell(TrainOneStepWithLossScaleCell):
        else:
            self.clip = ClipByGlobalNorm(self.weights, config)
        self.cast = P.Cast()
-        self.sync_all_reduce = P.AllReduce()
-        self.sync_tensor = Tensor(0.0, dtype=mstype.float32)

    def construct(self, input_ids, input_position, attention_mask, layer_past=None, sens=None):
        """Defines the computation performed."""
@@ -158,12 +156,7 @@ class PanguAlphaTrainOneStepWithLossScaleCell(TrainOneStepWithLossScaleCell):
        # if not, update weights
        if not overflow:
            if self.enable_offload:
-                res = self.optimizer(grads, clip_value)
-                # For moe and enable_offload, compile time of difference devices have great gap and cause
-                # notify wait, a sync allreduce at the end of last graph is need
-                sync_tensor = F.depend(self.sync_tensor, res)
-                sync_flag = self.sync_all_reduce(sync_tensor)
-                loss = F.depend(loss, sync_flag)
+                self.optimizer(grads, clip_value)
            else:
                self.optimizer(grads)
        return loss, cond, scaling_sens