diff --git a/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh b/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh index 28700f8901f72d1c5cd6b2fbbd83adbb8a759d14..07719312e3bec8f2a19d797b21a78c6d2d26e69e 100644 --- a/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh +++ b/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh @@ -23,7 +23,7 @@ echo "For hyper parameter, please note that you should customize the scripts: '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' " echo "==============================================================================================================" CUR_DIR=`pwd` -ulimit -s 102400 +ulimit -s 302400 python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py \ --run_script_dir=${CUR_DIR}/run_pretrain.py \ --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ diff --git a/official/nlp/bert/src/bert_for_pre_training.py b/official/nlp/bert/src/bert_for_pre_training.py index ec27daeaf412602b8b4eecda5174c3a78072fc39..29a26b1a567951c1cb92a0fdd30c2c6631f4c04d 100644 --- a/official/nlp/bert/src/bert_for_pre_training.py +++ b/official/nlp/bert/src/bert_for_pre_training.py @@ -813,6 +813,18 @@ class BertNetworkMatchBucket(nn.Cell): bucket_list = [seq_length] self.bucket_list = [bucket for bucket in bucket_list if bucket <= seq_length] + if network.reducer_flag: + reuse_attr = 'reuse_communication_node' + if not network.grad_reducer.split_fusion: + hccl_op = network.grad_reducer.allreduce + network.grad_reducer.allreduce = hccl_op.add_prim_attr(reuse_attr, getattr(hccl_op, 'fusion')) + else: + new_op_list = [] + for hccl_op in network.grad_reducer.op_list: + new_op = hccl_op.add_prim_attr(reuse_attr, getattr(hccl_op, 'fusion')) + new_op_list.append(new_op) + network.grad_reducer.op_list = new_op_list + def construct(self, input_ids, input_mask, diff --git a/official/nlp/bert/src/dataset.py b/official/nlp/bert/src/dataset.py index a611c1ef5629d2d2b8f87375422474cbc9ef855d..2864d3e8c62a63ff911e0bbe540c80de9945471e 100644 --- a/official/nlp/bert/src/dataset.py +++ b/official/nlp/bert/src/dataset.py @@ -32,9 +32,8 @@ class BucketDatasetGenerator: dataset (Dataset): The training dataset. batch_size (Int): The training batchsize. bucket_list (List): List of different sentence lengths, such as [128, 256, 512]. Default: None. - valid_dataset_len (Int): Prevent communication failure at the end of the dataset. Default: 0.35. """ - def __init__(self, dataset, batch_size, bucket_list=None, valid_dataset_len=0.35): + def __init__(self, dataset, batch_size, bucket_list=None): self.dataset = dataset self.batch_size = batch_size self.bucket_list = bucket_list @@ -42,14 +41,12 @@ class BucketDatasetGenerator: self.random_list = np.random.binomial(n=(bucket_size - 1), p=0.55, size=self.__len__()) self.random_list = (self.random_list + 2) % bucket_size self.random_list = [bucket_list[i] for i in self.random_list] - valid_dataset_len = int(valid_dataset_len * self.__len__()) - self.random_list = self.random_list[:valid_dataset_len] + [bucket_list[-1]] * self.__len__() self._init_variables() def _init_variables(self): self.data_bucket = {bucket: [] for bucket in self.bucket_list} self.iter = 0 - self.remaining_data_size = 1 + self.remaining_data = [] self.stage = 0 def __next__(self): @@ -68,6 +65,8 @@ class BucketDatasetGenerator: self.iter += 1 return self._package_data(data, key) self.stage = 1 + for value in self.data_bucket.values(): + self.remaining_data += list(value) return self._process_remaining_data() def _package_data(self, data, key): @@ -86,20 +85,16 @@ class BucketDatasetGenerator: def _process_remaining_data(self): """process remaining data.""" - remaining_data_offset = self.remaining_data_size * self.batch_size - remaining_data = [] - for value in self.data_bucket.values(): - remaining_data += list(value) - if remaining_data_offset > len(remaining_data) or self.iter >= self.__len__(): + if self.batch_size > len(self.remaining_data) or self.iter >= self.__len__(): self._init_variables() raise StopIteration - self.remaining_data_size += 1 - remaining_data = remaining_data[remaining_data_offset - self.batch_size : remaining_data_offset] + remaining_data = self.remaining_data[:self.batch_size] + self.remaining_data = self.remaining_data[self.batch_size:] self.iter += 1 return self._package_data(remaining_data, self.bucket_list[-1]) def __iter__(self): - self.iter = 0 + self._init_variables() self.iterator = self.dataset.create_tuple_iterator(output_numpy=True) return self