diff --git a/official/nlp/bert/pretrain_config.yaml b/official/nlp/bert/pretrain_config.yaml index 55603ee1565da6345f71f6d471111a84729c63d4..d4469561bd5b9732c6df31e18f382e5dba449ae2 100644 --- a/official/nlp/bert/pretrain_config.yaml +++ b/official/nlp/bert/pretrain_config.yaml @@ -32,7 +32,7 @@ save_checkpoint_num: 1 data_dir: '' schema_dir: '' dataset_format: "mindrecord" -num_samples: None # is the option which could be set by user to specify steps +num_samples: None # is the option which could be set by user to specify steps when bert_network is base # ============================================================================== # pretrain related diff --git a/official/nlp/bert/pretrain_config_Ascend_Boost.yaml b/official/nlp/bert/pretrain_config_Ascend_Boost.yaml index 77be3be95f74e0569950e575c70c7b277095ef64..0a96802354e7f6c2321496bfa5383d500748e210 100644 --- a/official/nlp/bert/pretrain_config_Ascend_Boost.yaml +++ b/official/nlp/bert/pretrain_config_Ascend_Boost.yaml @@ -32,7 +32,7 @@ save_checkpoint_num: 1 data_dir: '' schema_dir: '' dataset_format: "mindrecord" -num_samples: None # is the option which could be set by user to specify steps +num_samples: None # is the option which could be set by user to specify steps when bert_network is base # ============================================================================== # pretrain related diff --git a/official/nlp/bert/pretrain_config_Ascend_Thor.yaml b/official/nlp/bert/pretrain_config_Ascend_Thor.yaml index 666ab18314100311b642fa21fbb7363b4ffe92e3..31ac77f8a8c6f3d9d854865ab4e05cdb1eed940b 100644 --- a/official/nlp/bert/pretrain_config_Ascend_Thor.yaml +++ b/official/nlp/bert/pretrain_config_Ascend_Thor.yaml @@ -32,7 +32,7 @@ save_checkpoint_num: 5 data_dir: '' schema_dir: '' dataset_format: "mindrecord" -num_samples: None # is the option which could be set by user to specify steps +num_samples: None # is the option which could be set by user to specify steps when bert_network is base # ============================================================================== # pretrain related diff --git a/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh b/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh index 770dab31195c38fd8a1e77bc93c5cc643399b20e..a03d18387346dfcbcfbd62b1c539e0145231bf97 100644 --- a/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh +++ b/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh @@ -16,8 +16,8 @@ echo "==============================================================================================================" echo "Please run the script as: " -echo "bash scripts/run_distributed_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR" -echo "for example: bash scripts/run_distributed_pretrain.sh 8 40 /path/zh-wiki/ [/path/Schema.json](optional)" +echo "bash scripts/run_distributed_pretrain_for_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR" +echo "for example: bash scripts/run_distributed_pretrain_for_gpu.sh 8 40 /path/zh-wiki/ [/path/Schema.json](optional)" echo "It is better to use absolute path." echo "==============================================================================================================" diff --git a/official/nlp/bert/src/dataset.py b/official/nlp/bert/src/dataset.py index f1277646b698bbf2c83e2cb8744ce69680764a77..01f9659e6b97ae16f9c00cf65fa5a92d68e04d6b 100644 --- a/official/nlp/bert/src/dataset.py +++ b/official/nlp/bert/src/dataset.py @@ -126,7 +126,7 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, (dataset_format == "mindrecord" and "mindrecord" in file_name and "mindrecord.db" not in file_name): data_files.append(os.path.join(data_dir, file_name)) if dataset_format == "mindrecord": - if num_samples is not None: + if str(num_samples).lower() != "none": data_set = ds.MindDataset(data_files, columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], @@ -279,10 +279,15 @@ def create_eval_dataset(batchsize=32, device_num=1, rank=0, data_dir=None, schem else: data_files.append(data_dir) if dataset_format == "mindrecord": - data_set = ds.MindDataset(data_files, - columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], - num_samples=num_samples) + if str(num_samples).lower() != "none": + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_samples=num_samples) + else: + data_set = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"]) elif dataset_format == "tfrecord": data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", @@ -312,10 +317,16 @@ def create_eval_dataset(batchsize=32, device_num=1, rank=0, data_dir=None, schem eval_ds.use_sampler(sampler) else: if dataset_format == "mindrecord": - eval_ds = ds.MindDataset(data_files, - columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], - num_shards=device_num, shard_id=rank) + if str(num_samples).lower() != "none": + eval_ds = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_shards=device_num, shard_id=rank, num_samples=num_samples) + else: + eval_ds = ds.MindDataset(data_files, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + num_shards=device_num, shard_id=rank) elif dataset_format == "tfrecord": eval_ds = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", diff --git a/official/nlp/bert/src/tools/parallel_tfrecord_to_mindrecord.py b/official/nlp/bert/src/tools/parallel_tfrecord_to_mindrecord.py index c65d5f8e52bdec2bee12e65db8f937d8a545027a..20f6e2085c2d5bf927be56561d2cc9cb2afcba6e 100644 --- a/official/nlp/bert/src/tools/parallel_tfrecord_to_mindrecord.py +++ b/official/nlp/bert/src/tools/parallel_tfrecord_to_mindrecord.py @@ -22,7 +22,8 @@ def tf_2_mr(item): item_path = item if not os.path.exists(args.output_mindrecord_dir): os.makedirs(args.output_mindrecord_dir, exist_ok=True) - mindrecord_path = args.output_mindrecord_dir + item[item.rfind('/') + 1:item.rfind('.')] + '.mindrecord' + mindrecord_path = os.path.join(args.output_mindrecord_dir, + item[item.rfind('/') + 1:item.rfind('.')] + '.mindrecord') print("Start convert {} to {}.".format(item_path, mindrecord_path)) writer = FileWriter(file_name=mindrecord_path, shard_num=1, overwrite=True) nlp_schema = {"input_ids": {"type": "int64", "shape": [-1]},