diff --git a/official/nlp/fasttext/src/load_dataset.py b/official/nlp/fasttext/src/load_dataset.py index 708b08bffd0432bc3467e6542e9f13642908cea3..4a78187a68fb40ec678a1dd40c7a053f82fb37d3 100644 --- a/official/nlp/fasttext/src/load_dataset.py +++ b/official/nlp/fasttext/src/load_dataset.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,14 +35,14 @@ def load_dataset(dataset_path, shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, - num_parallel_workers=4) + num_parallel_workers=32) ori_dataset_size = data_set.get_dataset_size() print(f"Dataset size: {ori_dataset_size}") repeat_count = epoch_count data_set = data_set.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'], output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag']) - data_set = data_set.batch(batch_size, drop_remainder=False) + data_set = data_set.batch(batch_size, drop_remainder=False, num_parallel_workers=32) data_set = data_set.repeat(repeat_count) return data_set diff --git a/research/hpc/pafnucy/README.md b/research/hpc/pafnucy/README.md index 06316007312e05169823bcdc9b6f345ef87dfbfe..8ed9df51dc8a5aa3080ed40f3650651831c29b01 100644 --- a/research/hpc/pafnucy/README.md +++ b/research/hpc/pafnucy/README.md @@ -128,6 +128,35 @@ After installing MindSpore via the official website, you can start training and <https://gitee.com/mindspore/models/tree/master/utils/hccl_tools>. +If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows: + +```python +# run distributed training on modelarts example +# (1) First, Perform a or b. +# a. Set "enable_modelarts=True" on yaml file. +# Set other parameters on yaml file you need. +# b. Add "enable_modelarts=True" on the website UI interface. +# Add other parameters on the website UI interface. +# (2) Set the code directory to "/path/resnet" on the website UI interface. +# (3) Set the startup file to "train.py" on the website UI interface. +# (4) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. +# (5) Create your job. + +# run evaluation on modelarts example +# (1) Copy or upload your trained model to S3 bucket. +# (2) Perform a or b. +# a. Set "enable_modelarts=True" on yaml file. +# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file. +# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file. +# b. Add "enable_modelarts=True" on the website UI interface. +# Add "ckpt_file='checkpoint_file_name'" on the website UI interface. +# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface. +# (3) Set the code directory to "/path/resnet" on the website UI interface. +# (4) Set the startup file to "eval.py" on the website UI interface. +# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. +# (6) Create your job. +``` + # [Script Description](#contents) ## [Script and Sample Code](#contents) diff --git a/research/hpc/pafnucy/default_config.yaml b/research/hpc/pafnucy/default_config.yaml index f094a0a31a194e68417e570b8a77a0e536d73dcc..9428a0dadea96a0841e40fa9beb9929af09847fb 100644 --- a/research/hpc/pafnucy/default_config.yaml +++ b/research/hpc/pafnucy/default_config.yaml @@ -3,9 +3,9 @@ enable_modelarts: False data_url: "" train_url: "" checkpoint_url: "" -data_path: "/data/pafnucy/tests/data/dataset" +data_path: "/cache/data" output_path: "/cache/train" -load_path: "/cache/checkpoint_path" +load_path: "/cache/data" mindrecord_path: "" device_target: Ascend enable_profiling: False diff --git a/research/hpc/pafnucy/eval.py b/research/hpc/pafnucy/eval.py index c519d5619b1cd56369c5b4b3e3de0260a6c8fda2..3deeea520b654d40a97f523aa685722fe93bea78 100644 --- a/research/hpc/pafnucy/eval.py +++ b/research/hpc/pafnucy/eval.py @@ -38,6 +38,8 @@ def get_data_size(csv_path): @moxing_wrapper(pre_process=modelarts_pre_process) def run_eval(): + config.logger = get_logger('./', config.device_id) + config.logger.save_args(config) context.set_context(mode=context.GRAPH_MODE, device_target='Ascend', device_id=config.device_id) network = SBNetWork(in_chanel=[19, 64, 128], out_chanle=config.conv_channels, @@ -45,7 +47,13 @@ def run_eval(): lmbda=config.lmbda, isize=config.isize, keep_prob=1.0) network.set_train(False) - param_dict = load_checkpoint(config.ckpt_file) + if config.enable_modelarts: + config.mindrecord_path = config.data_path + config.ckpt_path = config.load_path + print("----------&&&&------", config.ckpt_path, flush=True) + param_dict = load_checkpoint(os.path.join(config.ckpt_path, config.ckpt_file)) + else: + param_dict = load_checkpoint(config.ckpt_file) load_param_into_net(network, param_dict) val_size = get_data_size(config.mindrecord_path) val_path = './val/validation_dataset.mindrecord' @@ -66,6 +74,4 @@ def run_eval(): if __name__ == '__main__': - config.logger = get_logger('./', config.device_id) - config.logger.save_args(config) run_eval() diff --git a/research/hpc/pafnucy/predict.py b/research/hpc/pafnucy/predict.py index c292eaa136902cdd40c1338871ef55871a3e00aa..368b2b89e6c5be2eca1485a405bd3f06945a9f0b 100644 --- a/research/hpc/pafnucy/predict.py +++ b/research/hpc/pafnucy/predict.py @@ -26,6 +26,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.net import SBNetWork from src.logger import get_logger from src.model_utils.config import config +from src.model_utils.device_adapter import get_device_id from src.model_utils.moxing_adapter import moxing_wrapper from src.data import Featurizer, make_grid @@ -100,16 +101,26 @@ def load_evaldata(configs, data_path): @moxing_wrapper(pre_process=modelarts_pre_process) def run_eval(): - context.set_context(mode=context.GRAPH_MODE, device_target='Ascend', device_id=5) + config.logger = get_logger('./', config.device_id) + config.logger.save_args(config) + if config.enable_modelarts: + path = config.data_path + config.ckpt_path = config.load_path + param_dict = load_checkpoint(os.path.join(config.ckpt_path, config.ckpt_file)) + hdf_file_path = os.path.join(path, config.hdf_file) + else: + path = config.predict_input + param_dict = load_checkpoint(config.ckpt_file) + hdf_file_path = path + device_id = get_device_id() + context.set_context(mode=context.GRAPH_MODE, device_target='Ascend', device_id=device_id) network = SBNetWork(in_chanel=[19, 64, 128], out_chanle=config.conv_channels, dense_size=config.dense_sizes, lmbda=config.lmbda, isize=config.isize, keep_prob=1.0, is_training=False) network.set_train(False) - hdf_file_path = input_file(config.hdf_file) data_loader, names = load_evaldata(config, hdf_file_path) - param_dict = load_checkpoint(config.ckpt_file) load_param_into_net(network, param_dict) prediction = [] for data in data_loader.create_dict_iterator(output_numpy=True): @@ -129,9 +140,4 @@ def run_eval(): if __name__ == '__main__': - config.logger = get_logger('./', config.device_id) - config.logger.save_args(config) - path = config.predict_input - paths = input_file(path) - config.hdf_file = paths run_eval() diff --git a/research/hpc/pafnucy/train.py b/research/hpc/pafnucy/train.py index a8a21d6ccd38415b04f6b72fceba8a495ab34f66..f136b3f6eee345beb08436e1e588aa055ac57ca4 100644 --- a/research/hpc/pafnucy/train.py +++ b/research/hpc/pafnucy/train.py @@ -22,14 +22,14 @@ from mindspore import context from mindspore.common.tensor import Tensor from mindspore.common import set_seed from mindspore.nn import TrainOneStepCell +from mindspore.common import dtype as mstype from mindspore.communication.management import init from mindspore.context import ParallelMode from mindspore.train.serialization import save_checkpoint - from src.logger import get_logger from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -from src.model_utils.device_adapter import get_device_id, get_device_num +from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id from src.net import SBNetWork from src.dataloader import minddataset_loader, minddataset_loader_val @@ -68,13 +68,22 @@ def run_train(): dense_size=config.dense_sizes, lmbda=config.lmbda, isize=config.isize, keep_prob=config.keep_prob) - lr = Tensor(float(config.lr)) + lr = Tensor(float(config.lr), mstype.float32) + if config.enable_modelarts: + config.mindrecord_path = config.data_path + config.ckpt_path = os.path.join(config.output_path, 'ckpt_'+str(get_rank_id())) + if not os.path.exists(config.ckpt_path): + os.mkdir(config.ckpt_path) + config.logger.info("mkdir %s", config.ckpt_path) + else: + config.ckpt_path = './ckpt/' optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr, weight_decay=config.weight_decay) train_wrapper = TrainOneStepCell(network, optimizer=optimizer) train_size, val_size = get_data_size(config.mindrecord_path) rot_path = './train_rotation/train_rotation_dataset.mindrecord' nrot_path = './no_rotation/train_norotation_dataset.mindrecord' val_path = './val/validation_dataset.mindrecord' + print(os.listdir(config.mindrecord_path)) rotation_data, _ = minddataset_loader(configs=config, mindfile=os.path.join(config.mindrecord_path, rot_path), no_batch_size=train_size) @@ -130,7 +139,7 @@ def run_train(): if final_mse_v <= stand_mse_v: stand_mse_v = final_mse_v config.logger.info("Saving checkpoint file") - save_checkpoint(train_wrapper, f'./ckpt/pafnucy_{final_mse_v}_{epoch}.ckpt') + save_checkpoint(train_wrapper, f'{config.ckpt_path}/pafnucy_{final_mse_v}_{epoch}.ckpt') config.logger.info("Finish Training.....")