diff --git a/research/nlp/lstm_crf/README.md b/research/nlp/lstm_crf/README.md new file mode 100644 index 0000000000000000000000000000000000000000..75b6c0fc5d866504ec0fb9d03a29b8c198ab1e88 --- /dev/null +++ b/research/nlp/lstm_crf/README.md @@ -0,0 +1,140 @@ +# Contents + +- [Contents](#contents) +- [LSTM Description](#lstm-crf-description) +- [Model Architecture](#model-architecture) +- [Dataset](#dataset) +- [Environment Requirements](#environment-requirements) +- [Quick Start](#quick-start) +- [Script Description](#script-description) +- [Model Description](#model-description) + - [Performance](#performance) + - [Training Performance](#training-performance) + - [Evaluation Performance](#evaluation-performance) +- [Description of Random Situation](#description-of-random-situation) +- [ModelZoo Homepage](#modelzoo-homepage) + +# [LSTM-CRF Description](#contents) + +This example is for LSTM_CRF model training and evaluation. + +[Paper](https://arxiv.org/abs/1508.01991): Zhiheng Huang, Wei Xu, Kai Yu. [Bidirectional LSTM-CRF Models for Sequence Tagging](https://arxiv.org/abs/1508.01991). + +# [Model Architecture](#contents) + +LSTM contains embeding, encoder and decoder modules. Encoder module consists of LSTM layer. Decoder module consists of fully-connection layer. Take the full-connection layer output as the input of CRF. + +# [Dataset](#contents) + +Note that you can run the scripts based on the dataset mentioned in original paper or widely used in relevant domain/network architecture. In the following sections, we will introduce how to run the scripts using the related dataset below. + +- CoNLL2000 for training evaluation.[CoNLL 2000 chunking](https://www.clips.uantwerpen.be/conll2000/chunking/) +- GloVe: Vector representations for words.[GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/) + +# [Environment Requirements](#contents) + +- Hardware(CPU/Ascend) + - Prepare hardware environment with Ascend or CPU processor. +- Framework + - [MindSpore](https://gitee.com/mindspore/mindspore) +- For more information, please check the resources below: + - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html) + +# [Quick Start](#contents) + +- bulid_data + + ```bash + # run bulid_data example + bash run_bulid_data.sh ..data/CoNLL2000 ../data/glove + ``` + +- running on Ascend + + ```bash + # run training example + bash run_train_ascend.sh 0 ..data/CoNLL2000 + + # run evaluation example + bash run_eval_ascend.sh 0 ..data/CoNll200 lstm-20_446.ckpt + ``` + +- running on CPU + + ```bash + # run training example + bash run_train_cpu.sh ..data/CoNLL2000 + + # run evaluation example + bash run_eval_cpu.sh ..data/CoNll200 lstm-20_446.ckpt + ``` + +# [Script Description](#contents) + +```shell +. +├── lstm_crf + ├── README.md # descriptions about LSTM + ├── script + │ ├── run_bulid_data.sh # shell script for create data + │ ├── run_eval_ascend.sh # shell script for evaluation on Ascend + │ ├── run_eval_cpu.sh # shell script for evaluation on CPU + │ ├── run_train_ascend.sh # shell script for training on Ascend + │ ├── run_train_cpu.sh # shell script for training on CPU + ├── src + │ ├── lstm.py # lstm model + │ ├── lstm_crf.py # lstm_crf model + │ ├── dataset.py # dataset preprocess + │ ├── imdb.py # imdb dataset read script + │ ├── util.py # utils script + │ └─model_utils + │ ├── config.py # Processing configuration parameters + │ ├── device_adapter.py # Get cloud ID + │ ├── local_adapter.py # Get local ID + │ ├── moxing_adapter.py # Parameter processing + ├── default_config.yaml # Training parameter profile(cpu/ascend) + ├── eval.py # evaluation script on CPU and Ascend + └── train.py # training script on CPU and Ascend + └── export.py # export script on CPU and Ascend +``` + +# [Model Description](#contents) + +## [Performance](#contents) + +### Training Performance + +| Parameters | LSTM_CRF (Ascend) | LSTM_CRF (CPU) | +| -------------------------- | -------------------------- | -------------------------- | +| Resource | Ascend 910 | windows10 i7-9700-32G | +| uploaded Date | 12/28/2021 (month/day/year)| 12/28/2021 (month/day/year)| +| MindSpore Version | 1.6.0 | 1.6.0 | +| Dataset | CoNLL2000 | CoNLL2000 | +| Training Parameters | epoch=15, batch_size=20 | epoch=15, batch_size=20 | +| Optimizer | AdamWeightDecay |AdamWeightDecay | +| Loss Function | CRF LOSS | CRF LOSS | +| Checkpoint for inference | 36.7M (.ckpt file) | 36.7M (.ckpt file) | +| Scripts | [lstm script](https://gitee.com/mindspore/models/tree/master/research/nlp/lstm_crf) | [lstm script](https://gitee.com/mindspore/models/tree/master/research/nlp/lstm_crf) | + +### Evaluation Performance + +| Parameters | LSTM_CRF (Ascend) | LSTMLSTM_CRF (CPU) | +| ------------------- | ---------------------------- | ---------------------------- | +| Resource | Ascend 910 | Ubuntu X86-i7-8565U-16GB | +| uploaded Date | 12/28/2021 (month/day/year) | 12/28/2021 (month/day/year) | +| MindSpore Version | 1.6.0 | 1.6.0 | +| Dataset | CoNLL2000 | CoNLL2000 | +| batch_size | 20 | 20 | +| F1 | 92.05% | 92.4% | + +# [Description of Random Situation](#contents) + +There are three random situations: + +- Shuffle of the dataset. +- Initialization of some model weights. + +# [ModelZoo Homepage](#contents) + +Please check the official [homepage](https://gitee.com/mindspore/models). diff --git a/research/nlp/lstm_crf/default_config.yaml b/research/nlp/lstm_crf/default_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3725d5b17cd433a7c58b209386ac6046d4fd2310 --- /dev/null +++ b/research/nlp/lstm_crf/default_config.yaml @@ -0,0 +1,81 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +data_url: "" +train_url: "" +checkpoint_url: "" +data_path: "./data" +output_path: "./output/train" +load_path: "./output/checkpoint_path" +checkpoint_path: './checkpoint/' +checkpoint_file: './checkpoint/lstm_crf.ckpt' +device_target: 'CPU' +device_id: 1 +enable_profiling: False +ckpt_save_path: '../ckpt_lstm_crf' + + +# ============================================================================== +# LSTM_CRF CONFIG +num_epochs: 15 +batch_size: 20 +embed_size: 300 +num_hiddens: 320 +num_layers: 320 +bidirectional: True +keep_checkpoint_max: 20 +dropout: 0.5 +build_data: False + + +# optimizer related +optimizer: 'AdamWeightDecay' +AdamWeightDecay: + warmup_steps: 1000 + beta1: 0.9 + beta2: 0.99 + learning_rate: 0.002 # 2e-3 + end_learning_rate: 0.0000000001 # 1e-10 + power: 1.0 + weight_decay: 0.00001 # 1e-5 + decay_filter: ['layernorm', 'bias'] + eps: 0.000001 # 1e-6 + + +# MindSpore LSTM_CRF Example - train.py +preprocess: 'false' +data_CoNLL_path: "./data/CoNLL2000" +glove_path: "./data/glove" +ckpt_path: 'lstm_crf-15_446.ckpt' +pre_trained: '' # None +device_num: 1 +distribute: "false" +enable_graph_kernel: "true" + + +# export.py +ckpt_file: './ckpt_lstm/lstm_crf.ckpt' +file_name: "lstm_crf" +file_format: "MINDIR" + + +--- +# Config description for each option +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +train_url: 'Training output url for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' + +preprocess: 'whether to preprocess data.' +data_CoNLL_path: 'path where the dataset is stored.' +glove_path: 'path where the GloVe is stored.' +ckpt_path: 'the path to save the checkpoint file.' +pre_trained: 'the pretrained checkpoint file path.' +device_target: 'the target device to run, support "GPU", "CPU". Default: "Ascend".' +device_num: 'Use device nums, default is 1.' +enable_graph_kernel: 'Accelerate by graph kernel, default is true.' + +--- +device_target: ['Ascend', 'GPU', 'CPU'] +enable_graph_kernel: ['true', 'false'] +file_format: ['AIR', 'MINDIR'] diff --git a/research/nlp/lstm_crf/eval.py b/research/nlp/lstm_crf/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..23ee5bc6ac78209c4ec52f6b1f82270d3f9b605b --- /dev/null +++ b/research/nlp/lstm_crf/eval.py @@ -0,0 +1,114 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +#################train lstm-crf example on CoNLL2000######################## +""" +import os +from copy import deepcopy +import numpy as np + +from src.util import F1, get_chunks, get_label_lists +from src.model_utils.config import config +from src.dataset import get_data_set +from src.LSTM_CRF import Lstm_CRF +from src.imdb import ImdbParser +import mindspore +from mindspore import Tensor, Model, context +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +def modelarts_process(): + config.ckpt_file = os.path.join(config.output_path, config.ckpt_file) + + +def eval_lstm_crf(): + """ eval lstm """ + print('\neval.py config: \n', config) + + context.set_context( + mode=context.GRAPH_MODE, + save_graphs=False, + device_id=config.device_id, + device_target=config.device_target + ) + + parser = ImdbParser(config.data_CoNLL_path, + config.glove_path, + config.data_CoNLL_path, + embed_size=config.embed_size + ) + embeddings, sequence_length, _, _, sequence_index, sequence_tag_index, tags_to_index_map \ + = parser.get_datas_embeddings(seg=['test'], build_data=False) + embeddings_table = Tensor(embeddings, mindspore.float32) + ds_test = get_data_set(sequence_index, sequence_tag_index, config.batch_size) + + network = Lstm_CRF(vocab_size=embeddings.shape[0], + tag_to_index=tags_to_index_map, + embedding_size=config.embed_size, + hidden_size=config.num_hiddens, + num_layers=config.num_layers, + weight=embeddings_table, + bidirectional=config.bidirectional, + batch_size=config.batch_size, + seq_length=sequence_length, + is_training=False) + + callback = F1(len(tags_to_index_map)) + model = Model(network) + + param_dict = load_checkpoint(os.path.join(config.ckpt_save_path, config.ckpt_path)) + load_param_into_net(network, param_dict) + print("============== Starting Testing ==============") + rest_golds_list = list() + rest_preds_list = list() + columns_list = ["feature", "label"] + for data in ds_test.create_dict_iterator(num_epochs=1): + input_data = [] + for i in columns_list: + input_data.append(data[i]) + feature, label = input_data + logits = model.predict(feature, label) + logit_ids, label_ids = callback.update(logits, label) + + rest_preds = np.array(logit_ids) + rest_preds = np.expand_dims(rest_preds, 0) + + rest_labels = deepcopy(label_ids) + label_ids = np.expand_dims(label_ids, 0) + rest_labels = np.expand_dims(rest_labels, 0) + + rest_golds, rest_preds = get_label_lists(rest_labels, rest_preds, label_ids) + + rest_golds_list += rest_golds + rest_preds_list += rest_preds + + accs = [] + correct_preds, total_correct, total_preds = 0., 0., 0. + for golds, preds in zip(rest_golds_list, rest_preds_list): + accs += [a == b for (a, b) in zip(golds, preds)] + golds_chunks = set(get_chunks(golds, tags_to_index_map)) + preds_chunks = set(get_chunks(preds, tags_to_index_map)) + correct_preds += len(golds_chunks & preds_chunks) + total_preds += len(preds_chunks) + total_correct += len(golds_chunks) + + p = correct_preds / total_preds if correct_preds > 0 else 0 + r = correct_preds / total_correct if correct_preds > 0 else 0 + f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 + acc = np.mean(accs) + print("acc: {:.6f}%, F1: {:.6f}% ".format(acc*100, f1*100)) + + +if __name__ == '__main__': + eval_lstm_crf() diff --git a/research/nlp/lstm_crf/export.py b/research/nlp/lstm_crf/export.py new file mode 100644 index 0000000000000000000000000000000000000000..5212204e160bb42438c5dfea3ff813dc2a343ebb --- /dev/null +++ b/research/nlp/lstm_crf/export.py @@ -0,0 +1,73 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +##############export checkpoint file into mindir model################# +python export.py +""" +import os +import numpy as np + +from src.LSTM_CRF import Lstm_CRF +from src.model_utils.config import config +from src.model_utils.device_adapter import get_device_id +from src.imdb import ImdbParser + +import mindspore +from mindspore import Tensor, context +from mindspore import export, load_checkpoint, load_param_into_net + +def modelarts_process(): + config.ckpt_file = os.path.join(config.output_path, config.ckpt_file) + +def export_lstm_crf(): + """ export lstm """ + context.set_context( + mode=context.GRAPH_MODE, + save_graphs=False, + device_target=config.device_target, + enable_graph_kernel=False, + device_id=get_device_id()) + + parser = ImdbParser(config.data_CoNLL_path, + config.glove_path, + config.data_CoNLL_path, + embed_size=config.embed_size) + + embeddings, sequence_length, _, _, _, _, tags_to_index_map \ + = parser.get_datas_embeddings(seg=['test'], build_data=False) + + embeddings_table = Tensor(embeddings, mindspore.float32) + + network = Lstm_CRF(vocab_size=embeddings_table.shape[0], + tag_to_index=tags_to_index_map, + embedding_size=config.embed_size, + hidden_size=config.num_hiddens, + num_layers=config.num_layers, + weight=embeddings_table, + bidirectional=config.bidirectional, + batch_size=config.batch_size, + seq_length=sequence_length, + is_training=True) + + param_dict = load_checkpoint(os.path.join(config.ckpt_save_path, config.ckpt_path)) + load_param_into_net(network, param_dict) + + input_arr_features = Tensor(np.random.uniform(0.0, 1e5, size=[config.batch_size, sequence_length]).astype(np.int32)) + input_arr_labels = Tensor(np.random.uniform(0.0, 1e5, size=[config.batch_size, sequence_length]).astype(np.int32)) + export(network, input_arr_features, input_arr_labels, file_name=config.file_name, file_format=config.file_format) + + +if __name__ == '__main__': + export_lstm_crf() diff --git a/research/nlp/lstm_crf/requirements.txt b/research/nlp/lstm_crf/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f26d4a3657859f13f7a3bdd813f7b874702764ac --- /dev/null +++ b/research/nlp/lstm_crf/requirements.txt @@ -0,0 +1,3 @@ +gensim +numpy +pyyaml \ No newline at end of file diff --git a/research/nlp/lstm_crf/scripts/run_bulid_data.sh b/research/nlp/lstm_crf/scripts/run_bulid_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..07d595ca870f1a2bc0b7b2d6d075d01068bd2888 --- /dev/null +++ b/research/nlp/lstm_crf/scripts/run_bulid_data.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_bulid_data.sh CoNLL2000_DIR GLOVE_DIR" +echo "for example: bash run_bulid_data.sh ../data/CoNLL2000 ../data/glove" +echo "==============================================================================================================" + +CoNLL2000_DIR=$1 +GLOVE_DIR=$2 + +mkdir -p ms_log +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + +python ../train.py \ + --config_path=$CONFIG_FILE \ + --data_CoNLL_path=${CoNLL2000_DIR}\ + --glove_path=${GLOVE_DIR}\ + --device_target="CPU" \ + --build_data=True \ + --preprocess=true \ + --preprocess_path=./preprocess > log_build_data.txt 2>&1 & diff --git a/research/nlp/lstm_crf/scripts/run_eval_ascend.sh b/research/nlp/lstm_crf/scripts/run_eval_ascend.sh new file mode 100644 index 0000000000000000000000000000000000000000..fd07dd0e4940183778e51c29c091f8d591556028 --- /dev/null +++ b/research/nlp/lstm_crf/scripts/run_eval_ascend.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_eval_ascend.sh CoNLL2000_DIR GLOVE_DIR" +echo "for example: bash run_eval_ascend.sh 0 ../data/CoNLL2000 lstm_crf-15_446.ckpt" +echo "==============================================================================================================" + +DEVICE_ID=$1 +CoNLL2000_DIR=$2 +CKPT_FILE=$3 + +mkdir -p ms_log +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + + +python ../eval.py \ + --config_path=$CONFIG_FILE \ + --device_target="Ascend" \ + --device_id=${DEVICE_ID}\ + --data_CoNLL_path=${CoNLL2000_DIR}\ + --ckpt_path=${CKPT_FILE}\ + --build_data=False \ + --preprocess=true \ + --preprocess_path=./preprocess > log_eval_ascend.txt 2>&1 & \ No newline at end of file diff --git a/research/nlp/lstm_crf/scripts/run_eval_cpu.sh b/research/nlp/lstm_crf/scripts/run_eval_cpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..727901ab4072ae2e6f0588fb0447e3363aeb191b --- /dev/null +++ b/research/nlp/lstm_crf/scripts/run_eval_cpu.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_eval_cpu.sh CoNLL2000_DIR GLOVE_DIR" +echo "for example: bash run_eval_cpu.sh ../data/CoNLL2000 lstm_crf-15_446.ckpt" +echo "==============================================================================================================" + +CoNLL2000_DIR=$1 +CKPT_FILE=$2 + +mkdir -p ms_log +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + + +python ../eval.py \ + --config_path=$CONFIG_FILE \ + --device_target="CPU" \ + --data_CoNLL_path=${CoNLL2000_DIR}\ + --ckpt_path=${CKPT_FILE}\ + --build_data=False \ + --preprocess=true \ + --preprocess_path=./preprocess > log_eval_cpu.txt 2>&1 & \ No newline at end of file diff --git a/research/nlp/lstm_crf/scripts/run_export_ascend.sh b/research/nlp/lstm_crf/scripts/run_export_ascend.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc2fd348946c920504e5b33d5e62901a66c966e2 --- /dev/null +++ b/research/nlp/lstm_crf/scripts/run_export_ascend.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_export_ascend.sh CoNLL2000_DIR GLOVE_DIR" +echo "for example: bash run_export_ascend.sh 0 ../data/CoNLL2000 lstm_crf-15_446.ckpt" +echo "==============================================================================================================" + +DEVICE_ID=$1 +CoNLL2000_DIR=$2 +CKPT_FILE=$3 + +mkdir -p ms_log +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + + +python ../export.py \ + --config_path=$CONFIG_FILE \ + --device_target="Ascend" \ + --device_id=${DEVICE_ID}\ + --data_CoNLL_path=${CoNLL2000_DIR}\ + --ckpt_path=${CKPT_FILE}\ + --build_data=False \ + --preprocess=true \ + --preprocess_path=./preprocess > log_export.txt 2>&1 & diff --git a/research/nlp/lstm_crf/scripts/run_train_ascend.sh b/research/nlp/lstm_crf/scripts/run_train_ascend.sh new file mode 100644 index 0000000000000000000000000000000000000000..089496c7634bcbd49ad76b38b13326360917f3a8 --- /dev/null +++ b/research/nlp/lstm_crf/scripts/run_train_ascend.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_train_ascend.sh CoNLL2000_DIR GLOVE_DIR" +echo "for example: bash run_train_ascend.sh 0 ../data/CoNLL2000" +echo "==============================================================================================================" + +DEVICE_ID=$1 +CoNLL2000_DIR=$2 + +mkdir -p ms_log +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + + +python ../train.py \ + --config_path=$CONFIG_FILE \ + --device_target="Ascend" \ + --device_id=${DEVICE_ID}\ + --data_CoNLL_path=${CoNLL2000_DIR}\ + --build_data=False \ + --preprocess=true \ + --preprocess_path=./preprocess > log_train_ascend.txt 2>&1 & diff --git a/research/nlp/lstm_crf/scripts/run_train_cpu.sh b/research/nlp/lstm_crf/scripts/run_train_cpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..10f7b48c630f190c869e530e177677a3c96d938d --- /dev/null +++ b/research/nlp/lstm_crf/scripts/run_train_cpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_train_cpu.sh CoNLL2000_DIR GLOVE_DIR" +echo "for example: bash run_train_cpu.sh ../data/CoNLL2000" +echo "==============================================================================================================" + +CoNLL2000_DIR=$1 + +mkdir -p ms_log +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + + +python ../train.py \ + --config_path=$CONFIG_FILE \ + --device_target="CPU" \ + --data_CoNLL_path=${CoNLL2000_DIR}\ + --build_data=False \ + --preprocess=true \ + --preprocess_path=./preprocess > log_train_cpu.txt 2>&1 & diff --git a/research/nlp/lstm_crf/src/LSTM.py b/research/nlp/lstm_crf/src/LSTM.py new file mode 100644 index 0000000000000000000000000000000000000000..a24381706792b47806a218407c23421398f9019b --- /dev/null +++ b/research/nlp/lstm_crf/src/LSTM.py @@ -0,0 +1,336 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LSTM.""" +import math +from mindspore import Tensor, nn, context, Parameter, ParameterTuple +from mindspore.common.initializer import initializer +from mindspore.ops import operations as P +import mindspore.ops.functional as F +import mindspore.common.dtype as mstype +import numpy as np + +STACK_LSTM_DEVICE = ["CPU"] + + +# Initialize short-term memory (h) and long-term memory (c) to 0 +def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional): + """init default input.""" + num_directions = 2 if bidirectional else 1 + h = Tensor(np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) + c = Tensor(np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) + return h, c + +def stack_lstm_default_state(batch_size, hidden_size, num_layers, bidirectional): + """init default input.""" + num_directions = 2 if bidirectional else 1 + + h_list = c_list = [] + for _ in range(num_layers): + h_list.append(Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32))) + c_list.append(Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32))) + h, c = tuple(h_list), tuple(c_list) + return h, c + +def stack_lstm_default_state_ascend(batch_size, hidden_size, num_layers, bidirectional): + """init default input.""" + + h_list = c_list = [] + for _ in range(num_layers): + h_fw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16)) + c_fw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16)) + h_i = [h_fw] + c_i = [c_fw] + + if bidirectional: + h_bw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16)) + c_bw = Tensor(np.zeros((1, batch_size, hidden_size)).astype(np.float16)) + h_i.append(h_bw) + c_i.append(c_bw) + + h_list.append(h_i) + c_list.append(c_i) + + h, c = tuple(h_list), tuple(c_list) + return h, c + + +class StackLSTM(nn.Cell): + """ + Stack multi-layers LSTM together. + """ + + def __init__(self, + input_size, + hidden_size, + num_layers=1, + has_bias=True, + batch_first=False, + dropout=0.0, + bidirectional=False): + super(StackLSTM, self).__init__() + self.num_layers = num_layers + self.batch_first = batch_first + self.transpose = P.Transpose() + + # direction number + num_directions = 2 if bidirectional else 1 + + # input_size list + input_size_list = [input_size] + for i in range(num_layers - 1): + input_size_list.append(hidden_size * num_directions) + + # layers + layers = [] + for i in range(num_layers): + layers.append(nn.LSTMCell(input_size=input_size_list[i], + hidden_size=hidden_size, + has_bias=has_bias, + batch_first=batch_first, + bidirectional=bidirectional, + dropout=dropout)) + + # weights + weights = [] + for i in range(num_layers): + # weight size + weight_size = (input_size_list[i] + hidden_size) * num_directions * hidden_size * 4 + if has_bias: + bias_size = num_directions * hidden_size * 4 + weight_size = weight_size + bias_size + + # numpy weight + stdv = 1 / math.sqrt(hidden_size) + w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32) + + # lstm weight + weights.append(Parameter(initializer(Tensor(w_np), w_np.shape), name="weight" + str(i))) + + # + self.lstms = layers + self.weight = ParameterTuple(tuple(weights)) + + def construct(self, x, hx): + """construct""" + if self.batch_first: + x = self.transpose(x, (1, 0, 2)) + # stack lstm + h, c = hx + hn = cn = None + for i in range(self.num_layers): + x, hn, cn, _, _ = self.lstms[i](x, h[i], c[i], self.weight[i]) + if self.batch_first: + x = self.transpose(x, (1, 0, 2)) + return x, (hn, cn) + + +class LSTM_Ascend(nn.Cell): + """ LSTM in Ascend. """ + + def __init__(self, bidirectional=False): + super(LSTM_Ascend, self).__init__() + self.bidirectional = bidirectional + self.dynamic_rnn = P.DynamicRNN(forget_bias=0.0) + self.reverseV2 = P.ReverseV2(axis=[0]) + self.concat = P.Concat(2) + + def construct(self, x, h, c, w_f, b_f, w_b=None, b_b=None): + """construct""" + x = F.cast(x, mstype.float16) + if self.bidirectional: + y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(x, w_f, b_f, None, h[0], c[0]) + r_x = self.reverseV2(x) + y2, h2, c2, _, _, _, _, _ = self.dynamic_rnn(r_x, w_b, b_b, None, h[1], c[1]) + y2 = self.reverseV2(y2) + + output = self.concat((y1, y2)) + hn = self.concat((h1, h2)) + cn = self.concat((c1, c2)) + return output, (hn, cn) + + y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(x, w_f, b_f, None, h[0], c[0]) + return y1, (h1, c1) + + +class StackLSTMAscend(nn.Cell): + """ Stack multi-layers LSTM together. """ + + def __init__(self, + input_size, + hidden_size, + num_layers=1, + has_bias=True, + batch_first=False, + dropout=0.0, + bidirectional=False): + super(StackLSTMAscend, self).__init__() + self.num_layers = num_layers + self.batch_first = batch_first + self.bidirectional = bidirectional + self.transpose = P.Transpose() + + # input_size list + input_size_list = [input_size] + for i in range(num_layers - 1): + input_size_list.append(hidden_size * 2) + + # weights, bias and layers init + weights_fw = [] + weights_bw = [] + bias_fw = [] + bias_bw = [] + + stdv = 1 / math.sqrt(hidden_size) + for i in range(num_layers): + # forward weight init + w_np_fw = np.random.uniform(-stdv, + stdv, + (input_size_list[i] + hidden_size, hidden_size * 4)).astype(np.float32) + w_fw = Parameter(initializer(Tensor(w_np_fw), w_np_fw.shape), name="w_fw_layer" + str(i)) + weights_fw.append(w_fw) + # forward bias init + if has_bias: + b_fw = np.random.uniform(-stdv, stdv, (hidden_size * 4)).astype(np.float32) + b_fw = Parameter(initializer(Tensor(b_fw), b_fw.shape), name="b_fw_layer" + str(i)) + else: + b_fw = np.zeros((hidden_size * 4)).astype(np.float32) + b_fw = Parameter(initializer(Tensor(b_fw), b_fw.shape), name="b_fw_layer" + str(i)) + bias_fw.append(b_fw) + + if bidirectional: + # backward weight init + w_np_bw = np.random.uniform(-stdv, + stdv, + (input_size_list[i] + hidden_size, hidden_size * 4)).astype(np.float32) + w_bw = Parameter(initializer(Tensor(w_np_bw), w_np_bw.shape), name="w_bw_layer" + str(i)) + weights_bw.append(w_bw) + + # backward bias init + if has_bias: + b_bw = np.random.uniform(-stdv, stdv, (hidden_size * 4)).astype(np.float32) + b_bw = Parameter(initializer(Tensor(b_bw), b_bw.shape), name="b_bw_layer" + str(i)) + else: + b_bw = np.zeros((hidden_size * 4)).astype(np.float32) + b_bw = Parameter(initializer(Tensor(b_bw), b_bw.shape), name="b_bw_layer" + str(i)) + bias_bw.append(b_bw) + + # layer init + self.lstm = LSTM_Ascend(bidirectional=bidirectional).to_float(mstype.float16) + + self.weight_fw = ParameterTuple(tuple(weights_fw)) + self.weight_bw = ParameterTuple(tuple(weights_bw)) + self.bias_fw = ParameterTuple(tuple(bias_fw)) + self.bias_bw = ParameterTuple(tuple(bias_bw)) + + def construct(self, x, hx): + """construct""" + x = F.cast(x, mstype.float16) + if self.batch_first: + x = self.transpose(x, (1, 0, 2)) + # stack lstm + h, c = hx + hn = cn = None + for i in range(self.num_layers): + if self.bidirectional: + x, (hn, cn) = self.lstm(x, + h[i], + c[i], + self.weight_fw[i], + self.bias_fw[i], + self.weight_bw[i], + self.bias_bw[i]) + else: + x, (hn, cn) = self.lstm(x, h[i], c[i], self.weight_fw[i], self.bias_fw[i]) + if self.batch_first: + x = self.transpose(x, (1, 0, 2)) + x = F.cast(x, mstype.float32) + hn = F.cast(x, mstype.float32) + cn = F.cast(x, mstype.float32) + return x, (hn, cn) + + +class Lstm(nn.Cell): + """ + Stack multi-layers LSTM together. + """ + + def __init__(self, + vocab_size, + embedding_size, + hidden_size, + out_size, + weight, + num_layers=1, + batch_size=1, + dropout=0.0, + bidirectional=False): + super(Lstm, self).__init__() + + self.embedding = nn.Embedding(vocab_size, + embedding_size, + use_one_hot=False, + embedding_table=weight) + self.embedding.embedding_table.requires_grad = False + + self.perm = (1, 0, 2) + self.trans = P.Transpose() + self.concat = P.Concat(1) + self.squeeze = P.Squeeze(axis=0) + + self.hidden_size = hidden_size + self.num_layers = num_layers + self.batch_size = batch_size + self.bidirectional = bidirectional + + if context.get_context("device_target") in STACK_LSTM_DEVICE: + # stack lstm by user + self.lstm = StackLSTM(input_size=embedding_size, + hidden_size=hidden_size, + num_layers=num_layers, + has_bias=True, + bidirectional=bidirectional, + dropout=dropout) + self.h, self.c = stack_lstm_default_state(batch_size, + hidden_size, + num_layers, + bidirectional) + else: + self.lstm = StackLSTMAscend(input_size=embedding_size, + hidden_size=hidden_size, + num_layers=num_layers, + has_bias=True, + dropout=dropout, + bidirectional=bidirectional) + self.h, self.c = stack_lstm_default_state_ascend(self.batch_size, + self.hidden_size, + self.num_layers, + self.bidirectional) + + self.fc = nn.Dense(hidden_size*2, out_size) + if bidirectional: + self.fc = nn.Dense(hidden_size * 2, out_size) + else: + self.fc = nn.Dense(hidden_size, out_size) + + def construct(self, sequence_tensor): + embeddings = self.embedding(sequence_tensor) + embeddings = self.trans(embeddings, self.perm) + lstm_out, _ = self.lstm(embeddings, (self.h, self.c)) + if self.bidirectional: + lstm_out = lstm_out.view(embeddings.shape[0], embeddings.shape[1], self.hidden_size*2) + else: + lstm_out = lstm_out.view(embeddings.shape[0], embeddings.shape[1], self.hidden_size) + lstm_feats = self.fc(lstm_out) + return lstm_feats diff --git a/research/nlp/lstm_crf/src/LSTM_CRF.py b/research/nlp/lstm_crf/src/LSTM_CRF.py new file mode 100644 index 0000000000000000000000000000000000000000..db19c6057c8a9897cc393d6f2da24d7956f1338e --- /dev/null +++ b/research/nlp/lstm_crf/src/LSTM_CRF.py @@ -0,0 +1,207 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LSTM_CRF.""" +import numpy as np + +from mindspore import Tensor, nn, Parameter +from mindspore.ops import operations as P +from mindspore.common import dtype as mstype +# from mindspore import nn as nn +from .LSTM import Lstm + + +STACK_LSTM_CRF_DEVICE = ["CPU"] + + +class Lstm_CRF(nn.Cell): + """Lstm_CRF network structure""" + def __init__(self, + vocab_size, + tag_to_index, + embedding_size, + hidden_size, + num_layers, + weight=None, + bidirectional=False, + batch_size=1, + seq_length=1, + dropout=0.0, + is_training=True + ): + super(Lstm_CRF, self).__init__() + self.vocab_size = vocab_size + self.tag_to_index = tag_to_index + self.embedding_size = embedding_size + self.num_hiddens = hidden_size + self.num_layers = num_layers + self.bidirectional = bidirectional + self.transpose = P.Transpose() + + self.is_training = is_training + self.batch_size = batch_size + self.seq_length = seq_length + self.START_TAG = "<START>" + self.STOP_TAG = "<STOP>" + self.tag_to_index[self.START_TAG] = len(self.tag_to_index) + self.tag_to_index[self.STOP_TAG] = len(self.tag_to_index) + self.out_size = len(self.tag_to_index) + self.START_VALUE = Tensor(self.tag_to_index[self.START_TAG], dtype=mstype.int32) + self.STOP_VALUE = Tensor(self.tag_to_index[self.STOP_TAG], dtype=mstype.int32) + + # Matrix of transition parameters. + transitions = np.random.normal(size=(self.out_size, self.out_size)).astype(np.float32) + transitions[self.tag_to_index[self.START_TAG], :] = -10000 + transitions[:, self.tag_to_index[self.STOP_TAG]] = -10000 + self.transitions = Parameter(Tensor(transitions)) + + self.lstm = Lstm(vocab_size, + embedding_size, + hidden_size, + out_size=self.out_size, + weight=weight, + num_layers=1, + batch_size=batch_size, + dropout=dropout, + bidirectional=bidirectional) + + self.cat = P.Concat(axis=-1) + self.argmax = P.ArgMaxWithValue(axis=-1) + self.log = P.Log() + self.exp = P.Exp() + self.sum = P.ReduceSum() + self.tile = P.Tile() + self.reduce_sum = P.ReduceSum(keep_dims=True) + self.reshape = P.Reshape() + self.expand = P.ExpandDims() + self.mean = P.ReduceMean() + init_alphas = np.ones(shape=(self.batch_size, self.out_size)) * -10000.0 + init_alphas[:, self.tag_to_index[self.START_TAG]] = 0. + self.init_alphas = Tensor(init_alphas, dtype=mstype.float32) + self.cast = P.Cast() + self.reduce_max = P.ReduceMax(keep_dims=True) + self.on_value = Tensor(1.0, dtype=mstype.float32) + self.off_value = Tensor(0.0, dtype=mstype.float32) + self.onehot = P.OneHot() + + def _realpath_score(self, features, label): + ''' + Compute the emission and transition score for the real path. + ''' + label = label * 1 + concat_A = self.tile(self.reshape(self.START_VALUE, (1,)), (self.batch_size,)) + concat_A = self.reshape(concat_A, (self.batch_size, 1)) + labels = self.cat((concat_A, label)) + onehot_label = self.onehot(label, self.out_size, self.on_value, self.off_value) + emits = features * onehot_label + labels = self.onehot(labels, self.out_size, self.on_value, self.off_value) + label1 = labels[:, 1:, :] + label2 = labels[:, :self.seq_length, :] + label1 = self.expand(label1, 3) + label2 = self.expand(label2, 2) + label_trans = label1 * label2 + transitions = self.expand(self.expand(self.transitions, 0), 0) + trans = transitions * label_trans + score = self.sum(emits, (1, 2)) + self.sum(trans, (1, 2, 3)) + stop_value_index = labels[:, (self.seq_length-1):self.seq_length, :] + stop_value = self.transitions[(self.out_size-1):self.out_size, :] + stop_score = stop_value * self.reshape(stop_value_index, (self.batch_size, self.out_size)) + score = score + self.sum(stop_score, 1) + score = self.reshape(score, (self.batch_size, -1)) + return score + + def _normalization_factor(self, features): + ''' + Compute the total score for all the paths. + ''' + forward_var = self.init_alphas + forward_var = self.expand(forward_var, 1) + for idx in range(self.seq_length): + feat = features[:, idx:(idx+1), :] + emit_score = self.reshape(feat, (self.batch_size, self.out_size, 1)) + next_tag_var = emit_score + self.transitions + forward_var + forward_var = self.log_sum_exp(next_tag_var) + forward_var = self.reshape(forward_var, (self.batch_size, 1, self.out_size)) + terminal_var = forward_var + self.reshape(self.transitions[(self.out_size-1):self.out_size, :], (1, -1)) + alpha = self.log_sum_exp(terminal_var) + alpha = self.reshape(alpha, (self.batch_size, -1)) + return alpha + + def _decoder(self, features): + ''' + Viterbi decode for evaluation. + ''' + backpointers = () + forward_var = self.init_alphas + for idx in range(self.seq_length): + feat = features[:, idx:(idx+1), :] + feat = self.reshape(feat, (self.batch_size, self.out_size)) + bptrs_t = () + + next_tag_var = self.expand(forward_var, 1) + self.transitions + best_tag_id, best_tag_value = self.argmax(next_tag_var) + bptrs_t += (best_tag_id,) + forward_var = best_tag_value + feat + + backpointers += (bptrs_t,) + terminal_var = forward_var + self.reshape(self.transitions[(self.out_size-1):self.out_size, :], (1, -1)) + best_tag_id, _ = self.argmax(terminal_var) + return backpointers, best_tag_id + + def log_sum_exp(self, logits): + ''' + Compute the log_sum_exp score for Normalization factor. + ''' + max_score = self.reduce_max(logits, -1) #16 5 5 + score = self.log(self.reduce_sum(self.exp(logits - max_score), -1)) + score = max_score + score + return score + + def construct(self, inputs, label): + """ + Get the emission scores from the BiLSTM + """ + emission = self.lstm(inputs) + emission = self.transpose(emission, (1, 0, 2)) + if self.is_training: + forward_score = self._normalization_factor(emission) + gold_score = self._realpath_score(emission, label) + return_value = self.mean(forward_score - gold_score) + else: + path_list, tag = self._decoder(emission) + return_value = path_list, tag + return return_value + + +# CRF postprocess +def postprocess(backpointers, best_tag_id): + ''' + Do postprocess + ''' + best_tag_id = best_tag_id.asnumpy() + batch_size = len(best_tag_id) + best_path = [] + for i in range(batch_size): + best_path.append([]) + best_local_id = best_tag_id[i] + best_path[-1].append(best_local_id) + for bptrs_t in reversed(backpointers): + bptrs_t = bptrs_t[0].asnumpy() + local_idx = bptrs_t[i] + best_local_id = local_idx[best_local_id] + best_path[-1].append(best_local_id) + # Pop off the start tag (we dont want to return that to the caller) + best_path[-1].pop() + best_path[-1].reverse() + return best_path diff --git a/research/nlp/lstm_crf/src/__init__.py b/research/nlp/lstm_crf/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f46a7b5eb59e77b55b25865c1701f25593c0087c --- /dev/null +++ b/research/nlp/lstm_crf/src/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +__init__.py +""" + +from . import imdb +from . import dataset +from . import LSTM +from . import LSTM_CRF +from . import util diff --git a/research/nlp/lstm_crf/src/dataset.py b/research/nlp/lstm_crf/src/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d1b9c2586d61e01ccfb82fe253768f944eb8228b --- /dev/null +++ b/research/nlp/lstm_crf/src/dataset.py @@ -0,0 +1,34 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Data operations, will be used in train.py and eval.py +""" +import numpy as np +import mindspore.dataset as ds + + +def get_data_set(word_index, tag_index, batch_size): + """get the data for train and eval""" + def generator_func(): + for i in range(len(word_index)): + yield (np.array([j for j in word_index[i]]).astype(np.int32), + np.array([value for value in tag_index[i]]).astype(np.int32)) + + data_set = ds.GeneratorDataset(generator_func, ["feature", "label"]) + data_set = data_set.shuffle(buffer_size=data_set.get_dataset_size()) + data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) + data_set = data_set.repeat(count=1) + + return data_set diff --git a/research/nlp/lstm_crf/src/imdb.py b/research/nlp/lstm_crf/src/imdb.py new file mode 100644 index 0000000000000000000000000000000000000000..99880a67976c1966f4f2b7fb4331eb9d2d795bf6 --- /dev/null +++ b/research/nlp/lstm_crf/src/imdb.py @@ -0,0 +1,223 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +imdb dataset parser. +""" +import os +import numpy as np +from .model_utils.config import config + + +UNK = "<UNK>" +PAD = "<PAD>" +NUM = "<NUM>" + + +def modelarts_pre_process(): + config.ckpt_path = os.path.join(config.output_path, config.ckpt_path) + + +def is_number(num): + num = num.replace(".", "") + num = num.replace(",", "") + num = num.replace(":", "") + return num.isdigit() + + +class MyIOError(Exception): + def __init__(self, filename): + # custom error message + message = """ + ERROR: Unable to locate file {}. + FIX: Have you tried running run_build_data.sh first? + This will build vocab file from your train and test sets and + your word vectors. + """.format(filename) + super(MyIOError, self).__init__(message) + + +class ImdbParser(): + """ + parse data to features and labels. + sentence->tokenized->encoded->padding->features + """ + + def __init__(self, imdb_path, glove_path, words_path, embed_size=300): + self.__imdb_path = imdb_path + self.__glove_dim = embed_size + self.__glove_file = os.path.join(glove_path, 'glove.6B.' + str(self.__glove_dim) + 'd.txt') + self.__glove_vectors_path = os.path.join(words_path, 'glove.6B.'+ str(self.__glove_dim) + 'trimmed.npz') + self.__words_path = os.path.join(words_path, 'words.txt') + self.__tags_path = os.path.join(words_path, 'tags.txt') + self.__max_length_path = os.path.join(words_path, 'max_length.txt') + # properties + self.words = [] + self.tags = [] + self.glove_vocab = set() + self.vocab_words = set() + self.vocab_tags = set() + self.vocab_dict = dict() + self.words_to_index_map = {} + self.tags_to_index_map = {} + self.words_index = [] + self.tags_index = [] + self.sequence_pad = [] + self.sequence_tag_pad = [] + + def __get_words_tags(self, seg='train', build_data=True): + '''load data from txt''' + + if build_data: + segs = ['train', 'test'] + else: + segs = seg + print('segs:', segs) + for i in segs: + sentence_dir = os.path.join(self.__imdb_path, i) + '.txt' + print('load....', sentence_dir, 'data') + with open(sentence_dir, mode='r', encoding='utf-8') as f: + word_list = [] + tag_list = [] + for line in f: + if line != '\n': + word, _, tag = line.strip('\n').split() + word = word.lower() + if is_number(word): + word = NUM + word_list.append(word) + tag_list.append(tag) + else: + self.words.append(word_list) + self.tags.append(tag_list) + word_list = [] + tag_list = [] + self.max_length = max([len(self.words[i]) for i in range(len(self.words))]) + + def __write_max_sequence_length(self): + with open(self.__max_length_path, 'w') as f: + f.write(str(self.max_length)) + + def __get_vocabs(self): + for i in range(len(self.words)): + self.vocab_words.update(self.words[i]) + self.vocab_tags.update(self.tags[i]) + + def __get_glove_vocab(self): + with open(self.__glove_file, mode='r', encoding='utf-8') as f: + for line in f: + word = line.strip().split(' ')[0] + self.glove_vocab.add(word) + + def __write_vocab(self, path, data): + print("Writing vocab......") + with open(path, "w") as f: + for i, word in enumerate(data): + if i != len(data)-1: + f.write('{}\n'.format(word)) + else: + f.write(word) + print("- done. {} tokens".format(len(data))) + + def __get_glove_vectors(self): + """embedding""" + embeddings = np.zeros([len(self.vocab), self.__glove_dim]) + with open(self.__glove_file, mode='r', encoding='utf-8') as f: + for line in f: + line = line.strip().split(' ') + word = line[0] + embedding = [float(x) for x in line[1:]] + if word in self.words_to_index_map: + word_index = self.words_to_index_map[word] + embeddings[word_index] = np.asarray(embedding) + np.savez_compressed(self.__glove_vectors_path, embeddings=embeddings) + + def __get_pretrain_glove_vectors(self): + try: + with np.load(self.__glove_vectors_path) as data: + return data["embeddings"] + except IOError: + raise MyIOError(self.__glove_vectors_path) + + def __load_vocab_map(self, path, datas_var): + vocab_map = datas_var + try: + with open(path, encoding='utf-8') as f: + for index, word in enumerate(f): + word = word.strip() + vocab_map[word] = index + + except IOError: + raise MyIOError(path) + + def __get_sequence_length(self): + with open(self.__max_length_path, encoding='utf-8') as f: + for word in f: + self.sequence_max_length = int(word) + + def __get_vocab_index(self, data_list, path_map, path_index): + """glove vector""" + for words in data_list: + vocab_index = [] + for word in words: + if word in path_map: + vocab_index.append(path_map[word]) + else: + vocab_index.append(path_map[UNK]) + path_index.append(vocab_index) + + def __get_sequence_same_length(self): + for vocab_index in self.words_index: + vocab_index_ = vocab_index[:self.sequence_max_length] + \ + [0]*max((self.sequence_max_length-len(vocab_index)), 0) + self.sequence_pad += [vocab_index_] + + def __get_sequence_tag_same_length(self): + for vocab_index in self.tags_index: + vocab_index_ = vocab_index[:self.sequence_max_length] + \ + [0]*max((self.sequence_max_length-len(vocab_index)), 0) + self.sequence_tag_pad += [vocab_index_] + + def build_datas(self, seg, build_data): + """build the vocab and embedding""" + self.__get_words_tags(seg, build_data) + self.__get_vocabs() + self.__get_glove_vocab() + self.vocab = self.vocab_words & self.glove_vocab + self.vocab.add(UNK) + self.vocab.add(NUM) + self.vocab.add('<START>') + self.vocab.add('<STOP>') + self.vocab = list(self.vocab) + self.vocab.insert(0, PAD) + self.__write_vocab(self.__words_path, self.vocab) + self.__write_vocab(self.__tags_path, self.vocab_tags) + self.__write_max_sequence_length() + self.__load_vocab_map(self.__words_path, self.words_to_index_map) + self.__load_vocab_map(self.__tags_path, self.tags_to_index_map) + self.__get_glove_vectors() + + def get_datas_embeddings(self, seg, build_data): + """read the CoNLL2000 data embedding""" + self.__get_words_tags(seg, build_data) + embeddings = self.__get_pretrain_glove_vectors() + self.__load_vocab_map(self.__words_path, self.words_to_index_map) + self.__load_vocab_map(self.__tags_path, self.tags_to_index_map) + self.__get_vocab_index(self.words, self.words_to_index_map, self.words_index) + self.__get_vocab_index(self.tags, self.tags_to_index_map, self.tags_index) + self.__get_sequence_length() + self.__get_sequence_same_length() + self.__get_sequence_tag_same_length() + return embeddings, self.sequence_max_length, self.words, self.tags, self.sequence_pad, \ + self.sequence_tag_pad, self.tags_to_index_map diff --git a/research/nlp/lstm_crf/src/model_utils/__init__.py b/research/nlp/lstm_crf/src/model_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a574651abfc26bda5fa1a369d53aad71a8b3f93e --- /dev/null +++ b/research/nlp/lstm_crf/src/model_utils/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +__init__.py +""" +from . import config +from . import device_adapter +from . import local_adapter +from . import moxing_adapter diff --git a/research/nlp/lstm_crf/src/model_utils/config.py b/research/nlp/lstm_crf/src/model_utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..b48c894e720cb28f272dda71aa0b686f3b4762e5 --- /dev/null +++ b/research/nlp/lstm_crf/src/model_utils/config.py @@ -0,0 +1,128 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Parse arguments""" + +import os +import ast +import argparse +from pprint import pformat +import yaml + +class Config: + """ + Configuration namespace. Convert dictionary to members. + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + if isinstance(v, (list, tuple)): + setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Config(v) if isinstance(v, dict) else v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + +def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): + """ + Parse command line arguments to the configuration according to the default yaml. + + Args: + parser: Parent parser. + cfg: Base configuration. + helper: Helper description. + cfg_path: Path to the default yaml config. + """ + parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", + parents=[parser]) + helper = {} if helper is None else helper + choices = {} if choices is None else choices + for item in cfg: + if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): + help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) + choice = choices[item] if item in choices else None + if isinstance(cfg[item], bool): + parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, + help=help_description) + else: + parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, + help=help_description) + args = parser.parse_args() + return args + + +def parse_yaml(yaml_path): + """ + Parse the yaml config file. + + Args: + yaml_path: Path to the yaml config. + """ + with open(yaml_path, 'r') as fin: + try: + cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) + cfgs = [x for x in cfgs] + if len(cfgs) == 1: + cfg_helper = {} + cfg = cfgs[0] + cfg_choices = {} + elif len(cfgs) == 2: + cfg, cfg_helper = cfgs + cfg_choices = {} + elif len(cfgs) == 3: + cfg, cfg_helper, cfg_choices = cfgs + else: + raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") + # print(cfg_helper) + except: + raise ValueError("Failed to parse yaml") + return cfg, cfg_helper, cfg_choices + + +def merge(args, cfg): + """ + Merge the base config from yaml file and command line arguments. + + Args: + args: Command line arguments. + cfg: Base configuration. + """ + args_var = vars(args) + for item in args_var: + cfg[item] = args_var[item] + return cfg + + +def get_config(): + """ + Get Config according to the yaml file and cli arguments. + """ + parser = argparse.ArgumentParser(description="default name", add_help=False) + current_dir = os.path.dirname(os.path.abspath(__file__)) + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../default_config.yaml"), + help="Config file path") + path_args, _ = parser.parse_known_args() + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) + # pprint(final_config) + print("Please check the above information for the configurations", flush=True) + return Config(final_config) + +config = get_config() diff --git a/research/nlp/lstm_crf/src/model_utils/device_adapter.py b/research/nlp/lstm_crf/src/model_utils/device_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..7c5d7f837ddaa8f53cf8dc5573cac0e36881e7b1 --- /dev/null +++ b/research/nlp/lstm_crf/src/model_utils/device_adapter.py @@ -0,0 +1,27 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Device adapter for ModelArts""" + +from .config import config + +if config.enable_modelarts: + from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + "get_device_id", "get_device_num", "get_rank_id", "get_job_id" +] diff --git a/research/nlp/lstm_crf/src/model_utils/local_adapter.py b/research/nlp/lstm_crf/src/model_utils/local_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..769fa6dc78e59eb66dbc8e6773accdc1d08b649e --- /dev/null +++ b/research/nlp/lstm_crf/src/model_utils/local_adapter.py @@ -0,0 +1,36 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Local adapter""" + +import os + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return "Local Job" diff --git a/research/nlp/lstm_crf/src/model_utils/moxing_adapter.py b/research/nlp/lstm_crf/src/model_utils/moxing_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..830d19a6fc99de8d602703971d5ac5b24e060d11 --- /dev/null +++ b/research/nlp/lstm_crf/src/model_utils/moxing_adapter.py @@ -0,0 +1,122 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from mindspore.profiler import Profiler +from .config import config + +_global_sync_count = 0 + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local path + Upload data from local directory to remote obs in contrast. + """ + import moxing as mox + import time + global _global_sync_count + sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) + _global_sync_count += 1 + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("from path: ", from_path) + print("to path: ", to_path) + mox.file.copy_parallel(from_path, to_path) + print("===finish data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + print("===save flag===") + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Finish sync data from {} to {}.".format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs. + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print("Dataset downloaded: ", os.listdir(config.data_path)) + if config.checkpoint_url: + sync_data(config.checkpoint_url, config.load_path) + print("Preload downloaded: ", os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print("Workspace downloaded: ", os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + if config.enable_profiling: + profiler = Profiler() + + run_func(*args, **kwargs) + + if config.enable_profiling: + profiler.analyse() + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper diff --git a/research/nlp/lstm_crf/src/util.py b/research/nlp/lstm_crf/src/util.py new file mode 100644 index 0000000000000000000000000000000000000000..d6b7759373f7ee14b9425cc4b636a56985943562 --- /dev/null +++ b/research/nlp/lstm_crf/src/util.py @@ -0,0 +1,372 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +utils for lstm-crf. +""" +import math +import numpy as np + +from mindspore import Tensor +import mindspore.nn as nn +import mindspore.ops.functional as F +import mindspore.common.dtype as mstype +from mindspore.ops import operations as P +from mindspore.ops import composite as C +from mindspore import context +from mindspore.context import ParallelMode +from mindspore.common.parameter import Parameter +from mindspore.nn.metrics import ConfusionMatrixMetric +from mindspore.train.callback import Callback +from mindspore.nn.learning_rate_schedule import LearningRateSchedule, PolynomialDecayLR, WarmUpLR +from .LSTM_CRF import postprocess + + +grad_scale = C.MultitypeFuncGraph("grad_scale") +reciprocal = P.Reciprocal() +GRADIENT_CLIP_TYPE = 1 +GRADIENT_CLIP_VALUE = 1.0 +NONE = "O" + + +@grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * reciprocal(scale) + + +clip_grad = C.MultitypeFuncGraph("clip_grad") + + +@clip_grad.register("Number", "Number", "Tensor") +def _clip_grad(clip_type, clip_value, grad): + """ + Clip gradients. + + Inputs: + clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. + clip_value (float): Specifies how much to clip. + grad (tuple[Tensor]): Gradients. + + Outputs: + tuple[Tensor], clipped gradients. + """ + if clip_type not in (0, 1): + return grad + dt = F.dtype(grad) + if clip_type == 0: + new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), + F.cast(F.tuple_to_array((clip_value,)), dt)) + else: + new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) + return new_grad + + +_grad_overflow = C.MultitypeFuncGraph("_grad_overflow") +grad_overflow = P.FloatStatus() + + +@_grad_overflow.register("Tensor") +def _tensor_grad_overflow(grad): + return grad_overflow(grad) + + +class Lstm_CRF_Cell_CPU(nn.Cell): + """LSTM_CRF model""" + def __init__(self, network, optimizer, scale_update_cell=None): + super(Lstm_CRF_Cell_CPU, self).__init__(auto_prefix=False) + self.network = network + self.network.add_flags(defer_inline=True) + self.network.set_grad() + self.weights = optimizer.parameters + self.optimizer = optimizer + self.grad = C.GradOperation(get_by_list=True, + sens_param=True) + self.grad_reducer = None + self.reducer_flag = False + self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + if self.reducer_flag: + mean = context.get_auto_parallel_context("gradients_mean") + degree = context.get_auto_parallel_context("device_num") + self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) + + self.base = Tensor(1, mstype.float32) + self.float_status = P.FloatStatus() + self.addn = P.AddN() + self.reshape = P.Reshape() + self.less_equal = P.LessEqual() + self.hyper_map = C.HyperMap() + self.cast = P.Cast() + self.loss_scaling_manager = scale_update_cell + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) + + def construct(self, features, label): + """LSTM-CRF Finetune cpu""" + weights = self.weights + loss = self.network(features, label) + scaling_sens = self.loss_scale + grads = self.grad(self.network, weights)(features, + label, + self.cast(scaling_sens, mstype.float32)) + grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + self.optimizer(grads) + return loss + + +class Lstm_CRF_Cell_Ascend(nn.Cell): + """add gradient to net""" + def __init__(self, network, optimizer, scale_update_cell=None): + super(Lstm_CRF_Cell_Ascend, self).__init__(auto_prefix=False) + self.network = network + self.network.set_grad() + self.weights = optimizer.parameters + self.optimizer = optimizer + self.grad = C.GradOperation(get_by_list=True, + sens_param=True) + self.allreduce = P.AllReduce() + self.grad_reducer = None + self.cast = P.Cast() + self.gpu_target = False + self.alloc_status = P.NPUAllocFloatStatus() + self.get_status = P.NPUGetFloatStatus() + self.clear_status = P.NPUClearFloatStatus() + self.reduce_sum = P.ReduceSum(keep_dims=False) + self.base = Tensor(1, mstype.float32) + self.less_equal = P.LessEqual() + self.hyper_map = C.HyperMap() + self.loss_scale = None + self.loss_scaling_manager = scale_update_cell + if scale_update_cell: + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) + + def construct(self, + features, + labels, + sens=None): + """LSTM-CRF Finetune""" + + weights = self.weights + init = False + loss = self.network(features, + labels) + if sens is None: + scaling_sens = self.loss_scale + else: + scaling_sens = sens + + init = self.alloc_status() + init = F.depend(init, loss) + clear_status = self.clear_status(init) + scaling_sens = F.depend(scaling_sens, clear_status) + grads = self.grad(self.network, weights)(features, + labels, + self.cast(scaling_sens, + mstype.float32)) + grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + init = F.depend(init, grads) + get_status = self.get_status(init) + init = F.depend(init, get_status) + flag_sum = self.reduce_sum(init, (0,)) + cond = self.less_equal(self.base, flag_sum) + overflow = cond + if sens is None: + overflow = self.loss_scaling_manager(self.loss_scale, cond) + if not overflow: + self.optimizer(grads) + return (loss, cond) + + +class LossCallBack(Callback): + """ + Monitor the loss in training. + If the loss in NAN or INF terminating training. + Note: + if per_print_times is 0 do not print loss. + Args: + per_print_times (int): Print loss every times. Default: 1. + """ + def __init__(self, dataset_size=-1, target_device='CPU'): + super(LossCallBack, self).__init__() + self._dataset_size = dataset_size + self.target_device = target_device + + def step_end(self, run_context): + """ + Print loss after each step + """ + cb_params = run_context.original_args() + if self._dataset_size > 0: + percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size) + if percent == 0: + percent = 1 + epoch_num -= 1 + if self.target_device == 'CPU': + print("epoch: {}, current epoch percent: {}, step: {}, loss is {}" + .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, \ + str(cb_params.net_outputs)), flush=True) + else: + print("epoch: {}, current epoch percent: {}, step: {}, loss is {}" + .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, \ + str(cb_params.net_outputs[0])), flush=True) + else: + if self.target_device == 'CPU': + print("epoch: {}, step: {}, loss is {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, + str(cb_params.net_outputs)), flush=True) + else: + print("epoch: {}, step: {}, loss is {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, + str(cb_params.net_outputs[0])), flush=True) + + +class F1: + ''' + calculate F1 score + ''' + def __init__(self, num_labels=2, mode="binary"): + self.TP = 0 + self.FP = 0 + self.FN = 0 + self.num_labels = num_labels + self.mode = mode + if self.mode.lower() not in ("binary", "multilabel"): + raise ValueError("Assessment mode not supported, support: [Binary, MultiLabel]") + if self.mode.lower() != "binary": + self.metric = ConfusionMatrixMetric(skip_channel=False, metric_name=("f1 score"), + calculation_method=False, decrease="mean") + + def update(self, logits, labels): + """update F1 score""" + labels = labels.asnumpy() + labels = np.reshape(labels, -1) + + backpointers, best_tag_id = logits + best_path = postprocess(backpointers, best_tag_id) + logit_id = [] + for ele in best_path: + logit_id.extend(ele) + + if self.mode.lower() == "binary": + pos_eva = np.isin(logit_id, [i for i in range(1, self.num_labels)]) + pos_label = np.isin(labels, [i for i in range(1, self.num_labels)]) + self.TP += np.sum(pos_eva&pos_label) + self.FP += np.sum(pos_eva&(~pos_label)) + self.FN += np.sum((~pos_eva)&pos_label) + else: + target = np.zeros((len(labels), self.num_labels), dtype=np.int) + pred = np.zeros((len(logit_id), self.num_labels), dtype=np.int) + for i, label in enumerate(labels): + target[i][label] = 1 + for i, label in enumerate(logit_id): + pred[i][label] = 1 + self.metric.update(pred, target) + return logit_id, labels + + def eval(self): + return self.metric.eval() + + +class LSTMCRFLearningRate(LearningRateSchedule): + """ + Warmup-decay learning rate for Bert network. + """ + def __init__(self, learning_rate, end_learning_rate, warmup_steps, decay_steps, power): + super(LSTMCRFLearningRate, self).__init__() + self.warmup_flag = False + if warmup_steps > 0: + self.warmup_flag = True + self.warmup_lr = WarmUpLR(learning_rate, warmup_steps) + self.decay_lr = PolynomialDecayLR(learning_rate, end_learning_rate, decay_steps, power) + self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32)) + + self.greater = P.Greater() + self.one = Tensor(np.array([1.0]).astype(np.float32)) + self.cast = P.Cast() + + def construct(self, global_step): + decay_lr = self.decay_lr(global_step) + if self.warmup_flag: + is_warmup = self.cast(self.greater(self.warmup_steps, global_step), mstype.float32) + warmup_lr = self.warmup_lr(global_step) + lr = (self.one - is_warmup) * decay_lr + is_warmup * warmup_lr + else: + lr = decay_lr + return lr + + +def get_chunk_type(tok, idx_to_tag): + """ + Args: + tok: id of token, ex 4 + idx_to_tag: dictionary {4: "B-PER", ...} + """ + tag_name = idx_to_tag[tok] + tag_class = tag_name.split('-')[0] + tag_type = tag_name.split('-')[-1] + return tag_class, tag_type + + +def get_chunks(sequence_index, tags_index_map): + """ + seq: [4, 4, 0, 0, ...] sequence of labels + tags: dict["O"] = 4 + """ + default = tags_index_map[NONE] + idx_to_tag = {idx: tag for tag, idx in tags_index_map.items()} + chunks = [] + chunk_type, chunk_start = None, None + for i, tokens in enumerate(sequence_index): + # End of a chunk 1 + if tokens == default and chunk_type is not None: + # Add a chunk. + chunk = (chunk_type, chunk_start, i) + chunks.append(chunk) + chunk_type, chunk_start = None, None + + # End of a chunk + start of a chunk! + elif tokens != default: + tok_chunk_class, tok_chunk_type = get_chunk_type(tokens, idx_to_tag) + if chunk_type is None: + chunk_type, chunk_start = tok_chunk_type, i + elif tok_chunk_type != chunk_type or tok_chunk_class == "B": + chunk = (chunk_type, chunk_start, i) + chunks.append(chunk) + chunk_type, chunk_start = tok_chunk_type, i + else: + pass + + # end condition + if chunk_type is not None: + chunk = (chunk_type, chunk_start, len(sequence_index)) + chunks.append(chunk) + + return chunks + + +def get_label_lists(gold_lists, pred_lists, mask_lists=None): + """get the valid sequence length""" + preds = list() + golds = list() + for gold, pred, mask in zip(gold_lists, pred_lists, mask_lists): + temp_preds = list() + temp_golds = list() + for g, p, m in zip(gold, pred, mask): + if m == 0: + continue + temp_preds.append(p) + temp_golds.append(g) + preds.append(temp_preds) + golds.append(temp_golds) + return golds, preds diff --git a/research/nlp/lstm_crf/train.py b/research/nlp/lstm_crf/train.py new file mode 100644 index 0000000000000000000000000000000000000000..052adb77be16c1a6b41a2e0ce60789b9338f6de9 --- /dev/null +++ b/research/nlp/lstm_crf/train.py @@ -0,0 +1,195 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +#################train lstm-crf example on CoNLL2000######################## +""" + +import os +from copy import deepcopy +import numpy as np + +from src import util +from src.util import get_chunks, get_label_lists, F1, LSTMCRFLearningRate +from src.model_utils.config import config +from src.dataset import get_data_set +from src.imdb import ImdbParser +from src.LSTM_CRF import Lstm_CRF + +import mindspore +from mindspore.common import set_seed +from mindspore.nn.optim import AdamWeightDecay +from mindspore import Tensor, Model, context +from mindspore.nn import DynamicLossScaleUpdateCell +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor +from mindspore.train.serialization import load_param_into_net, load_checkpoint + +set_seed(1000) + + +def modelarts_pre_process(): + config.ckpt_path = os.path.join(config.output_path, config.ckpt_path) + + +def create_filter_fun(keywords): + return lambda x: not (True in [key in x.name.lower() for key in keywords]) + + +class EvalCabllBack(TimeMonitor): + """print training log""" + def __init__(self, network): + + self.parser = ImdbParser(config.data_CoNLL_path, + config.glove_path, + config.data_CoNLL_path, + embed_size=config.embed_size) + + _, _, _, _, self.sequence_index, self.sequence_tag_index, self.tags_to_index_map \ + = self.parser.get_datas_embeddings(seg=['test'], build_data=config.build_data) + + self.ds_val = get_data_set(self.sequence_index, self.sequence_tag_index, config.batch_size) + self.network = network + self.weight = self.network.parameters_dict + self.callback = F1(len(self.tags_to_index_map)) + self._best_val_F1 = 0 + + def epoch_end(self, run_context): + """save .ckpt files""" + self.network.is_training = False + self.model = Model(self.network) + columns_list = ["feature", "label"] + rest_golds_list = list() + rest_preds_list = list() + for data in self.ds_val.create_dict_iterator(num_epochs=1): + input_data = [] + for i in columns_list: + input_data.append(data[i]) + feature, label = input_data + logits = self.model.predict_network(feature, label) + logit_ids, label_ids = self.callback.update(logits, label) + + rest_preds = np.array(logit_ids) + rest_preds = np.expand_dims(rest_preds, 0) + + rest_labels = deepcopy(label_ids) + label_ids = np.expand_dims(label_ids, 0) + rest_labels = np.expand_dims(rest_labels, 0) + + rest_golds, rest_preds = get_label_lists(rest_labels, rest_preds, label_ids) + + rest_golds_list += rest_golds + rest_preds_list += rest_preds + accs = [] + correct_preds, total_correct, total_preds = 0., 0., 0. + for golds, preds in zip(rest_golds_list, rest_preds_list): + accs += [a == b for (a, b) in zip(golds, preds)] + golds_chunks = set(get_chunks(golds, self.tags_to_index_map)) + preds_chunks = set(get_chunks(preds, self.tags_to_index_map)) + correct_preds += len(golds_chunks & preds_chunks) + total_preds += len(preds_chunks) + total_correct += len(golds_chunks) + + p = correct_preds / total_preds if correct_preds > 0 else 0 + r = correct_preds / total_correct if correct_preds > 0 else 0 + f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 + acc = np.mean(accs) + + val_current_F1 = f1 + if self._best_val_F1 <= val_current_F1: + self._best_val_F1 = val_current_F1 + print("current ACC {:.6f}%, current F1 {:.6f}%, self._best_val_F1 {:.6f}% " + .format(acc*100, val_current_F1*100, self._best_val_F1*100)) + + +def train_lstm_crf(): + """ train lstm_crf """ + print('\ntrain.py config: \n', config) + + context.set_context( + mode=context.GRAPH_MODE, + save_graphs=False, + device_id=config.device_id, + enable_graph_kernel=False, + device_target=config.device_target) + + parser = ImdbParser(config.data_CoNLL_path, + config.glove_path, + config.data_CoNLL_path, + embed_size=config.embed_size) + # only create data + if config.build_data: + parser.build_datas(seg='train', build_data=config.build_data) + return + + embeddings, sequence_length, _, _, sequence_index, sequence_tag_index, tags_to_index_map \ + = parser.get_datas_embeddings(seg=['train'], build_data=config.build_data) + + embeddings_table = Tensor(embeddings, mindspore.float32) + + ds_train = get_data_set(sequence_index, sequence_tag_index, config.batch_size) + + network = Lstm_CRF(vocab_size=embeddings_table.shape[0], + tag_to_index=tags_to_index_map, + embedding_size=config.embed_size, + hidden_size=config.num_hiddens, + num_layers=config.num_layers, + weight=embeddings_table, + bidirectional=config.bidirectional, + batch_size=config.batch_size, + seq_length=sequence_length, + dropout=config.dropout, + is_training=True) + + # create optimizer + steps_per_epoch = ds_train.get_dataset_size() + lr_schedule = LSTMCRFLearningRate(learning_rate=config.AdamWeightDecay.learning_rate, + end_learning_rate=config.AdamWeightDecay.end_learning_rate, + warmup_steps=int(steps_per_epoch * config.num_epochs * 0.02), + decay_steps=steps_per_epoch * config.num_epochs, + power=config.AdamWeightDecay.power) + params = network.trainable_params() + decay_params = list(filter(create_filter_fun(config.AdamWeightDecay.decay_filter), params)) + other_params = list(filter(lambda x: not create_filter_fun(config.AdamWeightDecay.decay_filter)(x), params)) + group_params = [{'params': decay_params, 'weight_decay': config.AdamWeightDecay.weight_decay}, + {'params': other_params, 'weight_decay': 0.0}] + opt = AdamWeightDecay(params=group_params, learning_rate=lr_schedule, eps=config.AdamWeightDecay.eps) + + update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**16, scale_factor=2, scale_window=100) + if config.device_target == 'CPU': + netwithgrads = util.Lstm_CRF_Cell_CPU(network, optimizer=opt, scale_update_cell=update_cell) + else: + netwithgrads = util.Lstm_CRF_Cell_Ascend(network, optimizer=opt, scale_update_cell=update_cell) + model = Model(netwithgrads) + + if config.pre_trained: + param_dict = load_checkpoint(config.pre_trained) + load_param_into_net(network, param_dict) + + print("============== Starting Training ==============") + config_ck = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), + keep_checkpoint_max=config.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="lstm_crf", directory=config.ckpt_save_path, config=config_ck) + eval_cb = EvalCabllBack(network) + callbacks = [TimeMonitor(ds_train.get_dataset_size()), + util.LossCallBack(ds_train.get_dataset_size(), config.device_target), + eval_cb, ckpoint_cb] + if config.device_target == "CPU": + model.train(config.num_epochs, ds_train, callbacks=callbacks, dataset_sink_mode=False) + else: + model.train(config.num_epochs, ds_train, callbacks=callbacks) + print("============== Training Success ==============") + + +if __name__ == '__main__': + train_lstm_crf()