diff --git a/research/nlp/soft_masked_bert/README.md b/research/nlp/soft_masked_bert/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b8d653850c3623f44059de0fbebc3cdb16dc2cb1 --- /dev/null +++ b/research/nlp/soft_masked_bert/README.md @@ -0,0 +1,288 @@ +# Directory + +[View Chinese](./README_CN.md) + +- [Directory](#directory) +- [Soft-Masked BERT](#soft-maskedbert) +- [Model Architecture](#model architecture) +- [Dataset](#dataset) +- [Environment Requirements](#environment requirements) +- [Quick Start](#quick start) +- [Script description](#script description) + - [Script parameters](#script parameters) + - [Training process](#training process) + - [Single player training](#single player training) + - [Distributed training](#distributed training) + - [Inference](#inference) + - [Reasoning process](#reasoning process) +- [Model Description](#model description) + - [Performance](#performance) + - [Training Performance](#training performance) + - [Inference Performance](#inference performance) +- [Contribution Guide](#contribution guide) + - [Contributors](#contributors) +- [ModeZoo Homepage](#modezoo homepage) + +<TOC> + +# Soft-Masked BERT + +[paper](https://arxiv.org/pdf/2005.07421v1.pdf)锛歓hang S, Huang H, Liu J, et al. Spelling error correction with soft-masked BERT[J]. arXiv preprint arXiv:2005.07421, 2020. + +## Model Architecture + +> Soft-masked BERT consists of a detection network based on BI-GRU and a correction network based on BERT. The probability of network prediction error is detected and the probability of network prediction error correction is corrected, while the detection network transmits the prediction results to the correction network by soft masking. + +## Dataset + +1. Download [SIGHAN dataset](http://nlp.ee.ncu.edu.tw/resource/csc.html) +1. Unpack the dataset above and copy all the ".sgml "files in the folder to the datasets/csc/directory +1. Copy 'sighan15_csc_testInt. TXT' and 'sighan15_csc_testtrut. TXT' to the datasets/csc/directory +1. [download] (https://github.com/wdimmy/Automatic-Corpus-Generation/blob/master/corpus/train.sgml) to datasets/csc directory +1. Ensure that the following files are in datasets/csc + +```text +train.sgml +B1_training.sgml +C1_training.sgml +SIGHAN15_CSC_A2_Training.sgml +SIGHAN15_CSC_B2_Training.sgml +SIGHAN15_CSC_TestInput.txt +SIGHAN15_CSC_TestTruth.txt +``` + +6. Preprocess the data(Please refer to the requirement.txt installation for the dependency package required to run the script.) + +```python +python preprocess_dataset.py +``` + +# [Environment Requirements](#contents) + +- Hardware锛圓scend/GPU/CPU锛� + - Prepare hardware environment with Ascend/GPU/CPU processor. +- Framework + - [MindSpore](https://www.mindspore.cn/install/en) +- For more information, please check the resources below锛� + - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html) +- Dependencies + - The installation depends on + > pip install -r requirements.txt +- version problem + - If the GLIBC version is too late, install an earlier version of openCC (e.g. 1.1.0). + +## Quick Start + +1. Store preprocessed data in the datasets directory. +2. Download [bert-base-chinese-vocab.txt](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt) and put it in the src/ file. +3. Download[pre-trained model](https://download.mindspore.cn/models/r1.3/bertbase_ascend_v130_cnnews128_official_nlp_loss1.5.ckpt) and put it in the weight/ file銆� +4. Execute the training script. +- Train on offline servers + +```python +# Distributed training +bash scripts/run_distribute_train.sh [RANK_SIZE] [RANK_START_ID] [RANK_TABLE_FILE] [BERT_CKPT] +BERT_CKPT:Pre-trained BERT file name (for example bert_base.ckpt) + +# Single training +bash scripts/run_standalone_train.sh [BERT_CKPT] [DEVICE_ID] [PYNATIVE] +BERT_CKPT:Pre-trained BERT file name (for example bert_base.ckpt) +DEVICE_ID:ID of the running machine +PYNATIVE:Whether to run in PYNATIVE mode (default False) +``` + +- while the ModelArts for training (if you want to run on ModelArts, can refer to the following document [ModelArts] (https://support.huaweicloud.com/modelarts/)) + +```text +# (1) Go to [code warehouse](https://git.openi.org.cn/OpenModelZoo/SoftMaskedBert) and create a training task. +# (2) Set "enable_modelarts=True; bert_ckpt=bert_base.ckpt" +# (3) If running in Pynative mode, set "pynative=True" +# (4) Set dataset "softmask.zip" on the web page +# (5) Set the startup file to "train.py" +# (6) run training task +``` + +5. Execute the evaluation script. + +After the training, follow these steps to initiate the evaluation: + +```python +# assessment +bash scripts/run_eval.sh [BERT_CKPT_NAME] [CKPT_DIR] +``` + +## Script description + +```text +鈹� 鈹€ 鈹€ model_zoo +鈹溾攢 Readme.md // All model related instructions +鈹� 鈹€ 鈹€ soft - maksed - Bert +鈹溾攢 Readme.md // Googlenet +鈹溾攢鈹€ Ascend310_infer // Implement 310 inference source code +鈹� 鈹€ 鈹€ scripts +鈹� 鈹溾攢 Run_Train. Sh // Distributed to Ascend shell script +鈹� 鈹溾攢 Run_eval. sh // Ascend evaluation shell script +鈹� 鈹溾攢 Run_INFER_310.sh // Ascend Reasoning shell Script +鈹� 鈹€ 鈹€ the SRC +鈹� 鈹溾攢 Soft Maksed Bert // Soft Maksed Bert +鈹溾攢 Train.py // +鈹溾攢 Eval. Py // Evaluation script +鈹溾攢 Postprogress.py // 310 Reasoning Postprocessing script +鈹溾攢 export.py // Checkpoint file export + +鈹溾攢鈹€ model_zoo + 鈹溾攢鈹€ README.md // All model related instructions + 鈹溾攢鈹€ soft-maksed-bert + 鈹溾攢鈹€ README.md // softmasked-BERT related instructions + 鈹溾攢鈹€ README_CN.md // softmasked-BERT related instructions in Chinese + 鈹溾攢鈹€ ascend310_infer // Implement 310 inference source code + 鈹溾攢鈹€ scripts + 鈹� 鈹溾攢鈹€run_distribute_train.sh // Distributed to Ascend shell script + 鈹� 鈹溾攢鈹€run_standalone_train.sh // Ascend single machine training shell script + 鈹� 鈹溾攢鈹€run_eval.sh // Ascend evaluation shell script + 鈹� 鈹溾攢鈹€run_infer_310.sh // Ascend inferences shell scripts + 鈹� 鈹溾攢鈹€run_preprocess.sh // Run a shell script for data preprocessing + 鈹溾攢鈹€ src + 鈹� 鈹溾攢鈹€soft_masked_bert.py // soft-maksed bert architecture + 鈹� 鈹溾攢鈹€bert_model.py // BERT architecture + 鈹� 鈹溾攢鈹€dataset.py // Data set processing + 鈹� 鈹溾攢鈹€finetune_config.py // Model's hyperparameter + 鈹� 鈹溾攢鈹€gru.py // GRU architecture + 鈹� 鈹溾攢鈹€tokenization.py // Words tokenizer + 鈹� 鈹溾攢鈹€util.py // tools + 鈹溾攢鈹€ train.py // Training script + 鈹溾攢鈹€ eval.py // Evaluation of the script + 鈹溾攢鈹€ postprogress.py // 310 Inference postprocessing scripts + 鈹溾攢鈹€ export.py // Export the checkpoint file + 鈹溾攢鈹€ preprocess_dataset.py // Data preprocessing +``` + +### Script parameters + +```python +'Batch size':36 # batch size +'epoch':100 # Total training epoch number +'Learning rate':0.0001 # Initial learning rate +'Loss function':'BCELoss' # Loss function used for training +'Optimizer ':AdamWeightDecay # Activate function +``` + +## Training process + +### Single player training + +- Ascend runs in the processor environment + +```python +bash scripts/run_standalone_train.sh [BERT_CKPT] [DEVICE_ID] [PYNATIVE] +``` + +After the training, you can find the checkpoint file in the default scripts folder. The operation process is as follows: + +```python +Epoch: 1 Step: 152, loss is 3.3235654830932617 +Epoch: 1 Step: 153, loss is 3.6958463191986084 +Epoch: 1 Step: 154, loss is 3.585498571395874 +Epoch: 1 Step: 155, loss is 3.276094913482666 +``` + +## Distributed training + +- Ascend runs in the processor environment + +```python +Bash run_distribute_train_smb.sh [RANK_SIZE] [RANK_START_ID] [RANK_TABLE_FILE] [BERT_CKPT] +``` + +The shell script above runs the distributed training in the background. + +```python +Epoch: 1 Step: 12, Loss is 7.957302093505859 +Epoch: 1 Step: 13, loss is 7.886098861694336 +Epoch: 1 Step: 14, Loss is 7.781495094299316 +Epoch: 1 Step: 15, Loss is 7.755488395690918 + +``` + +### [Inference](#contents) + +#### Reasoning process + +Before performing inference, the mindir file must be exported by export.py. Input files must be in bin format. + +```python +# Export mindir file +python export.py --bert_ckpt [BERT_CKPT] --ckpt_dir [CKPT_DIR] +# Ascend310 inference +bash scripts/run_infer_310.sh [MINDIR_PATH] [DATA_FILE_PATH] [NEED_PREPROCESS] [DEVICE_ID] +``` + +`BERT_CKPT` means the pre-train BERT filename. (e.g. bert_base.ckpt) +`CKPT_DIR` means the trained ckpt file path. (e.g. ./checkpoint/SoftMaskedBert-100_874.ckpt) +`MINDIR_PATH` means the directory of the model file. +`DATA_FILE_PATH` means the directory of the input data. +`NEED_PREPROCESS` means weather need preprocess or not, it's value is 'y' or 'n'. +`DEVICE_ID` is optional, default value is 0. + +#### result + +Inference result is saved in the project's main path, you can find result in acc.log file. + +```eval log +1 The detection result is precision=0.6733436055469953, recall=0.6181046676096181 and F1=0.6445427728613569 +2 The correction result is precision=0.8260869565217391, recall=0.7234468937875751 and F1=0.7713675213675213 +3 Sentence Level: acc:0.606364, precision:0.650970, recall:0.433579, f1:0.520487 +``` + +# Model Description + +## Performance + +### Training Performance + +| Parameters | Ascend | +| -------------------------- | ----------------------------------------------------------- | +| Model Version | BERT-base | +| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | +| uploaded Date | 2022-06-28 | +| MindSpore鐗堟湰 | 1.6.0 | +| Dataset | SIGHAN | +| Training Parameters | epoch=100, steps=6994, batch_size = 36, lr=0.0001 | +| Optimizer | AdamWeightDecay | +| Loss Function | BCELoss | +| Loss | 0.0016 | +| Speed | 1p锛�349.7ms/step; 8p锛�314.7ms/step | +| Total time | 1p锛�4076mins; 8p锛�458mins | +| Checkpoint for Fine tuning | 459M (.ckpt鏂囦欢) | +| Scripts | [link](https://gitee.com/rafeal8830/soft-maksed-bert/edit/master/README_TEMPLATE_CN.md) | + +### Inference Performance + +> Provide the detail of evaluation performance including latency, accuracy and so on. + +e.g. you can reference the following template + +| Parameters | Ascend | +| ------------------- | --------------------------- | +| Model Version | ResNet18 | +| Resource | Ascend 910; OS Euler2.8 | +| Uploaded Date | 02/25/2021 (month/day/year) | +| MindSpore Version | 1.7.0 | +| Dataset | CIFAR-10 | +| batch_size | 32 | +| outputs | probability | +| Accuracy | 94.02% | +| Model for inference | 43M (.air file) | + +## Contribution Guide + +If you want to contribute, please review the [contribution guidelines](https://gitee.com/mindspore/models/blob/master/CONTRIBUTING.md) and [how_to_contribute](https://gitee.com/mindspore/models/tree/master/how_to_contribute) + +### Contributors + +* [c34](https://gitee.com/c_34) (Huawei) + +## ModeZoo Homepage + +Please check the official [homepage](https://gitee.com/mindspore/models). \ No newline at end of file diff --git a/research/nlp/soft_masked_bert/README_CN.md b/research/nlp/soft_masked_bert/README_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..6ddc39909b924366ff2e741951c813c2c4a246df --- /dev/null +++ b/research/nlp/soft_masked_bert/README_CN.md @@ -0,0 +1,275 @@ +# 鐩綍 + +[View English](./README.md) + +- [鐩綍](#鐩綍) +- [Soft-Masked BERT](#Soft-MaskedBERT) +- [妯″瀷鏋舵瀯](#妯″瀷鏋舵瀯) +- [鏁版嵁闆哴(#鏁版嵁闆�) +- [鐜瑕佹眰](#鐜瑕佹眰) +- [蹇€熷叆闂╙(#蹇€熷叆闂�) +- [鑴氭湰璇存槑](#鑴氭湰璇存槑) + - [鑴氭湰鍙傛暟](#鑴氭湰鍙傛暟) + - [璁粌杩囩▼](#璁粌杩囩▼) + - [鍗曟満璁粌](#鍗曟満璁粌) + - [鍒嗗竷寮忚缁僝(#鍒嗗竷寮忚缁�) + - [鎺ㄧ悊杩囩▼](#鎺ㄧ悊杩囩▼) + - [鎺ㄧ悊](#鎺ㄧ悊) +- [妯″瀷鎻忚堪](#妯″瀷鎻忚堪) + - [鎬ц兘](#鎬ц兘) + - [璁粌鎬ц兘](#璁粌鎬ц兘) + - [鎺ㄧ悊鎬ц兘](#鎺ㄧ悊鎬ц兘) +- [璐$尞鎸囧崡](#璐$尞鎸囧崡) + - [璐$尞鑰匽(#璐$尞鑰�) +- [ModelZoo涓婚〉](#ModelZoo涓婚〉) + +<TOC> + +# Soft-Masked BERT +[璁烘枃](https://arxiv.org/pdf/2005.07421v1.pdf)锛歓hang S, Huang H, Liu J, et al. Spelling error correction with soft-masked BERT[J]. arXiv preprint arXiv:2005.07421, 2020. + +# 妯″瀷鏋舵瀯 + +Soft-Masked BERT鐢变竴涓熀浜嶣i-GRU鐨勬娴嬬綉缁滃拰涓€涓熀浜嶣ERT鐨勬牎姝g綉缁滅粍鎴愩€傛娴嬬綉缁滈娴嬭宸殑姒傜巼锛屼慨姝g綉缁滈娴嬭宸慨姝g殑姒傜巼锛岃€屾娴嬬綉缁滃埄鐢ㄨ蒋鎺╄斀灏嗛娴嬬粨鏋滀紶閫掔粰淇缃戠粶銆� + +# 鏁版嵁闆� + +1. 涓嬭浇[SIGHAN鏁版嵁闆哴(http://nlp.ee.ncu.edu.tw/resource/csc.html) +1. 瑙e帇涓婅堪鏁版嵁闆嗗苟灏嗘枃浠跺す涓墍鏈� ''.sgml'' 鏂囦欢澶嶅埗鑷� datasets/csc/ 鐩綍 +1. 澶嶅埗 ''SIGHAN15_CSC_TestInput.txt'' 鍜� ''SIGHAN15_CSC_TestTruth.txt'' 鑷� datasets/csc/ 鐩綍 +1. [涓嬭浇](https://github.com/wdimmy/Automatic-Corpus-Generation/blob/master/corpus/train.sgml)鑷砫atasets/csc 鐩綍 +1. 璇风‘淇濅互涓嬫枃浠跺湪 datasets/csc 涓� + +```text +train.sgml +B1_training.sgml +C1_training.sgml +SIGHAN15_CSC_A2_Training.sgml +SIGHAN15_CSC_B2_Training.sgml +SIGHAN15_CSC_TestInput.txt +SIGHAN15_CSC_TestTruth.txt +``` + +6. 瀵规暟鎹繘琛岄澶勭悊(杩愯鑴氭湰鎵€闇€瑕佺殑渚濊禆鍖呰鍙傝€價equirement.txt瀹夎) + +```python +python preprocess_dataset.py +``` + +# 鐜瑕佹眰 + +- 纭欢锛圓scend锛� + - 浣跨敤Ascend澶勭悊鍣ㄦ潵鎼缓纭欢鐜銆� +- 妗嗘灦 + - [MindSpore](https://www.mindspore.cn/install/en) +- 濡傞渶鏌ョ湅璇︽儏锛岃鍙傝濡備笅璧勬簮锛� + - [MindSpore鏁欑▼](https://www.mindspore.cn/tutorials/zh-CN/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html) +- 渚濊禆 + - 瀹夎鎵€闇€渚濊禆 pip install -r requirements.txt +- 鐗堟湰闂 + - 濡傛灉鍑虹幇鎶ラ敊GLIBC鐗堟湰杩囦綆鐨勯棶棰橈紝鍙互灏唎penCC鏀逛负瀹夎杈冧綆鐗堟湰锛堜緥濡� 1.1.0锛� + +# 蹇€熷叆闂� + +1. 灏嗛澶勭悊鍚庢暟鎹斁鍦╠atasets鐩綍銆� +2. 涓嬭浇[bert-base-chinese-vocab.txt](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt)锛屾斁鍦╯rc/鏂囦欢澶逛腑銆� +3. 涓嬭浇[棰勮缁冩ā鍨媇(https://download.mindspore.cn/models/r1.3/bertbase_ascend_v130_cnnews128_official_nlp_loss1.5.ckpt)锛屾斁鍏eight/鏂囦欢澶广€� +4. 鎵ц璁粌鑴氭湰銆� +- 鍦ㄧ嚎涓嬫湇鍔″櫒杩涜璁粌 + +```python +# 鍒嗗竷寮忚缁� +bash scripts/run_distribute_train.sh [RANK_SIZE] [RANK_START_ID] [RANK_TABLE_FILE] [BERT_CKPT] +#BERT_CKPT锛氶璁粌BERT鏂囦欢鍚嶏紙渚嬪bert_base.ckpt锛� + +# 鍗曟満璁粌 +bash scripts/run_standalone_train.sh [BERT_CKPT] [DEVICE_ID] [PYNATIVE] +#BERT_CKPT锛氶璁粌BERT鏂囦欢鍚嶏紙渚嬪bert_base.ckpt锛� +#DEVICE_ID锛氳繍琛岀殑鏈哄櫒id +#PYNATIVE锛氭槸鍚︿娇鐢╬ynative妯″紡杩愯锛堥粯璁alse锛� +``` + +- 鍦∣penI杩涜璁粌 + +```text +# (1) 杩涘叆[浠g爜浠揮(https://git.openi.org.cn/OpenModelZoo/SoftMaskedBert),鏂板缓璁粌浠诲姟銆� +# (2) 鍦ㄧ綉椤典笂璁剧疆 "enable_modelarts=True; bert_ckpt=bert_base.ckpt" +# (3) 濡傛灉鎸塸ynative妯″紡杩愯锛屽垯鍦ㄧ綉椤典笂璁剧疆 "pynative=True" +# (4) 鍦ㄧ綉椤典笂璁剧疆鏁版嵁闆� "SoftMask.zip" +# (5) 鍦ㄧ綉椤典笂璁剧疆鍚姩鏂囦欢涓� "train.py" +# (6) 杩愯璁粌浣滀笟 +``` + +5. 鎵ц璇勪及鑴氭湰銆� + +璁粌缁撴潫鍚庯紝鎸夌収濡備笅姝ラ鍚姩璇勪及锛� + +```python +# 璇勪及 +bash scripts/run_eval.sh [BERT_CKPT_NAME] [CKPT_DIR] +``` + +# 鑴氭湰璇存槑 + +```text +鈹溾攢鈹€ model_zoo + 鈹溾攢鈹€ README.md // 鎵€鏈夋ā鍨嬬浉鍏宠鏄� + 鈹溾攢鈹€ soft-maksed-bert + 鈹溾攢鈹€ README.md // softmasked-BERT鐩稿叧璇存槑 + 鈹溾攢鈹€ README_CN.md // softmasked-BERT涓枃鐗堢浉鍏宠鏄� + 鈹溾攢鈹€ ascend310_infer // 瀹炵幇310鎺ㄧ悊婧愪唬鐮� + 鈹溾攢鈹€ scripts + 鈹� 鈹溾攢鈹€run_distribute_train.sh // Ascend鍒嗗竷寮忚缁冪殑shell鑴氭湰 + 鈹� 鈹溾攢鈹€run_standalone_train.sh // Ascend鍗曟満璁粌鐨剆hell鑴氭湰 + 鈹� 鈹溾攢鈹€run_eval.sh // Ascend璇勪及鐨剆hell鑴氭湰 + 鈹� 鈹溾攢鈹€run_infer_310.sh // Ascend鎺ㄧ悊shell鑴氭湰 + 鈹� 鈹溾攢鈹€run_preprocess.sh // 杩愯鏁版嵁棰勫鐞嗙殑shell鑴氭湰 + 鈹溾攢鈹€ src + 鈹� 鈹溾攢鈹€soft_masked_bert.py // soft-maksed bert鏋舵瀯 + 鈹� 鈹溾攢鈹€bert_model.py // BERT鏋舵瀯 + 鈹� 鈹溾攢鈹€dataset.py // 鏁版嵁闆嗗鐞� + 鈹� 鈹溾攢鈹€finetune_config.py // 妯″瀷瓒呭弬鏁� + 鈹� 鈹溾攢鈹€gru.py // GRU鏋舵瀯 + 鈹� 鈹溾攢鈹€tokenization.py // 鍗曡瘝鍒嗗壊 + 鈹� 鈹溾攢鈹€util.py // 宸ュ叿 + 鈹溾攢鈹€ train.py // 璁粌鑴氭湰 + 鈹溾攢鈹€ eval.py // 璇勪及鑴氭湰 + 鈹溾攢鈹€ postprogress.py // 310鎺ㄧ悊鍚庡鐞嗚剼鏈� + 鈹溾攢鈹€ export.py // 灏哻heckpoint鏂囦欢瀵煎嚭 + 鈹溾攢鈹€ preprocess_dataset.py // 鏁版嵁棰勫鐞� +``` + +## 鑴氭湰鍙傛暟 + +```python +'batch size':36 # batch澶у皬 +'epoch':100 # 鎬昏璁粌epoch鏁� +'learning rate':0.0001 # 鍒濆瀛︿範鐜� +'loss function':'BCELoss' # 璁粌閲囩敤鐨勬崯澶卞嚱鏁� +'optimizer':AdamWeightDecay # 婵€娲诲嚱鏁� +``` + +## 璁粌杩囩▼ + +### 鍗曟満璁粌 + +- Ascend澶勭悊鍣ㄧ幆澧冭繍琛� + + ```python + bash scripts/run_standalone_train.sh [BERT_CKPT] [DEVICE_ID] [PYNATIVE] + ``` + + 璁粌缁撴潫鍚庯紝鎮ㄥ彲鍦ㄩ粯璁よ剼鏈枃浠跺す涓嬫壘鍒版鏌ョ偣鏂囦欢銆傝繍琛岃繃绋嬪涓嬶細 + + ```bash + epoch: 1 step: 152, loss is 3.3235654830932617 + epoch: 1 step: 153, loss is 3.6958463191986084 + epoch: 1 step: 154, loss is 3.585498571395874 + epoch: 1 step: 155, loss is 3.276094913482666 + ... + ``` + +### 鍒嗗竷寮忚缁� + +- Ascend澶勭悊鍣ㄧ幆澧冭繍琛� + + ```python + bash run_distribute_train.sh [RANK_SIZE] [RANK_START_ID] [RANK_TABLE_FILE] [BERT_CKPT] + ``` + + 涓婅堪shell鑴氭湰灏嗗湪鍚庡彴杩愯鍒嗗竷璁粌銆� + + ```bash + epoch: 1 step: 12, loss is 7.957302093505859 + epoch: 1 step: 13, loss is 7.886098861694336 + epoch: 1 step: 14, loss is 7.781495094299316 + epoch: 1 step: 15, loss is 7.755488395690918 + ... + ... + ``` + +## 鎺ㄧ悊 + +### 鎺ㄧ悊杩囩▼ + +鍦ㄦ墽琛屾帹鐞嗕箣鍓嶏紝闇€瑕侀€氳繃export.py瀵煎嚭mindir鏂囦欢銆傝緭鍏ユ暟鎹枃浠朵负bin鏍煎紡銆� + +```python +# 瀵煎嚭mindir鏂囦欢 + +python export.py --bert_ckpt [BERT_CKPT] --ckpt_dir [CKPT_DIR] + +# Ascend310 鎺ㄧ悊 + +bash scripts/run_infer_310.sh [MINDIR_PATH] [DATA_FILE_PATH] [NEED_PREPROCESS] [DEVICE_ID] +``` + +`BERT_CKPT`涓哄繀閫夐」, 棰勮缁傿ERT鏂囦欢鍚嶏紙渚嬪bert_base.ckpt锛� +`CKPT_DIR`涓哄繀閫夐」, 璁粌濂絚kpt鐨勮矾寰� (渚嬪./checkpoint/SoftMaskedBert-100_874.ckpt) +`MINDIR_PATH` 涓哄繀閫夐」, 琛ㄧず妯″瀷鏂囦欢鐨勭洰褰曘€� +`DATA_FILE_PATH` 涓哄繀閫夐」, 琛ㄧず杈撳叆鏁版嵁鐨勭洰褰曘€� +`NEED_PREPROCESS` 涓哄繀閫夐」, 鍦╗y|n]涓彇鍊硷紝琛ㄧず鏁版嵁鏄惁棰勫鐞嗕负bin鏍煎紡銆� +`DEVICE_ID` 鍙€夛紝榛樿鍊间负 0銆� + +### 鎺ㄧ悊缁撴灉 + +鎺ㄧ悊缁撴灉淇濆瓨鍦ㄩ」鐩富鐩綍涓嬶紝鍙湪acc.log涓湅鍒版渶缁堢簿搴︾粨鏋溿€� + +```eval log +1 The detection result is precision=0.6733436055469953, recall=0.6181046676096181 and F1=0.6445427728613569 +2 The correction result is precision=0.8260869565217391, recall=0.7234468937875751 and F1=0.7713675213675213 +3 Sentence Level: acc:0.606364, precision:0.650970, recall:0.433579, f1:0.520487 +``` + +# 妯″瀷鎻忚堪 + +## 鎬ц兘 + +### 璁粌鎬ц兘 + +| 鍙傛暟 | Ascend | +| -------------------------- | ----------------------------------------------------------- | +| 妯″瀷鐗堟湰 | BERT-base | +| 璧勬簮 | Ascend 910锛汣PU 2.60GHz锛�192鏍革紱鍐呭瓨 755G锛涚郴缁� Euler2.8 | +| 涓婁紶鏃ユ湡 | 2022-06-28 | +| MindSpore鐗堟湰 | 1.6.0 | +| 鏁版嵁闆� | SIGHAN | +| 璁粌鍙傛暟 | epoch=100, steps=6994, batch_size = 36, lr=0.0001 | +| 浼樺寲鍣� | AdamWeightDecay | +| 鎹熷け鍑芥暟 | BCELoss | +| 鎹熷け | 0.0016 | +| 閫熷害 | 鍗曞崱锛�349.7姣/姝�; 8鍗★細314.7姣/姝� | +| 鎬绘椂闀� | 鍗曞崱锛�4076鍒嗛挓; 8鍗★細458鍒嗛挓 | +| 寰皟妫€鏌ョ偣 | 459M (.ckpt鏂囦欢) | +| 鑴氭湰 | [Soft-Masked BERT鑴氭湰](https://gitee.com/rafeal8830/soft-maksed-bert/edit/master/README_TEMPLATE_CN.md) | + +### 鎺ㄧ悊鎬ц兘 + +> 鎻愪緵鎺ㄧ悊鎬ц兘鐨勮缁嗘弿杩帮紝鍖呮嫭鑰楁椂锛岀簿搴︾瓑 + +浣犲彲浠ュ弬鐓у涓嬫ā鏉� + +| Parameters | Ascend | +| ------------------- | --------------------------- | +| Model Version | ResNet18 | +| Resource | Ascend 910; OS Euler2.8 | +| Uploaded Date | 02/25/2021 (month/day/year) | +| MindSpore Version | 1.7.0 | +| Dataset | CIFAR-10 | +| batch_size | 32 | +| outputs | probability | +| Accuracy | 94.02% | +| Model for inference | 43M (.air file) | + +# 璐$尞鎸囧崡 + +濡傛灉浣犳兂鍙備笌璐$尞鏄囨€濈殑宸ヤ綔褰撲腑锛岃闃呰[鏄囨€濊础鐚寚鍗梋(https://gitee.com/mindspore/models/blob/master/CONTRIBUTING_CN.md)鍜孾how_to_contribute](https://gitee.com/mindspore/models/tree/master/how_to_contribute) + +## 璐$尞鑰� + +* [c34](https://gitee.com/c_34) (Huawei) + +# ModelZoo 涓婚〉 + +璇锋祻瑙堝畼鏂筟涓婚〉](https://gitee.com/mindspore/models)銆� \ No newline at end of file diff --git a/research/nlp/soft_masked_bert/ascend310_infer/CMakeLists.txt b/research/nlp/soft_masked_bert/ascend310_infer/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9faab190177e25d7fabb6b224380c78651aec9f5 --- /dev/null +++ b/research/nlp/soft_masked_bert/ascend310_infer/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.14.1) +project(softmasedbert[CXX]) +add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -std=c++17 -Werror -Wall -fPIE -Wl,--allow-shlib-undefined") +set(PROJECT_SRC_ROOT ${CMAKE_CURRENT_LIST_DIR}/) +option(MINDSPORE_PATH "mindspore install path" "") +include_directories(${MINDSPORE_PATH}) +include_directories(${MINDSPORE_PATH}/include) +include_directories(${PROJECT_SRC_ROOT}) +find_library(MS_LIB libmindspore.so ${MINDSPORE_PATH}/lib) +file(GLOB_RECURSE MD_LIB ${MINDSPORE_PATH}/_c_dataengine*) +add_executable(softmaskedbert src/main.cc src/utils.cc) +target_link_libraries(softmaskedbert ${MS_LIB} ${MD_LIB} gflags) diff --git a/research/nlp/soft_masked_bert/ascend310_infer/build.sh b/research/nlp/soft_masked_bert/ascend310_infer/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..1a910fac327121991d9eded00c99f565ff43da36 --- /dev/null +++ b/research/nlp/soft_masked_bert/ascend310_infer/build.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ -d out ]; then + rm -rf out +fi + +mkdir out +cd out || exit + +if [ -f "Makefile" ]; then + make clean +fi + +cmake . \ + -DMINDSPORE_PATH="`pip show mindspore-ascend | grep Location | awk '{print $2"/mindspore"}' | xargs realpath`" +make diff --git a/research/nlp/soft_masked_bert/ascend310_infer/inc/utils.h b/research/nlp/soft_masked_bert/ascend310_infer/inc/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f5fe77fdfc087a3dbde54fe7a14b873b7923d873 --- /dev/null +++ b/research/nlp/soft_masked_bert/ascend310_infer/inc/utils.h @@ -0,0 +1,32 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_INFERENCE_UTILS_H_ +#define MINDSPORE_INFERENCE_UTILS_H_ + +#include <sys/stat.h> +#include <dirent.h> +#include <vector> +#include <string> +#include <memory> +#include "include/api/types.h" + +std::vector<std::string> GetAllFiles(std::string_view dirName); +DIR *OpenDir(std::string_view dirName); +std::string RealPath(std::string_view path); +mindspore::MSTensor ReadFileToTensor(const std::string &file); +int WriteResult(const std::string& textFile, const std::vector<mindspore::MSTensor> &outputs); +#endif diff --git a/research/nlp/soft_masked_bert/ascend310_infer/src/main.cc b/research/nlp/soft_masked_bert/ascend310_infer/src/main.cc new file mode 100644 index 0000000000000000000000000000000000000000..760094e0c45855052132eb4d735cc01e3ebef6da --- /dev/null +++ b/research/nlp/soft_masked_bert/ascend310_infer/src/main.cc @@ -0,0 +1,151 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <sys/time.h> +#include <gflags/gflags.h> +#include <dirent.h> +#include <iostream> +#include <string> +#include <algorithm> +#include <iosfwd> +#include <vector> +#include <fstream> +#include <sstream> +#include "include/api/model.h" +#include "include/api/context.h" +#include "include/api/types.h" +#include "include/api/serialization.h" +#include "include/dataset/execute.h" +#include "include/dataset/vision.h" +#include "inc/utils.h" + +using mindspore::Context; +using mindspore::Serialization; +using mindspore::Model; +using mindspore::Status; +using mindspore::MSTensor; +using mindspore::dataset::Execute; +using mindspore::ModelType; +using mindspore::GraphCell; +using mindspore::kSuccess; + +DEFINE_string(mindir_path, "", "mindir path"); +DEFINE_string(input0_path, ".", "input0 path"); +DEFINE_string(input1_path, ".", "input1 path"); +DEFINE_string(input2_path, ".", "input2 path"); +DEFINE_string(input3_path, ".", "input3 path"); +DEFINE_string(input4_path, ".", "input4 path"); +DEFINE_string(input5_path, ".", "input5 path"); +DEFINE_string(input6_path, ".", "input6 path"); +DEFINE_int32(device_id, 0, "device id"); + +int main(int argc, char **argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (RealPath(FLAGS_mindir_path).empty()) { + std::cout << "Invalid mindir" << std::endl; + return 1; + } + auto context = std::make_shared<Context>(); + auto ascend310 = std::make_shared<mindspore::Ascend310DeviceInfo>(); + ascend310->SetDeviceID(FLAGS_device_id); + context->MutableDeviceInfo().push_back(ascend310); + mindspore::Graph graph; + Status ret = Serialization::Load(FLAGS_mindir_path, ModelType::kMindIR, &graph); + if (ret != kSuccess) { + std::cout << "ERROR: Load failed." << std::endl; + return 1; + } + Model model; + ret = model.Build(GraphCell(graph), context); + if (ret != kSuccess) { + std::cout << "ERROR: Build failed." << std::endl; + return 1; + } + std::vector<MSTensor> model_inputs = model.GetInputs(); + if (model_inputs.empty()) { + std::cout << "Invalid model, inputs is empty." << std::endl; + return 1;} + auto input0_files = GetAllFiles(FLAGS_input0_path); + auto input1_files = GetAllFiles(FLAGS_input1_path); + auto input2_files = GetAllFiles(FLAGS_input2_path); + auto input3_files = GetAllFiles(FLAGS_input3_path); + auto input4_files = GetAllFiles(FLAGS_input4_path); + auto input5_files = GetAllFiles(FLAGS_input5_path); + auto input6_files = GetAllFiles(FLAGS_input6_path); + if (input0_files.empty() || input1_files.empty() || input2_files.empty() || input3_files.empty() \ + || input4_files.empty() || input5_files.empty() || input6_files.empty()) { + std::cout << "ERROR: input data empty." << std::endl; + return 1;} + std::map<double, double> costTime_map; + size_t size = input0_files.size(); + for (size_t i = 0; i < size; ++i) { + struct timeval start = {0}; + struct timeval end = {0}; + double startTimeMs; + double endTimeMs; + std::vector<MSTensor> inputs; + std::vector<MSTensor> outputs; + std::cout << "Start predict input files:" << input0_files[i] << std::endl; + auto input0 = ReadFileToTensor(input0_files[i]); + auto input1 = ReadFileToTensor(input1_files[i]); + auto input2 = ReadFileToTensor(input2_files[i]); + auto input3 = ReadFileToTensor(input3_files[i]); + auto input4 = ReadFileToTensor(input4_files[i]); + auto input5 = ReadFileToTensor(input5_files[i]); + auto input6 = ReadFileToTensor(input6_files[i]); + inputs.emplace_back(model_inputs[0].Name(), model_inputs[0].DataType(), model_inputs[0].Shape(), + input0.Data().get(), input0.DataSize()); + inputs.emplace_back(model_inputs[1].Name(), model_inputs[1].DataType(), model_inputs[1].Shape(), + input1.Data().get(), input1.DataSize()); + inputs.emplace_back(model_inputs[2].Name(), model_inputs[2].DataType(), model_inputs[2].Shape(), + input2.Data().get(), input2.DataSize()); + inputs.emplace_back(model_inputs[3].Name(), model_inputs[3].DataType(), model_inputs[3].Shape(), + input3.Data().get(), input3.DataSize()); + inputs.emplace_back(model_inputs[4].Name(), model_inputs[4].DataType(), model_inputs[4].Shape(), + input4.Data().get(), input4.DataSize()); + inputs.emplace_back(model_inputs[5].Name(), model_inputs[5].DataType(), model_inputs[5].Shape(), + input5.Data().get(), input5.DataSize()); + inputs.emplace_back(model_inputs[6].Name(), model_inputs[6].DataType(), model_inputs[6].Shape(), + input6.Data().get(), input6.DataSize()); + gettimeofday(&start, nullptr); + ret = model.Predict(inputs, &outputs); + gettimeofday(&end, nullptr); + if (ret != kSuccess) { + std::cout << "Predict " << input0_files[i] << " failed." << std::endl; + return 1; + } + startTimeMs = (1.0 * start.tv_sec * 1000000 + start.tv_usec) / 1000; + endTimeMs = (1.0 * end.tv_sec * 1000000 + end.tv_usec) / 1000; + costTime_map.insert(std::pair<double, double>(startTimeMs, endTimeMs)); + WriteResult(input0_files[i], outputs); + } + double average = 0.0; + int inferCount = 0; + for (auto iter = costTime_map.begin(); iter != costTime_map.end(); iter++) { + double diff = 0.0; + diff = iter->second - iter->first; + average += diff; + inferCount++;} + average = average / inferCount; + std::stringstream timeCost; + timeCost << "NN inference cost average time: "<< average << " ms of infer_count " << inferCount << std::endl; + std::cout << "NN inference cost average time: "<< average << "ms of infer_count " << inferCount << std::endl; + std::string fileName = "./time_result" + std::string("/test_perform_static.txt"); + std::ofstream fileStream(fileName.c_str(), std::ios::trunc); + fileStream << timeCost.str(); + fileStream.close(); + costTime_map.clear(); + return 0; +} diff --git a/research/nlp/soft_masked_bert/ascend310_infer/src/utils.cc b/research/nlp/soft_masked_bert/ascend310_infer/src/utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..2261cf87efc680a4ea4d400feeacba7b41a3aafd --- /dev/null +++ b/research/nlp/soft_masked_bert/ascend310_infer/src/utils.cc @@ -0,0 +1,134 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <fstream> +#include <algorithm> +#include <iostream> +#include "inc/utils.h" + +using mindspore::MSTensor; +using mindspore::DataType; + +std::vector<std::string> GetAllFiles(std::string_view dirName) { + struct dirent *filename; + DIR *dir = OpenDir(dirName); + if (dir == nullptr) { + return {}; + } + std::vector<std::string> res; + while ((filename = readdir(dir)) != nullptr) { + std::string dName = std::string(filename->d_name); + if (dName == "." || dName == ".." || filename->d_type != DT_REG) { + continue; + } + res.emplace_back(std::string(dirName) + "/" + filename->d_name); + } + std::sort(res.begin(), res.end()); + for (auto &f : res) { + std::cout << "Text file: " << f << std::endl; + } + return res; +} + +int WriteResult(const std::string& textFile, const std::vector<MSTensor> &outputs) { + std::vector<std::string> homePath; + homePath.push_back("./result_files/result_00"); + homePath.push_back("./result_files/result_01"); + homePath.push_back("./result_files/result_02"); + homePath.push_back("./result_files/result_03"); + homePath.push_back("./result_files/result_04"); + homePath.push_back("./result_files/result_05"); + for (size_t i = 0; i < outputs.size(); ++i) { + size_t outputSize; + std::shared_ptr<const void> netOutput; + netOutput = outputs[i].Data(); + outputSize = outputs[i].DataSize(); + int pos = textFile.rfind('/'); + std::string fileName(textFile, pos + 1); + std::string outFileName = homePath[i] + "/" + fileName; + FILE * outputFile = fopen(outFileName.c_str(), "wb"); + fwrite(netOutput.get(), outputSize, sizeof(char), outputFile); + fclose(outputFile); + outputFile = nullptr; + } + return 0; +} + +mindspore::MSTensor ReadFileToTensor(const std::string &file) { + if (file.empty()) { + std::cout << "Pointer file is nullptr" << std::endl; + return mindspore::MSTensor(); + } + + std::ifstream ifs(file); + if (!ifs.good()) { + std::cout << "File: " << file << " is not exist" << std::endl; + return mindspore::MSTensor(); + } + + if (!ifs.is_open()) { + std::cout << "File: " << file << "open failed" << std::endl; + return mindspore::MSTensor(); + } + + ifs.seekg(0, std::ios::end); + size_t size = ifs.tellg(); + mindspore::MSTensor buffer(file, mindspore::DataType::kNumberTypeUInt8, {static_cast<int64_t>(size)}, nullptr, size); + + ifs.seekg(0, std::ios::beg); + ifs.read(reinterpret_cast<char *>(buffer.MutableData()), size); + ifs.close(); + + return buffer; +} + + +DIR *OpenDir(std::string_view dirName) { + if (dirName.empty()) { + std::cout << " dirName is null ! " << std::endl; + return nullptr; + } + std::string realPath = RealPath(dirName); + struct stat s; + lstat(realPath.c_str(), &s); + if (!S_ISDIR(s.st_mode)) { + std::cout << "dirName is not a valid directory !" << std::endl; + return nullptr; + } + DIR *dir; + dir = opendir(realPath.c_str()); + if (dir == nullptr) { + std::cout << "Can not open dir " << dirName << std::endl; + return nullptr; + } + std::cout << "Successfully opened the dir " << dirName << std::endl; + return dir; +} + +std::string RealPath(std::string_view path) { + char realPathMem[PATH_MAX] = {0}; + char *realPathRet = nullptr; + realPathRet = realpath(path.data(), realPathMem); + + if (realPathRet == nullptr) { + std::cout << "File: " << path << " is not exist."; + return ""; + } + + std::string realPath(realPathMem); + std::cout << path << " realpath is: " << realPath << std::endl; + return realPath; +} diff --git a/research/nlp/soft_masked_bert/eval.py b/research/nlp/soft_masked_bert/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..74fd7c9812282724b75acb33573105bf2c04c8cf --- /dev/null +++ b/research/nlp/soft_masked_bert/eval.py @@ -0,0 +1,155 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""do eval""" +import os +import operator +import argparse +from mindspore import context, Model +from mindspore.nn.optim import AdamWeightDecay +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.communication.management import init, get_group_size +from src.tokenization import CscTokenizer +from src.soft_masked_bert import SoftMaskedBertCLS +from src.finetune_config import optimizer_cfg, bert_cfg +from src.utils import compute_corrector_prf +from tqdm import tqdm + +def do_eval(dataset, network, profile=None, trained_ckpt_path="", epoch_num=1): + network.set_train(False) + para_dict = load_checkpoint(trained_ckpt_path) + load_param_into_net(network, para_dict) + if optimizer_cfg.optimizer == 'AdamWeightDecay': + params = network.trainable_params() + optimizer = AdamWeightDecay(params) + model = Model(network, optimizer=optimizer, amp_level="O3") + results = [] + cor_acc_labels = [] + det_acc_labels = [] + results = [] + det_acc_labels = [] + cor_acc_labels = [] + print("come in prediction...") + for _, data in tqdm(enumerate(dataset.create_dict_iterator())): + original_tokens, cor_y, cor_y_hat, det_y_hat, det_labels, batch_seq_len = model.predict(data['wrong_ids'], \ + data['original_tokens'], data['original_tokens_mask'], data['correct_tokens'], data['correct_tokens_mask'], \ + data['original_token_type_ids'], data['correct_token_type_ids']) + for src, tgt, predict, det_predict, det_label, seq_len in \ + zip(original_tokens, cor_y, cor_y_hat, det_y_hat, det_labels, batch_seq_len): + seq_len_ = int((seq_len[0] - 2).asnumpy().tolist()) + _src = src[1: seq_len_ + 1].asnumpy().tolist() + _tgt = tgt[1: seq_len_ + 1].asnumpy().tolist() + _predict = predict[1: seq_len_ + 1].asnumpy().tolist() + _det_predict = det_predict[1:seq_len_ + 1].asnumpy().tolist() + _det_label = det_label[1:seq_len_ + 1].asnumpy().tolist() + cor_acc_labels.append(1 if operator.eq(_tgt, _predict) else 0) + det_acc_labels.append(1 if operator.eq(_det_predict, _det_label) else 0) + results.append((_src, _tgt, _predict)) + compute_corrector_prf(results) + +def run_csc(): + """run csc task""" + parser = argparse.ArgumentParser(description="run csc") + parser.add_argument("--bert_ckpt", type=str, default="bert_base.ckpt") + parser.add_argument("--device_target", type=str, default="Ascend") + parser.add_argument("--name", type=str, default="SoftMaskedBertModel") + parser.add_argument("--device_id", type=int, default=0) + parser.add_argument("--hyper_params", type=float, default=0.8) + parser.add_argument("--eval_dataset", type=str, default="./datasets/csc/dev.json") + parser.add_argument("--baselr", type=float, default=0.00001) + parser.add_argument("--bias_lr_factor", type=int, default=2) + parser.add_argument("--weight_decay", type=float, default=5e-8) + parser.add_argument("--batch_size", type=int, default=36) + parser.add_argument("--max_epochs", type=int, default=100) + parser.add_argument("--accumulate_grad_batches", type=int, default=2) + parser.add_argument("--max_seq_len", type=int, default=bert_cfg.seq_length) #512 + parser.add_argument("--ckpt_dir", type=str, required=True) + parser.add_argument("--train_url", type=str, default="./datasets/csc") + parser.add_argument("--data_url", type=str, default="./datasets/csc/dev.json") # direction of the training dataset, such as s3://open-data/attachment/ + parser.add_argument("--enable_modelarts", type=bool, default=False) + parser.add_argument("--pynative", type=bool, default=False) + args_opt = parser.parse_args() + local_ckpt_dir = './weight/' + args_opt.bert_ckpt + + if args_opt.enable_modelarts: + import moxing as mox + # show paths + print("data_url") + print(args_opt.data_url) + print("train_url") + print(args_opt.train_url) + if mox.file.exists('/cache/dataset'): + ret = mox.file.list_directory('/cache/dataset', recursive=True) + print('/cache/dataseet: (recursive)') + print(ret) + cloud_data_url = args_opt.data_url + local_root_dir = '/home/work/user-job-dir' + local_data_dir = os.path.join(local_root_dir, "data") + local_dev_file_dir = os.path.join(local_data_dir, "SoftMask_test", "dev.json") + local_ckpt_dir = os.path.join(local_data_dir, "SoftMask_test", args_opt.bert_ckpt) + local_vocab_dir = os.path.join(local_data_dir, "SoftMask_test", "bert-base-chinese-vocab.txt") + local_model_dir = os.path.join(local_root_dir, "model") + if mox.file.exists(local_data_dir) is False: + mox.file.make_dirs(local_data_dir) + if mox.file.exists(local_model_dir) is False: + mox.file.make_dirs(local_model_dir) + mox.file.copy_parallel(cloud_data_url, local_data_dir) + print(local_data_dir + ":") + ret = mox.file.list_directory(local_data_dir, recursive=True) + print(ret) + ckpt_name = args_opt.ckpt_dir.split('/')[-1] + trained_ckpt_path = os.path.join(local_data_dir, "SoftMask_test", ckpt_name) + else: + local_ckpt_dir = './weight/' + args_opt.bert_ckpt + local_model_dir = './checkpoint' + local_data_dir = args_opt.data_url + local_dev_file_dir = local_data_dir + trained_ckpt_path = args_opt.ckpt_dir + # context setting + if args_opt.device_target == "Ascend": + if args_opt.enable_modelarts: + rank_id = int(os.environ["DEVICE_ID"]) + if args_opt.pynative: + context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=rank_id) + args_opt.batch_size = 16 + else: + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=rank_id) + init() + device_num = get_group_size() + context.set_auto_parallel_context(device_num=device_num, + gradients_mean=True, + parallel_mode=context.ParallelMode.DATA_PARALLEL) + else: + device_id = args_opt.device_id + device_num = 1 + if args_opt.pynative: + context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=device_id) + args_opt.batch_size = 16 + else: + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) + else: + raise Exception("Only support on Ascend currently.") + netwithloss = SoftMaskedBertCLS(args_opt.batch_size, is_training=False, load_checkpoint_path=local_ckpt_dir) + netwithloss.set_train(False) + if args_opt.enable_modelarts: + tokenizer = CscTokenizer(fp=local_dev_file_dir, device_num=device_num, rank_id=rank_id, \ + max_seq_len=args_opt.max_seq_len, vocab_path=local_vocab_dir) + else: + tokenizer = CscTokenizer(fp=local_dev_file_dir, device_num=device_num, rank_id=device_id, \ + max_seq_len=args_opt.max_seq_len) + ds_eval = tokenizer.get_token_ids(args_opt.batch_size) + do_eval(ds_eval, netwithloss, trained_ckpt_path=trained_ckpt_path) + +if __name__ == "__main__": + run_csc() diff --git a/research/nlp/soft_masked_bert/export.py b/research/nlp/soft_masked_bert/export.py new file mode 100644 index 0000000000000000000000000000000000000000..8ae3e5eb750925c09920cda70c71ac883046a208 --- /dev/null +++ b/research/nlp/soft_masked_bert/export.py @@ -0,0 +1,49 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""export mindir file""" +import argparse +import numpy as np +import mindspore as ms +from mindspore import context +from mindspore import export +from mindspore.train.serialization import load_checkpoint +from mindspore.train.serialization import load_param_into_net +from src.soft_masked_bert import SoftMaskedBertCLS + +def run_csc(): + """run csc task""" + parser = argparse.ArgumentParser(description="run csc") + parser.add_argument("--device", type=str, default="Ascend") + parser.add_argument("--device_id", type=int, default=0) + parser.add_argument("--batch_size", type=int, default=2) + parser.add_argument("--bert_ckpt", type=str, required=True) + parser.add_argument("--ckpt_dir", type=str, required=True) + parser.add_argument("--O3", default=False, action='store_true') + args_opt = parser.parse_args() + # context setting + if args_opt.device == "Ascend": + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + else: + raise Exception("Only support on Ascend currently.") + ckpt_path = './weight/' + args_opt.bert_ckpt + netwithloss = SoftMaskedBertCLS(args_opt.batch_size, is_training=False, \ + if_O3=args_opt.O3, load_checkpoint_path=ckpt_path) + param_dict = load_checkpoint(args_opt.ckpt_dir) + load_param_into_net(netwithloss, param_dict) + t = ms.Tensor(np.ones([2, 512]).astype(np.int32)) + input1 = [t, t, t, t, t, t, t] + export(netwithloss, *input1, file_name='smb', file_format='MINDIR') +if __name__ == "__main__": + run_csc() diff --git a/research/nlp/soft_masked_bert/postprocess.py b/research/nlp/soft_masked_bert/postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..60d3411e2ccabb3de0c0923c53498c3c71ff4280 --- /dev/null +++ b/research/nlp/soft_masked_bert/postprocess.py @@ -0,0 +1,79 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +''' +postprocess script. +''' + +import os +import argparse +import numpy as np +from src.utils import compute_corrector_prf, compute_sentence_level_prf + +parser = argparse.ArgumentParser(description="postprocess") +parser.add_argument("--batch_size", type=int, default=2, help="Eval batch size, default is 2") +parser.add_argument("--result_dir_00", type=str, default="./result_files/result_00", help="infer result Files") +parser.add_argument("--result_dir_01", type=str, default="./result_files/result_01", help="infer result Files") +parser.add_argument("--result_dir_02", type=str, default="./result_files/result_02", help="infer result Files") +parser.add_argument("--result_dir_03", type=str, default="./result_files/result_03", help="infer result Files") +parser.add_argument("--result_dir_04", type=str, default="./result_files/result_04", help="infer result Files") +parser.add_argument("--result_dir_05", type=str, default="./result_files/result_05", help="infer result Files") + +args, _ = parser.parse_known_args() + +if __name__ == "__main__": + file_names = os.listdir(args.result_dir_00) + results = [] + max_seq_len = 512 + min_seq_len = 1 + for f in file_names: + f_name_00 = os.path.join(args.result_dir_00, f) + f_name_01 = os.path.join(args.result_dir_01, f) + f_name_02 = os.path.join(args.result_dir_02, f) + f_name_03 = os.path.join(args.result_dir_03, f) + f_name_04 = os.path.join(args.result_dir_04, f) + f_name_05 = os.path.join(args.result_dir_05, f) + + result_00 = np.fromfile(f_name_00, np.float32) + result_01 = np.fromfile(f_name_01, np.float32) + result_02 = np.fromfile(f_name_02, np.float32) + result_03 = np.fromfile(f_name_03, np.float32) + result_04 = np.fromfile(f_name_04, np.float32) + result_05 = np.fromfile(f_name_05, np.float32) + # (2, 512) + result_00 = result_00.reshape(args.batch_size, max_seq_len) + result_01 = result_01.reshape(args.batch_size, max_seq_len) + result_02 = result_02.reshape(args.batch_size, max_seq_len) + result_03 = result_03.reshape(args.batch_size, max_seq_len) + result_04 = result_04.reshape(args.batch_size, max_seq_len) + result_05 = result_05.reshape(args.batch_size, min_seq_len) + for i in range(args.batch_size): + original_tokens = result_00[i].reshape(1, result_00[i].size) + cor_y = result_01[i].reshape(1, result_01[i].size) + cor_y_hat = result_02[i].reshape(1, result_02[i].size) + det_y_hat = result_03[i].reshape(1, result_03[i].size) + det_labels = result_04[i].reshape(1, result_04[i].size) + batch_seq_len = result_05[i].reshape(1, result_05[i].size) + for src, tgt, predict, det_predict, det_label, seq_len in zip(original_tokens, cor_y, cor_y_hat, det_y_hat, + det_labels, batch_seq_len): + # src: incorrect original, tgt: correct article word segmentation, ids predict: model predicted word segmentation, ids det_predict: predicted, det det_label: DET label + seq_len_ = int(seq_len[0] - 2) + _src = src[1: seq_len_ + 1].tolist() + _tgt = tgt[1: seq_len_ + 1].tolist() + _predict = predict[1: seq_len_ + 1].tolist() + results.append((_src, _tgt, _predict,)) + + compute_corrector_prf(results) + compute_sentence_level_prf(results) diff --git a/research/nlp/soft_masked_bert/preprocess.py b/research/nlp/soft_masked_bert/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..cab6081822bb1dc5d73c35a6728c96e49ee1129f --- /dev/null +++ b/research/nlp/soft_masked_bert/preprocess.py @@ -0,0 +1,85 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +''' +Ernie preprocess script. +''' + +import os +import argparse +from src.tokenization import CscTokenizer + +def parse_args(): + """set and check parameters.""" + parser = argparse.ArgumentParser(description="soft-masked bert preprocess") + parser.add_argument("--eval_data_shuffle", type=str, default="false", choices=["true", "false"], + help="Enable eval data shuffle, default is false") + parser.add_argument("--eval_batch_size", type=int, default=2, help="Eval batch size, default is 2") + parser.add_argument("--eval_data_file_path", type=str, default="./dataset/dev.json", + help="Data path, it is better to use absolute path") + parser.add_argument('--result_path', type=str, default='./preprocess_result/', help='result path') + parser.add_argument('--device_num', type=int, default=1, help='device num') + parser.add_argument('--rank_id', type=int, default=0, help='rank id') + parser.add_argument('--vocab_path', type=str, default='./src/bert-base-chinese-vocab.txt', help='vocab path') + args_opt = parser.parse_args() + return args_opt + + +if __name__ == "__main__": + args = parse_args() + tokenizer = CscTokenizer(fp=args.eval_data_file_path, device_num=args.device_num, rank_id=args.rank_id, \ + max_seq_len=512, vocab_path=args.vocab_path) + ds = tokenizer.get_token_ids(args.eval_batch_size) + print(ds.dataset_size) + wrong_ids_path = os.path.join(args.result_path, "00_data") + original_tokens_path = os.path.join(args.result_path, "01_data") + original_tokens_mask_path = os.path.join(args.result_path, "02_data") + correct_tokens_path = os.path.join(args.result_path, "03_data") + correct_tokens_mask_path = os.path.join(args.result_path, "04_data") + original_token_type_ids_path = os.path.join(args.result_path, "05_data") + correct_token_type_ids_path = os.path.join(args.result_path, "06_data") + os.makedirs(wrong_ids_path) + os.makedirs(original_tokens_path) + os.makedirs(original_tokens_mask_path) + os.makedirs(correct_tokens_path) + os.makedirs(correct_tokens_mask_path) + os.makedirs(original_token_type_ids_path) + os.makedirs(correct_token_type_ids_path) + + for idx, data in enumerate(ds.create_dict_iterator(output_numpy=True, num_epochs=1)): + wrong_ids = data["wrong_ids"] + original_tokens = data["original_tokens"] + original_tokens_mask = data["original_tokens_mask"] + correct_tokens = data["correct_tokens"] + correct_tokens_mask = data["correct_tokens_mask"] + original_token_type_ids = data["original_token_type_ids"] + correct_token_type_ids = data["correct_token_type_ids"] + + file_name = "batch_" + str(args.eval_batch_size) + "_" + str(idx) + ".bin" + wrong_ids_file_path = os.path.join(wrong_ids_path, file_name) + wrong_ids.tofile(wrong_ids_file_path) + original_tokens_file_path = os.path.join(original_tokens_path, file_name) + original_tokens.tofile(original_tokens_file_path) + original_tokens_mask_file_path = os.path.join(original_tokens_mask_path, file_name) + original_tokens_mask.tofile(original_tokens_mask_file_path) + correct_tokens_file_path = os.path.join(correct_tokens_path, file_name) + correct_tokens.tofile(correct_tokens_file_path) + correct_tokens_mask_file_path = os.path.join(correct_tokens_mask_path, file_name) + correct_tokens_mask.tofile(correct_tokens_mask_file_path) + original_token_type_ids_file_path = os.path.join(original_token_type_ids_path, file_name) + original_token_type_ids.tofile(original_token_type_ids_file_path) + correct_token_type_ids_file_path = os.path.join(correct_token_type_ids_path, file_name) + correct_token_type_ids.tofile(correct_token_type_ids_file_path) + print("=" * 20, "export bin files finished", "=" * 20) diff --git a/research/nlp/soft_masked_bert/preprocess_dataset.py b/research/nlp/soft_masked_bert/preprocess_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a4720e2aaecffec36aeb3da132d1814d17ec0feb --- /dev/null +++ b/research/nlp/soft_masked_bert/preprocess_dataset.py @@ -0,0 +1,220 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import json +import gc +import os +import random +import opencc +from lxml import etree +from tqdm import tqdm + + +def get_main_dir(): + return os.path.join(os.path.dirname(__file__)) + +def get_abs_path(*name): + fn = os.path.join(*name) + if os.path.isabs(fn): + return fn + return os.path.abspath(os.path.join(get_main_dir(), fn)) + +def dump_json(obj, fp): + fp = os.path.abspath(fp) + if not os.path.exists(os.path.dirname(fp)): + os.makedirs(os.path.dirname(fp)) + with open(fp, 'w', encoding='utf8') as f: + json.dump(obj, f, ensure_ascii=False, indent=4, separators=(',', ':')) + print(f'file is saved successfully, {fp}') + return True + + +def proc_item(item, converter): + root = etree.XML(item) + passages = dict() + mistakes = [] + for passage in root.xpath('/ESSAY/TEXT/PASSAGE'): + passages[passage.get('id')] = converter.convert(passage.text) + for mistake in root.xpath('/ESSAY/MISTAKE'): + mistakes.append({'id': mistake.get('id'), + 'location': int(mistake.get('location')) - 1, + 'wrong': converter.convert(mistake.xpath('./WRONG/text()')[0].strip()), + 'correction': converter.convert(mistake.xpath('./CORRECTION/text()')[0].strip())}) + + rst_items = dict() + + def get_passages_by_id(pgs, _id): + p = pgs.get(_id) + if p: + return p + _id = _id[:-1] + str(int(_id[-1]) + 1) + p = pgs.get(_id) + if p: + return p + raise ValueError(f'passage not found by {_id}') + + for mistake in mistakes: + if mistake['id'] not in rst_items.keys(): + rst_items[mistake['id']] = {'original_text': get_passages_by_id(passages, mistake['id']), + 'wrong_ids': [], + 'correct_text': get_passages_by_id(passages, mistake['id'])} + ori_text = rst_items[mistake['id']]['original_text'] + cor_text = rst_items[mistake['id']]['correct_text'] + if len(ori_text) == len(cor_text): + if ori_text[mistake['location']] in mistake['wrong']: + rst_items[mistake['id']]['wrong_ids'].append(mistake['location']) + wrong_char_idx = mistake['wrong'].index(ori_text[mistake['location']]) + start = mistake['location'] - wrong_char_idx + end = start + len(mistake['wrong']) + rst_items[mistake['id']][ + 'correct_text'] = f'{cor_text[:start]}{mistake["correction"]}{cor_text[end:]}' + else: + print(f'{mistake["id"]}\n{ori_text}\n{cor_text}') + rst = [] + for k in rst_items: + if len(rst_items[k]['correct_text']) == len(rst_items[k]['original_text']): + rst.append({'id': k, **rst_items[k]}) + else: + text = rst_items[k]['correct_text'] + rst.append({'id': k, 'correct_text': text, 'original_text': text, 'wrong_ids': []}) + return rst + +def proc_test_set(fp, converter): + """ + Generate the SIGHAN15 test set + Args: + fp: + converter: + Returns: + """ + inputs = dict() + with open(os.path.join(fp, 'SIGHAN15_CSC_TestInput.txt'), 'r', encoding='utf8') as f: + for line in f: + pid = line[5:14] + text = line[16:].strip() + inputs[pid] = text + rst = [] + with open(os.path.join(fp, 'SIGHAN15_CSC_TestTruth.txt'), 'r', encoding='utf8') as f: + for line in f: + pid = line[0:9] + mistakes = line[11:].strip().split(', ') + if len(mistakes) <= 1: + text = converter.convert(inputs[pid]) + rst.append({'id': pid, + 'original_text': text, + 'wrong_ids': [], + 'correct_text': text}) + else: + wrong_ids = [] + original_text = inputs[pid] + cor_text = inputs[pid] + for i in range(len(mistakes) // 2): + idx = int(mistakes[2 * i]) - 1 + cor_char = mistakes[2 * i + 1] + wrong_ids.append(idx) + cor_text = f'{cor_text[:idx]}{cor_char}{cor_text[idx + 1:]}' + original_text = converter.convert(original_text) + cor_text = converter.convert(cor_text) + if len(original_text) != len(cor_text): + print(pid) + print(original_text) + print(cor_text) + continue + rst.append({'id': pid, + 'original_text': original_text, + 'wrong_ids': wrong_ids, + 'correct_text': cor_text}) + return rst + +def read_data(fp): + for fn in os.listdir(fp): + if fn.endswith('ing.sgml'): + with open(os.path.join(fp, fn), 'r', encoding='utf-8', errors='ignore') as f: + item = [] + for line in f: + if line.strip().startswith('<ESSAY') and item: + yield ''.join(item) + item = [line.strip()] + elif line.strip().startswith('<'): + item.append(line.strip()) + + +def read_confusion_data(fp): + fn = os.path.join(fp, 'train.sgml') + with open(fn, 'r', encoding='utf8') as f: + item = [] + for line in tqdm(f): + if line.strip().startswith('<SENT') and item: + yield ''.join(item) + item = [line.strip()] + elif line.strip().startswith('<'): + item.append(line.strip()) + + +def proc_confusion_item(item): + """ + Process the Confusionset dataset + Args: + item: + Returns: + """ + root = etree.XML(item) + text = root.xpath('/SENTENCE/TEXT/text()')[0] + mistakes = [] + for mistake in root.xpath('/SENTENCE/MISTAKE'): + mistakes.append({'location': int(mistake.xpath('./LOCATION/text()')[0]) - 1, + 'wrong': mistake.xpath('./WRONG/text()')[0].strip(), + 'correction': mistake.xpath('./CORRECTION/text()')[0].strip()}) + + cor_text = text + wrong_ids = [] + for mis in mistakes: + cor_text = f'{cor_text[:mis["location"]]}{mis["correction"]}{cor_text[mis["location"] + 1:]}' + wrong_ids.append(mis['location']) + rst = [{ + 'id': '-', + 'original_text': text, + 'wrong_ids': wrong_ids, + 'correct_text': cor_text + }] + if len(text) != len(cor_text): + return [{'id': '--', + 'original_text': cor_text, + 'wrong_ids': [], + 'correct_text': cor_text}] + if random.random() < 0.01: + rst.append({'id': '--', + 'original_text': cor_text, + 'wrong_ids': [], + 'correct_text': cor_text}) + return rst + +def preproc(): + rst_items = [] + converter = opencc.OpenCC('tw2sp.json') + for item in read_data(get_abs_path('datasets', 'csc')): + rst_items += proc_item(item, converter) + for item in read_confusion_data(get_abs_path('datasets', 'csc')): + rst_items += proc_confusion_item(item) + # Split train and test + dev_set_len = len(rst_items) // 10 + print(len(rst_items)) + random.seed(666) + random.shuffle(rst_items) + dump_json(rst_items[:dev_set_len], get_abs_path('datasets', 'csc', 'dev.json')) + dump_json(rst_items[dev_set_len:], get_abs_path('datasets', 'csc', 'train.json')) + gc.collect() + +if __name__ == '__main__': + preproc() diff --git a/research/nlp/soft_masked_bert/requirement.txt b/research/nlp/soft_masked_bert/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7ba69d707cc5cc8717fc1f3e14a0ed53edb80a8 --- /dev/null +++ b/research/nlp/soft_masked_bert/requirement.txt @@ -0,0 +1,33 @@ +asttokens==2.0.5 +attrs==21.4.0 +certifi==2022.5.18.1 +charset-normalizer==2.0.12 +decorator==5.1.1 +easydict==1.9 +filelock==3.7.1 +huggingface-hub==0.7.0 +idna==3.3 +importlib-metadata==4.11.4 +mpmath==1.2.1 +numpy==1.21.6 +packaging==21.3 +Pillow==9.1.1 +pkg_resources==0.0.0 +protobuf==4.21.1 +psutil==5.9.1 +pyparsing==3.0.9 +PyYAML==6.0 +regex==2022.6.2 +requests==2.28.0 +scipy==1.7.3 +six==1.16.0 +sympy==1.10.1 +tokenizers==0.12.1 +torch==1.11.0 +tqdm==4.64.0 +transformers==4.19.4 +typing_extensions==4.2.0 +urllib3==1.26.9 +zipp==3.8.0 +opencc +lxml diff --git a/research/nlp/soft_masked_bert/scripts/run_distribute_train.sh b/research/nlp/soft_masked_bert/scripts/run_distribute_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..be0c54b60bc81d25de18903a3a836aeb6262e95b --- /dev/null +++ b/research/nlp/soft_masked_bert/scripts/run_distribute_train.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +DIR="$(cd "$(dirname "$0")" && pwd)" + +# help message +if [ $# != 4 ]; then + echo "Usage: bash scripts/run_distribute_train.sh [rank_size] [rank_start_id] [rank_table_file] [bert_ckpt]" + exit 1 +fi + +ulimit -c unlimited +ulimit -n 65530 +export SLOG_PRINT_TO_STDOUT=0 +export RANK_SIZE=$1 +export RANK_START_ID=$2 +export RANK_TABLE_FILE=$3 +export BERT_CKPT=$4 + +rm -rf $DIR/output_distribute +mkdir $DIR/output_distribute + +for ((i = 0; i <= $RANK_SIZE - 1; i++)); do + export RANK_ID=${i} + export DEVICE_ID=$((i + RANK_START_ID)) + echo 'start rank='${i}', device id='${DEVICE_ID}'...' + if [ -d $DIR/output_distribute/device${DEVICE_ID} ]; then + rm -rf $DIR/output_distribute/device${DEVICE_ID} + fi + mkdir $DIR/output_distribute/device${DEVICE_ID} + + nohup python train.py \ + --device_id ${DEVICE_ID} --bert_ckpt ${BERT_CKPT} --rank_size ${RANK_SIZE} >$DIR/output_distribute/device${DEVICE_ID}_log.txt 2>&1 & +done diff --git a/research/nlp/soft_masked_bert/scripts/run_eval.sh b/research/nlp/soft_masked_bert/scripts/run_eval.sh new file mode 100644 index 0000000000000000000000000000000000000000..0591eede8eb41c43a0b766b34a70c5bee02e931d --- /dev/null +++ b/research/nlp/soft_masked_bert/scripts/run_eval.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +DIR="$(cd "$(dirname "$0")" && pwd)" + +# help message +if [ $# != 2 ]; then + echo "Usage: bash scripts/run_eval.sh [bert_ckpt] [ckpt_dir]" + exit 1 +fi + +rm -rf $DIR/output_eval +mkdir $DIR/output_eval + +export BERT_CKPT=$1 +export CKPT_DIR=$2 + +nohup python eval.py --bert_ckpt ${BERT_CKPT} --ckpt_dir ${CKPT_DIR} >$DIR/output_eval/eval_log.txt 2>&1 & diff --git a/research/nlp/soft_masked_bert/scripts/run_infer_310.sh b/research/nlp/soft_masked_bert/scripts/run_infer_310.sh new file mode 100644 index 0000000000000000000000000000000000000000..67c91a88c47b82cbc1b3aaff7ba2e45bcd0e2582 --- /dev/null +++ b/research/nlp/soft_masked_bert/scripts/run_infer_310.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [[ $# -lt 3 || $# -gt 4 ]]; then + echo "Usage: bash scripts/run_infer_310.sh [MINDIR_PATH] [DATA_FILE_PATH] [NEED_PREPROCESS] [DEVICE_ID] + MINDIR_PATH is necessary which means the directory of the model file. + DATA_FILE_PATH is necessary which means the directory of the input data. + NEED_PREPROCESS is necessary which means weather need preprocess or not, it's value is 'y' or 'n'. + DEVICE_ID is optional, it can be set by environment variable device_id, otherwise the value is zero. + Example: bash scripts/run_infer_310.sh ./checkpoint/smb.mindir ./dataset/dev_json y 0" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} +model=$(get_real_path $1) +eval_data_file_path=$(get_real_path $2) + +if [ "$3" == "y" ] || [ "$3" == "n" ];then + need_preprocess=$3 +else + echo "weather need preprocess or not, it's value must be in [y, n]" + exit 1 +fi + +device_id=0 +if [ $# == 4 ]; then + device_id=$4 +fi + + +echo "mindir path: "$model +echo "eval_data_file_path: "$eval_data_file_path +echo "need preprocess: "$need_preprocess +echo "device id: "$device_id + + +export ASCEND_HOME=/usr/local/Ascend/ +if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then + export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/atc/bin:$PATH + export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/ascend-toolkit/latest/atc/lib64:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH + export TBE_IMPL_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe + export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp +else + export ASCEND_HOME=/usr/local/Ascend/latest/ + export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH + export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH + export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=$ASCEND_HOME/opp +fi + +function preprocess_data() +{ + if [ -d preprocess_result ]; then + rm -rf ./preprocess_result + fi + mkdir preprocess_result + python preprocess.py --eval_data_file_path=$eval_data_file_path --result_path=./preprocess_result/ +} + +function compile_app() +{ + cd ./ascend310_infer || exit + if [ -f "Makefile" ]; then + make clean + fi + bash build.sh &> build.log +} + +function infer() +{ + cd .. || exit + if [ -d result_files ]; then + rm -rf ./result_files + fi + if [ -d time_result ]; then + rm -rf ./time_result + fi + mkdir result_files + mkdir time_result + mkdir ./result_files/result_00 + mkdir ./result_files/result_01 + mkdir ./result_files/result_02 + mkdir ./result_files/result_03 + mkdir ./result_files/result_04 + mkdir ./result_files/result_05 + + ./ascend310_infer/softmaskedbert --mindir_path=$model --input0_path=./preprocess_result/00_data --input1_path=./preprocess_result/01_data --input2_path=./preprocess_result/02_data --input3_path=./preprocess_result/03_data --input4_path=./preprocess_result/04_data --input5_path=./preprocess_result/05_data --input6_path=./preprocess_result/06_data --device_id=$device_id &> infer.log + +} + +function cal_acc() +{ + python ./postprocess.py --result_dir_00=./result_files/result_00 --result_dir_01=./result_files/result_01 --result_dir_02=./result_files/result_02 --result_dir_03=./result_files/result_03 --result_dir_04=./result_files/result_04 --result_dir_05=./result_files/result_05 &> acc.log + +} + +if [ $need_preprocess == "y" ]; then + preprocess_data + if [ $? -ne 0 ]; then + echo "preprocess dataset failed" + exit 1 + fi +fi +compile_app +if [ $? -ne 0 ]; then + echo "compile app code failed" + exit 1 +fi +infer +if [ $? -ne 0 ]; then + echo " execute inference failed" + exit 1 +fi +cal_acc +if [ $? -ne 0 ]; then + echo "calculate accuracy failed" + exit 1 +fi diff --git a/research/nlp/soft_masked_bert/scripts/run_standalone_train.sh b/research/nlp/soft_masked_bert/scripts/run_standalone_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..b4ff80521178505c170de81bef6db91f70dbb558 --- /dev/null +++ b/research/nlp/soft_masked_bert/scripts/run_standalone_train.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 3 ] +then + echo "Usage: bash scripts/run_standalone_train.sh [BERT_CKPT] [DEVICE_ID] [PYNATIVE]" +exit 1 +fi + +DIR="$(cd "$(dirname "$0")" && pwd)" + +rm -rf $DIR/output_standalone +mkdir $DIR/output_standalone + +nohup python train.py --bert_ckpt $1 --device_id $2 --pynative $3 >>$DIR/output_standalone/device_log.txt 2>&1 & diff --git a/research/nlp/soft_masked_bert/src/__init__.py b/research/nlp/soft_masked_bert/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..602527cd720c8d268599dbaef190ba1cf1eb6f2b --- /dev/null +++ b/research/nlp/soft_masked_bert/src/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/research/nlp/soft_masked_bert/src/bert-base-chinese-config.json b/research/nlp/soft_masked_bert/src/bert-base-chinese-config.json new file mode 100644 index 0000000000000000000000000000000000000000..a521dc2845bdddbe822864290c6b928396fc5ee8 --- /dev/null +++ b/research/nlp/soft_masked_bert/src/bert-base-chinese-config.json @@ -0,0 +1,25 @@ +{ + "architectures": [ + "BertForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "directionality": "bidi", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "pooler_fc_size": 768, + "pooler_num_attention_heads": 12, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 21128 +} diff --git a/research/nlp/soft_masked_bert/src/bert_model.py b/research/nlp/soft_masked_bert/src/bert_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b7a3e39bfe9bb209af131f303bdda6baecd08cfc --- /dev/null +++ b/research/nlp/soft_masked_bert/src/bert_model.py @@ -0,0 +1,929 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Bert model.""" + +import math +import copy +import numpy as np +import mindspore.common.dtype as mstype +import mindspore.nn as nn +import mindspore.ops.functional as F +from mindspore.common.initializer import TruncatedNormal, initializer +from mindspore.ops import operations as P +from mindspore.ops import composite as C +from mindspore.common.tensor import Tensor +from mindspore.common.parameter import Parameter +import mindspore.ops as ops +from mindspore.train.serialization import load_param_into_net + +class BertOnlyMLMHead(nn.Cell): + def __init__(self, config, param_dict, pretrained): + super().__init__() + predictions = BertLMPredictionHead(config) + if pretrained is True: + load_param_into_net(predictions, param_dict) # load parameters except cls2.dense.weight and cls2.dense.bias + self.predictions = predictions + + def construct(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + +class CLS2(nn.Cell): + def __init__(self, config): + super(CLS2, self).__init__() + self.decoder = nn.Dense(config.hidden_size, config.vocab_size, bias_init="zeros", has_bias=False) + def construct(self, hidden_states): + return self.decoder(hidden_states) + +class BertLMPredictionHead(nn.Cell): + def __init__(self, config): + super().__init__() + self.cls1 = BertPredictionHeadTransform(config) + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.cls2 = CLS2(config) + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + + def construct(self, hidden_states): + hidden_states = self.cls1(hidden_states) + hidden_states = self.cls2(hidden_states) + return hidden_states + +class BertPredictionHeadTransform(nn.Cell): + def __init__(self, config): + super().__init__() + self.dense = nn.Dense(config.hidden_size, config.hidden_size) + self.gule = nn.GELU() + self.layernorm = nn.LayerNorm([config.hidden_size], epsilon=config.layer_norm_eps) + + def construct(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.gule(hidden_states) + hidden_states = self.layernorm(hidden_states) + return hidden_states + + + +class BertConfig: + """ + Configuration for `BertModel`. + + Args: + seq_length (int): Length of input sequence. Default: 128. + vocab_size (int): The shape of each embedding vector. Default: 32000. + hidden_size (int): Size of the bert encoder layers. Default: 768. + num_hidden_layers (int): Number of hidden layers in the BertTransformer encoder + cell. Default: 12. + num_attention_heads (int): Number of attention heads in the BertTransformer + encoder cell. Default: 12. + intermediate_size (int): Size of intermediate layer in the BertTransformer + encoder cell. Default: 3072. + hidden_act (str): Activation function used in the BertTransformer encoder + cell. Default: "gelu". + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + max_position_embeddings (int): Maximum length of sequences used in this + model. Default: 512. + type_vocab_size (int): Size of token type vocab. Default: 16. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + dtype (:class:`mindspore.dtype`): Data type of the input. Default: mstype.float32. + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + """ + def __init__(self, + seq_length=128, + vocab_size=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + use_relative_positions=False, + dtype=mstype.float32, + compute_type=mstype.float32, + pad_token_id=0, + layer_norm_eps=1e-12): + self.seq_length = seq_length + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.use_relative_positions = use_relative_positions + self.dtype = dtype + self.compute_type = compute_type + self.pad_token_id = pad_token_id + self.layer_norm_eps = layer_norm_eps + + +class EmbeddingLookup(nn.Cell): + """ + A embeddings lookup table with a fixed dictionary and size. + + Args: + vocab_size (int): Size of the dictionary of embeddings. + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + """ + def __init__(self, + vocab_size, + embedding_size, + embedding_shape, + use_one_hot_embeddings=False, + initializer_range=0.02): + super(EmbeddingLookup, self).__init__() + self.vocab_size = vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.embedding_table = Parameter(initializer + (TruncatedNormal(initializer_range), + [vocab_size, embedding_size]), + name='embedding_table') + self.expand = P.ExpandDims() + self.shape_flat = (-1,) + # self.gather = P.GatherV2() + self.gather = P.Gather() + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = tuple(embedding_shape) + + def construct(self, input_ids): + """Get output and embeddings lookup table""" + extended_ids = self.expand(input_ids, -1) + flat_ids = self.reshape(extended_ids, self.shape_flat) + if self.use_one_hot_embeddings: + one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value) + output_for_reshape = self.array_mul( + one_hot_ids, self.embedding_table) + else: + output_for_reshape = self.gather(self.embedding_table, flat_ids, 0) + output = self.reshape(output_for_reshape, self.shape) + return output, self.embedding_table + +class EmbeddingPostprocessor(nn.Cell): + """ + Postprocessors apply positional and token type embeddings to word embeddings. + + Args: + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_token_type (bool): Specifies whether to use token type embeddings. Default: False. + token_type_vocab_size (int): Size of token type vocab. Default: 16. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + max_position_embeddings (int): Maximum length of sequences used in this + model. Default: 512. + dropout_prob (float): The dropout probability. Default: 0.1. + """ + def __init__(self, + embedding_size, + embedding_shape, + use_relative_positions=False, + use_token_type=False, + token_type_vocab_size=16, + use_one_hot_embeddings=False, + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + super(EmbeddingPostprocessor, self).__init__() + self.use_token_type = use_token_type + self.token_type_vocab_size = token_type_vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.max_position_embeddings = max_position_embeddings + self.token_type_embedding = nn.Embedding( + vocab_size=token_type_vocab_size, + embedding_size=embedding_size, + use_one_hot=use_one_hot_embeddings) + self.shape_flat = (-1,) + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.1, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = tuple(embedding_shape) + self.dropout = nn.Dropout(1 - dropout_prob) + self.gather = P.Gather() + self.use_relative_positions = use_relative_positions + self.slice = P.StridedSlice() + _, seq, _ = self.shape + self.full_position_embedding = nn.Embedding( + vocab_size=max_position_embeddings, + embedding_size=embedding_size, + use_one_hot=False) + self.layernorm = nn.LayerNorm((embedding_size,)) + self.position_ids = Tensor(np.arange(seq).reshape(-1, seq).astype(np.int32)) + self.add = P.Add() + + def construct(self, token_type_ids, word_embeddings): + """Postprocessors apply positional and token type embeddings to word embeddings.""" + output = word_embeddings + if self.use_token_type: + token_type_embeddings = self.token_type_embedding(token_type_ids) + output = self.add(output, token_type_embeddings) + if not self.use_relative_positions: + position_embeddings = self.full_position_embedding(self.position_ids) + output = self.add(output, position_embeddings) + output = self.layernorm(output) + output = self.dropout(output) + return output + +class BertOutput(nn.Cell): + """ + Apply a linear computation to hidden status and a residual computation to input. + + Args: + in_channels (int): Input channels. + out_channels (int): Output channels. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + dropout_prob (float): The dropout probability. Default: 0.1. + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + """ + def __init__(self, + in_channels, + out_channels, + initializer_range=0.02, + dropout_prob=0.1, + compute_type=mstype.float32): + super(BertOutput, self).__init__() + self.dense = nn.Dense(in_channels, out_channels, + weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) + self.dropout = nn.Dropout(1 - dropout_prob) + self.dropout_prob = dropout_prob + # self.add = P.TensorAdd() + self.add = P.Add() + self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) + self.cast = P.Cast() + + def construct(self, hidden_status, input_tensor): + output = self.dense(hidden_status) + output = self.dropout(output) + output = self.add(input_tensor, output) + output = self.layernorm(output) + return output + + +class RelaPosMatrixGenerator(nn.Cell): + """ + Generates matrix of relative positions between inputs. + + Args: + length (int): Length of one dim for the matrix to be generated. + max_relative_position (int): Max value of relative position. + """ + def __init__(self, length, max_relative_position): + super(RelaPosMatrixGenerator, self).__init__() + self._length = length + self._max_relative_position = max_relative_position + self._min_relative_position = -max_relative_position + self.range_length = -length + 1 + + self.tile = P.Tile() + self.range_mat = P.Reshape() + self.sub = P.Sub() + self.expanddims = P.ExpandDims() + self.cast = P.Cast() + + def construct(self): + """Generates matrix of relative positions between inputs.""" + range_vec_row_out = self.cast(F.tuple_to_array(F.make_range(self._length)), mstype.int32) + range_vec_col_out = self.range_mat(range_vec_row_out, (self._length, -1)) + tile_row_out = self.tile(range_vec_row_out, (self._length,)) + tile_col_out = self.tile(range_vec_col_out, (1, self._length)) + range_mat_out = self.range_mat(tile_row_out, (self._length, self._length)) + transpose_out = self.range_mat(tile_col_out, (self._length, self._length)) + distance_mat = self.sub(range_mat_out, transpose_out) + + distance_mat_clipped = C.clip_by_value(distance_mat, + self._min_relative_position, + self._max_relative_position) + + # Shift values to be >=0. Each integer still uniquely identifies a + # relative position difference. + final_mat = distance_mat_clipped + self._max_relative_position + return final_mat + + +class RelaPosEmbeddingsGenerator(nn.Cell): + """ + Generates tensor of size [length, length, depth]. + + Args: + length (int): Length of one dim for the matrix to be generated. + depth (int): Size of each attention head. + max_relative_position (int): Maxmum value of relative position. + initializer_range (float): Initialization value of TruncatedNormal. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + def __init__(self, + length, + depth, + max_relative_position, + initializer_range, + use_one_hot_embeddings=False): + super(RelaPosEmbeddingsGenerator, self).__init__() + self.depth = depth + self.vocab_size = max_relative_position * 2 + 1 + self.use_one_hot_embeddings = use_one_hot_embeddings + + self.embeddings_table = Parameter( + initializer(TruncatedNormal(initializer_range), + [self.vocab_size, self.depth]), + name='embeddings_for_position') + + self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, + max_relative_position=max_relative_position) + self.reshape = P.Reshape() + self.one_hot = nn.OneHot(depth=self.vocab_size) + self.shape = P.Shape() + self.gather = P.Gather() + self.matmul = P.BatchMatMul() + + def construct(self): + """Generate embedding for each relative position of dimension depth.""" + relative_positions_matrix_out = self.relative_positions_matrix() + + if self.use_one_hot_embeddings: + flat_relative_positions_matrix = self.reshape(relative_positions_matrix_out, (-1,)) + one_hot_relative_positions_matrix = self.one_hot( + flat_relative_positions_matrix) + embeddings = self.matmul(one_hot_relative_positions_matrix, self.embeddings_table) + my_shape = self.shape(relative_positions_matrix_out) + (self.depth,) + embeddings = self.reshape(embeddings, my_shape) + else: + embeddings = self.gather(self.embeddings_table, + relative_positions_matrix_out, 0) + return embeddings + + +class SaturateCast(nn.Cell): + """ + Performs a safe saturating cast. This operation applies proper clamping before casting to prevent + the danger that the value will overflow or underflow. + + Args: + src_type (:class:`mindspore.dtype`): The type of the elements of the input tensor. Default: mstype.float32. + dst_type (:class:`mindspore.dtype`): The type of the elements of the output tensor. Default: mstype.float32. + """ + def __init__(self, src_type=mstype.float32, dst_type=mstype.float32): + super(SaturateCast, self).__init__() + np_type = mstype.dtype_to_nptype(dst_type) + + self.tensor_min_type = float(np.finfo(np_type).min) + self.tensor_max_type = float(np.finfo(np_type).max) + + self.min_op = P.Minimum() + self.max_op = P.Maximum() + self.cast = P.Cast() + self.dst_type = dst_type + + def construct(self, x): + out = self.max_op(x, self.tensor_min_type) + out = self.min_op(out, self.tensor_max_type) + return self.cast(out, self.dst_type) + + +class BertAttention(nn.Cell): + """ + Apply multi-headed attention from "from_tensor" to "to_tensor". + + Args: + from_tensor_width (int): Size of last dim of from_tensor. + to_tensor_width (int): Size of last dim of to_tensor. + from_seq_length (int): Length of from_tensor sequence. + to_seq_length (int): Length of to_tensor sequence. + num_attention_heads (int): Number of attention heads. Default: 1. + size_per_head (int): Size of each attention head. Default: 512. + query_act (str): Activation function for the query transform. Default: None. + key_act (str): Activation function for the key transform. Default: None. + value_act (str): Activation function for the value transform. Default: None. + has_attention_mask (bool): Specifies whether to use attention mask. Default: False. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.0. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d + tensor. Default: False. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertAttention. Default: mstype.float32. + """ + def __init__(self, + from_tensor_width, + to_tensor_width, + from_seq_length, + to_seq_length, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + has_attention_mask=False, + attention_probs_dropout_prob=0.0, + use_one_hot_embeddings=False, + initializer_range=0.02, + do_return_2d_tensor=False, + use_relative_positions=False, + compute_type=mstype.float32): + + super(BertAttention, self).__init__() + self.from_seq_length = from_seq_length + self.to_seq_length = to_seq_length + self.num_attention_heads = num_attention_heads + self.size_per_head = size_per_head + self.has_attention_mask = has_attention_mask + self.use_relative_positions = use_relative_positions + + self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head)) + self.reshape = P.Reshape() + self.shape_from_2d = (-1, from_tensor_width) + self.shape_to_2d = (-1, to_tensor_width) + weight = TruncatedNormal(initializer_range) + units = num_attention_heads * size_per_head + self.query_layer = nn.Dense(from_tensor_width, + units, + activation=query_act, + weight_init=weight).to_float(compute_type) + self.key_layer = nn.Dense(to_tensor_width, + units, + activation=key_act, + weight_init=weight).to_float(compute_type) + self.value_layer = nn.Dense(to_tensor_width, + units, + activation=value_act, + weight_init=weight).to_float(compute_type) + + self.shape_from = (-1, from_seq_length, num_attention_heads, size_per_head) + self.shape_to = (-1, to_seq_length, num_attention_heads, size_per_head) + + self.matmul_trans_b = P.BatchMatMul(transpose_b=True) + self.multiply = P.Mul() + self.transpose = P.Transpose() + self.trans_shape = (0, 2, 1, 3) + self.trans_shape_relative = (2, 0, 1, 3) + self.trans_shape_position = (1, 2, 0, 3) + self.multiply_data = -10000.0 + self.matmul = P.BatchMatMul() + + self.softmax = nn.Softmax() + self.dropout = nn.Dropout(1 - attention_probs_dropout_prob) + + if self.has_attention_mask: + self.expand_dims = P.ExpandDims() + self.sub = P.Sub() + self.add = P.Add() + self.cast = P.Cast() + self.get_dtype = P.DType() + if do_return_2d_tensor: + self.shape_return = (-1, num_attention_heads * size_per_head) + else: + self.shape_return = (-1, from_seq_length, num_attention_heads * size_per_head) + + self.cast_compute_type = SaturateCast(dst_type=compute_type) + if self.use_relative_positions: + self._generate_relative_positions_embeddings = \ + RelaPosEmbeddingsGenerator(length=to_seq_length, + depth=size_per_head, + max_relative_position=16, + initializer_range=initializer_range, + use_one_hot_embeddings=use_one_hot_embeddings) + + def construct(self, from_tensor, to_tensor, attention_mask): + """reshape 2d/3d input tensors to 2d""" + from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d) + to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d) + query_out = self.query_layer(from_tensor_2d) + key_out = self.key_layer(to_tensor_2d) + value_out = self.value_layer(to_tensor_2d) + + query_layer = self.reshape(query_out, self.shape_from) + query_layer = self.transpose(query_layer, self.trans_shape) + key_layer = self.reshape(key_out, self.shape_to) + key_layer = self.transpose(key_layer, self.trans_shape) + + attention_scores = self.matmul_trans_b(query_layer, key_layer) + + # use_relative_position, supplementary logic + if self.use_relative_positions: + # relations_keys is [F|T, F|T, H] + relations_keys = self._generate_relative_positions_embeddings() + relations_keys = self.cast_compute_type(relations_keys) + # query_layer_t is [F, B, N, H] + query_layer_t = self.transpose(query_layer, self.trans_shape_relative) + # query_layer_r is [F, B * N, H] + query_layer_r = self.reshape(query_layer_t, + (self.from_seq_length, + -1, + self.size_per_head)) + # key_position_scores is [F, B * N, F|T] + key_position_scores = self.matmul_trans_b(query_layer_r, relations_keys) + # key_position_scores_r is [F, B, N, F|T] + key_position_scores_r = self.reshape(key_position_scores, + (self.from_seq_length, + -1, + self.num_attention_heads, + self.from_seq_length)) + # key_position_scores_r_t is [B, N, F, F|T] + key_position_scores_r_t = self.transpose(key_position_scores_r, + self.trans_shape_position) + attention_scores = attention_scores + key_position_scores_r_t + + attention_scores = self.multiply(self.scores_mul, attention_scores) + + if self.has_attention_mask: + attention_mask = self.expand_dims(attention_mask, 1) + multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)), + self.cast(attention_mask, self.get_dtype(attention_scores))) + + adder = self.multiply(multiply_out, self.multiply_data) + attention_scores = self.add(adder, attention_scores) + + attention_probs = self.softmax(attention_scores) + attention_probs = self.dropout(attention_probs) + + value_layer = self.reshape(value_out, self.shape_to) + value_layer = self.transpose(value_layer, self.trans_shape) + context_layer = self.matmul(attention_probs, value_layer) + + # use_relative_position, supplementary logic + if self.use_relative_positions: + # relations_values is [F|T, F|T, H] + relations_values = self._generate_relative_positions_embeddings() + relations_values = self.cast_compute_type(relations_values) + # attention_probs_t is [F, B, N, T] + attention_probs_t = self.transpose(attention_probs, self.trans_shape_relative) + # attention_probs_r is [F, B * N, T] + attention_probs_r = self.reshape( + attention_probs_t, + (self.from_seq_length, + -1, + self.to_seq_length)) + # value_position_scores is [F, B * N, H] + value_position_scores = self.matmul(attention_probs_r, relations_values) + + # value_position_scores_r is [F, B, N, H] + value_position_scores_r = self.reshape(value_position_scores, + (self.from_seq_length, + -1, + self.num_attention_heads, + self.size_per_head)) + # value_position_scores_r_t is [B, N, F, H] + value_position_scores_r_t = self.transpose(value_position_scores_r, + self.trans_shape_position) + context_layer = context_layer + value_position_scores_r_t + + context_layer = self.transpose(context_layer, self.trans_shape) + context_layer = self.reshape(context_layer, self.shape_return) + + return context_layer + + +class BertSelfAttention(nn.Cell): + """ + Apply self-attention. + + Args: + seq_length (int): Length of input sequence. + hidden_size (int): Size of the bert encoder layers. + num_attention_heads (int): Number of attention heads. Default: 12. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + use_one_hot_embeddings (bool): Specifies whether to use one_hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertSelfAttention. Default: mstype.float32. + """ + def __init__(self, + seq_length, + hidden_size, + num_attention_heads=12, + attention_probs_dropout_prob=0.1, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + compute_type=mstype.float32): + super(BertSelfAttention, self).__init__() + if hidden_size % num_attention_heads != 0: + raise ValueError("The hidden size (%d) is not a multiple of the number " + "of attention heads (%d)" % (hidden_size, num_attention_heads)) + + self.size_per_head = int(hidden_size / num_attention_heads) + + self.attention = BertAttention( + from_tensor_width=hidden_size, + to_tensor_width=hidden_size, + from_seq_length=seq_length, + to_seq_length=seq_length, + num_attention_heads=num_attention_heads, + size_per_head=self.size_per_head, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + use_relative_positions=use_relative_positions, + has_attention_mask=True, + do_return_2d_tensor=True, + compute_type=compute_type) + + self.output = BertOutput(in_channels=hidden_size, + out_channels=hidden_size, + initializer_range=initializer_range, + dropout_prob=hidden_dropout_prob, + compute_type=compute_type) + self.reshape = P.Reshape() + self.shape = (-1, hidden_size) + + def construct(self, input_tensor, attention_mask): + input_tensor = self.reshape(input_tensor, self.shape) + attention_output = self.attention(input_tensor, input_tensor, attention_mask) + output = self.output(attention_output, input_tensor) + return output + + +class BertEncoderCell(nn.Cell): + """ + Encoder cells used in BertTransformer. + + Args: + hidden_size (int): Size of the bert encoder layers. Default: 768. + seq_length (int): Length of input sequence. Default: 512. + num_attention_heads (int): Number of attention heads. Default: 12. + intermediate_size (int): Size of intermediate layer. Default: 3072. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.02. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + hidden_act (str): Activation function. Default: "gelu". + compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32. + """ + def __init__(self, + hidden_size=768, + seq_length=512, + num_attention_heads=12, + intermediate_size=3072, + attention_probs_dropout_prob=0.02, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + hidden_act="gelu", + compute_type=mstype.float32): + super(BertEncoderCell, self).__init__() + self.attention = BertSelfAttention( + hidden_size=hidden_size, + seq_length=seq_length, + num_attention_heads=num_attention_heads, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_relative_positions=use_relative_positions, + compute_type=compute_type) + self.intermediate = nn.Dense(in_channels=hidden_size, + out_channels=intermediate_size, + activation=hidden_act, + weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) + self.output = BertOutput(in_channels=intermediate_size, + out_channels=hidden_size, + initializer_range=initializer_range, + dropout_prob=hidden_dropout_prob, + compute_type=compute_type) + + def construct(self, hidden_states, attention_mask): + # self-attention + attention_output = self.attention(hidden_states, attention_mask) + # feed construct + intermediate_output = self.intermediate(attention_output) + # add and normalize + output = self.output(intermediate_output, attention_output) + return output + + +class BertTransformer(nn.Cell): + """ + Multi-layer bert transformer. + + Args: + hidden_size (int): Size of the encoder layers. + seq_length (int): Length of input sequence. + num_hidden_layers (int): Number of hidden layers in encoder cells. + num_attention_heads (int): Number of attention heads in encoder cells. Default: 12. + intermediate_size (int): Size of intermediate layer in encoder cells. Default: 3072. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + hidden_act (str): Activation function used in the encoder cells. Default: "gelu". + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + return_all_encoders (bool): Specifies whether to return all encoders. Default: False. + """ + def __init__(self, + hidden_size, + seq_length, + num_hidden_layers, + num_attention_heads=12, + intermediate_size=3072, + attention_probs_dropout_prob=0.1, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + hidden_act="gelu", + compute_type=mstype.float32, + return_all_encoders=False): + super(BertTransformer, self).__init__() + self.return_all_encoders = return_all_encoders + + layers = [] + for _ in range(num_hidden_layers): + layer = BertEncoderCell(hidden_size=hidden_size, + seq_length=seq_length, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_relative_positions=use_relative_positions, + hidden_act=hidden_act, + compute_type=compute_type) + layers.append(layer) + + self.layers = nn.CellList(layers) + + self.reshape = P.Reshape() + self.shape = (-1, hidden_size) + self.out_shape = (-1, seq_length, hidden_size) + + def construct(self, input_tensor, attention_mask): + """Multi-layer bert transformer.""" + prev_output = self.reshape(input_tensor, self.shape) + + all_encoder_layers = () + for layer_module in self.layers: + layer_output = layer_module(prev_output, attention_mask) + prev_output = layer_output + + if self.return_all_encoders: + layer_output = self.reshape(layer_output, self.out_shape) + all_encoder_layers = all_encoder_layers + (layer_output,) + + if not self.return_all_encoders: + prev_output = self.reshape(prev_output, self.out_shape) + all_encoder_layers = all_encoder_layers + (prev_output,) + return all_encoder_layers + + +class CreateAttentionMaskFromInputMask(nn.Cell): + """ + Create attention mask according to input mask. + + Args: + config (Class): Configuration for BertModel. + """ + def __init__(self, config, batch_size): + super(CreateAttentionMaskFromInputMask, self).__init__() + self.input_mask = None + self.seq_length = config.seq_length + self.batch_size = batch_size + self.reshape = P.Reshape() + + def construct(self, input_mask): + cast = ops.Cast() + shape = (self.batch_size, 1, self.seq_length) + attention_mask = cast(self.reshape(input_mask, shape), mstype.float32) + + return attention_mask + + +class BertModel(nn.Cell): + """ + Bidirectional Encoder Representations from Transformers. + + Args: + config (Class): Configuration for BertModel. + is_training (bool): True for training mode. False for eval mode. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + def __init__(self, + config, + is_training, + use_one_hot_embeddings=False): + super(BertModel, self).__init__() + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + self.seq_length = config.seq_length + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + self.embedding_size = config.hidden_size + self.token_type_ids = None + + self.last_idx = self.num_hidden_layers - 1 + output_embedding_shape = [-1, self.seq_length, self.embedding_size] + + self.bert_embedding_lookup = EmbeddingLookup( + vocab_size=config.vocab_size, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range) + + self.bert_embedding_postprocessor = EmbeddingPostprocessor( + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_relative_positions=config.use_relative_positions, + use_token_type=True, + token_type_vocab_size=config.type_vocab_size, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=0.02, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + self.bert_encoder = BertTransformer( + hidden_size=self.hidden_size, + seq_length=self.seq_length, + num_attention_heads=config.num_attention_heads, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=config.intermediate_size, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range, + hidden_dropout_prob=config.hidden_dropout_prob, + use_relative_positions=config.use_relative_positions, + hidden_act=config.hidden_act, + compute_type=config.compute_type, + return_all_encoders=True) + + self.dtype = config.dtype + self.cast_compute_type = SaturateCast(dst_type=config.compute_type) + self.slice = P.StridedSlice() + + self.squeeze_1 = P.Squeeze(axis=1) + self.dense = nn.Dense(self.hidden_size, self.hidden_size, + activation="tanh", + weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) + # self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config) + + def construct(self, input_ids, token_type_ids, input_mask): + """Bidirectional Encoder Representations from Transformers.""" + # embedding + word_embeddings, embedding_tables = self.bert_embedding_lookup(input_ids) + embedding_output = self.bert_embedding_postprocessor(token_type_ids, + word_embeddings) + + # attention mask [batch_size, seq_length, seq_length] + attention_mask = self._create_attention_mask_from_input_mask(input_mask) + + # bert encoder + encoder_output = self.bert_encoder(self.cast_compute_type(embedding_output), + attention_mask) + + sequence_output = self.cast(encoder_output[self.last_idx], self.dtype) + + # pooler + batch_size = P.Shape()(input_ids)[0] + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) + pooled_output = self.dense(first_token) + pooled_output = self.cast(pooled_output, self.dtype) + + return sequence_output, pooled_output, embedding_tables diff --git a/research/nlp/soft_masked_bert/src/dataset.py b/research/nlp/soft_masked_bert/src/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..243e38be758b313c9271b91274d6941602298e3b --- /dev/null +++ b/research/nlp/soft_masked_bert/src/dataset.py @@ -0,0 +1,91 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from src.utils import get_abs_path, load_json +import mindspore.dataset as ds +import mindspore.dataset.text as text +from tqdm import tqdm + +def get_dataset(fp, vocab_path='./src/bert-base-chinese-vocab.txt', max_seq_len=512, shffle=True, \ + workers_num=1, device_num=1, rank_id=0): + data = load_json(fp) + original_text_list = [] + original_tokens_list = [] + wrong_ids_list = [] + correct_text_list = [] + correct_tokens_list = [] + det_label_list = [] + # tokenizer + vocab = text.Vocab.from_file(vocab_path) + tokenizer_op = text.BertTokenizer(vocab=vocab) + for item in tqdm(data): + original_text_list.append(item['original_text']) + original_tokens_list.append(item['original_text']) + wrong_ids_list.append(str(item['wrong_ids'])) + encoded_text = tokenizer_op(item['correct_text']) + det_label = [0 for i in range(max_seq_len)] + for idx in item['wrong_ids']: + margins = [] + for word in encoded_text[:idx]: + if word == '[UNK]': + break + if word.startswith('##'): + margins.append(len(word) - 3) + else: + margins.append(len(word) - 1) + margin = sum(margins) + move = 0 + while (abs(move) < margin) or (idx + move >= len(encoded_text)) or encoded_text[idx + move].startswith( + '##'): + move -= 1 + det_label[idx + move + 1] = 1 + det_label_list.append(det_label) + correct_text_list.append(item['correct_text']) + correct_tokens_list.append(item['correct_text']) + if device_num > 1: + dataset = ds.NumpySlicesDataset(data=(original_text_list, det_label_list, correct_text_list), + column_names=['original_tokens', 'wrong_ids', 'correct_tokens'], + num_shards=device_num, shard_id=rank_id) + else: + dataset = ds.NumpySlicesDataset(data=(original_text_list, det_label_list, correct_text_list), + column_names=['original_tokens', 'wrong_ids', 'correct_tokens']) + return dataset + +def make_datasets(cfg, get_loader_fn, tokenizer, **kwargs): + if cfg.DATASETS.TRAIN == '': + train_dataset = None + else: + train_dataset = get_loader_fn(get_abs_path(cfg.DATASETS.TRAIN), \ + batch_size=cfg.SOLVER.BATCH_SIZE, \ + shuffle=True, \ + num_workers=cfg.DATALOADER.NUM_WORKERS, \ + tokenizer=tokenizer, **kwargs) + if cfg.DATASETS.VALID == '': + valid_dataset = None + else: + valid_dataset = get_loader_fn(get_abs_path(cfg.DATASETS.VALID), \ + batch_size=cfg.TEST.BATCH_SIZE, \ + shuffle=False, \ + num_workers=cfg.DATALOADER.NUM_WORKERS, \ + tokenizer=tokenizer, **kwargs) + if cfg.DATASETS.TEST == '': + test_dataset = None + else: + test_dataset = get_loader_fn(get_abs_path(cfg.DATASETS.TEST), \ + batch_size=cfg.TEST.BATCH_SIZE, \ + shuffle=False, \ + num_workers=cfg.DATALOADER.NUM_WORKERS, \ + tokenizer=tokenizer, **kwargs) + return train_dataset, valid_dataset, test_dataset diff --git a/research/nlp/soft_masked_bert/src/finetune_config.py b/research/nlp/soft_masked_bert/src/finetune_config.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac563c417528bd7cccc44658d298341815589c8 --- /dev/null +++ b/research/nlp/soft_masked_bert/src/finetune_config.py @@ -0,0 +1,100 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""hyper-parameters.""" + +from easydict import EasyDict as edict +import mindspore as ms + +SEQ_LEN = 512 + +soft_masked_bert_cfg = edict({ + 'model': edict({ + 'bert_ckpt': 'bert-base-chinese', + 'device': 'Ascend', + 'name': 'SoftMaskedBertModel', + 'gpu_ids': [0], + 'hyper_params': [0.8] + }), + 'dataset': edict({ + 'train': 'datasets/csc/train.json', + 'valid': 'datasets/csc/dev.json', + 'test': 'datasets/csc/test.json' + }), + 'solver': edict({ + 'base_lr': 0.0001, + 'weight_decay': 5e-8, + 'batch_size': 4, + 'max_epoch': 10, + 'accumulate_grad_batches': 4 + }), + 'test': edict({ + 'batch_size': 16 + }), + 'task': edict({ + 'name': 'csc' + }), + 'output_dir': 'checkpoints/SoftMaskedBert' +}) + +optimizer_cfg = edict({ + 'batch_size': 2, + 'optimizer': 'AdamWeightDecay', + 'AdamWeightDecay': edict({ + 'learning_rate': 2e-5, + 'end_learning_rate': 1e-7, + 'power': 1.0, + 'weight_decay': 1e-5, + 'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(), + 'eps': 1e-6, + }), + 'Lamb': edict({ + 'learning_rate': 2e-5, + 'end_learning_rate': 1e-7, + 'power': 1.0, + 'weight_decay': 0.01, + 'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(), + }), + 'Momentum': edict({ + 'learning_rate': 2e-5, + 'momentum': 0.9, + }), +}) + +bert_cfg = edict({ + 'seq_length': SEQ_LEN, #128 + 'vocab_size': 21128, + 'hidden_size': 768, + 'num_hidden_layers': 12, + 'num_attention_heads': 12, + 'intermediate_size': 3072, + 'hidden_act': "gelu", + 'hidden_dropout_prob': 0.1, + 'attention_probs_dropout_prob': 0.1, + 'max_position_embeddings': SEQ_LEN, + 'type_vocab_size': 2, + 'initializer_range': 0.02, + 'use_relative_positions': False, + 'dtype': ms.float32, + 'compute_type': ms.float32, + 'pad_token_id': 0, + 'layer_norm_eps': 1e-12 + }) + +gru_cfg = edict({ + 'encoder_embedding_size': 768, + 'hidden_size': 384, + 'max_length': SEQ_LEN, + 'is_training': True +}) diff --git a/research/nlp/soft_masked_bert/src/generator.py b/research/nlp/soft_masked_bert/src/generator.py new file mode 100644 index 0000000000000000000000000000000000000000..e136c68cc13137537b36f1160b05a838104d8810 --- /dev/null +++ b/research/nlp/soft_masked_bert/src/generator.py @@ -0,0 +1,30 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from utils import load_json +import numpy as np + +class CscDatasetGenerator: + def __init__(self, fp): + self.data = load_json(fp) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + original_text = self.data[index]['original_text'] + original_text_np = np.array(original_text) + res = (original_text_np) + return res diff --git a/research/nlp/soft_masked_bert/src/gru.py b/research/nlp/soft_masked_bert/src/gru.py new file mode 100644 index 0000000000000000000000000000000000000000..d46bbf2aa84f7f26b660808d9d95ddc91a7598eb --- /dev/null +++ b/research/nlp/soft_masked_bert/src/gru.py @@ -0,0 +1,102 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""GRU cell""" +from mindspore import nn +from mindspore.ops import operations as P +from mindspore.common import dtype as mstype +from src.weight_init import gru_default_state, gru_default_state_bw + +class BidirectionGRU(nn.Cell): + ''' + BidirectionGRU model + + Args: + config: config of network + ''' + def __init__(self, config, batch_size): + super(BidirectionGRU, self).__init__() + self.batch_size = batch_size + self.embedding_size = config.encoder_embedding_size + self.hidden_size = config.hidden_size + self.weight_i, self.weight_h, self.bias_i, self.bias_h, self.init_h = gru_default_state(self.batch_size, + self.embedding_size, + self.hidden_size) + self.weight_bw_i, self.weight_bw_h, self.bias_bw_i, self.bias_bw_h, self.init_bw_h = \ + gru_default_state_bw(self.batch_size, self.embedding_size, self.hidden_size) + self.reverse = P.ReverseV2(axis=[0]) + self.concat = P.Concat(axis=2) + self.squeeze = P.Squeeze(axis=0) + self.rnn = P.DynamicGRUV2() + self.text_len = config.max_length + self.cast = P.Cast() + + def construct(self, x): + ''' + BidirectionGRU construction + + Args: + x(Tensor): BidirectionGRU input + + Returns: + output(Tensor): rnn output + hidden(Tensor): hidden state + ''' + x = self.cast(x, mstype.float16) + y1, _, _, _, _, _ = self.rnn(x, self.weight_i, self.weight_h, self.bias_i, self.bias_h, None, self.init_h) + bw_x = self.reverse(x) + y1_bw, _, _, _, _, _ = self.rnn(bw_x, self.weight_bw_i, + self.weight_bw_h, self.bias_bw_i, self.bias_bw_h, None, self.init_bw_h) + y1_bw = self.reverse(y1_bw) + output1 = self.concat((y1, y1_bw)) + hidden = self.concat((y1[self.text_len-1:self.text_len:1, ::, ::], + y1_bw[self.text_len-1:self.text_len:1, ::, ::])) + hidden = self.squeeze(hidden) + return output1, hidden + +class GRU(nn.Cell): + ''' + GRU model + + Args: + config: config of network + ''' + def __init__(self, config, is_training=True): + super(GRU, self).__init__() + if is_training: + self.batch_size = config.batch_size + else: + self.batch_size = config.eval_batch_size + self.embedding_size = config.encoder_embedding_size + self.hidden_size = config.hidden_size + self.weight_i, self.weight_h, self.bias_i, self.bias_h, self.init_h = \ + gru_default_state(self.batch_size, self.embedding_size + self.hidden_size*2, self.hidden_size) + self.rnn = P.DynamicGRUV2() + self.cast = P.Cast() + + def construct(self, x): + ''' + GRU construction + + Args: + x(Tensor): GRU input + + Returns: + output(Tensor): rnn output + hidden(Tensor): hidden state + ''' + x = self.cast(x, mstype.float16) + y1, h1, _, _, _, _ = self.rnn(x, self.weight_i, self.weight_h, self.bias_i, self.bias_h, None, self.init_h) + return y1, h1 diff --git a/research/nlp/soft_masked_bert/src/soft_masked_bert.py b/research/nlp/soft_masked_bert/src/soft_masked_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..3c0f68f1cbce89dda9d7ec75ff618c3520d58a2c --- /dev/null +++ b/research/nlp/soft_masked_bert/src/soft_masked_bert.py @@ -0,0 +1,202 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Sofr-Masked BERT""" +from mindspore.train.serialization import load_checkpoint, load_param_into_net +import mindspore as ms +from mindspore import nn +from mindspore.ops import operations as P +from mindspore.common import dtype as mstype +from src.bert_model import BertTransformer, SaturateCast, BertOnlyMLMHead, EmbeddingLookup, EmbeddingPostprocessor, CreateAttentionMaskFromInputMask +from src.finetune_config import bert_cfg, gru_cfg +from src.gru import BidirectionGRU + +class DetectionNetwork(nn.Cell): + def __init__(self, config, batch_size, is_training, if_O3): + super().__init__() + self.config = config + self.rnn = BidirectionGRU(gru_cfg, batch_size) + self.sigmoid = nn.Sigmoid() + self.linear = nn.Dense(self.config.hidden_size, 1) + self.cast = P.Cast() + self.reshape = P.Reshape() + self.transpose = P.Transpose() + self.O3 = if_O3 + + def construct(self, hidden_states): + if self.O3 is False: + hidden_states = self.cast(hidden_states, ms.float16) # if not O3 + hidden_states = self.transpose(hidden_states, (1, 0, 2)) + out, _ = self.rnn(hidden_states) + out = self.transpose(out, (1, 0, 2)) + if self.O3: + prob = self.linear(out) # if O3 + else: + prob = self.linear(out.astype("float32")) # if not O3 + prob = self.sigmoid(prob) + return prob + +class BertEmbedding(nn.Cell): + def __init__(self, config, load_checkpoint_path): + super(BertEmbedding, self).__init__() + self.config = config + self.bert_embedding_lookup = EmbeddingLookup( + vocab_size=self.config.vocab_size, + embedding_size=self.config.hidden_size, + embedding_shape=[-1, self.config.seq_length, self.config.hidden_size], + use_one_hot_embeddings=False, + initializer_range=self.config.initializer_range) + + self.bert_embedding_postprocessor = EmbeddingPostprocessor( + embedding_size=self.config.hidden_size, + embedding_shape=[-1, self.config.seq_length, self.config.hidden_size], + use_relative_positions=self.config.use_relative_positions, + use_token_type=True, + token_type_vocab_size=self.config.type_vocab_size, + use_one_hot_embeddings=False, + initializer_range=0.02, + max_position_embeddings=self.config.seq_length, + dropout_prob=self.config.hidden_dropout_prob) + + def construct(self, sentence_tokens, token_type_ids): + word_embeddings, _ = self.bert_embedding_lookup(sentence_tokens) + embed = self.bert_embedding_postprocessor(token_type_ids, word_embeddings) + return embed + + +class BertCorrectionModel(nn.Cell): + def __init__(self, config, batch_size, embbedding, param_dict, pretrained): + super().__init__() + self.config = config + self.vocab_size = config.vocab_size + corrector = BertTransformer(hidden_size=self.config.hidden_size, + seq_length=self.config.seq_length, + num_hidden_layers=self.config.num_hidden_layers, + hidden_dropout_prob=self.config.hidden_dropout_prob, + attention_probs_dropout_prob=self.config.attention_probs_dropout_prob) + if pretrained is True: + load_param_into_net(corrector, param_dict) + self.corrector = corrector + self.mask_token_id = 103 # id of the [MASK] token + self.batch_size = batch_size + self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config, self.batch_size) #? + self.cast_compute_type = SaturateCast(dst_type=config.compute_type) #? + # TODO: test whether has to use cls pretrained parameter. + cls = BertOnlyMLMHead(self.config, param_dict, pretrained) + self.cls = cls + self.cast = P.Cast() + self.oneslike = P.OnesLike() + self.squeeze = P.Squeeze(-1) + self.reduce_sum = P.ReduceSum(keep_dims=False) + self.loss_fct = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='none') + self.reshape = P.Reshape() + + def construct(self, prob, embed, mask_embed, cor_labels, original_tokens_mask, original_token_type_ids, \ + batch_max_len, original_batch_len, total_seq_len, batch_size, residual_connection=True): + cor_embed = prob * mask_embed + (1 - prob) * embed + # get attention mask + # use 'original_tokens_mask' + attention_mask = self._create_attention_mask_from_input_mask(original_tokens_mask) + encoder_outputs = self.corrector(self.cast_compute_type(cor_embed), attention_mask) + sequence_output = encoder_outputs[0] + sequence_output = sequence_output + embed if residual_connection else sequence_output + prediction_scores = self.cls(sequence_output) + out = (prediction_scores, sequence_output) + if cor_labels is not None: + cor_labels[cor_labels == 0] = -100 + prediction_scores_rp = self.reshape(prediction_scores, (-1, self.vocab_size)) + cor_labels_rp = self.reshape(cor_labels, (-1,)) + cor_loss = self.loss_fct(prediction_scores_rp, cor_labels_rp) + cor_loss = cor_loss * original_tokens_mask.view(-1) + cor_loss = self.reduce_sum(cor_loss) + cor_loss = cor_loss / total_seq_len + out = (cor_loss,) + out + return out + + +class SoftMaskedBertCLS(nn.Cell): + def __init__(self, batch_size, is_training=True, if_O3=True, \ + load_checkpoint_path="./weight/bert_base.ckpt", pretrained=False): + super(SoftMaskedBertCLS, self).__init__() + self.batch_size = batch_size + self.config = bert_cfg + self.detector = DetectionNetwork(self.config, batch_size, is_training=True, if_O3=if_O3) + self.mask_token_id = 103 # id of the [MASK] token + self.oneslike = P.OnesLike() + self.squeeze2 = P.Squeeze(-1) + embedding = BertEmbedding(self.config, load_checkpoint_path) + param_dict = load_checkpoint(load_checkpoint_path) + if pretrained is True: + load_param_into_net(embedding, param_dict) + self.embedding = embedding + # correction + self.corrector = BertCorrectionModel(self.config, self.batch_size, \ + self.embedding, param_dict, pretrained=pretrained) + self.reshape = P.Reshape() + self.maskedselect = P.MaskedSelect() + self.expand_dims = P.ExpandDims() + self.cast = P.Cast() + self.reduce_sum1 = P.ReduceSum(keep_dims=True) + self.reduce_sum2 = P.ReduceSum(keep_dims=False) + self.loss = nn.BCELoss(reduction='none') + self.squeeze = P.Squeeze(2) + self.linear = nn.Dense(512, 512) # for debug + self.w = 0.8 + self.linear_debug = nn.Dense(768, 1) # for debug + self.linear_debug2 = nn.Dense(512, 512) # for debug + self.select = P.Select() + self.is_training = is_training + self.print = P.Print() # for debug + self.argmax = P.Argmax() # for debug + # ['wrong_ids', 'original_tokens', 'original_tokens_mask', 'correct_tokens', 'correct_tokens_mask', + # 'original_token_type_ids', 'correct_token_type_ids'] + def construct(self, *inputs): + det_labels = inputs[0] + original_tokens = inputs[1] + original_tokens_mask = inputs[2] + correct_tokens = inputs[3] + original_token_type_ids = inputs[5] + input_shape = original_token_type_ids.shape + embed = self.embedding(original_tokens, original_token_type_ids) # 3 matmul + prob = self.detector(embed) + mask_embed = self.embedding(self.cast(self.oneslike(self.squeeze2(prob)), mstype.int32) * self.mask_token_id, + original_token_type_ids) + active_loss = self.reshape(original_tokens_mask, (-1, prob.shape[1])) + batch_seq_len = self.reduce_sum1(active_loss.astype("float32"), 1) + batch_max_len = batch_seq_len.max() + original_batch_len = input_shape[1] + total_seq_len = self.reduce_sum2(batch_seq_len) + cor_out = self.corrector(prob, embed, mask_embed, correct_tokens, original_tokens_mask, \ + original_token_type_ids, batch_max_len, original_batch_len, total_seq_len, self.batch_size, \ + residual_connection=False) + prob_ = self.reshape(prob, (-1, prob.shape[1])) + prob = prob_.astype(mstype.float32) + det_labels = det_labels.astype(mstype.float32) + det_loss = self.loss(prob, det_labels) + det_loss = det_loss * original_tokens_mask + det_loss = self.reduce_sum2(det_loss) + det_loss = det_loss / total_seq_len + outputs = (det_loss, cor_out[0], prob) + cor_out[1:] + loss = self.w * outputs[1] + (1 - self.w) * outputs[0] + det_y_hat = (outputs[2] > 0.5).astype("int32") + cor_y_hat = self.argmax(outputs[3]) + if self.is_training: + res = loss + else: + det_y_hat = (outputs[2] > 0.5).astype("int32") + cor_y_hat = self.argmax(outputs[3]) + cor_y = correct_tokens + cor_y_hat *= original_tokens_mask + res = (original_tokens, cor_y, cor_y_hat, det_y_hat, det_labels, batch_seq_len) + return res diff --git a/research/nlp/soft_masked_bert/src/tokenization.py b/research/nlp/soft_masked_bert/src/tokenization.py new file mode 100644 index 0000000000000000000000000000000000000000..9c8525a580e57dbc8a9317c4decba7bd881dba1d --- /dev/null +++ b/research/nlp/soft_masked_bert/src/tokenization.py @@ -0,0 +1,142 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""tokenizer""" +import sys +from src.dataset import get_dataset +from mindspore.dataset import text +import numpy as np + +print(sys.path[0]) + +class CscTokenizer: + def __init__(self, device_num=1, rank_id=0, fp="", max_seq_len=0, vocab_path='./src/bert-base-chinese-vocab.txt'): + self.vocab_path = vocab_path + self.tokenizer_op = self.set_tokenizer() + self.fp = fp + self.vocab = self.set_vocab() + self.max_seq_len = max_seq_len + self.device_num = device_num + self.rank_id = rank_id + + def set_vocab(self): + fr = open(self.vocab_path, mode='rb') + vocab_byte_list = fr.readlines() + vocab_byte_dict = {} + count = 0 + for item in vocab_byte_list: + if b'\n' in item: + item = item[:-1] + vocab_byte_dict[item] = count + count = count + 1 + return vocab_byte_dict + + def set_tokenizer(self): + vocab = text.Vocab.from_file(self.vocab_path) + tokenizer_op = text.BertTokenizer(vocab=vocab) + return tokenizer_op + + def convert2id(self, tokens_ndarray): + tokens_list = list(tokens_ndarray) + ids_list = [] + ids_list.append(101) #[CLS] + for token in tokens_list: + if token in self.vocab: + ids_list.append(self.vocab[token]) + else: + ids_list.append(100) #[UNK] + ids_list.append(102) #[SEP] + # pad to max_seq_len + input_mask = [1 for i in range(len(ids_list))] + while len(ids_list) < self.max_seq_len: + ids_list.append(0) + input_mask.append(0) + assert len(ids_list) == self.max_seq_len + assert len(input_mask) == self.max_seq_len + token_type_ids = [0 for i in range(self.max_seq_len)] + ids_ndarray = np.array(ids_list, dtype=np.int32) + input_mask_ndarray = np.array(input_mask, dtype=np.int32) + token_type_ids_ndarray = np.array(token_type_ids, dtype=np.int32) + return ids_ndarray, input_mask_ndarray, token_type_ids_ndarray + + def turn2int32(self, num_ndarray): + return np.array(num_ndarray, dtype=np.int32) + + def get_worng_ids_ndarray(self, wrong_ids_byte_ndarray): + i = 0 + wrong_ids_list = [] + wrong_ids_str = text.to_str(wrong_ids_byte_ndarray).tolist() + while wrong_ids_str[i] != ']': + if wrong_ids_str[i].isdigit(): + wrong_ids_list.append(int(wrong_ids_str[i])) + i = i + 1 + wrong_ids_ndarray = np.array(wrong_ids_list) + return wrong_ids_ndarray + + def convert2det_labels(self, wrong_ids_ndarray, tokens_ndarray): + wrong_ids_list = wrong_ids_ndarray.tolist() + tokens_list_byte = list(tokens_ndarray) + # turn to string list + tokens_list_str = [] + for token_byte in tokens_list_byte: + tokens_list_str.append(text.to_str(token_byte)) + # create det_labels_list + det_labels_list = [0 for i in range(len(tokens_list_str))] + for idx in wrong_ids_list: + margins = [] + for word in tokens_list_str[:idx]: + if word == '[UNK]': + break + if word.startswith('##'): + margins.append(len(word) - 3) + else: + margins.append(len(word) - 1) + margin = sum(margins) + move = 0 + while (abs(move) < margin) or (idx + move >= len(tokens_list_str)) or \ + tokens_list_str[idx + move].startswith('##'): + move -= 1 + det_labels_list[idx + move + 1] = 1 + # pad to max_seq_len + while len(det_labels_list) < self.max_seq_len: + det_labels_list.append(0) + assert len(det_labels_list) == self.max_seq_len + det_labels_ndarray = np.array(det_labels_list) + return det_labels_ndarray + + def get_token_ids(self, batch_size): + dataset = get_dataset(self.fp, vocab_path=self.vocab_path, device_num=self.device_num, rank_id=self.rank_id) + dataset = dataset.map(operations=self.tokenizer_op, input_columns=['original_tokens']) + dataset = dataset.map(operations=self.convert2id, input_columns=['original_tokens'], \ + output_columns=['original_tokens', 'original_tokens_mask', 'original_token_type_ids'], \ + column_order=['wrong_ids', 'original_tokens', 'original_tokens_mask', 'correct_tokens', \ + 'original_token_type_ids']) + dataset = dataset.map(operations=self.tokenizer_op, input_columns=['correct_tokens']) + dataset = dataset.map(operations=self.convert2id, input_columns=['correct_tokens'], \ + output_columns=['correct_tokens', 'correct_tokens_mask', 'correct_token_type_ids'], \ + column_order=['wrong_ids', 'original_tokens', 'original_tokens_mask', 'correct_tokens', \ + 'correct_tokens_mask', 'original_token_type_ids', 'correct_token_type_ids']) + dataset = dataset.map(operations=self.turn2int32, input_columns=['wrong_ids']) + dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) + return dataset + +if __name__ == '__main__': + fpath = '../../dataset/csc/dev.json' + demo = CscTokenizer(fpath) + dataset1 = demo.get_token_ids() + count1 = 0 + for data in dataset1.create_dict_iterator(num_epochs=1, output_numpy=True): + count1 = count1 + 1 + if count1 > 3: + break diff --git a/research/nlp/soft_masked_bert/src/utils.py b/research/nlp/soft_masked_bert/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e65b9a194dba6654cb061ce3d6f68b6b98bb84a0 --- /dev/null +++ b/research/nlp/soft_masked_bert/src/utils.py @@ -0,0 +1,147 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""utils""" + +import os +import json +import sys + +def compute_correction_prf(results2, all_predict_true_index, all_gold_index): + TP = 0 + FP = 0 + FN = 0 + for i in range(len(all_predict_true_index)): + # we only detect those correctly detected location, which is a different from the common metrics since + # we wanna to see the precision improve by using the confusionset + if all_predict_true_index[i]: + predict_words = [] + for j in all_predict_true_index[i]: + predict_words.append(results2[i][2][j]) + if results2[i][1][j] == results2[i][2][j]: + TP += 1 + else: + FP += 1 + for j in all_gold_index[i]: + if results2[i][1][j] in predict_words: + continue + else: + FN += 1 + # For the correction Precision, Recall and F1 + correction_precision = TP / (TP + FP) if (TP + FP) > 0 else 0 + correction_recall = TP / (TP + FN) if (TP + FN) > 0 else 0 + if correction_precision + correction_recall == 0: + correction_f1 = 0 + else: + correction_f1 = 2 * (correction_precision * correction_recall) / (correction_precision + correction_recall) + print("The correction result is precision={}, recall={} and F1={}".format(correction_precision, \ + correction_recall, correction_f1)) + return correction_f1 + +def compute_detection_prf(results1): + TP = 0 + FP = 0 + FN = 0 + all_predict_true_index = [] + all_gold_index = [] + for item in results1: + src, tgt, predict = item + gold_index = [] + each_true_index = [] + for i in range(len(list(src))): + if src[i] == tgt[i]: + continue + else: + gold_index.append(i) + all_gold_index.append(gold_index) + predict_index = [] + for i in range(len(list(src))): + if src[i] == predict[i]: + continue + else: + predict_index.append(i) + + for i in predict_index: + if i in gold_index: + TP += 1 + each_true_index.append(i) + else: + FP += 1 + for i in gold_index: + if i in predict_index: + continue + else: + FN += 1 + all_predict_true_index.append(each_true_index) + + # For the detection Precision, Recall and F1 + detection_precision = TP / (TP + FP) if (TP + FP) > 0 else 0 + detection_recall = TP / (TP + FN) if (TP + FN) > 0 else 0 + if detection_precision + detection_recall == 0: + detection_f1 = 0 + else: + detection_f1 = 2 * (detection_precision * detection_recall) / (detection_precision + detection_recall) + print("The detection result is precision={}, recall={} and F1={}".format(detection_precision, \ + detection_recall, detection_f1)) + return all_predict_true_index, all_gold_index, detection_f1 + +def compute_corrector_prf(results): + all_predict_true_index_, all_gold_index_, detection_f1_ = compute_detection_prf(results) + correction_f1_ = compute_correction_prf(results, all_predict_true_index_, all_gold_index_) + return detection_f1_, correction_f1_ + +def compute_sentence_level_prf(results): + TP = 0.0 + FP = 0.0 + FN = 0.0 + TN = 0.0 + total_num = len(results) + + for item in results: + src, tgt, predict = item + if src == tgt: + if tgt == predict: + TN += 1 + else: + FP += 1 + else: + if tgt == predict: + TP += 1 + else: + FN += 1 + + acc = (TP + TN) / total_num + precision = TP / (TP + FP) if TP > 0 else 0.0 + recall = TP / (TP + FN) if TP > 0 else 0.0 + f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0 + + print(f'Sentence Level: acc:{acc:.6f}, precision:{precision:.6f}, recall:{recall:.6f}, f1:{f1:.6f}') + return acc, precision, recall, f1 + +def get_main_dir(): + if hasattr(sys, 'frozen'): + return os.path.join(os.path.dirname(sys.executable)) + return os.path.join(os.path.dirname(__file__), '..', '..') + +def get_abs_path(*name): + fn = os.path.join(*name) + if os.path.isabs(fn): + return fn + return os.path.abspath(os.path.join(get_main_dir(), fn)) + +def load_json(fp): + if not os.path.exists(fp): + return dict() + with open(fp, 'r', encoding='utf8') as f: + return json.load(f) diff --git a/research/nlp/soft_masked_bert/src/weight_init.py b/research/nlp/soft_masked_bert/src/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..9085d04dc51ea8756ad339cbcd1bbe5b921d64f4 --- /dev/null +++ b/research/nlp/soft_masked_bert/src/weight_init.py @@ -0,0 +1,53 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""weight init""" +import math +import numpy as np +from mindspore import Tensor, Parameter + +def gru_default_state(batch_size, input_size, hidden_size, num_layers=1, bidirectional=False): + '''Weight init for gru cell''' + stdv = 1 / math.sqrt(hidden_size) + weight_i = Parameter(Tensor( + np.random.uniform(-stdv, stdv, (input_size, 3*hidden_size)).astype(np.float16)), name='weight_i') + weight_h = Parameter(Tensor( + np.random.uniform(-stdv, stdv, (hidden_size, 3*hidden_size)).astype(np.float16)), name='weight_h') + bias_i = Parameter(Tensor( + np.random.uniform(-stdv, stdv, (3*hidden_size)).astype(np.float16)), name='bias_i') + bias_h = Parameter(Tensor( + np.random.uniform(-stdv, stdv, (3*hidden_size)).astype(np.float16)), name='bias_h') + init_h = Tensor(np.zeros((batch_size, hidden_size)).astype(np.float16)) + return weight_i, weight_h, bias_i, bias_h, init_h + +def gru_default_state_bw(batch_size, input_size, hidden_size, num_layers=1, bidirectional=False): + '''Weight init for gru cell''' + stdv = 1 / math.sqrt(hidden_size) + weight_bw_i = Parameter(Tensor( + np.random.uniform(-stdv, stdv, (input_size, 3*hidden_size)).astype(np.float16)), name='weight_bw_i') + weight_bw_h = Parameter(Tensor( + np.random.uniform(-stdv, stdv, (hidden_size, 3*hidden_size)).astype(np.float16)), name='weight_bw_h') + bias_bw_i = Parameter(Tensor( + np.random.uniform(-stdv, stdv, (3*hidden_size)).astype(np.float16)), name='bias_bw_i') + bias_bw_h = Parameter(Tensor( + np.random.uniform(-stdv, stdv, (3*hidden_size)).astype(np.float16)), name='bias_bw_h') + init_bw_h = Tensor(np.zeros((batch_size, hidden_size)).astype(np.float16)) + return weight_bw_i, weight_bw_h, bias_bw_i, bias_bw_h, init_bw_h + +def dense_default_state(in_channel, out_channel): + '''Weight init for dense cell''' + stdv = 1 / math.sqrt(in_channel) + weight = Tensor(np.random.uniform(-stdv, stdv, (out_channel, in_channel)).astype(np.float32)) + bias = Tensor(np.random.uniform(-stdv, stdv, (out_channel)).astype(np.float32)) + return weight, bias diff --git a/research/nlp/soft_masked_bert/train.py b/research/nlp/soft_masked_bert/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f734941a944537805b17e4b680aa0c85084aaaab --- /dev/null +++ b/research/nlp/soft_masked_bert/train.py @@ -0,0 +1,132 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train""" +import os +import argparse +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor, LossMonitor +from mindspore import context, Model, DynamicLossScaleManager +from mindspore.communication.management import init, get_group_size +from mindspore.nn.optim import AdamWeightDecay +from src.finetune_config import optimizer_cfg, bert_cfg +from src.tokenization import CscTokenizer +from src.soft_masked_bert import SoftMaskedBertCLS + +def do_train(dataset, network, cfg, profile=None, save_ckpt_path='./checkpoint', epoch_num=1): + """ do train """ + max_epoch = 100 + steps_per_epoch = dataset.get_dataset_size() + # network.to_float(mstype.float16) + if optimizer_cfg.optimizer == 'AdamWeightDecay': + params = network.trainable_params() + bias_params = list(filter(lambda x: 'bias' in x.name, params)) + no_bias_params = list(filter(lambda x: 'bias' not in x.name, params)) + group_params = [{'params': bias_params, 'weight_decay': 0, 'lr': cfg.baselr * cfg.bias_lr_factor}, + {'params': no_bias_params, 'weight_decay': cfg.weight_decay, 'lr': cfg.baselr}] + optimizer = AdamWeightDecay(group_params, learning_rate=cfg.baselr) + if cfg.enable_modelarts: + config = CheckpointConfig(saved_network=network, save_checkpoint_steps=steps_per_epoch * max_epoch) + else: + config = CheckpointConfig(saved_network=network, save_checkpoint_steps=steps_per_epoch * 10) + ckpoint_cb = ModelCheckpoint(prefix='SoftMaskedBert', + directory=save_ckpt_path, + config=config) + time_cb = TimeMonitor(data_size=steps_per_epoch) + loss_scale_manager = DynamicLossScaleManager(init_loss_scale=2**24) + model = Model(network, loss_scale_manager=loss_scale_manager, optimizer=optimizer, amp_level="O3") + model.train(epoch=max_epoch, train_dataset=dataset, callbacks=[LossMonitor(), \ + ckpoint_cb, time_cb], dataset_sink_mode=False) + if cfg.enable_modelarts: + import moxing as mox + mox.file.copy_parallel(save_ckpt_path, cfg.train_url) + +def run_csc(): + """run csc task""" + parser = argparse.ArgumentParser(description="run csc") + parser.add_argument("--bert_ckpt", type=str, \ + default="bert_base.ckpt") + parser.add_argument("--device_target", type=str, default="Ascend") + parser.add_argument("--name", type=str, default="SoftMaskedBertModel") + parser.add_argument("--hyper_params", type=float, default=0.8) + parser.add_argument("--baselr", type=float, default=0.00001) # 0.0001 + parser.add_argument("--bias_lr_factor", type=int, default=2) + parser.add_argument("--weight_decay", type=float, default=5e-8) + parser.add_argument("--batch_size", type=int, default=36) + parser.add_argument("--max_epochs", type=int, default=100) + parser.add_argument("--accumulate_grad_batches", type=int, default=2) + parser.add_argument("--max_seq_len", type=int, default=bert_cfg.seq_length) #512 + parser.add_argument("--train_url", type=str, default="./datasets/csc") # output direction, such as s3://open-data/job/openizxche2022062222t062300200037543/output/V0012/ + parser.add_argument("--data_url", type=str, default="./datasets/csc/train.json") # direction of the training dataset, such as s3://open-data/attachment/1/4/1493e5f0-4601-408e-bc7f-b51ef8b3785c1493e5f0-4601-408e-bc7f-b51ef8b3785c/ + def str2bool(input_str): + return bool(input_str) + parser.add_argument("--enable_modelarts", type=str2bool, default='False') + parser.add_argument("--pynative", type=str2bool, default='False') + parser.add_argument("--device_id", type=int, default=0) + parser.add_argument("--rank_size", type=int, default=1) + args_opt = parser.parse_args() + if args_opt.enable_modelarts: + import moxing as mox + if mox.file.exists('/cache/dataset'): + ret = mox.file.list_directory('/cache/dataset', recursive=True) + print('/cache/dataseet: (recursive)') + print(ret) + cloud_data_url = args_opt.data_url + local_root_dir = '/home/work/user-job-dir' + local_data_dir = os.path.join(local_root_dir, "data") + local_train_file_dir = os.path.join(local_data_dir, "SoftMask", "train.json") + local_ckpt_dir = os.path.join(local_data_dir, "SoftMask", args_opt.bert_ckpt) + local_vocab_dir = os.path.join(local_data_dir, "SoftMask", "bert-base-chinese-vocab.txt") + local_model_dir = os.path.join(local_root_dir, "model") + if mox.file.exists(local_data_dir) is False: + mox.file.make_dirs(local_data_dir) + if mox.file.exists(local_model_dir) is False: + mox.file.make_dirs(local_model_dir) + mox.file.copy_parallel(cloud_data_url, local_data_dir) + print(local_data_dir + ":") + ret = mox.file.list_directory(local_data_dir, recursive=True) + print(ret) + else: + local_ckpt_dir = './weight/' + args_opt.bert_ckpt + local_model_dir = './checkpoint' + local_data_dir = args_opt.data_url + local_train_file_dir = local_data_dir + # context setting + if args_opt.device_target != "Ascend": + raise Exception("Only support on Ascend currently.") + run_mode = context.GRAPH_MODE + if args_opt.pynative: + run_mode = context.PYNATIVE_MODE + args_opt.batch_size = 16 + device_id = args_opt.device_id + device_num = args_opt.rank_size + if args_opt.enable_modelarts: + device_id = int(os.environ["DEVICE_ID"]) + init() + device_num = get_group_size() + context.set_context(mode=run_mode, device_target="Ascend", device_id=device_id) + if args_opt.enable_modelarts: + context.set_auto_parallel_context(device_num=device_num, \ + gradients_mean=True, parallel_mode=context.ParallelMode.DATA_PARALLEL) + netwithloss = SoftMaskedBertCLS(args_opt.batch_size, is_training=True, load_checkpoint_path=local_ckpt_dir) + netwithloss.set_train(True) + if args_opt.enable_modelarts: + tokenizer = CscTokenizer(fp=local_train_file_dir, device_num=device_num, rank_id=device_id, \ + max_seq_len=args_opt.max_seq_len, vocab_path=local_vocab_dir) + else: + tokenizer = CscTokenizer(fp=local_train_file_dir, device_num=device_num, rank_id=device_id, \ + max_seq_len=args_opt.max_seq_len) + ds_train = tokenizer.get_token_ids(args_opt.batch_size) + do_train(ds_train, netwithloss, cfg=args_opt, save_ckpt_path=local_model_dir) +if __name__ == "__main__": + run_csc()