diff --git a/research/nlp/albert/infer_squad/convert/convert.sh b/research/nlp/albert/infer_squad/convert/convert.sh new file mode 100644 index 0000000000000000000000000000000000000000..3d2c9ea30d64efe840948995b49e46e1f854f34f --- /dev/null +++ b/research/nlp/albert/infer_squad/convert/convert.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +air_path=$1 +om_path=$2 + +echo "Input AIR file path: ${air_path}" +echo "Output OM file path: ${om_path}" + +atc --framework=1 --model="${air_path}" \ + --output="${om_path}" \ + --soc_version=Ascend310 \ + --op_select_implmode="high_precision" \ No newline at end of file diff --git a/research/nlp/albert/infer_squad/data/config/albert_base.pipeline b/research/nlp/albert/infer_squad/data/config/albert_base.pipeline new file mode 100644 index 0000000000000000000000000000000000000000..031c478c66e8e1d71731c19970803b368f128ed0 --- /dev/null +++ b/research/nlp/albert/infer_squad/data/config/albert_base.pipeline @@ -0,0 +1,46 @@ +{ + "im_albertbase": { + "stream_config": { + "deviceId": "0" + }, + "appsrc0": { + "props": { + "blocksize": "409600" + }, + "factory": "appsrc", + "next": "mxpi_tensorinfer0:0" + }, + "appsrc1": { + "props": { + "blocksize": "409600" + }, + "factory": "appsrc", + "next": "mxpi_tensorinfer0:1" + }, + "appsrc2": { + "props": { + "blocksize": "409600" + }, + "factory": "appsrc", + "next": "mxpi_tensorinfer0:2" + }, + "mxpi_tensorinfer0": { + "props": { + "dataSource": "appsrc0,appsrc1,appsrc2", + "modelPath": "../data/model/albert_squad.om" + }, + "factory": "mxpi_tensorinfer", + "next": "mxpi_dataserialize0" + }, + "mxpi_dataserialize0": { + "props": { + "outputDataKeys": "mxpi_tensorinfer0" + }, + "factory": "mxpi_dataserialize", + "next": "appsink0" + }, + "appsink0": { + "factory": "appsink" + } + } +} \ No newline at end of file diff --git a/research/nlp/albert/infer_squad/docker_start_infer.sh b/research/nlp/albert/infer_squad/docker_start_infer.sh new file mode 100644 index 0000000000000000000000000000000000000000..2678ff3f94b2b0be1bb20af554f3787f58b70aef --- /dev/null +++ b/research/nlp/albert/infer_squad/docker_start_infer.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +docker_image=$1 +model_dir=$2 + + +function show_help() { + echo "Usage: docker_start.sh docker_image model_dir data_dir" +} + +function param_check() { + if [ -z "${docker_image}" ]; then + echo "please input docker_image" + show_help + exit 1 + fi + + if [ -z "${model_dir}" ]; then + echo "please input model_dir" + show_help + exit 1 + fi +} + +param_check + +docker run -it -u root \ + --device=/dev/davinci0 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v ${model_dir}:${model_dir} \ + ${docker_image} \ + /bin/bash diff --git a/research/nlp/albert/infer_squad/mxbase/CMakeLists.txt b/research/nlp/albert/infer_squad/mxbase/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dbbf45891611b96a637339c131019429cfa38ff --- /dev/null +++ b/research/nlp/albert/infer_squad/mxbase/CMakeLists.txt @@ -0,0 +1,51 @@ +cmake_minimum_required(VERSION 3.10.0) +project(albert) + +set(TARGET albert) + +add_definitions(-DENABLE_DVPP_INTERFACE) +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) +add_definitions(-Dgoogle=mindxsdk_private) +add_compile_options(-std=c++11 -fPIE -fstack-protector-all -fPIC -Wall) +add_link_options(-Wl,-z,relro,-z,now,-z,noexecstack -s -pie) + +# Check environment variable +if(NOT DEFINED ENV{ASCEND_HOME}) + message(FATAL_ERROR "please define environment variable:ASCEND_HOME") +endif() +if(NOT DEFINED ENV{ASCEND_VERSION}) + message(WARNING "please define environment variable:ASCEND_VERSION") +endif() +if(NOT DEFINED ENV{ARCH_PATTERN}) + message(WARNING "please define environment variable:ARCH_PATTERN") +endif() +set(ACL_INC_DIR $ENV{ASCEND_HOME}/$ENV{ASCEND_VERSION}/$ENV{ARCH_PATTERN}/acllib/include) +set(ACL_LIB_DIR $ENV{ASCEND_HOME}/$ENV{ASCEND_VERSION}/$ENV{ARCH_PATTERN}/acllib/lib64) + +set(MXBASE_ROOT_DIR $ENV{MX_SDK_HOME}) +set(MXBASE_INC ${MXBASE_ROOT_DIR}/include) +set(MXBASE_LIB_DIR ${MXBASE_ROOT_DIR}/lib) +set(MXBASE_POST_LIB_DIR ${MXBASE_ROOT_DIR}/lib/modelpostprocessors) +set(MXBASE_POST_PROCESS_DIR ${MXBASE_ROOT_DIR}/include/MxBase/postprocess/include) +if(DEFINED ENV{MXSDK_OPENSOURCE_DIR}) + set(OPENSOURCE_DIR $ENV{MXSDK_OPENSOURCE_DIR}) +else() + set(OPENSOURCE_DIR ${MXBASE_ROOT_DIR}/opensource) +endif() + +include_directories(${ACL_INC_DIR}) +include_directories(${OPENSOURCE_DIR}/include) +include_directories(${OPENSOURCE_DIR}/include/opencv4) + +include_directories(${MXBASE_INC}) +include_directories(${MXBASE_POST_PROCESS_DIR}) + +link_directories(${ACL_LIB_DIR}) +link_directories(${OPENSOURCE_DIR}/lib) +link_directories(${MXBASE_LIB_DIR}) +link_directories(${MXBASE_POST_LIB_DIR}) + +add_executable(${TARGET} src/main.cpp src/AlbertBase.cpp) +target_link_libraries(${TARGET} glog cpprest mxbase opencv_world stdc++fs) + +install(TARGETS ${TARGET} RUNTIME DESTINATION ${PROJECT_SOURCE_DIR}/) diff --git a/research/nlp/albert/infer_squad/mxbase/build.sh b/research/nlp/albert/infer_squad/mxbase/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..f13d90333e5add793ad21cc6ae7ce2b246b330a6 --- /dev/null +++ b/research/nlp/albert/infer_squad/mxbase/build.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +path_cur=$(dirname $0) + +function check_env() +{ + # set ASCEND_VERSION to ascend-toolkit/latest when it was not specified by user + if [ ! "${ASCEND_VERSION}" ]; then + export ASCEND_VERSION=ascend-toolkit/latest + echo "Set ASCEND_VERSION to the default value: ${ASCEND_VERSION}" + else + echo "ASCEND_VERSION is set to ${ASCEND_VERSION} by user" + fi + + if [ ! "${ARCH_PATTERN}" ]; then + # set ARCH_PATTERN to ./ when it was not specified by user + export ARCH_PATTERN=./ + echo "ARCH_PATTERN is set to the default value: ${ARCH_PATTERN}" + else + echo "ARCH_PATTERN is set to ${ARCH_PATTERN} by user" + fi +} + +function build_albert() +{ + cd $path_cur + rm -rf build + mkdir -p build + cd build + cmake .. + make + ret=$? + if [ ${ret} -ne 0 ]; then + echo "Failed to build albert." + exit ${ret} + fi + make install +} + +check_env +build_albert \ No newline at end of file diff --git a/research/nlp/albert/infer_squad/mxbase/postprocess.py b/research/nlp/albert/infer_squad/mxbase/postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..bd935304a43d2727d58bf460626035e6eda6eb99 --- /dev/null +++ b/research/nlp/albert/infer_squad/mxbase/postprocess.py @@ -0,0 +1,496 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +'''postprocess''' +import argparse +import collections +import glob +import json +import math +import os +import pickle +import re +import string +import sys +import numpy as np +sys.path.append("../utils") + + +class SquadExample: + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + paragraph_text, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.paragraph_text = paragraph_text + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) + +_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + +RawResult = collections.namedtuple("RawResult", + ["unique_id", + "start_log_prob", + "end_log_prob"]) + + +def parse_args(): + """set and check parameters.""" + parser = argparse.ArgumentParser(description="albert process") + parser.add_argument("--data_dir", type=str, default="", + help="Dataset contain input_ids, input_mask, segment_ids, label_ids") + parser.add_argument("--eval_json_path", type=str, + default="", help="label ids to name") + parser.add_argument("--eval_data_file_path", type=str, default="", + help="Data path, it is better to use absolute path") + args_opt = parser.parse_args() + return args_opt + + +def f1_score(prediction, ground_truth): + """calculate f1 score""" + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = collections.Counter( + prediction_tokens) & collections.Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def post_process(dataset_file, all_predictions, output_metrics="output.json"): + """ + process the result of infer tensor to Visualization results. + Args: + args: param of config. + file_name: label file name. + infer_result: get logit from infer result + max_seq_length: sentence input length default is 128. + """ + # print the infer result + with open(dataset_file) as ds: + print('==========') + dataset_json = json.load(ds) + dataset = dataset_json['data'] + print(dataset) + print('success') + re_json = evaluate(dataset, all_predictions) + print(json.dumps(re_json)) + with open(output_metrics, 'w') as wr: + wr.write(json.dumps(re_json)) + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def exact_match_score(prediction, ground_truth): + return normalize_answer(prediction) == normalize_answer(ground_truth) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def evaluate(dataset, predictions): + """do evaluation""" + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article['paragraphs']: + for qa in paragraph['qas']: + total += 1 + if qa['id'] not in predictions: + message = 'Unanswered question ' + qa['id'] + \ + ' will receive score 0.' + print(message, file=sys.stderr) + continue + ground_truths = list(map(lambda x: x['text'], qa['answers'])) + if not ground_truths: + continue + prediction = predictions[qa['id']] + exact_match += metric_max_over_ground_truths( + exact_match_score, prediction, ground_truths) + f1 += metric_max_over_ground_truths( + f1_score, prediction, ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + print(exact_match) + print(f1) + return {'exact_match': exact_match, 'f1': f1} + + +def get_infer_logits(args, file_name): + """ + get the result of model output. + Args: + infer_result: get logit from infer result + max_seq_length: sentence input length default is 384. + """ + infer_logits_path = os.path.realpath( + os.path.join(args.data_dir, "11_data", file_name)) + data_0 = [] + data_1 = [] + with open(infer_logits_path, "r") as f: + for line in f: + data_0.append(float(line.strip('\n'))) + + for i in range(384): + data_1.append([data_0[i], data_0[384 + i]]) + res = np.array(data_1) + start_logits = [float(x) for x in res[:, 0].flat] + end_logits = [float(x) for x in res[:, 1].flat] + + return start_logits, end_logits + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +def write_predictions_v1(result_dict, all_examples, all_features, + all_results, n_best_size, max_answer_length): + """Write final predictions to the json file and log-odds of null if needed.""" + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + for (feature_index, feature) in enumerate(features): + for ((start_idx, end_idx), logprobs) in \ + result_dict[example_index][feature.unique_id].items(): + start_log_prob = 0 + end_log_prob = 0 + for logprob in logprobs: + start_log_prob += logprob[0] + end_log_prob += logprob[1] + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_idx, + end_index=end_idx, + start_log_prob=start_log_prob / len(logprobs), + end_log_prob=end_log_prob / len(logprobs))) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_log_prob + x.end_log_prob), + reverse=True) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index >= 0: # this is a non-null prediction + tok_start_to_orig_index = feature.tok_start_to_orig_index + tok_end_to_orig_index = feature.tok_end_to_orig_index + start_orig_pos = tok_start_to_orig_index[pred.start_index] + end_orig_pos = tok_end_to_orig_index[pred.end_index] + + paragraph_text = example.paragraph_text + final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_log_prob=pred.start_log_prob, + end_log_prob=pred.end_log_prob)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_log_prob=0.0, end_log_prob=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + all_predictions[example.qas_id] = nbest_json[0]["text"] + all_nbest_json[example.qas_id] = nbest_json + + return all_predictions, all_nbest_json + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted( + enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def accumulate_predictions_v1(result_dict, all_examples, all_features, + all_results, n_best_size, max_answer_length): + """accumulate predictions for each positions in a dictionary.""" + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + for (example_index, _) in enumerate(all_examples): + if example_index not in result_dict: + result_dict[example_index] = {} + features = example_index_to_features[example_index] + + for (_, feature) in enumerate(features): + if feature.unique_id not in result_dict[example_index]: + result_dict[example_index][feature.unique_id] = {} + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes( + result.start_log_prob, n_best_size) + end_indexes = _get_best_indexes(result.end_log_prob, n_best_size) + for start_index in start_indexes: + for end_index in end_indexes: + doc_offset = feature.tokens.index("[SEP]") + 1 + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index - doc_offset >= len(feature.tok_start_to_orig_index): + continue + if end_index - doc_offset >= len(feature.tok_end_to_orig_index): + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + start_log_prob = result.start_log_prob[start_index] + end_log_prob = result.end_log_prob[end_index] + start_idx = start_index - doc_offset + end_idx = end_index - doc_offset + if (start_idx, end_idx) not in result_dict[example_index][feature.unique_id]: + result_dict[example_index][feature.unique_id][( + start_idx, end_idx)] = [] + result_dict[example_index][feature.unique_id][(start_idx, end_idx)].append( + (start_log_prob, end_log_prob)) + return result_dict + + +def get_result(result, eval_examples, eval_features): + """Evaluate the checkpoint on SQuAD 1.0.""" + + result_dict = {} + accumulate_predictions_v1( + result_dict, eval_examples, eval_features, + result, 20, 30) + all_predictions, all_nbest_json = write_predictions_v1( + result_dict, eval_examples, eval_features, result, 20, 30) + return all_predictions, all_nbest_json + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + orig_answer_text = None + is_impossible = False + + if is_training: + is_impossible = qa.get("is_impossible", False) + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + start_position = answer["answer_start"] + else: + start_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + paragraph_text=paragraph_text, + orig_answer_text=orig_answer_text, + start_position=start_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def run(): + """ + read pipeline and do infer + """ + args = parse_args() + # input_ids file list, every file content a tensor[1,128] + file_list = glob.glob(os.path.join( + os.path.realpath(args.data_dir), "11_data", "*.txt")) + cwq_lists = [] + for i in range(len(file_list)): + b = os.path.split(file_list[i]) + cwq_lists.append(b) + + def take_second(elem): + return elem[1] + + cwq_lists.sort(key=take_second) + yms_lists = [] + for i in range(len(cwq_lists)): + c = cwq_lists[i][0] + '/' + cwq_lists[i][1] + yms_lists.append(c) + + eval_examples = read_squad_examples(args.eval_json_path, False) + with open(args.eval_data_file_path, "rb") as fin: + eval_features = pickle.load(fin) + file_list = yms_lists + all_predictions = collections.OrderedDict() + outputs = [] + for input_ids in file_list: + file_name = input_ids.split('/')[-1].split('.')[0] + '.bin' + start_log_prob, end_log_prob = get_infer_logits( + args, input_ids.split('/')[-1]) + unique_id_name = os.path.join(args.data_dir, "03_data", file_name) + unique_id = np.fromfile(unique_id_name, np.int32) + unique_id = int(unique_id[0]) + outputs.append(RawResult( + unique_id=unique_id, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob)) + + all_predictions, _ = get_result(outputs, eval_examples, eval_features) + js = json.dumps(all_predictions) + file = open('infer_result.txt', 'w') + file.write(js) + file.close() + post_process(args.eval_json_path, all_predictions, + output_metrics="output.json") + + +if __name__ == '__main__': + run() diff --git a/research/nlp/albert/infer_squad/mxbase/run.sh b/research/nlp/albert/infer_squad/mxbase/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..aa1830f7a297fdef6010cfb9d8953238e42baa7c --- /dev/null +++ b/research/nlp/albert/infer_squad/mxbase/run.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data_dir=$1 +eval_json_path=$2 +eval_data_file_path=$3 + +set -e + +# Simple log helper functions +info() { echo -e "\033[1;34m[INFO ][MxStream] $1\033[1;37m" ; } +warn() { echo >&2 -e "\033[1;31m[WARN ][MxStream] $1\033[1;37m" ; } + +export LD_LIBRARY_PATH=${MX_SDK_HOME}/lib:${MX_SDK_HOME}/opensource/lib:${MX_SDK_HOME}/opensource/lib64:/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64:${LD_LIBRARY_PATH} +export GST_PLUGIN_SCANNER=${MX_SDK_HOME}/opensource/libexec/gstreamer-1.0/gst-plugin-scanner +export GST_PLUGIN_PATH=${MX_SDK_HOME}/opensource/lib/gstreamer-1.0:${MX_SDK_HOME}/lib/plugins + +#to set PYTHONPATH, import the StreamManagerApi.py +export PYTHONPATH=$PYTHONPATH:${MX_SDK_HOME}/python + +./albert ../data/input/ 0 +python3 postprocess.py --data_dir=$data_dir --eval_json_path=$eval_json_path --eval_data_file_path=$eval_data_file_path +exit 0 diff --git a/research/nlp/albert/infer_squad/mxbase/src/AlbertBase.cpp b/research/nlp/albert/infer_squad/mxbase/src/AlbertBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..71e5c88a3f0918ecd6eec624df959372ad8a5fad --- /dev/null +++ b/research/nlp/albert/infer_squad/mxbase/src/AlbertBase.cpp @@ -0,0 +1,304 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AlbertBase.h" +#include <unistd.h> +#include <sys/stat.h> +#include <map> +#include <fstream> +#include "MxBase/DeviceManager/DeviceManager.h" +#include "MxBase/Log/Log.h" + +const uint32_t EACH_LABEL_LENGTH = 4; +const uint32_t MAX_LENGTH = 384; +const uint32_t CLASS_NUM = 41; + + +APP_ERROR AlbertBase::Init(const InitParam &initParam) { + deviceId_ = initParam.deviceId; + APP_ERROR ret = MxBase::DeviceManager::GetInstance()->InitDevices(); + if (ret != APP_ERR_OK) { + LogError << "Init devices failed, ret=" << ret << "."; + return ret; + } + ret = MxBase::TensorContext::GetInstance()->SetContext(initParam.deviceId); + if (ret != APP_ERR_OK) { + LogError << "Set context failed, ret=" << ret << "."; + return ret; + } + dvppWrapper_ = std::make_shared<MxBase::DvppWrapper>(); + ret = dvppWrapper_->Init(); + if (ret != APP_ERR_OK) { + LogError << "DvppWrapper init failed, ret=" << ret << "."; + return ret; + } + model_ = std::make_shared<MxBase::ModelInferenceProcessor>(); + ret = model_->Init(initParam.modelPath, modelDesc_); + if (ret != APP_ERR_OK) { + LogError << "ModelInferenceProcessor init failed, ret=" << ret << "."; + return ret; + } + + return APP_ERR_OK; +} + +APP_ERROR AlbertBase::DeInit() { + dvppWrapper_->DeInit(); + model_->DeInit(); + MxBase::DeviceManager::GetInstance()->DestroyDevices(); + return APP_ERR_OK; +} + +APP_ERROR AlbertBase::ReadTensorFromFile(const std::string &file, uint32_t *data, uint32_t size) { + if (data == NULL || size < MAX_LENGTH) { + LogError << "input data is invalid."; + return APP_ERR_COMM_INVALID_POINTER; + } + std::ifstream infile; + // open label file + infile.open(file, std::ios_base::in | std::ios_base::binary); + // check label file validity + if (infile.fail()) { + LogError << "Failed to open label file: " << file << "."; + return APP_ERR_COMM_OPEN_FAIL; + } + infile.read(reinterpret_cast<char*>(data), sizeof(uint32_t) * MAX_LENGTH); + infile.close(); + return APP_ERR_OK; +} + +APP_ERROR AlbertBase::ReadInputTensor(const std::string &fileName, uint32_t index, + std::vector<MxBase::TensorBase> *inputs) { + uint32_t data[MAX_LENGTH] = {0}; + APP_ERROR ret = ReadTensorFromFile(fileName, data, MAX_LENGTH); + if (ret != APP_ERR_OK) { + LogError << "ReadTensorFromFile failed."; + return ret; + } + + const uint32_t dataSize = modelDesc_.inputTensors[index].tensorSize; + MxBase::MemoryData memoryDataDst(dataSize, MxBase::MemoryData::MEMORY_DEVICE, deviceId_); + MxBase::MemoryData memoryDataSrc(reinterpret_cast<void*>(data), dataSize, MxBase::MemoryData::MEMORY_HOST_MALLOC); + ret = MxBase::MemoryHelper::MxbsMallocAndCopy(memoryDataDst, memoryDataSrc); + if (ret != APP_ERR_OK) { + LogError << GetError(ret) << "Memory malloc and copy failed."; + return ret; + } + + std::vector<uint32_t> shape = {1, MAX_LENGTH}; + inputs->push_back(MxBase::TensorBase(memoryDataDst, false, shape, MxBase::TENSOR_DTYPE_UINT32)); + return APP_ERR_OK; +} + +APP_ERROR AlbertBase::Inference(const std::vector<MxBase::TensorBase> &inputs, + std::vector<MxBase::TensorBase> *outputs) { + auto dtypes = model_->GetOutputDataType(); + for (size_t i = 0; i < modelDesc_.outputTensors.size(); ++i) { + std::vector<uint32_t> shape = {}; + for (size_t j = 0; j < modelDesc_.outputTensors[i].tensorDims.size(); ++j) { + shape.push_back((uint32_t)modelDesc_.outputTensors[i].tensorDims[j]); + } + MxBase::TensorBase tensor(shape, dtypes[i], MxBase::MemoryData::MemoryType::MEMORY_DEVICE, deviceId_); + APP_ERROR ret = MxBase::TensorBase::TensorBaseMalloc(tensor); + if (ret != APP_ERR_OK) { + LogError << "TensorBaseMalloc failed, ret=" << ret << "."; + return ret; + } + outputs->push_back(tensor); + } + + MxBase::DynamicInfo dynamicInfo = {}; + dynamicInfo.dynamicType = MxBase::DynamicType::STATIC_BATCH; + auto startTime = std::chrono::high_resolution_clock::now(); + APP_ERROR ret = model_->ModelInference(inputs, *outputs, dynamicInfo); + auto endTime = std::chrono::high_resolution_clock::now(); + double costMs = std::chrono::duration<double, std::milli>(endTime - startTime).count(); + g_inferCost.push_back(costMs); + if (ret != APP_ERR_OK) { + LogError << "ModelInference failed, ret=" << ret << "."; + return ret; + } + return APP_ERR_OK; +} + +APP_ERROR AlbertBase::PostProcess(std::vector<MxBase::TensorBase> *outputs, std::vector<uint32_t> *argmax, \ + const std::string &fileName) { + MxBase::TensorBase &tensor = outputs->at(0); + APP_ERROR ret = tensor.ToHost(); + if (ret != APP_ERR_OK) { + LogError << GetError(ret) << "Tensor deploy to host failed."; + return ret; + } + // check tensor is available + auto outputShape = tensor.GetShape(); + uint32_t length = outputShape[1]; + uint32_t classNum = outputShape[2]; + LogInfo << "output shape is: " << outputShape[1] << " " << outputShape[2] << std::endl; + + void* data = tensor.GetBuffer(); + std::vector<float> result_start = {}; + std::vector<float> result_end = {}; + for (uint32_t i = 0; i < length; i++) { + for (uint32_t j = 0; j < classNum; j++) { + if (j == 0) { + float value_start = *(reinterpret_cast<float*>(data) + i * classNum + j); + result_start.push_back(value_start); + } else if (j == 1) { + float value_end = *(reinterpret_cast<float*>(data) + i * classNum + j); + result_end.push_back(value_end); + } + } + } + int result_start_size = result_start.size(); + + std::string resultPathName = "../data/input/11_data"; + // create result directory when it does not exit + if (access(resultPathName.c_str(), 0) != 0) { + ret = mkdir(resultPathName.c_str(), S_IRUSR | S_IWUSR | S_IXUSR); + if (ret != 0) { + LogError << "Failed to create result directory: " << resultPathName << ", ret = " << ret; + return APP_ERR_COMM_OPEN_FAIL; + } + } + // create result file under result directory + std::string nobin_fileName = fileName.substr(0, fileName.length() - 4); + resultPathName = resultPathName + "/" + nobin_fileName + ".txt"; + std::ofstream tfile(resultPathName, std::ofstream::app); + if (tfile.fail()) { + LogError << "Failed to open result file: " << resultPathName; + return APP_ERR_COMM_OPEN_FAIL; + } + // write inference result into file + LogInfo << "=============================================================="; + for (int t = 0; t < result_start_size; t++) { + tfile << result_start[t] << std::endl; + } + for (int j = 0; j < result_start_size; j++) { + tfile << result_end[j] << std::endl; + } + + LogInfo << "=============================================================="; + tfile.close(); + + return APP_ERR_OK; +} + +APP_ERROR AlbertBase::CountPredictResult(const std::string &labelFile, const std::vector<uint32_t> &argmax) { + uint32_t data[MAX_LENGTH] = {0}; + APP_ERROR ret = ReadTensorFromFile(labelFile, data, MAX_LENGTH); + if (ret != APP_ERR_OK) { + LogError << "ReadTensorFromFile failed."; + return ret; + } + uint32_t target[CLASS_NUM][MAX_LENGTH] = {0}; + uint32_t pred[CLASS_NUM][MAX_LENGTH] = {0}; + for (uint32_t i = 0; i < MAX_LENGTH; i++) { + if (data[i] > 0) { + target[data[i]][i] = 1; + } + if (argmax[i] > 0) { + pred[argmax[i]][i] = 1; + } + } + for (uint32_t i = 0; i < CLASS_NUM; i++) { + for (uint32_t j = 0; j < MAX_LENGTH; j++) { + // count True Positive and False Positive + if (pred[i][j] == 1) { + if (target[i][j] == 1) { + g_TP += 1; + } else { + g_FP += 1; + } + } + // count False Negative + if (target[i][j] == 1 && pred[i][j] != 1) { + g_FN += 1; + } + } + } + LogInfo << "TP: " << g_TP << ", FP: " << g_FP << ", FN: " << g_FN; + return APP_ERR_OK; +} + +APP_ERROR AlbertBase::WriteResult(const std::string &fileName, const std::vector<uint32_t> &argmax) { + std::string resultPathName = "result"; + // create result directory when it does not exit + if (access(resultPathName.c_str(), 0) != 0) { + int ret = mkdir(resultPathName.c_str(), S_IRUSR | S_IWUSR | S_IXUSR); + if (ret != 0) { + LogError << "Failed to create result directory: " << resultPathName << ", ret = " << ret; + return APP_ERR_COMM_OPEN_FAIL; + } + } + // create result file under result directory + resultPathName = resultPathName + "/result.txt"; + std::ofstream tfile(resultPathName, std::ofstream::app); + if (tfile.fail()) { + LogError << "Failed to open result file: " << resultPathName; + return APP_ERR_COMM_OPEN_FAIL; + } + // write inference result into file + LogInfo << "=============================================================="; + LogInfo << "infer result of " << fileName << " is: "; + tfile << "file name is: " << fileName << std::endl; + LogInfo << "=============================================================="; + tfile.close(); + return APP_ERR_OK; +} + +APP_ERROR AlbertBase::Process(const std::string &inferPath, const std::string &fileName, bool eval) { + std::vector<MxBase::TensorBase> inputs = {}; + std::string inputIdsFile = inferPath + "00_data/" + fileName; + APP_ERROR ret = ReadInputTensor(inputIdsFile, INPUT_IDS, &inputs); + if (ret != APP_ERR_OK) { + LogError << "Read input ids failed, ret=" << ret << "."; + return ret; + } + std::string inputMaskFile = inferPath + "01_data/" + fileName; + ret = ReadInputTensor(inputMaskFile, INPUT_MASK, &inputs); + if (ret != APP_ERR_OK) { + LogError << "Read input mask file failed, ret=" << ret << "."; + return ret; + } + std::string tokenTypeIdFile = inferPath + "02_data/" + fileName; + ret = ReadInputTensor(tokenTypeIdFile, TOKEN_TYPE, &inputs); + if (ret != APP_ERR_OK) { + LogError << "Read token typeId file failed, ret=" << ret << "."; + return ret; + } + + std::vector<MxBase::TensorBase> outputs = {}; + ret = Inference(inputs, &outputs); + if (ret != APP_ERR_OK) { + LogError << "Inference failed, ret=" << ret << "."; + return ret; + } + + std::vector<uint32_t> argmax; + ret = PostProcess(&outputs, &argmax, fileName); + if (ret != APP_ERR_OK) { + LogError << "PostProcess failed, ret=" << ret << "."; + return ret; + } + if (eval) { + std::string labelFile = inferPath + "03_data/" + fileName; + ret = CountPredictResult(labelFile, argmax); + if (ret != APP_ERR_OK) { + LogError << "CalcF1Score read label failed, ret=" << ret << "."; + return ret; + } + } + return APP_ERR_OK; +} diff --git a/research/nlp/albert/infer_squad/mxbase/src/AlbertBase.h b/research/nlp/albert/infer_squad/mxbase/src/AlbertBase.h new file mode 100644 index 0000000000000000000000000000000000000000..4aa4f0bed0e1cd88ee8d48eb277343620ed93b7e --- /dev/null +++ b/research/nlp/albert/infer_squad/mxbase/src/AlbertBase.h @@ -0,0 +1,67 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MXBASE_AlbertBase_H +#define MXBASE_AlbertBase_H + +#include <memory> +#include <utility> +#include <vector> +#include <string> +#include <map> +#include <opencv2/opencv.hpp> +#include "MxBase/DvppWrapper/DvppWrapper.h" +#include "MxBase/ModelInfer/ModelInferenceProcessor.h" +#include "MxBase/Tensor/TensorContext/TensorContext.h" + +extern std::vector<double> g_inferCost; +extern uint32_t g_TP; +extern uint32_t g_FP; +extern uint32_t g_FN; + +struct InitParam { + uint32_t deviceId; + std::string modelPath; +}; + +enum DataIndex { + INPUT_IDS = 0, + INPUT_MASK = 1, + TOKEN_TYPE = 2, +}; + +class AlbertBase { + public: + APP_ERROR Init(const InitParam &initParam); + APP_ERROR DeInit(); + APP_ERROR Inference(const std::vector<MxBase::TensorBase> &inputs, std::vector<MxBase::TensorBase> *outputs); + APP_ERROR Process(const std::string &inferPath, const std::string &fileName, bool eval); + APP_ERROR PostProcess(std::vector<MxBase::TensorBase> *outputs, \ + std::vector<uint32_t> *argmax, const std::string &fileName); + protected: + APP_ERROR ReadTensorFromFile(const std::string &file, uint32_t *data, uint32_t size); + APP_ERROR ReadInputTensor(const std::string &fileName, uint32_t index, std::vector<MxBase::TensorBase> *inputs); + APP_ERROR LoadLabels(const std::string &labelPath, std::vector<std::string> *labelMap); + APP_ERROR ReadInputTensor(const std::string &fileName, const std::vector<uint32_t> &argmax); + APP_ERROR WriteResult(const std::string &fileName, const std::vector<uint32_t> &argmax); + APP_ERROR CountPredictResult(const std::string &labelFile, const std::vector<uint32_t> &argmax); + private: + std::shared_ptr<MxBase::DvppWrapper> dvppWrapper_; + std::shared_ptr<MxBase::ModelInferenceProcessor> model_; + MxBase::ModelDesc modelDesc_ = {}; + uint32_t deviceId_ = 0; +}; +#endif diff --git a/research/nlp/albert/infer_squad/mxbase/src/main.cpp b/research/nlp/albert/infer_squad/mxbase/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c652e72969cb20d02d2b41526b62288db697b36c --- /dev/null +++ b/research/nlp/albert/infer_squad/mxbase/src/main.cpp @@ -0,0 +1,107 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <unistd.h> +#include <dirent.h> +#include <iostream> +#include <fstream> +#include <vector> +#include "AlbertBase.h" +#include "MxBase/Log/Log.h" + +std::vector<double> g_inferCost; +uint32_t g_TP = 0; +uint32_t g_FP = 0; +uint32_t g_FN = 0; + +void InitAlbertParam(InitParam* initParam) { + initParam->deviceId = 0; + initParam->modelPath = "../data/model/albert_squad.om"; +} + +APP_ERROR ReadFilesFromPath(const std::string &path, std::vector<std::string> *files) { + DIR *dir = NULL; + struct dirent *ptr = NULL; + + if ((dir = opendir(path.c_str())) == NULL) { + LogError << "Open dir error: " << path; + return APP_ERR_COMM_OPEN_FAIL; + } + + while ((ptr = readdir(dir)) != NULL) { + // d_type == 8 is file + if (ptr->d_type == 8) { + files->push_back(ptr->d_name); + } + } + closedir(dir); + // sort ascending order + sort(files->begin(), files->end()); + return APP_ERR_OK; +} + +int main(int argc, char* argv[]) { + if (argc <= 1) { + LogWarn << "Please input image path, such as './albert /input/data 0'."; + return APP_ERR_OK; + } + + InitParam initParam; + InitAlbertParam(&initParam); + auto albertBase = std::make_shared<AlbertBase>(); + APP_ERROR ret = albertBase->Init(initParam); + if (ret != APP_ERR_OK) { + LogError << "AlbertBase init failed, ret=" << ret << "."; + return ret; + } + + std::string inferPath = argv[1]; + std::vector<std::string> files; + ret = ReadFilesFromPath(inferPath + "00_data", &files); + if (ret != APP_ERR_OK) { + LogError << "Read files from path failed, ret=" << ret << "."; + return ret; + } + // do eval and calc the f1 score + bool eval = atoi(argv[2]); + for (uint32_t i = 0; i < files.size(); i++) { + LogInfo << "read file name: " << files[i]; + ret = albertBase->Process(inferPath, files[i], eval); + if (ret != APP_ERR_OK) { + LogError << "AlbertBase process failed, ret=" << ret << "."; + albertBase->DeInit(); + return ret; + } + } + + if (eval) { + LogInfo << "=============================================================="; + float precision = g_TP * 1.0 / (g_TP + g_FP); + LogInfo << "Precision: " << precision; + float recall = g_TP * 1.0 / (g_TP + g_FN); + LogInfo << "recall: " << recall; + LogInfo << "F1 Score: " << 2 * precision * recall / (precision + recall); + LogInfo << "=============================================================="; + } + albertBase->DeInit(); + double costSum = 0; + for (uint32_t i = 0; i < g_inferCost.size(); i++) { + costSum += g_inferCost[i]; + } + LogInfo << "Infer question sum " << g_inferCost.size() << ", cost total time: " << costSum << " ms."; + LogInfo << "The throughput: " << g_inferCost.size() * 1000 / costSum << " bin/sec."; + return APP_ERR_OK; +} diff --git a/research/nlp/albert/infer_squad/sdk/main.py b/research/nlp/albert/infer_squad/sdk/main.py new file mode 100644 index 0000000000000000000000000000000000000000..be076092cc8d8fe26250b60d4d2eb65eb7050c52 --- /dev/null +++ b/research/nlp/albert/infer_squad/sdk/main.py @@ -0,0 +1,617 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +sample script of CLUE infer using SDK run in docker +""" +import string +import re +import json +import sys +import math +import pickle +import collections + +import argparse +import glob +import os +import time + +import MxpiDataType_pb2 as MxpiDataType +import numpy as np +from StreamManagerApi import StreamManagerApi, MxDataInput, InProtobufVector, \ + MxProtobufIn, StringVector + +sys.path.append("../utils") + + +class SquadExample: + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + paragraph_text, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.paragraph_text = paragraph_text + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +TP = 0 +FP = 0 +FN = 0 + + +def parse_args(): + """set and check parameters.""" + parser = argparse.ArgumentParser(description="albert process") + parser.add_argument("--pipeline", type=str, default="", + help="SDK infer pipeline") + parser.add_argument("--data_dir", type=str, default="", + help="Dataset contain input_ids, input_mask, segment_ids, label_ids") + parser.add_argument("--eval_json_path", type=str, + default="", help="label ids to name") + parser.add_argument("--eval_data_file_path", type=str, default="", + help="Data path, it is better to use absolute path") + args_opt = parser.parse_args() + return args_opt + + +def send_source_data(appsrc_id, filename, stream_name, stream_manager): + """ + Construct the input of the stream, + send inputs data to a specified stream based on streamName. + + Returns: + bool: send data success or not + """ + tensor = np.fromfile(filename, dtype=np.int32) + tensor = np.expand_dims(tensor, 0) + tensor_package_list = MxpiDataType.MxpiTensorPackageList() + tensor_package = tensor_package_list.tensorPackageVec.add() + array_bytes = tensor.tobytes() + data_input = MxDataInput() + data_input.data = array_bytes + tensor_vec = tensor_package.tensorVec.add() + tensor_vec.deviceId = 0 + tensor_vec.memType = 0 + for i in tensor.shape: + tensor_vec.tensorShape.append(i) + tensor_vec.dataStr = data_input.data + tensor_vec.tensorDataSize = len(array_bytes) + + key = "appsrc{}".format(appsrc_id).encode('utf-8') + protobuf_vec = InProtobufVector() + protobuf = MxProtobufIn() + protobuf.key = key + protobuf.type = b'MxTools.MxpiTensorPackageList' + protobuf.protobuf = tensor_package_list.SerializeToString() + protobuf_vec.push_back(protobuf) + + ret = stream_manager.SendProtobuf(stream_name, appsrc_id, protobuf_vec) + if ret < 0: + print("Failed to send data to stream.") + return False + return True + + +def send_appsrc_data(args, file_name, stream_name, stream_manager): + """ + send three stream to infer model, include input ids, input mask and token type_id. + + Returns: + bool: send data success or not + """ + input_ids = os.path.realpath(os.path.join( + args.data_dir, "00_data", file_name)) + if not send_source_data(0, input_ids, stream_name, stream_manager): + return False + input_mask = os.path.realpath(os.path.join( + args.data_dir, "01_data", file_name)) + if not send_source_data(1, input_mask, stream_name, stream_manager): + return False + token_type_id = os.path.realpath( + os.path.join(args.data_dir, "02_data", file_name)) + if not send_source_data(2, token_type_id, stream_name, stream_manager): + return False + return True + + +def f1_score(prediction, ground_truth): + """calculate f1 score""" + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = collections.Counter( + prediction_tokens) & collections.Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def post_process(dataset_file, all_predictions, output_metrics="output.json"): + """ + process the result of infer tensor to Visualization results. + Args: + args: param of config. + file_name: label file name. + infer_result: get logit from infer result + max_seq_length: sentence input length default is 128. + """ + # print the infer result + with open(dataset_file) as ds: + print('==========') + dataset_json = json.load(ds) + dataset = dataset_json['data'] + # print(dataset) + print('success') + re_json = evaluate(dataset, all_predictions) + print(json.dumps(re_json)) + with open(output_metrics, 'w') as wr: + wr.write(json.dumps(re_json)) + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def exact_match_score(prediction, ground_truth): + return normalize_answer(prediction) == normalize_answer(ground_truth) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def evaluate(dataset, predictions): + """do evaluation""" + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article['paragraphs']: + for qa in paragraph['qas']: + total += 1 + if qa['id'] not in predictions: + message = 'Unanswered question ' + qa['id'] + \ + ' will receive score 0.' + print(message, file=sys.stderr) + continue + ground_truths = list(map(lambda x: x['text'], qa['answers'])) + if not ground_truths: + continue + prediction = predictions[qa['id']] + exact_match += metric_max_over_ground_truths( + exact_match_score, prediction, ground_truths) + f1 += metric_max_over_ground_truths( + f1_score, prediction, ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + print(exact_match) + print(f1) + return {'exact_match': exact_match, 'f1': f1} + + +def get_infer_logits(args, file_name, infer_result, max_seq_length=384, num_class=2): + """ + get the result of model output. + Args: + infer_result: get logit from infer result + max_seq_length: sentence input length default is 384. + """ + result = MxpiDataType.MxpiTensorPackageList() + result.ParseFromString(infer_result[0].messageBuf) + + res = np.frombuffer( + result.tensorPackageVec[0].tensorVec[0].dataStr, dtype='<f4') + + input_mask_file = os.path.realpath( + os.path.join(args.data_dir, "01_data", file_name)) + input_mask = np.fromfile( + input_mask_file, np.float32).reshape(max_seq_length) + + res = res.reshape(max_seq_length, num_class) + #print("output tensor is: ", res.shape) + start_logits = np.squeeze(res[:, 0:1], axis=-1) + start_logits = start_logits + 100 * input_mask + end_logits = np.squeeze(res[:, 1:2], axis=-1) + end_logits = end_logits + 100 * input_mask + + start_logits = [float(x) for x in start_logits.flat] + end_logits = [float(x) for x in end_logits.flat] + + return start_logits, end_logits + + +_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) + +_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + +RawResult = collections.namedtuple("RawResult", + ["unique_id", + "start_log_prob", + "end_log_prob"]) + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +def write_predictions_v1(result_dict, all_examples, all_features, + all_results, n_best_size, max_answer_length): + """Write final predictions to the json file and log-odds of null if needed.""" + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + for (feature_index, feature) in enumerate(features): + for ((start_idx, end_idx), logprobs) in \ + result_dict[example_index][feature.unique_id].items(): + start_log_prob = 0 + end_log_prob = 0 + for logprob in logprobs: + start_log_prob += logprob[0] + end_log_prob += logprob[1] + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_idx, + end_index=end_idx, + start_log_prob=start_log_prob / len(logprobs), + end_log_prob=end_log_prob / len(logprobs))) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_log_prob + x.end_log_prob), + reverse=True) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index >= 0: # this is a non-null prediction + tok_start_to_orig_index = feature.tok_start_to_orig_index + tok_end_to_orig_index = feature.tok_end_to_orig_index + start_orig_pos = tok_start_to_orig_index[pred.start_index] + end_orig_pos = tok_end_to_orig_index[pred.end_index] + + paragraph_text = example.paragraph_text + final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_log_prob=pred.start_log_prob, + end_log_prob=pred.end_log_prob)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_log_prob=0.0, end_log_prob=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + all_predictions[example.qas_id] = nbest_json[0]["text"] + all_nbest_json[example.qas_id] = nbest_json + + return all_predictions, all_nbest_json + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted( + enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def accumulate_predictions_v1(result_dict, all_examples, all_features, + all_results, n_best_size, max_answer_length): + """accumulate predictions for each positions in a dictionary.""" + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + for (example_index, _) in enumerate(all_examples): + if example_index not in result_dict: + result_dict[example_index] = {} + features = example_index_to_features[example_index] + + for (_, feature) in enumerate(features): + if feature.unique_id not in result_dict[example_index]: + result_dict[example_index][feature.unique_id] = {} + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes( + result.start_log_prob, n_best_size) + end_indexes = _get_best_indexes(result.end_log_prob, n_best_size) + for start_index in start_indexes: + for end_index in end_indexes: + doc_offset = feature.tokens.index("[SEP]") + 1 + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index - doc_offset >= len(feature.tok_start_to_orig_index): + continue + if end_index - doc_offset >= len(feature.tok_end_to_orig_index): + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + start_log_prob = result.start_log_prob[start_index] + end_log_prob = result.end_log_prob[end_index] + start_idx = start_index - doc_offset + end_idx = end_index - doc_offset + if (start_idx, end_idx) not in result_dict[example_index][feature.unique_id]: + result_dict[example_index][feature.unique_id][( + start_idx, end_idx)] = [] + result_dict[example_index][feature.unique_id][(start_idx, end_idx)].append( + (start_log_prob, end_log_prob)) + return result_dict + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + orig_answer_text = None + is_impossible = False + + if is_training: + is_impossible = qa.get("is_impossible", False) + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + start_position = answer["answer_start"] + else: + start_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + paragraph_text=paragraph_text, + orig_answer_text=orig_answer_text, + start_position=start_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def get_result(result, eval_examples, eval_features): + """Evaluate the checkpoint on SQuAD 1.0.""" + + result_dict = {} + accumulate_predictions_v1( + result_dict, eval_examples, eval_features, + result, 20, 30) + all_predictions, all_nbest_json = write_predictions_v1( + result_dict, eval_examples, eval_features, result, 20, 30) + return all_predictions, all_nbest_json + + +def takeSecond(elem): + return elem[1] + + +def run(): + """ + read pipeline and do infer + """ + args = parse_args() + # init stream manager + stream_manager_api = StreamManagerApi() + ret = stream_manager_api.InitManager() + if ret != 0: + print("Failed to init Stream manager, ret=%s" % str(ret)) + return + + # create streams by pipeline config file + with open(os.path.realpath(args.pipeline), 'rb') as f: + pipeline_str = f.read() + ret = stream_manager_api.CreateMultipleStreams(pipeline_str) + if ret != 0: + print("Failed to create Stream, ret=%s" % str(ret)) + return + + stream_name = b'im_albertbase' + infer_total_time = 0 + # input_ids file list, every file content a tensor[1,128] + file_list = glob.glob(os.path.join( + os.path.realpath(args.data_dir), "00_data", "*.bin")) + cwq_lists = [] + for i in range(len(file_list)): + b = os.path.split(file_list[i]) + cwq_lists.append(b) + + cwq_lists.sort(key=takeSecond) + yms_lists = [] + for i in range(len(cwq_lists)): + c = cwq_lists[i][0]+'/'+cwq_lists[i][1] + yms_lists.append(c) + file_list = yms_lists + + eval_examples = read_squad_examples( + args.eval_json_path, False) + with open(args.eval_data_file_path, "rb") as fin: + eval_features = pickle.load(fin) + + outputs = [] + for input_ids in file_list: + file_name = input_ids.split('/')[-1] + if not send_appsrc_data(args, file_name, stream_name, stream_manager_api): + return + # Obtain the inference result by specifying streamName and uniqueId. + key_vec = StringVector() + key_vec.push_back(b'mxpi_tensorinfer0') + start_time = time.time() + infer_result = stream_manager_api.GetProtobuf(stream_name, 0, key_vec) + infer_total_time += time.time() - start_time + if infer_result.size() == 0: + print("inferResult is null") + return + if infer_result[0].errorCode != 0: + print("GetProtobuf error. errorCode=%d" % + (infer_result[0].errorCode)) + return + start_log_prob, end_log_prob = get_infer_logits( + args, file_name, infer_result) + + unique_id_name = os.path.join(args.data_dir, "03_data", file_name) + unique_id = np.fromfile(unique_id_name, np.int32) + unique_id = int(unique_id[0]) + outputs.append(RawResult( + unique_id=unique_id, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob)) + + all_predictions, _ = get_result(outputs, eval_examples, eval_features) + + js = json.dumps(all_predictions) + file = open('infer_result.txt', 'w') + file.write(js) + file.close() + print(all_predictions) + print('done') + post_process(args.eval_json_path, all_predictions, + output_metrics="output.json") + + +if __name__ == '__main__': + run() diff --git a/research/nlp/albert/infer_squad/sdk/run.sh b/research/nlp/albert/infer_squad/sdk/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..961cf424ab41e1c3f6614d40e6cafb8ae33abe48 --- /dev/null +++ b/research/nlp/albert/infer_squad/sdk/run.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pipeline=$1 +data_dir=$2 +eval_json_path=$3 +eval_data_file_path=$4 + +set -e + +# Simple log helper functions +info() { echo -e "\033[1;34m[INFO ][MxStream] $1\033[1;37m" ; } +warn() { echo >&2 -e "\033[1;31m[WARN ][MxStream] $1\033[1;37m" ; } + +#export MX_SDK_HOME=/home/work/mxVision +export LD_LIBRARY_PATH=${MX_SDK_HOME}/lib:${MX_SDK_HOME}/opensource/lib:${MX_SDK_HOME}/opensource/lib64:/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64:${LD_LIBRARY_PATH} +export GST_PLUGIN_SCANNER=${MX_SDK_HOME}/opensource/libexec/gstreamer-1.0/gst-plugin-scanner +export GST_PLUGIN_PATH=${MX_SDK_HOME}/opensource/lib/gstreamer-1.0:${MX_SDK_HOME}/lib/plugins + +#to set PYTHONPATH, import the StreamManagerApi.py +export PYTHONPATH=$PYTHONPATH:${MX_SDK_HOME}/python + +python3 main.py --pipeline=$pipeline --data_dir=$data_dir --eval_json_path=$eval_json_path --eval_data_file_path=$eval_data_file_path +exit 0 diff --git a/research/nlp/albert/infer_squad/utils/__init__.py b/research/nlp/albert/infer_squad/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..602527cd720c8d268599dbaef190ba1cf1eb6f2b --- /dev/null +++ b/research/nlp/albert/infer_squad/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/research/nlp/albert/infer_squad/utils/create_squad_data.py b/research/nlp/albert/infer_squad/utils/create_squad_data.py new file mode 100644 index 0000000000000000000000000000000000000000..3b1a04677d2d18df23f8df04c0aa8448d98da75e --- /dev/null +++ b/research/nlp/albert/infer_squad/utils/create_squad_data.py @@ -0,0 +1,551 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""create squad data""" + +import collections +import json +import six +from six.moves import map, range +import tokenization +import numpy as np +from mindspore.log import logging + + +class SquadExample: + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + paragraph_text, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.paragraph_text = paragraph_text + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures: + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tok_start_to_orig_index, + tok_end_to_orig_index, + token_is_max_context, + tokens, + input_ids, + input_mask, + segment_ids, + paragraph_len, + p_mask=None, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tok_start_to_orig_index = tok_start_to_orig_index + self.tok_end_to_orig_index = tok_end_to_orig_index + self.token_is_max_context = token_is_max_context + self.tokens = tokens + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.paragraph_len = paragraph_len + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + self.p_mask = p_mask + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + orig_answer_text = None + is_impossible = False + + if is_training: + is_impossible = qa.get("is_impossible", False) + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + start_position = answer["answer_start"] + else: + start_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + paragraph_text=paragraph_text, + orig_answer_text=orig_answer_text, + start_position=start_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + \ + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def _convert_index(index, pos, m=None, is_start=True): + """Converts index.""" + if index[pos] is not None: + return index[pos] + n = len(index) + rear = pos + while rear < n - 1 and index[rear] is None: + rear += 1 + front = pos + while front > 0 and index[front] is None: + front -= 1 + assert index[front] is not None or index[rear] is not None + if index[front] is None: + if index[rear] >= 1: + if is_start: + idx = 0 + else: + idx = index[rear] - 1 + return idx + return index[rear] + if index[rear] is None: + if m is not None and index[front] < m - 1: + if is_start: + idx = index[front] + 1 + else: + idx = m - 1 + return idx + return index[front] + if is_start: + if index[rear] > index[front] + 1: + idx = index[front] + 1 + else: + idx = index[rear] + else: + if index[rear] > index[front] + 1: + idx = index[rear] - 1 + else: + idx = index[front] + return idx + + +def _lcs_match(max_dist, n, m, f, g, do_lower_case, paragraph_text, tok_cat_text): + """Longest-common-substring algorithm.""" + f.fill(0) + g.clear() + + # longest common sub sequence + # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j)) + for i in range(n): + # note(zhiliny): + # unlike standard LCS, this is specifically optimized for the setting + # because the mismatch between sentence pieces and original text will + # be small + for j in range(i - max_dist, i + max_dist): + if j >= m or j < 0: + continue + + if i > 0: + g[(i, j)] = 0 + f[i, j] = f[i - 1, j] + + if j > 0 and f[i, j - 1] > f[i, j]: + g[(i, j)] = 1 + f[i, j] = f[i, j - 1] + + f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0 + if (tokenization.preprocess_text( + paragraph_text[i], do_lower_case=do_lower_case, + remove_space=False) == tok_cat_text[j] + and f_prev + 1 > f[i, j]): + g[(i, j)] = 2 + f[i, j] = f_prev + 1 + return f, g + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + output_fn, do_lower_case, is_gen_data=False): + """Loads a data file into a list of `InputBatch`s.""" + + cnt_pos, cnt_neg = 0, 0 + unique_id = 1000000000 + max_n, max_m = 1024, 1024 + f = np.zeros((max_n, max_m), dtype=np.float32) + output = [] + for (example_index, example) in enumerate(examples): + + if example_index % 100 == 0: + logging.info("Converting {}/{} pos {} neg {}".format( + example_index, len(examples), cnt_pos, cnt_neg)) + + query_tokens = tokenization.encode_ids( + tokenizer.sp_model, + tokenization.preprocess_text( + example.question_text, do_lower_case=do_lower_case)) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + paragraph_text = example.paragraph_text + para_tokens = tokenization.encode_pieces( + tokenizer.sp_model, + tokenization.preprocess_text( + example.paragraph_text, do_lower_case=do_lower_case), + return_unicode=False) + + chartok_to_tok_index = [] + tok_start_to_chartok_index = [] + tok_end_to_chartok_index = [] + char_cnt = 0 + para_tokens = [six.ensure_text(token, "utf-8") + for token in para_tokens] + for i, token in enumerate(para_tokens): + new_token = six.ensure_text(token).replace( + tokenization.SPIECE_UNDERLINE, " ") + chartok_to_tok_index.extend([i] * len(new_token)) + tok_start_to_chartok_index.append(char_cnt) + char_cnt += len(new_token) + tok_end_to_chartok_index.append(char_cnt - 1) + + tok_cat_text = "".join(para_tokens).replace( + tokenization.SPIECE_UNDERLINE, " ") + n, m = len(paragraph_text), len(tok_cat_text) + + if n > max_n or m > max_m: + max_n = max(n, max_n) + max_m = max(m, max_m) + f = np.zeros((max_n, max_m), dtype=np.float32) + g = {} + + max_dist = abs(n - m) + 5 + for _ in range(2): + f, g = _lcs_match(max_dist, n, m, f, g, + do_lower_case, paragraph_text, tok_cat_text) + if f[n - 1, m - 1] > 0.8 * n: + break + max_dist *= 2 + + orig_to_chartok_index = [None] * n + chartok_to_orig_index = [None] * m + i, j = n - 1, m - 1 + while i >= 0 and j >= 0: + if (i, j) not in g: + break + if g[(i, j)] == 2: + orig_to_chartok_index[i] = j + chartok_to_orig_index[j] = i + i, j = i - 1, j - 1 + elif g[(i, j)] == 1: + j = j - 1 + else: + i = i - 1 + + if (all(v is None for v in orig_to_chartok_index) or + f[n - 1, m - 1] < 0.8 * n): + logging.info("MISMATCH DETECTED!") + continue + + tok_start_to_orig_index = [] + tok_end_to_orig_index = [] + for i in range(len(para_tokens)): + start_chartok_pos = tok_start_to_chartok_index[i] + end_chartok_pos = tok_end_to_chartok_index[i] + start_orig_pos = _convert_index(chartok_to_orig_index, start_chartok_pos, + n, is_start=True) + end_orig_pos = _convert_index(chartok_to_orig_index, end_chartok_pos, + n, is_start=False) + + tok_start_to_orig_index.append(start_orig_pos) + tok_end_to_orig_index.append(end_orig_pos) + + if not is_training: + tok_start_position = tok_end_position = None + + if is_training and example.is_impossible: + tok_start_position = 0 + tok_end_position = 0 + + if is_training and not example.is_impossible: + start_position = example.start_position + end_position = start_position + len(example.orig_answer_text) - 1 + + start_chartok_pos = _convert_index(orig_to_chartok_index, start_position, + is_start=True) + tok_start_position = chartok_to_tok_index[start_chartok_pos] + + end_chartok_pos = _convert_index(orig_to_chartok_index, end_position, + is_start=False) + tok_end_position = chartok_to_tok_index[end_chartok_pos] + assert tok_start_position <= tok_end_position + + def _piece_to_id(x): + if six.PY2 and isinstance(x, six.text_type): + x = six.ensure_binary(x, "utf-8") + return tokenizer.sp_model.PieceToId(x) + + all_doc_tokens = list(map(_piece_to_id, para_tokens)) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_is_max_context = {} + segment_ids = [] + p_mask = [] + + cur_tok_start_to_orig_index = [] + cur_tok_end_to_orig_index = [] + + tokens.append(tokenizer.sp_model.PieceToId("[CLS]")) + segment_ids.append(0) + p_mask.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + p_mask.append(1) + tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) + segment_ids.append(0) + p_mask.append(1) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + + cur_tok_start_to_orig_index.append( + tok_start_to_orig_index[split_token_index]) + cur_tok_end_to_orig_index.append( + tok_end_to_orig_index[split_token_index]) + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + p_mask.append(0) + tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) + segment_ids.append(1) + p_mask.append(1) + + paragraph_len = len(tokens) + input_ids = tokens + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + p_mask.append(1) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + span_is_impossible = example.is_impossible + start_position = None + end_position = None + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + # continue + start_position = 0 + end_position = 0 + span_is_impossible = True + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and span_is_impossible: + start_position = 0 + end_position = 0 + + if is_training and not span_is_impossible: + pieces = [tokenizer.sp_model.IdToPiece(token) for token in + tokens[start_position: (end_position + 1)]] + answer_text = tokenizer.sp_model.DecodePieces(pieces) + logging.info("start_position: %d" % (start_position)) + logging.info("end_position: %d" % (end_position)) + logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + # note(zhiliny): With multi processing, + # the example_index is actually the index within the current process + # therefore we use example_index=None to avoid being used in the future. + # The current code does not use example_index of training data. + if is_training: + feat_example_index = None + else: + feat_example_index = example_index + if is_gen_data: + Record = collections.namedtuple( + 'Record', + ['unique_id', 'example_index', 'doc_span_index', 'tok_start_to_orig_index', 'tok_end_to_orig_index', + 'token_is_max_context', 'tokens', 'input_ids', 'input_mask', 'segment_ids', + 'paragraph_len', 'start_position', 'end_position', 'is_impossible', 'p_mask']) + + record = Record( + unique_id=unique_id, + example_index=feat_example_index, + doc_span_index=doc_span_index, + tok_start_to_orig_index=cur_tok_start_to_orig_index, + tok_end_to_orig_index=cur_tok_end_to_orig_index, + token_is_max_context=token_is_max_context, + tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens], + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + paragraph_len=paragraph_len, + start_position=start_position, + end_position=end_position, + is_impossible=span_is_impossible, + p_mask=p_mask) + + # Run callback + output_fn(record) + + feature = InputFeatures( + unique_id=unique_id, + example_index=feat_example_index, + doc_span_index=doc_span_index, + tok_start_to_orig_index=cur_tok_start_to_orig_index, + tok_end_to_orig_index=cur_tok_end_to_orig_index, + token_is_max_context=token_is_max_context, + tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens], + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + paragraph_len=paragraph_len, + start_position=start_position, + end_position=end_position, + is_impossible=span_is_impossible, + p_mask=p_mask) + output.append(feature) + unique_id += 1 + if span_is_impossible: + cnt_neg += 1 + else: + cnt_pos += 1 + + logging.info("Total number of instances: {} = pos {} neg {}".format( + cnt_pos + cnt_neg, cnt_pos, cnt_neg)) + return output diff --git a/research/nlp/albert/infer_squad/utils/data_precess_squad.py b/research/nlp/albert/infer_squad/utils/data_precess_squad.py new file mode 100644 index 0000000000000000000000000000000000000000..e1623e3ccde42e10c2db069e09112bbe6e682b6a --- /dev/null +++ b/research/nlp/albert/infer_squad/utils/data_precess_squad.py @@ -0,0 +1,106 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +'''squad data precess''' +import argparse +import os +import pickle +from create_squad_data import read_squad_examples, convert_examples_to_features +import numpy as np +import tokenization + + +def parse_args(): + """set parameters.""" + parser = argparse.ArgumentParser(description="bert preprocess") + parser.add_argument("--vocab_path", type=str, + default="../data/config/vocab.txt") + parser.add_argument("--spm_model_file", type=str, + default="../data/input", help="the path of convert dataset.") + parser.add_argument("--dev_path", type=str, default="../data/dev.json") + parser.add_argument("--max_seq_len", type=int, default=128, + help="sentence length, default is 128.") + parser.add_argument("--output_path", type=str, + default="../data/input", help="the path of convert dataset.") + parser.add_argument("--eval_data_file_path", type=str, default="", + help="Data path, it is better to use absolute path") + + args = parser.parse_args() + return args + + +def get_all_path(output_path): + """ + Args: + output_path: save path of convert dataset + Returns: + the path of ids, mask, token, label + """ + ids_path = os.path.join(output_path, "00_data") # input_ids + mask_path = os.path.join(output_path, "01_data") # input_mask + token_path = os.path.join(output_path, "02_data") # segment_ids + label_path = os.path.join(output_path, "03_data") # unique_id + + for path in [ids_path, mask_path, token_path, label_path]: + os.makedirs(path, 0o755, exist_ok=True) + + return ids_path, mask_path, token_path, label_path + + +def run(): + '''main function''' + args = parse_args() + input_ids, input_mask, segment_ids, unique_id = get_all_path( + args.output_path) + tokenizer = tokenization.FullTokenizer( + vocab_file=args.vocab_path, do_lower_case=True, spm_model_file=args.spm_model_file) + eval_examples = read_squad_examples(args.dev_path, False) + if not os.path.exists(args.eval_data_file_path): + eval_features = convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=384, + doc_stride=128, + max_query_length=64, + is_training=False, + output_fn=None, + do_lower_case=True) + with open(args.eval_data_file_path, "wb") as fout: + pickle.dump(eval_features, fout) + else: + with open(args.eval_data_file_path, "rb") as fin: + eval_features = pickle.load(fin) + + for i in range(len(eval_features)): + file_name = "squadv1" + "_batch_1_" + str(i) + ".bin" + ids_file_path = os.path.join(input_ids, file_name) + np.array(eval_features[i].input_ids, + dtype=np.int32).tofile(ids_file_path) + + input_mask_path = os.path.join(input_mask, file_name) + np.array(eval_features[i].input_mask, + dtype=np.int32).tofile(input_mask_path) + + segment_ids_path = os.path.join(segment_ids, file_name) + np.array(eval_features[i].segment_ids, + dtype=np.int32).tofile(segment_ids_path) + + unique_id_path = os.path.join(unique_id, file_name) + np.array(eval_features[i].unique_id, + dtype=np.int32).tofile(unique_id_path) + + +if __name__ == "__main__": + run() diff --git a/research/nlp/albert/infer_squad/utils/tokenization.py b/research/nlp/albert/infer_squad/utils/tokenization.py new file mode 100644 index 0000000000000000000000000000000000000000..1d0ab7f2d69d3e6aec05c69d431c9fef67294350 --- /dev/null +++ b/research/nlp/albert/infer_squad/utils/tokenization.py @@ -0,0 +1,488 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +Tokenization. +""" + +import unicodedata +import collections +import six +import sentencepiece as spm + +SPIECE_UNDERLINE = u"â–" + + +def preprocess_text(inputs, remove_space=True, do_lower_case=True): + """preprocess text""" + if remove_space: + outputs = ' '.join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + if six.PY2 and isinstance(outputs, str): + outputs = outputs.decode('utf-8') + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if do_lower_case: + outputs = outputs.lower() + return outputs + + +def encode_pieces(sp_model, text, return_unicode=True, sample=False): + """turn sentences into word pieces.""" + text = preprocess_text(text,) + if six.PY2 and isinstance(text, unicode): + text = text.encode('utf-8') + if not sample: + pieces = sp_model.EncodeAsPieces(text) + else: + pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit(): + cur_pieces = sp_model.EncodeAsPieces( + piece[:-1].replace(SPIECE_UNDERLINE, '')) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + # note(zhiliny): convert back to unicode for py2 + if six.PY2 and return_unicode: + ret_pieces = [] + for piece in new_pieces: + if isinstance(piece, str): + piece = piece.decode(piece, "utf-8") + ret_pieces.append(piece) + new_pieces = ret_pieces + + return new_pieces + + +def encode_ids(sp_model, text, sample=False): + pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample) + ids = [sp_model.PieceToId(piece) for piece in pieces] + return ids + + +def convert_to_unicode(text): + """ + Convert text into unicode type. + Args: + text: input str. + + Returns: + input str in unicode. + """ + ret = text + if isinstance(text, str): + ret = text + elif isinstance(text, bytes): + ret = text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + return ret + + +def vocab_to_dict_key_token(vocab_file): + """Loads a vocab file into a dict, key is token.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def vocab_to_dict_key_id(vocab_file): + """Loads a vocab file into a dict, key is id.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[index] = token + index += 1 + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +def convert_tokens_to_ids(vocab_file, tokens): + """ + Convert tokens to ids. + Args: + vocab_file: path to vocab.txt. + tokens: list of tokens. + + Returns: + list of ids. + """ + vocab_dict = vocab_to_dict_key_token(vocab_file) + output = [] + for token in tokens: + output.append(vocab_dict[token]) + return output + + +def convert_ids_to_tokens(vocab_file, ids): + """ + Convert ids to tokens. + Args: + vocab_file: path to vocab.txt. + ids: list of ids. + + Returns: + list of tokens. + """ + vocab_dict = vocab_to_dict_key_id(vocab_file) + output = [] + for _id in ids: + output.append(vocab_dict[_id]) + return output + + +class FullTokenizer(): + """ + Full tokenizer + """ + + def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None): + self.vocab_dict = None + self.sp_model = None + if spm_model_file: + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(spm_model_file) + # # Note(mingdachen): For the purpose of consistent API, we are + # # generating a vocabulary for the sentence piece tokenizer. + self.vocab_dict = {self.sp_model.IdToPiece(i): i for i + in range(self.sp_model.GetPieceSize())} + else: + self.vocab_dict = vocab_to_dict_key_token(vocab_file) + self.do_lower_case = do_lower_case + self.basic_tokenize = BasicTokenizer(do_lower_case) + self.wordpiece_tokenize = WordpieceTokenizer(self.vocab_dict) + + def tokenize(self, text): + """ + Do full tokenization. + Args: + text: str of text. + + Returns: + list of tokens. + """ + if self.sp_model: + tokens_ret = encode_pieces( + self.sp_model, text, return_unicode=False) + else: + tokens_ret = [] + text = convert_to_unicode(text) # + for tokens in self.basic_tokenize.tokenize(text): + wordpiece_tokens = self.wordpiece_tokenize.tokenize(tokens) + tokens_ret.extend(wordpiece_tokens) + return tokens_ret + + def convert_tokens_to_ids(self, tokens): + if self.sp_model: + output = [self.sp_model.PieceToId(token) for token in tokens] + else: + # vocab_dict = vocab_to_dict_key_token(self.vocab_dict) + output = [] + for token in tokens: + output.append(self.vocab_dict[token]) + return output + + def convert_ids_to_tokens(self, ids): + if self.sp_model: + output = [self.sp_model.IdToPiece(id_) for id_ in ids] + else: + # vocab_dict = vocab_to_dict_key_id(self.vocab_dict) + output = [] + for _id in ids: + output.append(self.vocab_dict[_id]) + return output + + +class BasicTokenizer(): # --- + """ + Basic tokenizer + """ + + def __init__(self, do_lower_case=True): + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """ + Do basic tokenization. + Args: + text: text in unicode. + + Returns: + a list of tokens split from text + """ + text = self._clean_text(text) + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + aaa = self._run_split_on_punc(token) + split_tokens.extend(aaa) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + i = 0 + start_new_word = True + output = [] + for char in text: + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + return ["".join(x) for x in output] + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((0x4E00 <= cp <= 0x9FFF) or + (0x3400 <= cp <= 0x4DBF) or + (0x20000 <= cp <= 0x2A6DF) or + (0x2A700 <= cp <= 0x2B73F) or + (0x2B740 <= cp <= 0x2B81F) or + (0x2B820 <= cp <= 0x2CEAF) or + (0xF900 <= cp <= 0xFAFF) or + (0x2F800 <= cp <= 0x2FA1F)): + return True + + return False + + +class WordpieceTokenizer(): + """ + Wordpiece tokenizer + """ + + def __init__(self, vocab): + self.vocab_dict = vocab + + def tokenize(self, tokens): + """ + Do word-piece tokenization + Args: + tokens: a word. + + Returns: + a list of tokens that can be found in vocab dict. + """ + output_tokens = [] + tokens = convert_to_unicode(tokens) + for token in whitespace_tokenize(tokens): + chars = list(token) + len_chars = len(chars) + start = 0 + end = len_chars + while start < len_chars: + while start < end: + substr = "".join(token[start:end]) + if start != 0: + substr = "##" + substr + if substr in self.vocab_dict: + output_tokens.append(substr) + start = end + end = len_chars + else: + end = end - 1 + if start == end and start != len_chars: + output_tokens.append("[UNK]") + break + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically control characters but we treat them + # as whitespace since they are generally considered as such. + whitespace_char = [" ", "\t", "\n", "\r"] + if char in whitespace_char: + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + control_char = ["\t", "\n", "\r"] + if char in control_char: + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((33 <= cp <= 47) or (58 <= cp <= 64) or + (91 <= cp <= 96) or (123 <= cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + if isinstance(text, str): + t = text + elif isinstance(text, bytes): + t = text.encode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + return t + + +def tokenize_chinese_chars(text): + """Adds whitespace around any CJK character.""" + + def _is_chinese_char(cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((0x4E00 <= cp <= 0x9FFF) or # + (0x3400 <= cp <= 0x4DBF) or # + (0x20000 <= cp <= 0x2A6DF) or # + (0x2A700 <= cp <= 0x2B73F) or # + (0x2B740 <= cp <= 0x2B81F) or # + (0x2B820 <= cp <= 0x2CEAF) or + (0xF900 <= cp <= 0xFAFF) or # + (0x2F800 <= cp <= 0x2FA1F)): # + return True + + return False + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + output = [] + buff = "" + for char in text: + cp = ord(char) + if _is_chinese_char(cp) or is_whitespace(char): + if buff != "": + output.append(buff) + buff = "" + output.append(char) + else: + buff += char + + if buff != "": + output.append(buff) + + return output diff --git a/research/nlp/albert/modelarts/train_modelarts.py b/research/nlp/albert/modelarts/train_modelarts.py new file mode 100644 index 0000000000000000000000000000000000000000..8dc895c94943026edaea8c2250a5b166593e8ae5 --- /dev/null +++ b/research/nlp/albert/modelarts/train_modelarts.py @@ -0,0 +1,332 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +''' +Alert finetune and evaluation script. +''' +import collections +import os +import six + +from src.model_utils.device_adapter import get_device_id +from src.model_utils.moxing_adapter import moxing_wrapper +from src.model_utils.config import config as args_opt, optimizer_cfg, albert_net_cfg +from src.utils import make_directory, LossCallBack, LoadNewestCkpt, AlbertLearningRate +from src.dataset import create_squad_dataset +from src.albert_for_finetune import AlbertSquadCell, AlbertSquad +from src.Albert_Callback import albert_callback +from src.finetune_eval_model import AlbertCLSModel, AlbertSquadModel + +from mindspore.communication.management import init, get_rank +from mindspore.common import set_seed +from mindspore.context import ParallelMode +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor +from mindspore.train.model import Model +from mindspore.common.tensor import Tensor +from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum +from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell +from mindspore import context, export +import mindspore.common.dtype as mstype + +import numpy as np + +os.system('pip install sentencepiece') + + +if six.PY2: + import six.moves.cPickle as pickle +else: + import pickle + +_cur_dir = os.getcwd() + + +def do_export(args, load_finetune_checkpoint_path): + '''export function''' + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + if args.device_target == "Ascend": + context.set_context(device_id=args.device_id) + + if args.description == "run_classifier": + net = AlbertCLSModel(albert_net_cfg, False, num_labels=args.num_class) + elif args.description == "run_squad_v1": + net = AlbertSquadModel(albert_net_cfg, False) + else: + raise ValueError("unsupported downstream task") + + print("load_finetune_checkpoint_path ", load_finetune_checkpoint_path) + print("export_file_name ", args.export_file_name) + print("os.path.realpath(file_name) ", + os.path.realpath(args.export_file_name)) + load_checkpoint(load_finetune_checkpoint_path, net=net) + net.set_train(False) + + input_ids = Tensor( + np.zeros([args.export_batch_size, albert_net_cfg.seq_length]), mstype.int32) + input_mask = Tensor( + np.zeros([args.export_batch_size, albert_net_cfg.seq_length]), mstype.int32) + token_type_id = Tensor( + np.zeros([args.export_batch_size, albert_net_cfg.seq_length]), mstype.int32) + + input_data = [input_ids, input_mask, token_type_id] + + print("remove ", os.path.realpath(args.export_file_name)) + print("remove ", args.export_file_name + "." + args.file_format) + print("remove ", f"{args.export_file_name}.{args.file_format.lower()}") + if os.path.exists(f"{args.export_file_name}.{args.file_format.lower()}"): + os.remove(f"{args.export_file_name}.{args.file_format.lower()}") + + export(net, *input_data, file_name=args.export_file_name, + file_format=args.file_format) + + +def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1, args=None): + """ do train """ + if load_checkpoint_path == "": + raise ValueError( + "Pretrain model missed, finetune task must load pretrain model!") + steps_per_epoch = dataset.get_dataset_size() + print("steps_per_epoch: ", steps_per_epoch) + # optimizer + if optimizer_cfg.optimizer == 'AdamWeightDecay': + lr_schedule = AlbertLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate, + end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate, + warmup_steps=optimizer_cfg.AdamWeightDecay.warmup_steps, + decay_steps=steps_per_epoch * epoch_num, + power=optimizer_cfg.AdamWeightDecay.power) + params = network.trainable_params() + decay_params = list( + filter(optimizer_cfg.AdamWeightDecay.decay_filter, params)) + other_params = list( + filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params)) + group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay}, + {'params': other_params, 'weight_decay': 0.0}] + + optimizer = AdamWeightDecay( + group_params, lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps) + elif optimizer_cfg.optimizer == 'Lamb': + lr_schedule = AlbertLearningRate(learning_rate=optimizer_cfg.Lamb.learning_rate, + end_learning_rate=optimizer_cfg.Lamb.end_learning_rate, + warmup_steps=int( + steps_per_epoch * epoch_num * 0.1), + decay_steps=steps_per_epoch * epoch_num, + power=optimizer_cfg.Lamb.power) + optimizer = Lamb(network.trainable_params(), learning_rate=lr_schedule) + elif optimizer_cfg.optimizer == 'Momentum': + optimizer = Momentum(network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate, + momentum=optimizer_cfg.Momentum.momentum) + else: + raise Exception( + "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]") + + # load checkpoint into network + ckpt_config = CheckpointConfig( + save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) + ckpoint_cb = ModelCheckpoint(prefix="squad", + directory=None if save_checkpoint_path == "" else save_checkpoint_path, + config=ckpt_config) + + param_dict = load_checkpoint(load_checkpoint_path) + load_param_into_net(network, param_dict) + update_cell = DynamicLossScaleUpdateCell( + loss_scale_value=2 ** 32, scale_factor=2, scale_window=1000) + netwithgrads = AlbertSquadCell( + network, optimizer=optimizer, scale_update_cell=update_cell) + model = Model(netwithgrads) + eval_callback = albert_callback( + netwithgrads, args, steps_per_epoch, save_checkpoint_path) + model.train(epoch_num, dataset, callbacks=[TimeMonitor(dataset.get_dataset_size()), eval_callback, + LossCallBack(dataset.get_dataset_size()), ckpoint_cb]) + + +def do_eval(dataset=None, load_checkpoint_path="", eval_batch_size=1): + """ do eval """ + if load_checkpoint_path == "": + raise ValueError( + "Finetune model missed, evaluation task must load finetune model!") + net = AlbertSquad(albert_net_cfg, False, 2) + net.set_train(False) + param_dict = load_checkpoint(load_checkpoint_path) + load_param_into_net(net, param_dict) + model = Model(net) + output = [] + RawResult = collections.namedtuple( + "RawResult", ["unique_id", "start_log_prob", "end_log_prob"]) + columns_list = ["input_ids", "input_mask", "segment_ids", "unique_ids"] + for data in dataset.create_dict_iterator(num_epochs=1): + input_data = [] + for i in columns_list: + input_data.append(data[i]) + input_ids, input_mask, segment_ids, unique_ids = input_data + start_positions = Tensor([1], mstype.float32) + end_positions = Tensor([1], mstype.float32) + is_impossible = Tensor([1], mstype.float32) + logits = model.predict(input_ids, input_mask, segment_ids, start_positions, + end_positions, unique_ids, is_impossible) + ids = logits[0].asnumpy() + start = logits[1].asnumpy() + end = logits[2].asnumpy() + + for i in range(eval_batch_size): + unique_id = int(ids[i]) + start_logits = [float(x) for x in start[i].flat] + end_logits = [float(x) for x in end[i].flat] + output.append(RawResult( + unique_id=unique_id, + start_log_prob=start_logits, + end_log_prob=end_logits)) + return output + + +def modelarts_pre_process(): + '''modelarts pre process function.''' + args_opt.device_id = get_device_id() + args_opt.load_pretrain_checkpoint_path = os.path.join( + args_opt.data_path, args_opt.load_pretrain_checkpoint_path) + args_opt.load_finetune_checkpoint_path = os.path.join( + args_opt.output_path, args_opt.load_finetune_checkpoint_path) + args_opt.save_finetune_checkpoint_path = os.path.join( + args_opt.output_path, args_opt.save_finetune_checkpoint_path) + args_opt.export_file_name = os.path.join( + args_opt.output_path, args_opt.export_file_name) + if args_opt.schema_file_path: + args_opt.schema_file_path = os.path.join( + args_opt.data_path, args_opt.schema_file_path) + args_opt.train_data_file_path = os.path.join( + args_opt.data_path, args_opt.train_data_file_path) + args_opt.eval_json_path = os.path.join( + args_opt.data_path, args_opt.eval_json_path) + args_opt.vocab_file_path = os.path.join( + args_opt.data_path, args_opt.vocab_file_path) + args_opt.spm_model_file = os.path.join( + args_opt.data_path, args_opt.spm_model_file) + if os.path.exists(args_opt.predict_feature_left_file): + args_opt.predict_feature_left_file = os.path.join( + args_opt.data_path, args_opt.predict_feature_left_file) + + +@moxing_wrapper(pre_process=modelarts_pre_process) +def run_squad(): + """run squad task""" + set_seed(323) + epoch_num = args_opt.epoch_num + load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path + save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path + load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path + + if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower() == "false": + raise ValueError( + "At least one of 'do_train' or 'do_eval' must be true") + if args_opt.do_train.lower() == "true" and args_opt.train_data_file_path == "": + raise ValueError( + "'train_data_file_path' must be set when do finetune task") + if args_opt.do_eval.lower() == "true": + if args_opt.vocab_file_path == "": + raise ValueError( + "'vocab_file_path' must be set when do evaluation task") + if args_opt.eval_json_path == "": + raise ValueError( + "'tokenization_file_path' must be set when do evaluation task") + + if args_opt.device_target == "Ascend": + context.set_context(mode=context.GRAPH_MODE, + device_target=args_opt.device_target, device_id=args_opt.device_id) + if args_opt.distribute == 'true': + device_num = args_opt.device_num + print(device_num) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num=device_num, + parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) + init() + rank = get_rank() + save_ckpt_path = os.path.join( + args_opt.save_finetune_checkpoint_path, 'ckpt_' + str(get_rank()) + '/') + else: + rank = 0 + device_num = 1 + save_ckpt_path = os.path.join( + args_opt.save_finetune_checkpoint_path, 'ckpt_0/') + + context.set_context(reserve_class_name_in_scope=False) + + make_directory(save_ckpt_path) + + netwithloss = AlbertSquad(albert_net_cfg, True, 2, dropout_prob=0.1) + if args_opt.do_train.lower() == "true": + ds = create_squad_dataset(batch_size=args_opt.train_batch_size, repeat_count=1, + data_file_path=args_opt.train_data_file_path, + schema_file_path=args_opt.schema_file_path, + do_shuffle=( + args_opt.train_data_shuffle.lower() == "true"), + rank_size=args_opt.device_num, + rank_id=rank) + do_train(ds, netwithloss, load_pretrain_checkpoint_path, + save_ckpt_path, epoch_num, args_opt) + + if save_finetune_checkpoint_path == "": + load_finetune_checkpoint_dir = _cur_dir + else: + load_finetune_checkpoint_dir = make_directory(save_ckpt_path) + load_finetune_checkpoint_path = LoadNewestCkpt(load_finetune_checkpoint_dir, + ds.get_dataset_size(), epoch_num, "squad") + if args_opt.do_eval.lower() == "true": + from src import tokenization + from src.squad_utils import read_squad_examples, convert_examples_to_features + from src.squad_get_predictions import get_result + from src.squad_postprocess import SQuad_postprocess + tokenizer = tokenization.FullTokenizer(vocab_file=args_opt.vocab_file_path, + do_lower_case=True, + spm_model_file=args_opt.spm_model_file) + eval_examples = read_squad_examples(args_opt.eval_json_path, False) + if args_opt.enable_modelarts: + args_opt.predict_feature_left_file = os.path.join( + args_opt.data_path, args_opt.predict_feature_left_file) + if not os.path.exists(args_opt.predict_feature_left_file): + eval_features = convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=albert_net_cfg.seq_length, + doc_stride=128, + max_query_length=64, + is_training=False, + output_fn=None, + do_lower_case=True) + with open(args_opt.predict_feature_left_file, "wb") as fout: + pickle.dump(eval_features, fout) + else: + with open(args_opt.predict_feature_left_file, "rb") as fin: + eval_features = pickle.load(fin) + + ds = create_squad_dataset(batch_size=args_opt.eval_batch_size, repeat_count=1, + data_file_path=eval_features, + schema_file_path=args_opt.schema_file_path, is_training=False, + do_shuffle=(args_opt.eval_data_shuffle.lower() == "true")) + + outputs = do_eval(ds, load_finetune_checkpoint_path, + args_opt.eval_batch_size) + all_predictions, _ = get_result(outputs, eval_examples, eval_features) + SQuad_postprocess(args_opt.eval_json_path, + all_predictions, output_metrics="output.json") + + do_export(args_opt, load_finetune_checkpoint_path) + + +if __name__ == "__main__": + args_opt.enable_modelarts = True + args_opt.do_train = "true" + run_squad() diff --git a/research/nlp/albert/scripts/docker_start.sh b/research/nlp/albert/scripts/docker_start.sh new file mode 100644 index 0000000000000000000000000000000000000000..af0ddabbe8927a9ab7a4893e60d6235b92a5443a --- /dev/null +++ b/research/nlp/albert/scripts/docker_start.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +docker_image=$1 +data_dir=$2 +model_dir=$3 + +docker run -it -u root --ipc=host \ + --device=/dev/davinci0 \ + --device=/dev/davinci1 \ + --device=/dev/davinci2 \ + --device=/dev/davinci3 \ + --device=/dev/davinci4 \ + --device=/dev/davinci5 \ + --device=/dev/davinci6 \ + --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + --privileged \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons \ + -v ${data_dir}:${data_dir} \ + -v ${model_dir}:${model_dir} \ + -v /root/ascend/log:/root/ascend/log ${docker_image} /bin/bash \ No newline at end of file