diff --git a/official/nlp/emotect/Dockerfile b/official/nlp/emotect/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..a48571932465017fa78b87c57d378c7c267e589a --- /dev/null +++ b/official/nlp/emotect/Dockerfile @@ -0,0 +1,25 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +ARG FROM_IMAGE_NAME +FROM ${FROM_IMAGE_NAME} + +RUN ln -s /usr/local/python3.7.5/bin/python3.7 /usr/bin/python + +RUN apt-get update && \ + apt-get install libglib2.0-dev -y || \ + rm -rf /var/lib/dpkg/info && \ + mkdir /var/lib/dpkg/info && \ + apt-get install libglib2.0-dev -y && \ + pip install pytest-runner==5.3.0 diff --git a/official/nlp/emotect/docker_start.sh b/official/nlp/emotect/docker_start.sh new file mode 100644 index 0000000000000000000000000000000000000000..b5405cfe9616e266f625a0f0a11277535a64aa0b --- /dev/null +++ b/official/nlp/emotect/docker_start.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +docker_image=$1 +data_dir=$2 +model_dir=$3 + +docker run -it --ipc=host \ + --device=/dev/davinci3 \ + --device=/dev/davinci4 \ + --device=/dev/davinci5 \ + --device=/dev/davinci6 \ + --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm --device=/dev/hisi_hdc \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \ + -v ${model_dir}:${model_dir} \ + -v ${data_dir}:${data_dir} \ + -v ~/ascend/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \ + -v ~/ascend/log/npu/slog/:/var/log/npu/slog -v ~/ascend/log/npu/profiling/:/var/log/npu/profiling \ + -v ~/ascend/log/npu/dump/:/var/log/npu/dump -v ~/ascend/log/npu/:/usr/slog ${docker_image} \ + /bin/bash diff --git a/official/nlp/emotect/infer/convert/convert.sh b/official/nlp/emotect/infer/convert/convert.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d2afbfe71305b727d0f84daba292a919da30b69 --- /dev/null +++ b/official/nlp/emotect/infer/convert/convert.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +air_path=$1 +om_path=$2 + +echo "Input AIR file path: ${air_path}" +echo "Output OM file path: ${om_path}" + +atc --framework=1 --model="${air_path}" \ + --output="${om_path}" \ + --soc_version=Ascend310 \ + --op_select_implmode="high_precision" \ No newline at end of file diff --git a/official/nlp/emotect/infer/data/config/emotect.pipeline b/official/nlp/emotect/infer/data/config/emotect.pipeline new file mode 100644 index 0000000000000000000000000000000000000000..ab2564ab522010e5e9afe27d4e4390a1be5e8ff6 --- /dev/null +++ b/official/nlp/emotect/infer/data/config/emotect.pipeline @@ -0,0 +1,46 @@ +{ + "im_emotect": { + "stream_config": { + "deviceId": "0" + }, + "appsrc0": { + "props": { + "blocksize": "409600" + }, + "factory": "appsrc", + "next": "mxpi_tensorinfer0:0" + }, + "appsrc1": { + "props": { + "blocksize": "409600" + }, + "factory": "appsrc", + "next": "mxpi_tensorinfer0:1" + }, + "appsrc2": { + "props": { + "blocksize": "409600" + }, + "factory": "appsrc", + "next": "mxpi_tensorinfer0:2" + }, + "mxpi_tensorinfer0": { + "props": { + "dataSource": "appsrc0,appsrc1,appsrc2", + "modelPath": "../data/model/emotect.om" + }, + "factory": "mxpi_tensorinfer", + "next": "mxpi_dataserialize0" + }, + "mxpi_dataserialize0": { + "props": { + "outputDataKeys": "mxpi_tensorinfer0" + }, + "factory": "mxpi_dataserialize", + "next": "appsink0" + }, + "appsink0": { + "factory": "appsink" + } + } +} \ No newline at end of file diff --git a/official/nlp/emotect/infer/docker_start_infer.sh b/official/nlp/emotect/infer/docker_start_infer.sh new file mode 100644 index 0000000000000000000000000000000000000000..072b0819ae7edbe63d62c98a039c43469157f792 --- /dev/null +++ b/official/nlp/emotect/infer/docker_start_infer.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +docker_image=$1 +data_dir=$2 + +function show_help() { + echo "Usage: docker_start.sh docker_image data_dir" +} + +function param_check() { + if [ -z "${docker_image}" ]; then + echo "please input docker_image" + show_help + exit 1 + fi + + if [ -z "${data_dir}" ]; then + echo "please input data_dir" + show_help + exit 1 + fi +} + +param_check + +docker run -it \ + --device=/dev/davinci0 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v ${data_dir}:${data_dir} \ + ${docker_image} \ + /bin/bash diff --git a/official/nlp/emotect/infer/mxbase/CMakeLists.txt b/official/nlp/emotect/infer/mxbase/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c50c162cb874b008928a019196b217a34c2c2a66 --- /dev/null +++ b/official/nlp/emotect/infer/mxbase/CMakeLists.txt @@ -0,0 +1,51 @@ +cmake_minimum_required(VERSION 3.10.0) +project(emotect) + +set(TARGET emotect) + +add_definitions(-DENABLE_DVPP_INTERFACE) +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) +add_definitions(-Dgoogle=mindxsdk_private) +add_compile_options(-std=c++11 -fPIE -fstack-protector-all -fPIC -Wall) +add_link_options(-Wl,-z,relro,-z,now,-z,noexecstack -s -pie) + +# Check environment variable +if(NOT DEFINED ENV{ASCEND_HOME}) + message(FATAL_ERROR "please define environment variable:ASCEND_HOME") +endif() +if(NOT DEFINED ENV{ASCEND_VERSION}) + message(WARNING "please define environment variable:ASCEND_VERSION") +endif() +if(NOT DEFINED ENV{ARCH_PATTERN}) + message(WARNING "please define environment variable:ARCH_PATTERN") +endif() +set(ACL_INC_DIR $ENV{ASCEND_HOME}/$ENV{ASCEND_VERSION}/$ENV{ARCH_PATTERN}/acllib/include) +set(ACL_LIB_DIR $ENV{ASCEND_HOME}/$ENV{ASCEND_VERSION}/$ENV{ARCH_PATTERN}/acllib/lib64) + +set(MXBASE_ROOT_DIR $ENV{MX_SDK_HOME}) +set(MXBASE_INC ${MXBASE_ROOT_DIR}/include) +set(MXBASE_LIB_DIR ${MXBASE_ROOT_DIR}/lib) +set(MXBASE_POST_LIB_DIR ${MXBASE_ROOT_DIR}/lib/modelpostprocessors) +set(MXBASE_POST_PROCESS_DIR ${MXBASE_ROOT_DIR}/include/MxBase/postprocess/include) +if(DEFINED ENV{MXSDK_OPENSOURCE_DIR}) + set(OPENSOURCE_DIR $ENV{MXSDK_OPENSOURCE_DIR}) +else() + set(OPENSOURCE_DIR ${MXBASE_ROOT_DIR}/opensource) +endif() + +include_directories(${ACL_INC_DIR}) +include_directories(${OPENSOURCE_DIR}/include) +include_directories(${OPENSOURCE_DIR}/include/opencv4) + +include_directories(${MXBASE_INC}) +include_directories(${MXBASE_POST_PROCESS_DIR}) + +link_directories(${ACL_LIB_DIR}) +link_directories(${OPENSOURCE_DIR}/lib) +link_directories(${MXBASE_LIB_DIR}) +link_directories(${MXBASE_POST_LIB_DIR}) + +add_executable(${TARGET} src/main.cpp src/EmotectBase.cpp) +target_link_libraries(${TARGET} glog cpprest mxbase opencv_world stdc++fs) + +install(TARGETS ${TARGET} RUNTIME DESTINATION ${PROJECT_SOURCE_DIR}/) diff --git a/official/nlp/emotect/infer/mxbase/build.sh b/official/nlp/emotect/infer/mxbase/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..33717531b2330480ef9f23f95c93a6ac203ee48e --- /dev/null +++ b/official/nlp/emotect/infer/mxbase/build.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +path_cur=$(dirname $0) + +function check_env() +{ + # set ASCEND_VERSION to ascend-toolkit/latest when it was not specified by user + if [ ! "${ASCEND_VERSION}" ]; then + export ASCEND_VERSION=ascend-toolkit/latest + echo "Set ASCEND_VERSION to the default value: ${ASCEND_VERSION}" + else + echo "ASCEND_VERSION is set to ${ASCEND_VERSION} by user" + fi + + if [ ! "${ARCH_PATTERN}" ]; then + # set ARCH_PATTERN to ./ when it was not specified by user + export ARCH_PATTERN=./ + echo "ARCH_PATTERN is set to the default value: ${ARCH_PATTERN}" + else + echo "ARCH_PATTERN is set to ${ARCH_PATTERN} by user" + fi +} + +function build_emotect() +{ + cd $path_cur + rm -rf build + mkdir -p build + cd build + cmake .. + make + ret=$? + if [ ${ret} -ne 0 ]; then + echo "Failed to build emotect." + exit ${ret} + fi + make install +} + +check_env +build_emotect \ No newline at end of file diff --git a/official/nlp/emotect/infer/mxbase/src/EmotectBase.cpp b/official/nlp/emotect/infer/mxbase/src/EmotectBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e8240aeb687de19eb1786f372c3cbf74cff0aee9 --- /dev/null +++ b/official/nlp/emotect/infer/mxbase/src/EmotectBase.cpp @@ -0,0 +1,251 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "EmotectBase.h" +#include <unistd.h> +#include <sys/stat.h> +#include <map> +#include <fstream> +#include "MxBase/DeviceManager/DeviceManager.h" +#include "MxBase/Log/Log.h" + +const uint32_t MAX_LENGTH = 64; +const uint32_t CLASS_NUM = 3; + +APP_ERROR EmotectBase::Init(const InitParam &initParam) { + deviceId_ = initParam.deviceId; + APP_ERROR ret = MxBase::DeviceManager::GetInstance()->InitDevices(); + if (ret != APP_ERR_OK) { + LogError << "Init devices failed, ret=" << ret << "."; + return ret; + } + ret = MxBase::TensorContext::GetInstance()->SetContext(initParam.deviceId); + if (ret != APP_ERR_OK) { + LogError << "Set context failed, ret=" << ret << "."; + return ret; + } + model_ = std::make_shared<MxBase::ModelInferenceProcessor>(); + ret = model_->Init(initParam.modelPath, modelDesc_); + if (ret != APP_ERR_OK) { + LogError << "ModelInferenceProcessor init failed, ret=" << ret << "."; + return ret; + } + return APP_ERR_OK; +} + +APP_ERROR EmotectBase::DeInit() { + model_->DeInit(); + MxBase::DeviceManager::GetInstance()->DestroyDevices(); + return APP_ERR_OK; +} + +APP_ERROR EmotectBase::ReadTensorFromFile(const std::string &file, uint32_t *data) { + if (data == NULL) { + LogError << "input data is invalid."; + return APP_ERR_COMM_INVALID_POINTER; + } + std::ifstream infile; + // open label file + infile.open(file, std::ios_base::in | std::ios_base::binary); + // check label file validity + if (infile.fail()) { + LogError << "Failed to open label file: " << file << "."; + return APP_ERR_COMM_OPEN_FAIL; + } + infile.read(reinterpret_cast<char*>(data), sizeof(uint32_t) * MAX_LENGTH); + infile.close(); + return APP_ERR_OK; +} + +APP_ERROR EmotectBase::ReadInputTensor(const std::string &fileName, uint32_t index, + std::vector<MxBase::TensorBase> *inputs) { + uint32_t data[MAX_LENGTH] = {0}; + APP_ERROR ret = ReadTensorFromFile(fileName, data); + if (ret != APP_ERR_OK) { + LogError << "ReadTensorFromFile failed."; + return ret; + } + + const uint32_t dataSize = modelDesc_.inputTensors[index].tensorSize; + MxBase::MemoryData memoryDataDst(dataSize, MxBase::MemoryData::MEMORY_DEVICE, deviceId_); + MxBase::MemoryData memoryDataSrc(reinterpret_cast<void*>(data), dataSize, MxBase::MemoryData::MEMORY_HOST_MALLOC); + ret = MxBase::MemoryHelper::MxbsMallocAndCopy(memoryDataDst, memoryDataSrc); + if (ret != APP_ERR_OK) { + LogError << GetError(ret) << "Memory malloc and copy failed."; + return ret; + } + + std::vector<uint32_t> shape = {1, MAX_LENGTH}; + inputs->push_back(MxBase::TensorBase(memoryDataDst, false, shape, MxBase::TENSOR_DTYPE_UINT32)); + return APP_ERR_OK; +} + +APP_ERROR EmotectBase::Inference(const std::vector<MxBase::TensorBase> &inputs, + std::vector<MxBase::TensorBase> *outputs) { + auto dtypes = model_->GetOutputDataType(); + for (size_t i = 0; i < modelDesc_.outputTensors.size(); ++i) { + std::vector<uint32_t> shape = {}; + for (size_t j = 0; j < modelDesc_.outputTensors[i].tensorDims.size(); ++j) { + shape.push_back((uint32_t)modelDesc_.outputTensors[i].tensorDims[j]); + } + MxBase::TensorBase tensor(shape, dtypes[i], MxBase::MemoryData::MemoryType::MEMORY_DEVICE, deviceId_); + APP_ERROR ret = MxBase::TensorBase::TensorBaseMalloc(tensor); + if (ret != APP_ERR_OK) { + LogError << "TensorBaseMalloc failed, ret=" << ret << "."; + return ret; + } + outputs->push_back(tensor); + } + + MxBase::DynamicInfo dynamicInfo = {}; + dynamicInfo.dynamicType = MxBase::DynamicType::STATIC_BATCH; + auto startTime = std::chrono::high_resolution_clock::now(); + APP_ERROR ret = model_->ModelInference(inputs, *outputs, dynamicInfo); + auto endTime = std::chrono::high_resolution_clock::now(); + double costMs = std::chrono::duration<double, std::milli>(endTime - startTime).count(); + g_infer_cost.push_back(costMs); + if (ret != APP_ERR_OK) { + LogError << "ModelInference failed, ret=" << ret << "."; + return ret; + } + return APP_ERR_OK; +} + +APP_ERROR EmotectBase::PostProcess(std::vector<MxBase::TensorBase> *outputs, std::vector<uint32_t> *argmax) { + MxBase::TensorBase &tensor = outputs->at(0); + APP_ERROR ret = tensor.ToHost(); + if (ret != APP_ERR_OK) { + LogError << GetError(ret) << "Tensor deploy to host failed."; + return ret; + } + // check tensor is available + auto outputShape = tensor.GetShape(); + uint32_t length = outputShape[0]; + uint32_t classNum = outputShape[1]; + LogInfo << "output shape is: " << outputShape[1] << std::endl; + + void* data = tensor.GetBuffer(); + for (uint32_t i = 0; i < length; i++) { + std::vector<float> result = {}; + for (uint32_t j = 0; j < classNum; j++) { + float value = *(reinterpret_cast<float*>(data) + i * classNum + j); + result.push_back(value); + } + // argmax and get the class id + std::vector<float>::iterator maxElement = std::max_element(std::begin(result), std::end(result)); + uint32_t argmaxIndex = maxElement - std::begin(result); + argmax->push_back(argmaxIndex); + } + + return APP_ERR_OK; +} + +APP_ERROR EmotectBase::CountPredictResult(const std::string &labelFile, const std::vector<uint32_t> &argmax) { + uint32_t data[1] = {0}; + APP_ERROR ret = ReadTensorFromFile(labelFile, data); + if (ret != APP_ERR_OK) { + LogError << "ReadTensorFromFile failed."; + return ret; + } + if (data[0] == argmax[0]) { + g_total_acc += 1; + } + g_total += 1; + return APP_ERR_OK; +} + +APP_ERROR EmotectBase::WriteResult(const std::string &fileName, const std::vector<uint32_t> &argmax) { + std::string resultPathName = "result"; + // create result directory when it does not exit + if (access(resultPathName.c_str(), 0) != 0) { + int ret = mkdir(resultPathName.c_str(), S_IRUSR | S_IWUSR | S_IXUSR); + if (ret != 0) { + LogError << "Failed to create result directory: " << resultPathName << ", ret = " << ret; + return APP_ERR_COMM_OPEN_FAIL; + } + } + // create result file under result directory + resultPathName = resultPathName + "/result.txt"; + std::ofstream tfile(resultPathName, std::ofstream::app); + if (tfile.fail()) { + LogError << "Failed to open result file: " << resultPathName; + return APP_ERR_COMM_OPEN_FAIL; + } + // write inference result into file + LogInfo << "=============================================================="; + LogInfo << "infer result of " << fileName << " is: "; + tfile << "file name is: " << fileName << std::endl; + + for (auto &item : argmax) { + LogInfo << item; + tfile << item << std::endl; + } + LogInfo << "=============================================================="; + tfile.close(); + return APP_ERR_OK; +} + +APP_ERROR EmotectBase::Process(const std::string &inferPath, const std::string &fileName, bool eval) { + std::vector<MxBase::TensorBase> inputs = {}; + std::string inputIdsFile = inferPath + "00_data/" + fileName; + APP_ERROR ret = ReadInputTensor(inputIdsFile, INPUT_IDS, &inputs); + if (ret != APP_ERR_OK) { + LogError << "Read input ids failed, ret=" << ret << "."; + return ret; + } + std::string inputMaskFile = inferPath + "01_data/" + fileName; + ret = ReadInputTensor(inputMaskFile, INPUT_MASK, &inputs); + if (ret != APP_ERR_OK) { + LogError << "Read input mask file failed, ret=" << ret << "."; + return ret; + } + std::string tokenTypeIdFile = inferPath + "02_data/" + fileName; + ret = ReadInputTensor(tokenTypeIdFile, TOKEN_TYPE, &inputs); + if (ret != APP_ERR_OK) { + LogError << "Read token typeId file failed, ret=" << ret << "."; + return ret; + } + + std::vector<MxBase::TensorBase> outputs = {}; + ret = Inference(inputs, &outputs); + if (ret != APP_ERR_OK) { + LogError << "Inference failed, ret=" << ret << "."; + return ret; + } + + std::vector<uint32_t> argmax; + ret = PostProcess(&outputs, &argmax); + if (ret != APP_ERR_OK) { + LogError << "PostProcess failed, ret=" << ret << "."; + return ret; + } + + ret = WriteResult(fileName, argmax); + if (ret != APP_ERR_OK) { + LogError << "save result failed, ret=" << ret << "."; + return ret; + } + + if (eval) { + std::string labelFile = inferPath + "03_data/" + fileName; + ret = CountPredictResult(labelFile, argmax); + if (ret != APP_ERR_OK) { + LogError << "CalcF1Score read label failed, ret=" << ret << "."; + return ret; + } + } + + return APP_ERR_OK; +} diff --git a/official/nlp/emotect/infer/mxbase/src/EmotectBase.h b/official/nlp/emotect/infer/mxbase/src/EmotectBase.h new file mode 100644 index 0000000000000000000000000000000000000000..7ca47c8c1351031cef5e381bc0bec18b3b793412 --- /dev/null +++ b/official/nlp/emotect/infer/mxbase/src/EmotectBase.h @@ -0,0 +1,66 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MXBASE_EMOTECTBASE_H +#define MXBASE_EMOTECTBASE_H + +#include <memory> +#include <utility> +#include <vector> +#include <string> +#include <map> +#include <opencv2/opencv.hpp> +#include "MxBase/DvppWrapper/DvppWrapper.h" +#include "MxBase/ModelInfer/ModelInferenceProcessor.h" +#include "MxBase/Tensor/TensorContext/TensorContext.h" + +extern std::vector<double> g_infer_cost; +extern uint32_t g_total; +extern uint32_t g_total_acc; + +struct InitParam { + uint32_t deviceId; + std::string labelPath; + std::string modelPath; + uint32_t classNum; +}; + +enum DataIndex { + INPUT_IDS = 0, + INPUT_MASK = 1, + TOKEN_TYPE = 2, +}; + +class EmotectBase { + public: + APP_ERROR Init(const InitParam &initParam); + APP_ERROR DeInit(); + APP_ERROR Inference(const std::vector<MxBase::TensorBase> &inputs, std::vector<MxBase::TensorBase> *outputs); + APP_ERROR Process(const std::string &inferPath, const std::string &fileName, bool eval); + APP_ERROR PostProcess(std::vector<MxBase::TensorBase> *outputs, std::vector<uint32_t> *argmax); + protected: + APP_ERROR ReadTensorFromFile(const std::string &file, uint32_t *data); + APP_ERROR ReadInputTensor(const std::string &fileName, uint32_t index, std::vector<MxBase::TensorBase> *inputs); + APP_ERROR WriteResult(const std::string &fileName, const std::vector<uint32_t> &argmax); + APP_ERROR CountPredictResult(const std::string &labelFile, const std::vector<uint32_t> &argmax); + private: + std::shared_ptr<MxBase::ModelInferenceProcessor> model_; + MxBase::ModelDesc modelDesc_ = {}; + std::vector<std::string> labelMap_ = {}; + uint32_t deviceId_ = 0; + uint32_t classNum_ = 0; +}; +#endif diff --git a/official/nlp/emotect/infer/mxbase/src/main.cpp b/official/nlp/emotect/infer/mxbase/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..11d6c902e1d6d7d4a5f9c971f67bf29baf76a5c6 --- /dev/null +++ b/official/nlp/emotect/infer/mxbase/src/main.cpp @@ -0,0 +1,104 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <unistd.h> +#include <dirent.h> +#include <iostream> +#include <fstream> +#include <vector> +#include "EmotectBase.h" +#include "MxBase/Log/Log.h" + +std::vector<double> g_infer_cost; +uint32_t g_total = 0; +uint32_t g_total_acc = 0; + +void InitEmotectParam(InitParam* initParam) { + initParam->deviceId = 0; + initParam->modelPath = "../data/model/emotect.om"; +} + +APP_ERROR ReadFilesFromPath(const std::string &path, std::vector<std::string> *files) { + DIR *dir = NULL; + struct dirent *ptr = NULL; + int file_type = 8; + if ((dir = opendir(path.c_str())) == NULL) { + LogError << "Open dir error: " << path; + return APP_ERR_COMM_OPEN_FAIL; + } + + while ((ptr = readdir(dir)) != NULL) { + // d_type == 8 is file + if (ptr->d_type == file_type) { + files->push_back(ptr->d_name); + } + } + closedir(dir); + // sort ascending order + sort(files->begin(), files->end()); + return APP_ERR_OK; +} + +int main(int argc, char* argv[]) { + if (argc <= 1) { + LogWarn << "Please input image path, such as './emotect /input/data 0'."; + return APP_ERR_OK; + } + + InitParam initParam; + InitEmotectParam(&initParam); + auto emotectBase = std::make_shared<EmotectBase>(); + APP_ERROR ret = emotectBase->Init(initParam); + if (ret != APP_ERR_OK) { + LogError << "EmotectBase init failed, ret=" << ret << "."; + return ret; + } + + std::string inferPath = argv[1]; + std::vector<std::string> files; + ret = ReadFilesFromPath(inferPath + "00_data", &files); + if (ret != APP_ERR_OK) { + LogError << "Read files from path failed, ret=" << ret << "."; + return ret; + } + // do eval and calc the f1 score + bool eval = atoi(argv[2]); + for (uint32_t i = 0; i < files.size(); i++) { + LogInfo << "read file name: " << files[i]; + ret = emotectBase->Process(inferPath, files[i], eval); + if (ret != APP_ERR_OK) { + LogError << "EmotectBase process failed, ret=" << ret << "."; + emotectBase->DeInit(); + return ret; + } + } + + if (eval) { + LogInfo << "=============================================================="; + float acc = (g_total_acc * 1.0) / (g_total * 1.0); + LogInfo << "Acc: " << acc; + LogInfo << "=============================================================="; + } + emotectBase->DeInit(); + double costSum = 0; + for (uint32_t i = 0; i < g_infer_cost.size(); i++) { + costSum += g_infer_cost[i]; + } + double scale = 1000; + LogInfo << "Infer items sum " << g_infer_cost.size() << ", cost total time: " << costSum << " ms."; + LogInfo << "The throughput: " << g_infer_cost.size() * scale / costSum << " bin/sec."; + return APP_ERR_OK; +} diff --git a/official/nlp/emotect/infer/sdk/build.sh b/official/nlp/emotect/infer/sdk/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..582965ed2b9f4aa027ea11eae744e50ba0fd7a53 --- /dev/null +++ b/official/nlp/emotect/infer/sdk/build.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +if [ $# -ne 1 ] +then + echo "==============================================================================================================" + echo "Please run the script as: " + echo "bash build.sh [TASK_TYPE]" + echo "for example: bash build.sh test" + echo "TASK_TYPE including [test, infer]" + echo "==============================================================================================================" +exit 1 +fi + +set -e + +# Simple log helper functions +info() { echo -e "\033[1;34m[INFO ][MxStream] $1\033[1;37m" ; } +warn() { echo >&2 -e "\033[1;31m[WARN ][MxStream] $1\033[1;37m" ; } + +export LD_LIBRARY_PATH=${MX_SDK_HOME}/lib:${MX_SDK_HOME}/opensource/lib:${MX_SDK_HOME}/opensource/lib64:/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64:${LD_LIBRARY_PATH} +export GST_PLUGIN_SCANNER=${MX_SDK_HOME}/opensource/libexec/gstreamer-1.0/gst-plugin-scanner +export GST_PLUGIN_PATH=${MX_SDK_HOME}/opensource/lib/gstreamer-1.0:${MX_SDK_HOME}/lib/plugins + +#to set PYTHONPATH, import the StreamManagerApi.py +export PYTHONPATH=$PYTHONPATH:${MX_SDK_HOME}/python + +if [ "$1" == "test" ];then + python3.7 main.py --pipeline=../data/config/emotect.pipeline --data_dir=../data/input/ --output_file=./emotect_output.txt --do_eval=True --task_name=emotect +else + python3.7 main.py --pipeline=../data/config/emotect.pipeline --data_dir=../data/infer/ --output_file=./emotect_infer_output.txt --do_eval=False --task_name=emotect +fi + +exit 0 diff --git a/official/nlp/emotect/infer/sdk/main.py b/official/nlp/emotect/infer/sdk/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f362967654072fef2837b8b93edbfa1259387cff --- /dev/null +++ b/official/nlp/emotect/infer/sdk/main.py @@ -0,0 +1,204 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +sample script of CLUE infer using SDK run in docker +""" + +import argparse +import glob +import os +import time + +import MxpiDataType_pb2 as MxpiDataType +import numpy as np +from StreamManagerApi import StreamManagerApi, MxDataInput, InProtobufVector, \ + MxProtobufIn, StringVector + +class Accuracy(): + ''' + calculate accuracy + ''' + def __init__(self): + self.acc_num = 0 + self.total_num = 0 + + def update(self, logits, labels): + self.acc_num += np.sum(labels == logits) + self.total_num += len(labels) + + +def parse_args(): + """set and check parameters.""" + parser = argparse.ArgumentParser(description="bert process") + parser.add_argument("--pipeline", type=str, default="", help="SDK infer pipeline") + parser.add_argument("--data_dir", type=str, default="", + help="Dataset contain input_ids, input_mask, segment_ids, label_ids") + parser.add_argument("--output_file", type=str, default="", help="save result to file") + parser.add_argument("--task_name", type=str, default="atis_intent", help="(atis_intent, mrda, swda, udc)") + parser.add_argument("--do_eval", type=str, default="True", help="eval the accuracy of model ") + args_opt = parser.parse_args() + return args_opt + + +def send_source_data(appsrc_id, filename, stream_name, stream_manager): + """ + Construct the input of the stream, + send inputs data to a specified stream based on streamName. + + Returns: + bool: send data success or not + """ + tensor = np.fromfile(filename, dtype=np.int32) + tensor = np.expand_dims(tensor, 0) + tensor_package_list = MxpiDataType.MxpiTensorPackageList() + tensor_package = tensor_package_list.tensorPackageVec.add() + array_bytes = tensor.tobytes() + data_input = MxDataInput() + data_input.data = array_bytes + tensor_vec = tensor_package.tensorVec.add() + tensor_vec.deviceId = 0 + tensor_vec.memType = 0 + for i in tensor.shape: + tensor_vec.tensorShape.append(i) + tensor_vec.dataStr = data_input.data + tensor_vec.tensorDataSize = len(array_bytes) + + key = "appsrc{}".format(appsrc_id).encode('utf-8') + protobuf_vec = InProtobufVector() + protobuf = MxProtobufIn() + protobuf.key = key + protobuf.type = b'MxTools.MxpiTensorPackageList' + protobuf.protobuf = tensor_package_list.SerializeToString() + protobuf_vec.push_back(protobuf) + + ret = stream_manager.SendProtobuf(stream_name, appsrc_id, protobuf_vec) + if ret < 0: + print("Failed to send data to stream.") + return False + return True + + +def send_appsrc_data(args_opt, file_name, stream_name, stream_manager): + """ + send three stream to infer model, include input ids, input mask and token type_id. + + Returns: + bool: send data success or not + """ + input_ids = os.path.realpath(os.path.join(args_opt.data_dir, "00_data", file_name)) + if not send_source_data(0, input_ids, stream_name, stream_manager): + return False + input_mask = os.path.realpath(os.path.join(args_opt.data_dir, "01_data", file_name)) + if not send_source_data(1, input_mask, stream_name, stream_manager): + return False + token_type_id = os.path.realpath(os.path.join(args_opt.data_dir, "02_data", file_name)) + if not send_source_data(2, token_type_id, stream_name, stream_manager): + return False + return True + + +def post_process(args_opt, file_name, infer_result): + """ + process the result of infer tensor to Visualization results. + Args: + args_opt: param of config. + file_name: label file name. + infer_result: get logit from infer result + """ + # print the infer result + print("==============================================================") + result = MxpiDataType.MxpiTensorPackageList() + result.ParseFromString(infer_result[0].messageBuf) + logit_id = np.frombuffer(result.tensorPackageVec[0].tensorVec[0].dataStr, dtype='<f4') + print("output tensor is: ", logit_id.shape) + print("post_process:") + print(logit_id) + logit_id = np.argmax(logit_id, axis=-1) + + #output to file + result_label = str(logit_id) + print(result_label) + with open(args_opt.output_file, "a") as output_file: + output_file.write("{}: {}\n".format(file_name, str(result_label))) + return logit_id + + +def run(): + """ + read pipeline and do infer + """ + # init stream manager + stream_manager_api = StreamManagerApi() + ret = stream_manager_api.InitManager() + if ret != 0: + print("Failed to init Stream manager, ret=%s" % str(ret)) + return + + # create streams by pipeline config file + with open(os.path.realpath(args.pipeline), 'rb') as f: + pipeline_str = f.read() + ret = stream_manager_api.CreateMultipleStreams(pipeline_str) + if ret != 0: + print("Failed to create Stream, ret=%s" % str(ret)) + return + + stream_name = b'im_emotect' + infer_total_time = 0 + # input_ids file list + file_list = glob.glob(os.path.join(os.path.realpath(args.data_dir), "00_data", "*.bin")) + data_prefix_len = len(args.task_name) + 1 + file_num = len(file_list) + for i in range(file_num): + file_list[i] = file_list[i].split('/')[-1] + file_list = sorted(file_list, key=lambda name: int(name[data_prefix_len:-4])) + for file_name in file_list: + if not send_appsrc_data(args, file_name, stream_name, stream_manager_api): + return + # Obtain the inference result by specifying streamName and uniqueId. + key_vec = StringVector() + key_vec.push_back(b'mxpi_tensorinfer0') + start_time = time.time() + infer_result = stream_manager_api.GetProtobuf(stream_name, 0, key_vec) + infer_total_time += time.time() - start_time + if infer_result.size() == 0: + print("inferResult is null") + return + if infer_result[0].errorCode != 0: + print("GetProtobuf error. errorCode=%d" % (infer_result[0].errorCode)) + return + + logit_id = post_process(args, file_name, infer_result) + if args.do_eval.lower() == 'true': + label_file = os.path.realpath(os.path.join(args.data_dir, "03_data", file_name)) + label_id = np.fromfile(label_file, np.int32) + callback.update(logit_id, label_id) + + if args.do_eval.lower() == 'true': + print("==============================================================") + print("acc_num {} , total_num {}, accuracy {:.6f}".format(callback.acc_num, callback.total_num, + callback.acc_num / callback.total_num)) + print("==============================================================") + scale = 1000.0 + print("Infer items sum:", file_num, "infer_total_time:", infer_total_time * scale, "ms") + print("throughput:", file_num / infer_total_time, "bin/sec") + # destroy streams + stream_manager_api.DestroyAllStreams() + + +if __name__ == '__main__': + args = parse_args() + callback = Accuracy() + run() diff --git a/official/nlp/emotect/infer/util/data_processor_seq.py b/official/nlp/emotect/infer/util/data_processor_seq.py new file mode 100644 index 0000000000000000000000000000000000000000..a38537b8049c9cceedec5dc7c371233be72aae9c --- /dev/null +++ b/official/nlp/emotect/infer/util/data_processor_seq.py @@ -0,0 +1,302 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +''' +Dataset reader for preprocessing and converting dataset into bin. +''' + +import io +import os +import argparse +import collections +import six +import numpy as np +from tokenizer import FullTokenizer + + +def csv_reader(fd, delimiter='\t'): + """ + load csv file + """ + def gen(): + for i in fd: + slots = i.rstrip('\n').split(delimiter) + if len(slots) == 1: + yield (slots,) + else: + yield slots + return gen() + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + text = text + elif isinstance(text, bytes): + text = text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + text = text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + text = text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + return text + + +class BaseReader: + """BaseReader for classify and sequence labeling task""" + + def __init__(self, + vocab_path, + label_map_config=None, + max_seq_len=512, + do_lower_case=True, + in_tokens=False, + random_seed=None): + self.max_seq_len = max_seq_len + self.tokenizer = FullTokenizer( + vocab_file=vocab_path, do_lower_case=do_lower_case) + self.vocab = self.tokenizer.vocab + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.in_tokens = in_tokens + + np.random.seed(random_seed) + + self.current_example = 0 + self.current_epoch = 0 + self.num_examples = 0 + + if label_map_config: + self.label_map = 0 + else: + self.label_map = None + + def _read_tsv(self, input_file): + """Reads a tab separated value file.""" + with io.open(input_file, "r", encoding="utf8") as f: + reader = csv_reader(f, delimiter="\t") + headers = next(reader) + Example = collections.namedtuple('Example', headers) + + examples = [] + for line in reader: + example = Example(*line) + examples.append(example) + return examples + + def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + def _convert_example_to_record(self, example, max_seq_length, tokenizer): + """Converts a single `Example` into a single `Record`.""" + + text_a = convert_to_unicode(example.text_a) + tokens_a = tokenizer.tokenize(text_a) + tokens_b = None + if "text_b" in example._fields: + text_b = convert_to_unicode(example.text_b) + tokens_b = tokenizer.tokenize(text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT/ERNIE is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + input_mask = [1] * len(input_ids) + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + if self.label_map: + label_id = self.label_map[example.label] + else: + label_id = example.label + + Record = collections.namedtuple( + 'Record', + ['input_ids', 'input_mask', 'segment_ids', 'label_id']) + + record = Record( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id) + return record + + def get_num_examples(self, input_file): + """return total number of examples""" + examples = self._read_tsv(input_file) + return len(examples) + + def get_examples(self, input_file): + examples = self._read_tsv(input_file) + return examples + + def get_all_path(self, output_path): + """ + Args: + output_path: save path of convert dataset + Returns: + the path of ids, mask, token, label + """ + ids_path = os.path.join(output_path, "00_data") + mask_path = os.path.join(output_path, "01_data") + token_path = os.path.join(output_path, "02_data") + label_path = os.path.join(output_path, "03_data") + for path in [ids_path, mask_path, token_path, label_path]: + os.makedirs(path, 0o755, exist_ok=True) + + return ids_path, mask_path, token_path, label_path + + def file_based_convert_examples_to_features(self, input_file, output_file): + """"Convert a set of `InputExample`s to a MindDataset file.""" + examples = self._read_tsv(input_file) + output_ids, output_mask, output_token, output_label = self.get_all_path(output_file) + example_count = 0 + for _, example in enumerate(examples): + record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer) + file_name = "emotect" + "_" + str(example_count) + ".bin" + ids_file_path = os.path.join(output_ids, file_name) + np.array(record.input_ids, dtype=np.int32).tofile(ids_file_path) + + mask_file_path = os.path.join(output_mask, file_name) + np.array(record.input_mask, dtype=np.int32).tofile(mask_file_path) + + token_file_path = os.path.join(output_token, file_name) + np.array(record.segment_ids, dtype=np.int32).tofile(token_file_path) + + label_file_path = os.path.join(output_label, file_name) + np.array(record.label_id, dtype=np.int32).tofile(label_file_path) + example_count += 1 + if example_count % 3000 == 0: + print(example_count) + print("total example:", example_count) + + +class ClassifyReader(BaseReader): + """ClassifyReader""" + + def _read_tsv(self, input_file): + """Reads a tab separated value file.""" + with io.open(input_file, "r", encoding="utf8") as f: + reader = csv_reader(f, delimiter="\t") + headers = next(reader) + text_indices = [ + index for index, h in enumerate(headers) if h != "label" + ] + Example = collections.namedtuple('Example', headers) + + examples = [] + for line in reader: + for index, text in enumerate(line): + if index in text_indices: + line[index] = text.replace(' ', '') + example = Example(*line) + examples.append(example) + return examples + + +def main(): + parser = argparse.ArgumentParser(description="read dataset and save it to bin") + parser.add_argument("--vocab_file", type=str, default="", help="vocab file") + parser.add_argument("--label_map_config", type=str, default=None, help="label mapping config file") + parser.add_argument("--max_seq_len", type=int, default=64, + help="The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + parser.add_argument("--do_lower_case", type=bool, default=True, + help="Whether to lower case the input text. " + "Should be True for uncased models and False for cased models.") + parser.add_argument("--random_seed", type=int, default=1, help="random seed number") + + parser.add_argument("--data_path", type=str, default="text.tsv", help="the format of infer file is tsv.") + parser.add_argument("--output_path", type=str, default="./data", help="the path of convert dataset.") + + args_opt = parser.parse_args() + reader = ClassifyReader( + vocab_path=args_opt.vocab_file, + label_map_config=args_opt.label_map_config, + max_seq_len=args_opt.max_seq_len, + do_lower_case=args_opt.do_lower_case, + random_seed=args_opt.random_seed + ) + reader.file_based_convert_examples_to_features(input_file=args_opt.data_path, output_file=args_opt.output_path) + +if __name__ == "__main__": + main() diff --git a/official/nlp/emotect/infer/util/run_dataconvert_emotect.sh b/official/nlp/emotect/infer/util/run_dataconvert_emotect.sh new file mode 100644 index 0000000000000000000000000000000000000000..196aad2ba24c437295c4bdb52ce3def2ec436ef8 --- /dev/null +++ b/official/nlp/emotect/infer/util/run_dataconvert_emotect.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# Simple log helper functions +info() { echo -e "\033[1;34m[INFO ][MxStream] $1\033[1;37m" ; } +warn() { echo >&2 -e "\033[1;31m[WARN ][MxStream] $1\033[1;37m" ; } + +export LD_LIBRARY_PATH=${MX_SDK_HOME}/lib:${MX_SDK_HOME}/opensource/lib:${MX_SDK_HOME}/opensource/lib64:/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64:${LD_LIBRARY_PATH} +export GST_PLUGIN_SCANNER=${MX_SDK_HOME}/opensource/libexec/gstreamer-1.0/gst-plugin-scanner +export GST_PLUGIN_PATH=${MX_SDK_HOME}/opensource/lib/gstreamer-1.0:${MX_SDK_HOME}/lib/plugins + +#to set PYTHONPATH, import the StreamManagerApi.py +export PYTHONPATH=$PYTHONPATH:${MX_SDK_HOME}/python + +cp ../../src/tokenizer.py ./ +#test +python3.7 data_processor_seq.py --vocab_file=../data/config/vocab.txt --max_seq_len=64 --data_path=../data/rawdata/test.tsv --output_path=../data/input/ +#infer +python3.7 data_processor_seq.py --vocab_file=../data/config/vocab.txt --max_seq_len=64 --data_path=../data/rawdata/infer.tsv --output_path=../data/infer/ +exit 0 diff --git a/official/nlp/emotect/modelart/start.py b/official/nlp/emotect/modelart/start.py new file mode 100644 index 0000000000000000000000000000000000000000..1e08cb09fe477b10f7362af7fb1c58d29e924587 --- /dev/null +++ b/official/nlp/emotect/modelart/start.py @@ -0,0 +1,237 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +''' +Ernie finetune and evaluation script. +''' + +import os +import time +import argparse +import numpy as np + +import mindspore.common.dtype as mstype +from mindspore import Tensor, context, export +from mindspore import log as logger +from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell +from mindspore.nn.optim import Adam, AdamWeightDecay, Adagrad +from mindspore.train.model import Model +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +from src.ernie_for_finetune import ErnieFinetuneCell, ErnieCLS +from src.finetune_eval_config import optimizer_cfg, ernie_net_cfg +from src.dataset import create_classification_dataset +from src.assessment_method import Accuracy +from src.utils import make_directory, LossCallBack, LoadNewestCkpt, ErnieLearningRate + +_cur_dir = os.getcwd() +CACHE_TRAINING_URL = "/cache/training/" + +if not os.path.isdir(CACHE_TRAINING_URL): + os.makedirs(CACHE_TRAINING_URL) + +def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): + """ do train """ + if load_checkpoint_path == "": + raise ValueError("Pretrain model missed, finetune task must load pretrain model!") + steps_per_epoch = 500 + # optimizer + if optimizer_cfg.optimizer == 'AdamWeightDecay': + lr_schedule = ErnieLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate, + end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate, + warmup_steps=int(steps_per_epoch * epoch_num * 0.1), + decay_steps=steps_per_epoch * epoch_num, + power=optimizer_cfg.AdamWeightDecay.power) + params = network.trainable_params() + decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params)) + other_params = list(filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params)) + group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay}, + {'params': other_params, 'weight_decay': 0.0}] + + optimizer = AdamWeightDecay(group_params, lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps) + elif optimizer_cfg.optimizer == 'Adam': + optimizer = Adam(network.trainable_params(), learning_rate=optimizer_cfg.Adam.learning_rate) + elif optimizer_cfg.optimizer == 'Adagrad': + optimizer = Adagrad(network.trainable_params(), learning_rate=optimizer_cfg.Adagrad.learning_rate) + # load checkpoint into network + ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=10) + ckpoint_cb = ModelCheckpoint(prefix="classifier", + directory=None if save_checkpoint_path == "" else save_checkpoint_path, + config=ckpt_config) + param_dict = load_checkpoint(load_checkpoint_path) + unloaded_params = load_param_into_net(network, param_dict) + if len(unloaded_params) > 2: + print(unloaded_params) + logger.warning('Loading ernie model failed, please check the checkpoint file.') + + update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2 ** 32, scale_factor=2, scale_window=1000) + netwithgrads = ErnieFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) + model = Model(netwithgrads) + callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb] + model.train(epoch_num, dataset, callbacks=callbacks) + + +def do_eval(dataset=None, network=None, num_class=2, load_checkpoint_path=""): + """ do eval """ + if load_checkpoint_path == "": + raise ValueError("Finetune model missed, evaluation task must load finetune model!") + net_for_pretraining = network(ernie_net_cfg, False, num_class) + net_for_pretraining.set_train(False) + param_dict = load_checkpoint(load_checkpoint_path) + load_param_into_net(net_for_pretraining, param_dict) + + callback = Accuracy() + + evaluate_times = [] + columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] + for data in dataset.create_dict_iterator(num_epochs=1): + input_data = [] + for i in columns_list: + input_data.append(data[i]) + input_ids, input_mask, token_type_id, label_ids = input_data + time_begin = time.time() + logits = net_for_pretraining(input_ids, input_mask, token_type_id, label_ids) + time_end = time.time() + evaluate_times.append(time_end - time_begin) + callback.update(logits, label_ids) + print("==============================================================") + print("acc_num {} , total_num {}, accuracy {:.6f}".format(callback.acc_num, callback.total_num, + callback.acc_num / callback.total_num)) + print("(w/o first and last) elapsed time: {}, per step time : {}".format( + sum(evaluate_times[1:-1]), sum(evaluate_times[1:-1])/(len(evaluate_times) - 2))) + print("==============================================================") + + +def run_classifier(): + """run classifier task""" + parser = argparse.ArgumentParser(description="run classifier") + parser.add_argument("--device_target", type=str, default="Ascend", choices=["Ascend", "GPU"], + help="Device type, default is Ascend") + parser.add_argument("--do_train", type=str, default="false", choices=["true", "false"], + help="Enable train, default is false") + parser.add_argument("--do_eval", type=str, default="false", choices=["true", "false"], + help="Enable eval, default is false") + parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") + parser.add_argument("--epoch_num", type=int, default=3, help="Epoch number, default is 3.") + parser.add_argument("--num_class", type=int, default=3, help="The number of class, default is 3.") + parser.add_argument("--train_data_shuffle", type=str, default="true", choices=["true", "false"], + help="Enable train data shuffle, default is true") + parser.add_argument("--eval_data_shuffle", type=str, default="false", choices=["true", "false"], + help="Enable eval data shuffle, default is false") + parser.add_argument("--train_batch_size", type=int, default=32, help="Train batch size, default is 32") + parser.add_argument("--eval_batch_size", type=int, default=1, help="Eval batch size, default is 1") + parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path") + parser.add_argument("--load_pretrain_checkpoint_path", type=str, default="", help="Load checkpoint file path") + parser.add_argument("--local_pretrain_checkpoint_path", type=str, default="", + help="Local pretrain checkpoint file path") + parser.add_argument("--load_finetune_checkpoint_path", type=str, default="", help="Load checkpoint file path") + parser.add_argument("--train_data_file_path", type=str, default="", + help="Data path, it is better to use absolute path") + parser.add_argument("--eval_data_file_path", type=str, default="", + help="Data path, it is better to use absolute path") + parser.add_argument("--schema_file_path", type=str, default="", + help="Schema path, it is better to use absolute path") + parser.add_argument('--data_url', type=str, default=None, help='Dataset path for ModelArts') + parser.add_argument('--train_url', type=str, default=None, help='Train output path for ModelArts') + parser.add_argument('--modelarts', type=str, default='false', + help='train on modelarts or not, default is false') + args_opt = parser.parse_args() + + epoch_num = args_opt.epoch_num + load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path + save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path + load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path + + if args_opt.modelarts.lower() == 'true': + import moxing as mox + mox.file.copy_parallel(args_opt.data_url, '/cache/data') + mox.file.copy_parallel(args_opt.load_pretrain_checkpoint_path, args_opt.local_pretrain_checkpoint_path) + load_pretrain_checkpoint_path = args_opt.local_pretrain_checkpoint_path + if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower() == "true": + mox.file.copy_parallel(args_opt.save_finetune_checkpoint_path, args_opt.load_finetune_checkpoint_path) + + if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower() == "false": + raise ValueError("At least one of 'do_train' or 'do_eval' must be true") + if args_opt.do_train.lower() == "true" and args_opt.train_data_file_path == "": + raise ValueError("'train_data_file_path' must be set when do finetune task") + if args_opt.do_eval.lower() == "true" and args_opt.eval_data_file_path == "": + raise ValueError("'eval_data_file_path' must be set when do evaluation task") + + target = args_opt.device_target + if target == "Ascend": + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + elif target == "GPU": + context.set_context(mode=context.GRAPH_MODE, device_target="GPU") + if ernie_net_cfg.compute_type != mstype.float32: + logger.warning('GPU only support fp32 temporarily, run with fp32.') + ernie_net_cfg.compute_type = mstype.float32 + else: + raise Exception("Target error, GPU or Ascend is supported.") + + netwithloss = ErnieCLS(ernie_net_cfg, True, num_labels=args_opt.num_class, dropout_prob=0.1) + + if args_opt.do_train.lower() == "true": + ds = create_classification_dataset(batch_size=args_opt.train_batch_size, repeat_count=1, + data_file_path=args_opt.train_data_file_path, + schema_file_path=args_opt.schema_file_path, + do_shuffle=(args_opt.train_data_shuffle.lower() == "true")) + do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num) + + if save_finetune_checkpoint_path == "": + load_finetune_checkpoint_dir = _cur_dir + else: + load_finetune_checkpoint_dir = make_directory(save_finetune_checkpoint_path) + load_finetune_checkpoint_path = LoadNewestCkpt(load_finetune_checkpoint_dir, + ds.get_dataset_size(), epoch_num, "classifier") + #frozen_to_air + ckpt_model = load_finetune_checkpoint_path + frozen_to_air_args = {'ckpt_file': ckpt_model, + 'batch_size': 1, + 'file_name': CACHE_TRAINING_URL + 'emotect.air', + 'file_format': 'AIR'} + net = ErnieCLS(ernie_net_cfg, False, num_labels=args_opt.num_class) + frozen_to_air(net, frozen_to_air_args) + + mox.file.copy_parallel(CACHE_TRAINING_URL, args_opt.train_url) + + if args_opt.do_eval.lower() == "true": + ds = create_classification_dataset(batch_size=args_opt.eval_batch_size, repeat_count=1, + data_file_path=args_opt.eval_data_file_path, + schema_file_path=args_opt.schema_file_path, + do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"), + drop_remainder=False) + do_eval(ds, ErnieCLS, args_opt.num_class, load_finetune_checkpoint_path) + + if args_opt.modelarts.lower() == 'true' and args_opt.do_train.lower() == "true": + mox.file.copy_parallel(load_finetune_checkpoint_path, + args_opt.train_url + load_finetune_checkpoint_path.split('/')[-1]) + + +def frozen_to_air(net, args): + """frozen model to air""" + load_checkpoint(args.get("ckpt_file"), net=net) + net.set_train(False) + + batch_size = args.get("batch_size") + input_ids = Tensor(np.zeros([batch_size, ernie_net_cfg.seq_length]), mstype.int32) + input_mask = Tensor(np.zeros([batch_size, ernie_net_cfg.seq_length]), mstype.int32) + token_type_id = Tensor(np.zeros([batch_size, ernie_net_cfg.seq_length]), mstype.int32) + + input_data = [input_ids, input_mask, token_type_id] + export(net.ernie, *input_data, file_name=args.get("file_name"), file_format=args.get("file_format")) + +if __name__ == "__main__": + run_classifier()