diff --git a/research/cv/MGN/README.md b/research/cv/MGN/README.md index e407392ceb7037bf6ef66f6055b9fa1c6914d505..d7496183e435cca2c355d358f0dca9737acb1c37 100644 --- a/research/cv/MGN/README.md +++ b/research/cv/MGN/README.md @@ -7,6 +7,7 @@ - [Environment Requirements](#environment-requirements) - [Quick Start](#quick-start) - [Running scripts](#running-scripts) + - [Run on Modelarts](#Run-on-Modelarts) - [Script Description](#script-description) - [Script and Sample Code](#script-and-sample-code) - [Script Parameters](#script-parameters) @@ -17,6 +18,7 @@ - [Evaluation Process](#evaluation-process) - [Inference Process](#inference-process) - [Export MindIR](#export-mindir) + - [Infer on Ascend310](#Infer-on-Ascend310) - [result](#result) - [Model Description](#model-description) - [Performance](#performance) @@ -86,7 +88,55 @@ bash scripts/run_standalone_train_gpu.sh 0 /path/to/market1501/ /path/to/output/ bash scripts/run_distribute_train_gpu.sh 8 /path/to/market1501/ /path/to/output/ /path/to/pretrined_resnet50.pth # run evaluation example -bash scripts/run_eval_gpu.sh /your/path/checkpoint_file +bash scripts/run_eval.sh /your/path/dataset /your/path/checkpoint_file GPU +``` + +### [Run on Modelarts](#contents) + +If you want to run in modelarts, please check the official documentation of modelarts, and you can start training and evaluation as follows: + +```text +# Train 8p on ModelArts +# (1) Perform a or b. +# a. Set "enable_modelarts=True" on default_config.yaml file. +# Set "pre_trained_backbone='/cache/data/resnet50.ckpt'" on default_config.yaml file. +# Set "ckpt_path='/cache/train'" on default_config.yaml file if load pretrain. +# Set "train_log_path='/cache/data/logs/'" on default_config.yaml file if load pretrain. +# Set "log_path='/cache/data/logs/'" on default_config.yaml file if load pretrain. +# Set other parameters on default_config.yaml file you need. +# b. Add "enable_modelarts=True" on the website UI interface. +# Add "pre_trained_backbone='/cache/data/resnet50.ckpt'" on the website UI interface. +# Add "ckpt_path='/cache/train'" on the website UI interface if load pretrain. +# Add "train_log_path='/cache/data/logs/'" on the website UI interface if load pretrain. +# Add "log_path='/cache/data/logs/'" on the website UI interface if load pretrain. +# Add other parameters on the website UI interface. +# (2) (option) Upload or copy your pretrained model to S3 bucket if load pretrain. +# (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.) +# (4) Set the code directory to "/path/MGN" on the website UI interface. +# (5) Set the startup file to "train.py" on the website UI interface. +# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. +# (7) Create your job. +# +# Train 1p on ModelArts +# (1) Perform a or b. +# a. Set "enable_modelarts=True" on default_config.yaml file. +# Set "pre_trained_backbone='/cache/data/resnet50.ckpt'" on default_config.yaml file. +# Set "ckpt_path='/cache/train'" on default_config.yaml file if load pretrain. +# Set "train_log_path='/cache/data/logs/'" on default_config.yaml file if load pretrain. +# Set "log_path='/cache/data/logs/'" on default_config.yaml file if load pretrain. +# Set other parameters on default_config.yaml file you need. +# b. Add "enable_modelarts=True" on the website UI interface. +# Add "pre_trained_backbone='/cache/data/resnet50.ckpt'" on the website UI interface. +# Add "ckpt_path='/cache/train'" on the website UI interface if load pretrain. +# Add "train_log_path='/cache/data/logs/'" on the website UI interface if load pretrain. +# Add "log_path='/cache/data/logs/'" on the website UI interface if load pretrain. +# Add other parameters on the website UI interface. +# (2) (option) Upload or copy your pretrained model to S3 bucket if load pretrain. +# (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.) +# (4) Set the code directory to "/path/MGN" on the website UI interface. +# (5) Set the startup file to "train.py" on the website UI interface. +# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. +# (7) Create your job. ``` ## [Script Description](#contents) @@ -108,9 +158,12 @@ MGN │ ├── local_adapter.py # Environment variables parser │ └── moxing_adapter.py # Moxing adapter for ModelArts ├── scripts +│ ├── run_distribute_train_ascend.sh # Use the Market1501 data set to start Ascend distributed training (8 cards) │ ├── run_distribute_train_gpu.sh # Use the Market1501 data set to start GPU distributed training (8 cards) -│ ├── run_eval_gpu.sh # Use the Market1501 data set to start single GPU evaluation -│ └── run_standalone_train_gpu.sh # Use the Market1501 data set to start single GPU training +│ ├── run_eval.sh # Use the Market1501 data set to start single GPU or Ascend evaluation +│ ├── run_infer_310.sh # Ascend 310 infer +│ ├── run_standalone_train_gpu.sh # Use the Market1501 data set to start single GPU training +│ └── run_standalone_train_gpu.sh # Use the Market1501 data set to start single Ascend training ├── src │ ├── __init__.py │ ├── callbacks.py # Logging to file callbacks @@ -118,11 +171,14 @@ MGN │ ├── loss.py # MGN loss definition │ ├── lr_schedule.py # Learning rate scheduler │ ├── mgn.py # MGN network structure +│ ├── MGN_Callback.py # MGN network callback files │ ├── resnet.py # ResNet 50 network structure │ ├── sampler.py # Sampler definition │ └── triplet_loss.py # Triplet loss definition ├── eval.py # Evaluate the network ├── export.py # Export the network +├── postprocess.py # Ascend 310 infer postprocess +├── preprocess.py # Ascend 310 infer preprocess ├── train.py # Train the network └── README.md @@ -189,26 +245,38 @@ Parameters for learning rate: - Set options in `market1501_config.yaml`, including paths, learning rate and network hyperparameters. -- Run `run_standalone_train_gpu.sh` for non-distributed training of MGN model. +- Run `run_standalone_train_gpu.sh` for non-distributed training of MGN model in GPU. ```bash bash scripts/run_standalone_train_gpu.sh DEVICE_ID DATA_DIR OUTPUT_PATH PRETRAINED_RESNET50 ``` -- Run `run_distribute_train_gpu.sh` for distributed training of MGN model. +- Run `run_distribute_train_gpu.sh` for distributed training of MGN model in GPU. ```bash bash scripts/run_distribute_train_gpu.sh DEVICE_NUM DATA_DIR OUTPUT_PATH PRETRAINED_RESNET50 ``` +- Run `run_standalone_train_ascend.sh` for non-distributed training of MGN model in Ascend. + + ```bash + bash scripts/run_standalone_train_ascend.sh DEVICE_ID DATA_DIR OUTPUT_PATH PRETRAINED_RESNET50 + ``` + +- Run `run_distribute_train_ascend.sh` for distributed training of MGN model in Ascend. + + ```bash + bash scripts/run_distribute_train_ascend.sh DEVICE_NUM DATA_DIR OUTPUT_PATH PRETRAINED_RESNET50 HCCL_JSON + ``` + ## [Evaluation Process](#contents) - Set options in `market1501_config.yaml`. -- Run `bash scripts/run_eval_gpu.sh` for evaluation of MGN model. +- Run `bash scripts/run_eval.sh` for evaluation of MGN model. ```bash - bash scripts/run_eval_gpu.sh CKPT_PATH + bash scripts/run_eval.sh DATA_DIR CKPT_PATH DEVICE_TARGET ``` ## Inference Process @@ -222,12 +290,21 @@ options: --config_path path to .yml config file --ckpt_file checkpoint file --file_name output file name - --file_format output file format, choices in ['MINDIR'] + --file_format output file format, choices in ['MINDIR', 'AIR'] ``` The ckpt_file and config_path parameters are required, `FILE_FORMAT` should be in "MINDIR" +### Infer on Ascend310 + +Before performing inference, the mindir file must be exported by export.py script. We only provide an example of inference using MINDIR model. + +```text +# Ascend310 inference +bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID] +``` + ### result Inference result will be shown in the terminal @@ -238,32 +315,32 @@ Inference result will be shown in the terminal #### Training Performance -| Parameters | GPU | -| -------------------------- | -------------------------------------------------------------- | -| Resource | Tesla V100-PCIE 32G | -| uploaded Date | 12/17/2021 (month/day/year) | -| MindSpore Version | 1.5.0 | -| Dataset | Market1501 | -| Training Parameters | max_epoch=400, ids_per_batch=12, decay_epochs='320,380', lr_init=0.00015 | -| Optimizer | Adam | -| Loss Function | ReIDLoss | -| Speed | 401ms/step (1pcs), 368ms/step (8pcs) | -| Loss | 0.456 | -| Params (M) | 69 | -| Checkpoint for inference | 846Mb (.ckpt file) | -| Scripts | [MGN scripts](scripts) | +| Parameters | GPU | Ascend | +| -------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------- | +| Resource | Tesla V100-PCIE 32G | Ascend 910;CPU 2.60GHz,192核;内存 755G;系统 Euler2.8 | +| uploaded Date | 12/17/2021 (month/day/year) | 4/5/2021 (month/day/year) | +| MindSpore Version | 1.5.0 | 1.6.1 | +| Dataset | Market1501 | Market1501 | +| Training Parameters | max_epoch=400, ids_per_batch=12, decay_epochs='320,380', lr_init=0.00015 | max_epoch=400, ids_per_batch=12, decay_epochs='320,380', lr_init=0.00015 | +| Optimizer | Adam | Adam | +| Loss Function | ReIDLoss | ReIDLoss | +| Speed | 401ms/step (1pcs), 368ms/step (8pcs) | 346ms/step (1pcs), 304ms/step (8pcs) | +| Loss | 0.456 | 0.492 | +| Params (M) | 69 | 69 | +| Checkpoint for inference | 846Mb (.ckpt file) | 807Mb (.ckpt file) | +| Scripts | [MGN scripts](scripts) | [MGN scripts](scripts) | #### Evaluation Performance -| Parameters | GPU | -| ------------------- | --------------------------- | -| Resource | Tesla V100-PCIE 32G | -| Uploaded Date | 12/17/2021 (month/day/year) | -| MindSpore Version | 1.5.0 | -| Dataset | Market1501 | -| batch_size | 32 | -| outputs | mAP, Rank-1 | -| Accuracy | mAP: 93.78%, rank-1: 95.31%.| +| Parameters | GPU | Ascend | +| ------------------- | --------------------------- | --------------------------- | +| Resource | Tesla V100-PCIE 32G | Ascend 910;CPU 2.60GHz,192核;内存 755G;系统 Euler2.8 | +| Uploaded Date | 12/17/2021 (month/day/year) | 4/5/2022 (month/day/year) | +| MindSpore Version | 1.5.0 | 1.6.1 | +| Dataset | Market1501 | Market1501 | +| batch_size | 32 | 32 | +| outputs | mAP, Rank-1 | mAP, Rank-1 | +| Accuracy | mAP: 93.78%, rank-1: 95.31%.| mAP: 90.60%, rank-1: 93.76%.| ## [Description of Random Situation](#contents) diff --git a/research/cv/MGN/ascend310_infer/CMakeLists.txt b/research/cv/MGN/ascend310_infer/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1ea8e790b8cc70f0733ada8f2bf8e88910613dc --- /dev/null +++ b/research/cv/MGN/ascend310_infer/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.14.1) +project(Ascend310Infer) +add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -std=c++17 -Werror -Wall -fPIE -Wl,--allow-shlib-undefined") +set(PROJECT_SRC_ROOT ${CMAKE_CURRENT_LIST_DIR}/) +option(MINDSPORE_PATH "mindspore install path" "") +include_directories(${MINDSPORE_PATH}) +include_directories(${MINDSPORE_PATH}/include) +include_directories(${PROJECT_SRC_ROOT}) + +find_library(MS_LIB libmindspore.so ${MINDSPORE_PATH}/lib) +file(GLOB_RECURSE MD_LIB ${MINDSPORE_PATH}/_c_dataengine*) +find_package(gflags REQUIRED) + +add_executable(main src/main.cc src/utils.cc) +target_link_libraries(main ${MS_LIB} ${MD_LIB} gflags) diff --git a/research/cv/MGN/ascend310_infer/build.sh b/research/cv/MGN/ascend310_infer/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..8f9dc18582e53b4a9739be60f9ff67b203dcd8bc --- /dev/null +++ b/research/cv/MGN/ascend310_infer/build.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ ! -d out ]; then + mkdir out +fi +cd out || exit +cmake .. \ + -DMINDSPORE_PATH="`pip show mindspore-ascend | grep Location | awk '{print $2"/mindspore"}' | xargs realpath`" +make diff --git a/research/cv/MGN/ascend310_infer/inc/utils.h b/research/cv/MGN/ascend310_infer/inc/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..b084b86831b2ca802bac2ca26a6f2b34ff5c68a1 --- /dev/null +++ b/research/cv/MGN/ascend310_infer/inc/utils.h @@ -0,0 +1,34 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_INFERENCE_UTILS_H_ +#define MINDSPORE_INFERENCE_UTILS_H_ + +#include <sys/stat.h> +#include <dirent.h> +#include <vector> +#include <string> +#include <memory> +#include <map> +#include "include/api/types.h" + +std::vector<std::string> GetAllFiles(std::string_view dirName); +DIR *OpenDir(std::string_view dirName); +std::string RealPath(std::string_view path); +mindspore::MSTensor ReadFileToTensor(const std::string &file); +int WriteResult(const std::string& imageFile, const std::vector<mindspore::MSTensor> &outputs); +void cal_time(std::map<double, double> costTime_map); +#endif diff --git a/research/cv/MGN/ascend310_infer/src/main.cc b/research/cv/MGN/ascend310_infer/src/main.cc new file mode 100644 index 0000000000000000000000000000000000000000..a75d3c9397e5be52bff5119a8d6bee01f83b494a --- /dev/null +++ b/research/cv/MGN/ascend310_infer/src/main.cc @@ -0,0 +1,178 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <sys/time.h> +#include <gflags/gflags.h> +#include <dirent.h> +#include <iostream> +#include <string> +#include <algorithm> +#include <iosfwd> +#include <vector> +#include <fstream> +#include <sstream> + +#include "../inc/utils.h" +#include "include/dataset/execute.h" +#include "include/dataset/transforms.h" +#include "include/dataset/vision.h" +#include "include/dataset/vision_ascend.h" +#include "include/api/types.h" +#include "include/api/model.h" +#include "include/api/serialization.h" +#include "include/api/context.h" + + +using mindspore::Context; +using mindspore::Serialization; +using mindspore::Model; +using mindspore::Status; +using mindspore::dataset::Execute; +using mindspore::MSTensor; +using mindspore::ModelType; +using mindspore::GraphCell; +using mindspore::kSuccess; +using mindspore::Graph; +using mindspore::dataset::Execute; +using mindspore::dataset::TensorTransform; +using mindspore::dataset::vision::Decode; +using mindspore::dataset::vision::Resize; +using mindspore::dataset::vision::Normalize; +using mindspore::dataset::vision::HWC2CHW; + +DEFINE_string(model_path, "../mcnn.mindir", "model path"); +DEFINE_string(test_path, "../test_data/preprocess_data", "test dataset path"); +DEFINE_string(query_path, "../test_data/preprocess_data", "query dataset path"); +DEFINE_int32(input_width, 960, "input width"); +DEFINE_int32(input_height, 576, "inputheight"); +DEFINE_int32(device_id, 0, "device id"); + + +int main(int argc, char **argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (RealPath(FLAGS_model_path).empty()) { + std::cout << "Invalid mindir" << std::endl; + return 1; + } + + auto context = std::make_shared<Context>(); + auto ascend310_info = std::make_shared<mindspore::Ascend310DeviceInfo>(); + ascend310_info->SetDeviceID(FLAGS_device_id); + context->MutableDeviceInfo().push_back(ascend310_info); + + Graph graph; + Status ret = Serialization::Load(FLAGS_model_path, ModelType::kMindIR, &graph); + if (ret != kSuccess) { + std::cout << "Load model failed." << std::endl; + return 1; + } + + Model model; + ret = model.Build(GraphCell(graph), context); + if (ret != kSuccess) { + std::cout << "ERROR: Build failed." << std::endl; + return 1; + } + + std::vector<MSTensor> modelInputs = model.GetInputs(); + + auto all_files = GetAllFiles(FLAGS_test_path); + if (all_files.empty()) { + std::cout << "ERROR: no input data." << std::endl; + return 1; + } + + std::map<double, double> costTime_map; + size_t size = all_files.size(); + + for (size_t i = 0; i < size; ++i) { + struct timeval start; + struct timeval end; + double startTime_ms; + double endTime_ms; + std::vector<MSTensor> inputs; + std::vector<MSTensor> outputs; + + std::cout << "Start predict input files:" << all_files[i] << std::endl; + + mindspore::MSTensor image = ReadFileToTensor(all_files[i]); + + inputs.emplace_back(modelInputs[0].Name(), modelInputs[0].DataType(), modelInputs[0].Shape(), + image.Data().get(), image.DataSize()); + + gettimeofday(&start, NULL); + ret = model.Predict(inputs, &outputs); + gettimeofday(&end, NULL); + if (ret != kSuccess) { + std::cout << "Predict " << all_files[i] << " failed." << std::endl; + return 1; + } + startTime_ms = (1.0 * start.tv_sec * 1000000 + start.tv_usec) / 1000; + endTime_ms = (1.0 * end.tv_sec * 1000000 + end.tv_usec) / 1000; + costTime_map.insert(std::pair<double, double>(startTime_ms, endTime_ms)); + int rst = WriteResult(all_files[i], outputs); + if (rst != 0) { + std::cout << "write result failed." << std::endl; + return rst; + } + } + + cal_time(costTime_map); + + auto all_files2 = GetAllFiles(FLAGS_query_path); + if (all_files2.empty()) { + std::cout << "ERROR: no input data." << std::endl; + return 1; + } + + std::map<double, double> costTime_map2; + size_t size2 = all_files2.size(); + + for (size_t i = 0; i < size2; ++i) { + struct timeval start; + struct timeval end; + double startTime_ms; + double endTime_ms; + std::vector<MSTensor> inputs; + std::vector<MSTensor> outputs; + + std::cout << "Start predict input files:" << all_files2[i] << std::endl; + + mindspore::MSTensor image = ReadFileToTensor(all_files2[i]); + + inputs.emplace_back(modelInputs[0].Name(), modelInputs[0].DataType(), modelInputs[0].Shape(), + image.Data().get(), image.DataSize()); + + gettimeofday(&start, NULL); + ret = model.Predict(inputs, &outputs); + gettimeofday(&end, NULL); + if (ret != kSuccess) { + std::cout << "Predict " << all_files2[i] << " failed." << std::endl; + return 1; + } + startTime_ms = (1.0 * start.tv_sec * 1000000 + start.tv_usec) / 1000; + endTime_ms = (1.0 * end.tv_sec * 1000000 + end.tv_usec) / 1000; + costTime_map2.insert(std::pair<double, double>(startTime_ms, endTime_ms)); + int rst = WriteResult(all_files2[i], outputs); + if (rst != 0) { + std::cout << "write result failed." << std::endl; + return rst; + } + } + cal_time(costTime_map2); + + return 0; +} diff --git a/research/cv/MGN/ascend310_infer/src/utils.cc b/research/cv/MGN/ascend310_infer/src/utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..d5d5e8e7954e066662901b4194fc002fe2c1f321 --- /dev/null +++ b/research/cv/MGN/ascend310_infer/src/utils.cc @@ -0,0 +1,164 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "inc/utils.h" + +#include <fstream> +#include <algorithm> +#include <iostream> +#include <sstream> + +using mindspore::MSTensor; +using mindspore::DataType; + + +std::vector<std::string> GetAllFiles(std::string_view dirName) { + struct dirent *filename; + DIR *dir = OpenDir(dirName); + if (dir == nullptr) { + return {}; + } + std::vector<std::string> res; + while ((filename = readdir(dir)) != nullptr) { + std::string dName = std::string(filename->d_name); + if (dName == "." || dName == ".." || filename->d_type != DT_REG) { + continue; + } + res.emplace_back(std::string(dirName) + "/" + filename->d_name); + } + std::sort(res.begin(), res.end()); + for (auto &f : res) { + std::cout << "image file: " << f << std::endl; + } + return res; +} + +int WriteResult(const std::string& imageFile, const std::vector<MSTensor> &outputs) { + std::string homePath = "./result_Files"; + const int INVALID_POINTER = -1; + const int ERROR = -2; + for (size_t i = 0; i < outputs.size(); ++i) { + size_t outputSize; + std::shared_ptr<const void> netOutput = outputs[i].Data(); + outputSize = outputs[i].DataSize(); + int pos = imageFile.rfind('/'); + std::string fileName(imageFile, pos + 1); + fileName.replace(fileName.find('.'), fileName.size() - fileName.find('.'), '_' + std::to_string(i) + ".bin"); + std::string outFileName = homePath + "/" + fileName; + FILE *outputFile = fopen(outFileName.c_str(), "wb"); + if (outputFile == nullptr) { + std::cout << "open result file " << outFileName << " failed" << std::endl; + return INVALID_POINTER; + } + size_t size = fwrite(netOutput.get(), sizeof(char), outputSize, outputFile); + if (size != outputSize) { + fclose(outputFile); + outputFile = nullptr; + std::cout << "write result file " << outFileName << " failed, write size[" << size << + "] is smaller than output size[" << outputSize << "], maybe the disk is full." << std::endl; + return ERROR; + } + fclose(outputFile); + outputFile = nullptr; + } + return 0; +} + +mindspore::MSTensor ReadFileToTensor(const std::string &file) { + if (file.empty()) { + std::cout << "Pointer file is nullptr" << std::endl; + return mindspore::MSTensor(); + } + + std::ifstream ifs(file); + if (!ifs.good()) { + std::cout << "File: " << file << " is not exist" << std::endl; + return mindspore::MSTensor(); + } + + if (!ifs.is_open()) { + std::cout << "File: " << file << "open failed" << std::endl; + return mindspore::MSTensor(); + } + + ifs.seekg(0, std::ios::end); + size_t size = ifs.tellg(); + mindspore::MSTensor buffer(file, mindspore::DataType::kNumberTypeUInt8, {static_cast<int64_t>(size)}, nullptr, size); + + ifs.seekg(0, std::ios::beg); + ifs.read(reinterpret_cast<char *>(buffer.MutableData()), size); + ifs.close(); + + return buffer; +} + + +DIR *OpenDir(std::string_view dirName) { + if (dirName.empty()) { + std::cout << " dirName is null ! " << std::endl; + return nullptr; + } + std::string realPath = RealPath(dirName); + struct stat s; + lstat(realPath.c_str(), &s); + if (!S_ISDIR(s.st_mode)) { + std::cout << "dirName is not a valid directory !" << std::endl; + return nullptr; + } + DIR *dir = opendir(realPath.c_str()); + if (dir == nullptr) { + std::cout << "Can not open dir " << dirName << std::endl; + return nullptr; + } + std::cout << "Successfully opened the dir " << dirName << std::endl; + return dir; +} + +std::string RealPath(std::string_view path) { + char realPathMem[PATH_MAX] = {0}; + char *realPathRet = nullptr; + realPathRet = realpath(path.data(), realPathMem); + if (realPathRet == nullptr) { + std::cout << "File: " << path << " is not exist."; + return ""; + } + + std::string realPath(realPathMem); + std::cout << path << " realpath is: " << realPath << std::endl; + return realPath; +} + +void cal_time(std::map<double, double> costTime_map) { + double average = 0.0; + int infer_cnt = 0; + + for (auto iter = costTime_map.begin(); iter != costTime_map.end(); iter++) { + double diff = 0.0; + diff = iter->second - iter->first; + average += diff; + infer_cnt++; + } + + average = average / infer_cnt; + std::stringstream timeCost; + timeCost << "NN inference cost average time: "<< average << " ms of infer_count " << infer_cnt << std::endl; + std::cout << "NN inference cost average time: "<< average << "ms of infer_count " << infer_cnt << std::endl; + std::string file_name = "./time_Result" + std::string("/test_perform_static.txt"); + std::ofstream file_stream(file_name.c_str(), std::ios::trunc); + file_stream << timeCost.str(); + file_stream.close(); + costTime_map.clear(); +} diff --git a/research/cv/MGN/configs/market1501_config.yml b/research/cv/MGN/configs/market1501_config.yml index 112dac06338b65754adcb8af86480e40fe2807da..46a2c6d6368ef100602e18bb9128d73fc490c674 100644 --- a/research/cv/MGN/configs/market1501_config.yml +++ b/research/cv/MGN/configs/market1501_config.yml @@ -8,7 +8,7 @@ checkpoint_url: "" data_path: "/cache/data" output_path: "/cache/train" load_path: "/cache/checkpoint_path" -device_target: "GPU" +device_target: "Ascend" need_modelarts_dataset_unzip: False modelarts_dataset_unzip_name: "market1501" @@ -16,6 +16,9 @@ modelarts_dataset_unzip_name: "market1501" # options lr_init: 0.00015 # 2e-4 decay_epochs: '320,380' +optimizer: "adam" +use_map: False +run_eval: False ims_per_id: 4 ids_per_batch: 12 max_epoch: 400 diff --git a/research/cv/MGN/eval.py b/research/cv/MGN/eval.py index c103626bfdcb1b706bc192efc67429cd1ef6da6f..41cbc677eb1a33c3ed68b7724f43c961c96d91e5 100644 --- a/research/cv/MGN/eval.py +++ b/research/cv/MGN/eval.py @@ -14,22 +14,18 @@ # ============================================================================ """ Evaluation script """ -import os -import time - import numpy as np from mindspore import Tensor from mindspore import context from mindspore import numpy as mnp from mindspore.common import set_seed -from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.serialization import load_param_into_net, load_checkpoint from scipy.spatial.distance import cdist from metric_utils.functions import cmc, mean_ap from metric_utils.re_ranking import re_ranking from model_utils.config import get_config -from model_utils.device_adapter import get_device_id, get_device_num +from model_utils.device_adapter import get_device_id from model_utils.moxing_adapter import moxing_wrapper from src.dataset import create_dataset from src.mgn import MGN @@ -38,71 +34,6 @@ set_seed(1) config = get_config() -def modelarts_pre_process(): - """ Modelarts pre process function """ - def unzip(zip_file, save_dir): - import zipfile - s_time = time.time() - if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)): - zip_isexist = zipfile.is_zipfile(zip_file) - if zip_isexist: - fz = zipfile.ZipFile(zip_file, 'r') - data_num = len(fz.namelist()) - print("Extract Start...") - print("unzip file num: {}".format(data_num)) - data_print = int(data_num / 100) if data_num > 100 else 1 - i = 0 - for file in fz.namelist(): - if i % data_print == 0: - print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True) - i += 1 - fz.extract(file, save_dir) - print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60), - int(int(time.time() - s_time) % 60))) - print("Extract Done.") - else: - print("This is not zip.") - else: - print("Zip has been extracted.") - - if config.need_modelarts_dataset_unzip: - zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip") - save_dir_1 = os.path.join(config.data_path) - - sync_lock = "/tmp/unzip_sync.lock" - - # Each server contains 8 devices as most. - if config.device_target == "GPU": - init() - device_id = get_rank() - device_num = get_group_size() - elif config.device_target == "Ascend": - device_id = get_device_id() - device_num = get_device_num() - else: - raise ValueError("Not support device_target.") - - # Each server contains 8 devices as most. - if device_id % min(device_num, 8) == 0 and not os.path.exists(sync_lock): - print("Zip file path: ", zip_file_1) - print("Unzip file save dir: ", save_dir_1) - unzip(zip_file_1, save_dir_1) - print("===Finish extract data synchronization===") - try: - os.mknod(sync_lock) - except IOError: - pass - - while True: - if os.path.exists(sync_lock): - break - time.sleep(1) - - print("Device: {}, Finish sync unzip data from {} to {}.".format(device_id, zip_file_1, save_dir_1)) - - config.log_path = os.path.join(config.output_path, config.log_path) - - def extract_feature(model, dataset): """ Extract dataset features from model """ def fliphor(tensor): @@ -133,7 +64,7 @@ def extract_feature(model, dataset): return np.concatenate(features, axis=0) -@moxing_wrapper(pre_process=modelarts_pre_process) +@moxing_wrapper() def run_eval(): """ Run evaluation """ re_rank = True @@ -142,10 +73,8 @@ def run_eval(): config.image_mean = list(map(float, config.image_mean.split(','))) config.image_std = list(map(float, config.image_std.split(','))) - _enable_graph_kernel = False context.set_context( mode=context.GRAPH_MODE, - enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target, ) diff --git a/research/cv/MGN/export.py b/research/cv/MGN/export.py index e76780d23db6ce2918551c697b5d199a3246dfce..d19ed3651aa91e79fc1dfd8c332dc6ccc2a3c1b7 100644 --- a/research/cv/MGN/export.py +++ b/research/cv/MGN/export.py @@ -16,7 +16,6 @@ import numpy as np from mindspore import Tensor -from mindspore import context from mindspore.train.serialization import load_param_into_net, load_checkpoint, export from model_utils.config import get_config @@ -26,18 +25,9 @@ from src.mgn import MGN config = get_config() -def modelarts_pre_process(): - """model arts pre process""" - - -@moxing_wrapper(pre_process=modelarts_pre_process) +@moxing_wrapper() def export_network(): """ Export network """ - context.set_context( - mode=context.GRAPH_MODE, - device_target=config.device_target, - ) - network = MGN(num_classes=config.n_classes) config.image_size = list(map(int, config.image_size.split(','))) diff --git a/research/cv/MGN/model_utils/moxing_adapter.py b/research/cv/MGN/model_utils/moxing_adapter.py index ead56bf525d40866eebdb33048112f0e04ff24b7..c8e240c467b628a146a095cc12b384c6d3eb70a9 100644 --- a/research/cv/MGN/model_utils/moxing_adapter.py +++ b/research/cv/MGN/model_utils/moxing_adapter.py @@ -51,7 +51,34 @@ def get_job_id(): return job_id -def sync_data(from_path, to_path): +def unzip(zip_file, save_dir): + import zipfile + import time + s_time = time.time() + if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)): + zip_isexist = zipfile.is_zipfile(zip_file) + if zip_isexist: + fz = zipfile.ZipFile(zip_file, 'r') + data_num = len(fz.namelist()) + print("Extract Start...") + print("unzip file num: {}".format(data_num)) + data_print = int(data_num / 100) if data_num > 100 else 1 + i = 0 + for file in fz.namelist(): + if i % data_print == 0: + print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True) + i += 1 + fz.extract(file, save_dir) + print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60), + int(int(time.time() - s_time) % 60))) + print("Extract Done.") + else: + print("This is not zip.") + else: + print("Zip has been extracted.") + + +def sync_data(from_path, to_path, need_unzip=False, zip_file=""): """ Download data from remote obs to local directory if the first url is remote url and the second one is local path Upload data from local directory to remote obs in contrast. @@ -67,6 +94,8 @@ def sync_data(from_path, to_path): print("from path: ", from_path) print("to path: ", to_path) mox.file.copy_parallel(from_path, to_path) + if need_unzip: + unzip(os.path.join(to_path, zip_file), to_path) print("===finish data synchronization===") try: os.mknod(sync_lock) @@ -92,8 +121,17 @@ def moxing_wrapper(pre_process=None, post_process=None): # Download data from data_url if config.enable_modelarts: if config.data_url: - sync_data(config.data_url, config.data_path) - print("Dataset downloaded: ", os.listdir(config.data_path)) + sync_data(config.data_url, config.data_dir, + need_unzip=config.need_modelarts_dataset_unzip, + zip_file=config.modelarts_dataset_unzip_name + ".zip") + if config.pre_trained_backbone: + config.pre_trained_backbone = os.path.join(config.data_dir, + os.path.basename(config.pre_trained_backbone)) + if not os.path.exists(config.pre_trained_backbone): + print("!!! Fail to get pre_trained_backbone checkpoint", config.pre_trained_backbone) + config.pre_trained_backbone = "" + print("Pretrain ckpt:", config.pre_trained_backbone) + print("Dataset downloaded: ", config.data_dir, os.listdir(config.data_dir)) if config.checkpoint_url: sync_data(config.checkpoint_url, config.load_path) print("Preload downloaded: ", os.listdir(config.load_path)) diff --git a/research/cv/MGN/postprocess.py b/research/cv/MGN/postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..1ab7a31fc043bb46207dcee2a29271b750f17c7b --- /dev/null +++ b/research/cv/MGN/postprocess.py @@ -0,0 +1,133 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""postprocess for 310 inference""" +import os +import argparse +import numpy as np +from metric_utils.re_ranking import re_ranking +from metric_utils.functions import cmc, mean_ap +from model_utils.config import get_config +config_path = "../configs/market1501_config.yml" +config = get_config() +parser = argparse.ArgumentParser(description="postprocess") +parser.add_argument("--result_dir", type=str, default="./result_Files", help="result files path.") +parser.add_argument("--preprocess_result_dir", type=str, default="./preprocess_Result", help="result files path.") +parser.add_argument("--config_path", type=str, default="../configs/market1501_config.yml", help="config file path.") +args = parser.parse_args() + + +def txt2list(filename): + txt_tables = [] + f_read = open(filename, "r", encoding='utf-8') + line = f_read.readline() + while line: + line = int(line) + txt_tables.append(line) + line = f_read.readline() + return txt_tables + + +if __name__ == '__main__': + output_path = args.preprocess_result_dir + output_test_path = os.path.join(output_path, "test") + output_query_path = os.path.join(output_path, "query") + test_img_path = args.result_dir + test_label_path = os.path.join(output_test_path, "market1501_label_ids.npy") + query_label_path = os.path.join(output_query_path, "market1501_label_ids.npy") + + features = [] + labels = np.load(test_label_path, allow_pickle=True) + for idx, label in enumerate(labels): + + ff = np.zeros((config.per_batch_size, 2048)) + + file_name = "market1501_test_bs" + str(config.per_batch_size) + "_" + str(idx) + "_0_0" + ".bin" + f_name = os.path.join(test_img_path, file_name) + f = np.fromfile(f_name, np.float32) + f = f.reshape(config.per_batch_size, 2048) + + ff = ff + f + + file_name = "market1501_test_bs" + str(config.per_batch_size) + "_" + str(idx) + "_1_0" + ".bin" + f_name = os.path.join(test_img_path, file_name) + f = np.fromfile(f_name, np.float32) + f = f.reshape(config.per_batch_size, 2048) + + ff = ff + f + + fnorm = np.sum(np.sqrt(np.square(ff)), axis=1, keepdims=True) + ff = ff / fnorm + + features.append(ff) + gf = np.concatenate(features, axis=0) + + features = [] + labels = np.load(query_label_path, allow_pickle=True) + for idx, label in enumerate(labels): + + ff = np.zeros((config.per_batch_size, 2048)) + + file_name = "market1501_query_bs" + str(config.per_batch_size) + "_" + str(idx) + "_0_0" + ".bin" + f_name = os.path.join(test_img_path, file_name) + f = np.fromfile(f_name, np.float32) + f = f.reshape(config.per_batch_size, 2048) + + ff = ff + f + + file_name = "market1501_query_bs" + str(config.per_batch_size) + "_" + str(idx) + "_1_0" + ".bin" + f_name = os.path.join(test_img_path, file_name) + f = np.fromfile(f_name, np.float32) + f = f.reshape(config.per_batch_size, 2048) + + ff = ff + f + + fnorm = np.sum(np.sqrt(np.square(ff)), axis=1, keepdims=True) + ff = ff / fnorm + + features.append(ff) + qf = np.concatenate(features, axis=0) + + file_path = os.path.join(output_test_path, 't_cams.txt') + t_cams = txt2list(file_path) + + file_path = os.path.join(output_test_path, 't_ids.txt') + t_ids = txt2list(file_path) + + file_path = os.path.join(output_query_path, 'q_cams.txt') + q_cams = txt2list(file_path) + + file_path = os.path.join(output_query_path, 'q_ids.txt') + q_ids = txt2list(file_path) + + re_rank = True + if re_rank: + q_g_dist = np.dot(qf, np.transpose(gf)) + q_q_dist = np.dot(qf, np.transpose(qf)) + g_g_dist = np.dot(gf, np.transpose(gf)) + dist = re_ranking(q_g_dist, q_q_dist, g_g_dist) + else: + dist = cdist(qf, gf) + r = cmc(dist, q_ids, t_ids, q_cams, t_cams, + separate_camera_set=False, + single_gallery_shot=False, + first_match_break=True) + m_ap = mean_ap(dist, q_ids, t_ids, q_cams, t_cams) + + print( + '[INFO] mAP: {:.4f} rank1: {:.4f} rank3: {:.4f} rank5: {:.4f} rank10: {:.4f}'.format( + m_ap, + r[0], r[2], r[4], r[9], + ) + ) diff --git a/research/cv/MGN/preprocess.py b/research/cv/MGN/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..d809c779fd18735bc0087016656bb8d5d99a6a99 --- /dev/null +++ b/research/cv/MGN/preprocess.py @@ -0,0 +1,141 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""pre process for 310 inference""" +import os +import numpy as np +from src.dataset import create_dataset +from model_utils.config import get_config +from model_utils.device_adapter import get_device_id +from mindspore import context + +config_path = "../configs/market1501_config.yml" +config = get_config() + + +def text_save(filename, data): + file = open(filename, 'w') + for i in range(len(data)): + s = str(data[i]).replace('[', '').replace(']', '') + s = s.replace("'", '').replace(',', '') + '\n' + file.write(s) + file.close() + print("save successfully") + + +def fliphor(tensor): + """ Flip tensor """ + return tensor[..., ::-1].copy() + + +def save_data_to_bin(data_path, output_path): + config.image_size = list(map(int, config.image_size.split(','))) + config.image_mean = list(map(float, config.image_mean.split(','))) + config.image_std = list(map(float, config.image_std.split(','))) + + _enable_graph_kernel = False + context.set_context( + mode=context.GRAPH_MODE, + enable_graph_kernel=_enable_graph_kernel, + device_target=config.device_target, + ) + + config.rank = 0 + config.device_id = get_device_id() + config.group_size = 1 + + t_dataset, t_cams, t_ids = create_dataset( + data_path, + ims_per_id=4, + ids_per_batch=12, + mean=config.image_mean, + std=config.image_std, + resize_h_w=config.image_size, + batch_size=config.per_batch_size, + rank=config.rank, + group_size=config.group_size, + data_part='test' + ) + + q_dataset, q_cams, q_ids = create_dataset( + data_path, + ims_per_id=4, + ids_per_batch=12, + mean=config.image_mean, + std=config.image_std, + resize_h_w=config.image_size, + batch_size=config.per_batch_size, + rank=config.rank, + group_size=config.group_size, + data_part='query' + ) + output_test_path = os.path.join(output_path, "test") + output_query_path = os.path.join(output_path, "query") + + test_img_path = os.path.join(output_test_path, "dataset") + os.makedirs(test_img_path) + label_list = [] + idx = 0 + for idx, data in enumerate(t_dataset.create_dict_iterator(output_numpy=True)): + if data["image"].shape[0] == config.per_batch_size: + file_name = "market1501_test_bs" + str(config.per_batch_size) + "_" + str(idx) + "_0" + ".bin" + file_path = os.path.join(test_img_path, file_name) + data["image"].tofile(file_path) + + images_ = data["image"] + images_ = fliphor(images_) + file_name = "market1501_test_bs" + str(config.per_batch_size) + "_" + str(idx) + "_1" + ".bin" + file_path = os.path.join(test_img_path, file_name) + images_.tofile(file_path) + + label_list.append(data["label"]) + + np.save(os.path.join(output_test_path, "market1501_label_ids.npy"), label_list) + print("=" * 20, "export bin files finished", "=" * 20) + + file_path = os.path.join(output_test_path, 't_cams.txt') + text_save(file_path, t_cams[:idx*config.per_batch_size]) + + file_path = os.path.join(output_test_path, 't_ids.txt') + text_save(file_path, t_ids[:idx*config.per_batch_size]) + + + query_img_path = os.path.join(output_query_path, "dataset") + os.makedirs(query_img_path) + label_list = [] + for idx, data in enumerate(q_dataset.create_dict_iterator(output_numpy=True)): + if data["image"].shape[0] == config.per_batch_size: + file_name = "market1501_query_bs" + str(config.per_batch_size) + "_" + str(idx) + "_0" + ".bin" + file_path = os.path.join(query_img_path, file_name) + data["image"].tofile(file_path) + + images_ = data["image"] + images_ = fliphor(images_) + file_name = "market1501_query_bs" + str(config.per_batch_size) + "_" + str(idx) + "_1" + ".bin" + file_path = os.path.join(query_img_path, file_name) + images_.tofile(file_path) + + label_list.append(data["label"]) + np.save(os.path.join(output_query_path, "market1501_label_ids.npy"), label_list) + print("=" * 20, "export bin files finished", "=" * 20) + + file_path = os.path.join(output_query_path, 'q_cams.txt') + text_save(file_path, q_cams[:idx*config.per_batch_size]) + + file_path = os.path.join(output_query_path, 'q_ids.txt') + text_save(file_path, q_ids[:idx*config.per_batch_size]) + + +if __name__ == '__main__': + save_data_to_bin(config.data_dir, config.output_path) diff --git a/research/cv/MGN/scripts/run_distribute_train_ascend.sh b/research/cv/MGN/scripts/run_distribute_train_ascend.sh new file mode 100644 index 0000000000000000000000000000000000000000..0559b3ccc442c92d030d2ed3e892e6ebb427761c --- /dev/null +++ b/research/cv/MGN/scripts/run_distribute_train_ascend.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 5 ] ; then +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash scripts/run_distribute_train_ascend.sh DEVICE_NUM DATA_DIR OUTPUT_PATH PRETRAINED_RESNET50 HCCL_JSON" +echo "bash scripts/run_distribute_train_ascend.sh 4 /data/dataset/Market-1501-v15.09.15/ output resnet50.ckpt /data/MGN_4/hccl_4p_0123_127.0.0.1.json" +echo "bash scripts/run_distribute_train_ascend.sh 8 /data/dataset/Market-1501-v15.09.15/ output resnet50.ckpt /data/MGN_COPY/rank_table_8pcs.json" +echo "for example: bash scripts/run_distribute_train_ascend.sh 8 /path/to/market1501/ /path/to/output/ /path/to/pretrained_resnet50.pth HCCL_JSON" +echo "==============================================================================================================" +exit 1; +fi + + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} +config_path=$(get_real_path "./configs/market1501_config.yml") + +export RANK_SIZE=$1 +echo $RANK_SIZE +DATA_DIR=$(get_real_path "$2") +OUTPUT_PATH=$(get_real_path "$3") +PRE_TRAINED_PATH=$(get_real_path "$4") +HCCL_JSON=$(get_real_path "$5") +export RANK_TABLE_FILE=$HCCL_JSON +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) +rank_start=0 + +for((i=0; i<${RANK_SIZE}; i++)) +do + export DEVICE_ID=$i + export RANK_ID=$((rank_start + i)) + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + python $BASE_PATH/../train.py \ + --config_path="$config_path" \ + --data_dir="$DATA_DIR" \ + --ckpt_path="$OUTPUT_PATH" \ + --train_log_path="$OUTPUT_PATH" \ + --pre_trained_backbone="$PRE_TRAINED_PATH" \ + --device_target=Ascend \ + --lr_init=0.00025 \ + --ids_per_batch=6 \ + --optimizer="adamw" \ + --use_map=True \ + --run_eval=True \ + --decay_epochs="640,760" \ + --max_epoch=800 \ + --is_distributed=1 > train.log 2>&1 & + cd .. +done diff --git a/research/cv/MGN/scripts/run_eval_gpu.sh b/research/cv/MGN/scripts/run_eval.sh similarity index 80% rename from research/cv/MGN/scripts/run_eval_gpu.sh rename to research/cv/MGN/scripts/run_eval.sh index 671972bd49b164cef16c27fd6180bb24d8632447..90624e5171dee5fdda42516a5ad8abf92a34aee8 100644 --- a/research/cv/MGN/scripts/run_eval_gpu.sh +++ b/research/cv/MGN/scripts/run_eval.sh @@ -14,11 +14,11 @@ # limitations under the License. # ============================================================================ -if [ $# != 1 ] ; then +if [ $# != 3 ] ; then echo "==============================================================================================================" echo "Please run the script as: " -echo "bash scripts/run_eval_gpu.sh CKPT_PATH" -echo "for example: bash scripts/run_eval_gpu.sh /your/path/checkpoint_file" +echo "bash scripts/run_eval.sh DATA_DIR CKPT_PATH DEVICE_TARGET" +echo "for example: bash scripts/run_eval.sh /your/path/dataset /your/path/checkpoint_file Ascend" echo "It is better to use absolute path." echo "==============================================================================================================" exit 1; @@ -31,12 +31,16 @@ get_real_path(){ echo "$(realpath -m $PWD/$1)" fi } + config_path=$(get_real_path "./configs/market1501_config.yml") -PATH1=$(get_real_path $1) +DATA_DIR=$(get_real_path $1) +PATH1=$(get_real_path $2) +DEVICE_TARGET=$3 echo "$PATH1" python eval.py \ --config_path="$config_path" \ - --device_target="GPU" \ + --data_dir="$DATA_DIR" \ + --device_target=$DEVICE_TARGET \ --eval_model="$PATH1" > output.eval.log 2>&1 & diff --git a/research/cv/MGN/scripts/run_infer_310.sh b/research/cv/MGN/scripts/run_infer_310.sh new file mode 100644 index 0000000000000000000000000000000000000000..75d3f445d03e58a633cb5ab6006e3c43b4ae4e0a --- /dev/null +++ b/research/cv/MGN/scripts/run_infer_310.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [[ $# -lt 2 || $# -gt 3 ]]; then + echo "Usage: sh run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID] + DEVICE_ID is optional, it can be set by environment variable device_id, otherwise the value is zero" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} +config_path=$(get_real_path "../configs/market1501_config.yml") +model=$(get_real_path $1) +data_path=$(get_real_path $2) +if [ $# == 3 ]; then + device_id=$3 +elif [ $# == 2 ]; then + if [ -z $device_id ]; then + device_id=0 + else + device_id=$device_id + fi +fi + +echo $model +echo $data_path +echo $device_id + +export ASCEND_HOME=/usr/local/Ascend/ +if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then + export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/atc/bin:$PATH + export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/ascend-toolkit/latest/atc/lib64:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH + export TBE_IMPL_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe + export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp +else + export ASCEND_HOME=/usr/local/Ascend/latest/ + export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH + export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH + export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=$ASCEND_HOME/opp +fi + +function preprocess_data() +{ + if [ -d preprocess_Result ]; then + rm -rf ./preprocess_Result + fi + mkdir preprocess_Result + python ../preprocess.py --output_path=./preprocess_Result --config_path="$config_path" --data_dir=$data_path &> preprocess.log + test_path=./preprocess_Result/test/dataset/ + query_path=./preprocess_Result/query/dataset/ +} + +function compile_app() +{ + cd ../ascend310_infer || exit + if [ -f "Makefile" ]; then + make clean + fi + sh build.sh &> build.log + + if [ $? -ne 0 ]; then + echo "compile app code failed" + exit 1 + fi + cd - || exit +} + +function infer() +{ + if [ -d result_Files ]; then + rm -rf ./result_Files + fi + if [ -d time_Result ]; then + rm -rf ./time_Result + fi + mkdir result_Files + mkdir time_Result + ../ascend310_infer/out/main --model_path=$model --test_path=$test_path --query_path=$query_path --device_id=$device_id &> infer.log + + if [ $? -ne 0 ]; then + echo "execute inference failed" + exit 1 + fi +} + +function cal_acc() +{ + python ../postprocess.py --config_path="$config_path" &> acc.log & + if [ $? -ne 0 ]; then + echo "calculate accuracy failed" + exit 1 + fi +} +preprocess_data +compile_app +infer +cal_acc \ No newline at end of file diff --git a/research/cv/MGN/scripts/run_standalone_train_ascend.sh b/research/cv/MGN/scripts/run_standalone_train_ascend.sh new file mode 100644 index 0000000000000000000000000000000000000000..afa9ae1aa68de6fe6cf1356d96f541408d1d236b --- /dev/null +++ b/research/cv/MGN/scripts/run_standalone_train_ascend.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 4 ] ; then +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash scripts/run_standalone_train_ascend.sh DEVICE_ID DATA_DIR OUTPUT_PATH PRETRAINED_RESNET50" +echo "bash scripts/run_standalone_train_ascend.sh 1 /data/dataset/Market-1501-v15.09.15/ output resnet50.ckpt " +echo "for example: bash scripts/run_distribute_train_ascend.sh 8 /path/to/market1501/ /path/to/output/ /path/to/pretrained_resnet50.pth HCCL_JSON" +echo "==============================================================================================================" +exit 1; +fi + + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +config_path=$(get_real_path "./configs/market1501_config.yml") + +export RANK_SIZE=1 +echo $RANK_SIZE +DATA_DIR=$(get_real_path "$2") +OUTPUT_PATH=$(get_real_path "$3") +PRE_TRAINED_PATH=$(get_real_path "$4") +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) + +export DEVICE_ID=$1 +export RANK_ID=$1 +rm -rf ./train +mkdir ./train +cd ./train || exit +echo "start training for rank $RANK_ID, device $DEVICE_ID" +env > env.log +python $BASE_PATH/../train.py \ +--config_path="$config_path" \ +--data_dir="$DATA_DIR" \ +--ckpt_path="$OUTPUT_PATH" \ +--train_log_path="$OUTPUT_PATH" \ +--pre_trained_backbone="$PRE_TRAINED_PATH" \ +--device_target=Ascend \ +--lr_init=0.00015 \ +--ids_per_batch=12 \ +--optimizer="adamw" \ +--use_map=True \ +--run_eval=True \ +--decay_epochs="320,380" \ +--max_epoch=400 \ +--is_distributed=0 > train.log 2>&1 & +cd .. diff --git a/research/cv/MGN/src/MGN_Callback.py b/research/cv/MGN/src/MGN_Callback.py new file mode 100644 index 0000000000000000000000000000000000000000..bc889e0f84543828ec27f8b0fa41081afbabe947 --- /dev/null +++ b/research/cv/MGN/src/MGN_Callback.py @@ -0,0 +1,163 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""This is callback program""" +import os +from src.dataset import create_dataset +import numpy as np +from mindspore import Tensor +from mindspore import context +from mindspore import numpy as mnp +from mindspore.common import set_seed +from mindspore.train.callback import Callback +from mindspore.train.serialization import save_checkpoint +from scipy.spatial.distance import cdist +from metric_utils.functions import cmc, mean_ap +from metric_utils.re_ranking import re_ranking +from model_utils.config import get_config +from model_utils.device_adapter import get_device_id + +set_seed(1) +config = get_config() + + +def extract_feature(model, dataset): + """ Extract dataset features from model """ + def fliphor(tensor): + """ Flip tensor """ + return tensor[..., ::-1].copy() + + data_loader = dataset.create_dict_iterator(output_numpy=True, num_epochs=1) + + features = [] + + for data in data_loader: + images_ = data["image"] + + ff = mnp.zeros((images_.shape[0], 2048)) + for i in range(2): + if i == 1: + images_ = fliphor(images_) + images = Tensor.from_numpy(images_) + outputs = model(images) + f = outputs[0] + ff = ff + f + + fnorm = mnp.sqrt((ff ** 2).sum(axis=1, keepdims=True)) + ff = ff / fnorm.expand_as(ff) + + features.append(ff.asnumpy()) + + return np.concatenate(features, axis=0) + + +class mgn_callback(Callback): + def __init__(self, network): + self.net = network + self.rank1 = 0.1 + self.best_map = 0 + config.image_size = list(map(int, config.image_size.split(','))) + config.image_mean = list(map(float, config.image_mean.split(','))) + config.image_std = list(map(float, config.image_std.split(','))) + + _enable_graph_kernel = False + context.set_context( + mode=context.GRAPH_MODE, + enable_graph_kernel=_enable_graph_kernel, + device_target=config.device_target, + ) + + config.rank = 0 + config.device_id = get_device_id() + config.group_size = 1 + + _enable_graph_kernel = False + context.set_context( + mode=context.GRAPH_MODE, + enable_graph_kernel=_enable_graph_kernel, + device_target=config.device_target, + ) + + config.rank = 0 + config.device_id = get_device_id() + config.group_size = 1 + + self.t_dataset, self.t_cams, self.t_ids = create_dataset( + config.data_dir, + ims_per_id=4, + ids_per_batch=12, + mean=config.image_mean, + std=config.image_std, + resize_h_w=config.image_size, + batch_size=config.per_batch_size, + rank=config.rank, + group_size=config.group_size, + data_part='test' + ) + + self.q_dataset, self.q_cams, self.q_ids = create_dataset( + config.data_dir, + ims_per_id=4, + ids_per_batch=12, + mean=config.image_mean, + std=config.image_std, + resize_h_w=config.image_size, + batch_size=config.per_batch_size, + rank=config.rank, + group_size=config.group_size, + data_part='query' + ) + + def epoch_end(self, run_context): + # print(self.net.trainable_params()[0].data.asnumpy()[0][0]) + + cb_param = run_context.original_args() + cur_epoch = cb_param.cur_epoch_num + if cur_epoch % 10 == 0 or cur_epoch > config.max_epoch*0.8: + self.net.set_train(False) + + re_rank = True + network = self.net + gf = extract_feature(network, self.t_dataset) + print('Got gallery features') + qf = extract_feature(network, self.q_dataset) + print('Got query features') + + if re_rank: + q_g_dist = np.dot(qf, np.transpose(gf)) + q_q_dist = np.dot(qf, np.transpose(qf)) + g_g_dist = np.dot(gf, np.transpose(gf)) + dist = re_ranking(q_g_dist, q_q_dist, g_g_dist) + else: + dist = cdist(qf, gf) + r = cmc(dist, self.q_ids, self.t_ids, self.q_cams, self.t_cams, + separate_camera_set=False, + single_gallery_shot=False, + first_match_break=True) + m_ap = mean_ap(dist, self.q_ids, self.t_ids, self.q_cams, self.t_cams) + map_score = np.float32(m_ap) + + print( + '[INFO] mAP: {:.4f} rank1: {:.4f} rank3: {:.4f} rank5: {:.4f} rank10: {:.4f}'.format( + m_ap, + r[0], r[2], r[4], r[9], + ) + ) + + if self.best_map < map_score: + save_checkpoint(self.net, os.path.join(config.ckpt_path, 'best.ckpt')) + self.best_map = map_score + self.net.set_train(True) + if cur_epoch == config.max_epoch: + print("best score: mAP:", self.best_map) diff --git a/research/cv/MGN/src/dataset.py b/research/cv/MGN/src/dataset.py index f4ce073a52334349fa18ee8739e421ae2ee97976..a2d79ba35c9a7a3c663aaff4bd3c78d4ed3919b6 100644 --- a/research/cv/MGN/src/dataset.py +++ b/research/cv/MGN/src/dataset.py @@ -58,11 +58,11 @@ class Market1501: data_path = datadir if data_part == 'train': - data_path += '/bounding_box_train' + data_path = os.path.join(data_path, 'bounding_box_train') elif data_part == 'test': - data_path += '/bounding_box_test' + data_path = os.path.join(data_path, 'bounding_box_test') else: - data_path += '/query' + data_path = os.path.join(data_path, 'query') self.imgs = [path for path in list_pictures(data_path) if self.id(path) != -1] @@ -252,7 +252,7 @@ def create_dataset( num_parallel_workers=num_parallel_workers, ) - dataset = dataset.batch(batch_size, drop_remainder=False) + dataset = dataset.batch(batch_size, drop_remainder=(data_part == 'train')) if data_part == 'train': return dataset diff --git a/research/cv/MGN/train.py b/research/cv/MGN/train.py index 501e7a3eb70b6118ec2668365aa57b9342498f58..bdaf2631ba22e18da163a0ef87b80172acb86641 100644 --- a/research/cv/MGN/train.py +++ b/research/cv/MGN/train.py @@ -16,7 +16,6 @@ import os import time - import mindspore.nn as nn from mindspore import Tensor from mindspore import context @@ -26,10 +25,11 @@ from mindspore.context import ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.model import Model from mindspore.train.serialization import load_param_into_net, load_checkpoint - +from mindspore.train.loss_scale_manager import DynamicLossScaleManager from model_utils.config import get_config from model_utils.device_adapter import get_device_id, get_device_num from model_utils.moxing_adapter import moxing_wrapper +from src.MGN_Callback import mgn_callback from src.callbacks import SavingLossMonitor, SavingTimeMonitor from src.dataset import create_dataset from src.loss import MGNLoss @@ -110,23 +110,15 @@ def _prepare_configuration(): config.image_mean = list(map(float, config.image_mean.split(','))) config.image_std = list(map(float, config.image_std.split(','))) - _enable_graph_kernel = False context.set_context( mode=context.GRAPH_MODE, - enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target, ) config.device_id = get_device_id() if config.is_distributed: - if config.device_target == "GPU": - if not config.enable_modelarts: - init() - else: - if not config.need_modelarts_dataset_unzip: - init() - + init() config.group_size = get_group_size() config.rank = get_rank() @@ -158,7 +150,7 @@ def _prepare_configuration(): config.rank_print_ckpt_flag = 1 -@moxing_wrapper(pre_process=modelarts_pre_process) +@moxing_wrapper() def run_train(): """ Run train """ _prepare_configuration() @@ -198,7 +190,13 @@ def run_train(): decay_epochs=config.decay_epochs, ) lr = Tensor(lr) - opt = nn.Adam(network.trainable_params(), learning_rate=lr, weight_decay=config.weight_decay) + if config.optimizer == "adamw": + opt = nn.AdamWeightDecay(network.trainable_params(), learning_rate=lr, + weight_decay=config.weight_decay) + elif config.optimizer == "adam": + opt = nn.Adam(network.trainable_params(), learning_rate=lr, weight_decay=config.weight_decay) + else: + raise ValueError(f'Unsupport optimizer {config.optimizer}') timestamp = time.strftime("%Y%m%d_%H%M%S") + '_' + str(config.rank) @@ -234,15 +232,18 @@ def run_train(): prefix='{}'.format(config.rank), ) callbacks.append(ckpt_cb) + if config.use_map: + loss_scale = DynamicLossScaleManager(2**20) + model = Model(network, loss_fn=reid_loss, optimizer=opt, + amp_level="O3", loss_scale_manager=loss_scale) + else: + model = Model(network, loss_fn=reid_loss, optimizer=opt) - model = Model( - network, - loss_fn=reid_loss, - optimizer=opt, - ) - - model.train(config.max_epoch, dataset, callbacks=callbacks, dataset_sink_mode=False) + if config.run_eval: + eval_callback = mgn_callback(network) + callbacks.append(eval_callback) + model.train(config.max_epoch, dataset, callbacks=callbacks, dataset_sink_mode=True) if __name__ == '__main__': run_train()