diff --git a/official/cv/ADNet/README_CN.md b/official/cv/ADNet/README_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..31dd5d95c89877bf47ef14c023a9a448b590df1a --- /dev/null +++ b/official/cv/ADNet/README_CN.md @@ -0,0 +1,269 @@ +# 目录 + +<!-- TOC --> + +- [目录](#目录) +- [ADNet描述](#ADNet描述) +- [模型架构](#模型架构) +- [数据集](#数据集) +- [环境要求](#环境要求) +- [快速入门](#快速入门) +- [脚本说明](#脚本说明) + - [脚本及样例代码](#脚本及样例代码) + - [脚本参数](#脚本参数) + - [训练过程](#训练过程) + - [训练](#训练) + - [评估过程](#评估过程) + - [评估](#评估) + - [导出mindir模型](#导出mindir模型) + - [推理过程](#推理过程) + - [用法](#用法) + - [结果](#结果) +- [模型描述](#模型描述) + - [性能](#性能) + - [评估性能](#评估性能) +- [随机情况说明](#随机情况说明) +- [ModelZoo主页](#modelzoo主页) + +<!-- /TOC --> + +## ADNet描述 + +ADNet是2017年提出的视频目标跟踪算法,该论文发表在CVPR2017上面,相比传统的视频目标跟踪算法做到了更快,以监督学习为模型主要训练方式,并结合强化学习进行模型finetune,平均提升2个点的精度. + +[论文](http://openaccess.thecvf.com/content_cvpr_2017/papers/Yun_Action-Decision_Networks_for_CVPR_2017_paper.pdf):Sangdoo Yun(Seoul National University, South Korea). "Action-Decision Networks for Visual Tracking with Deep Reinforcement +Learning'. *Presented at CVPR 2017*. + +## 模型架构 + +ADNet模型由vgg-m提供视频帧的特征提取,满足模型轻量化需求,结合历史行为信息动态预测模型的下个行为信息以及当前行为的置信度. + +## 数据集 + +使用的数据集:[VOT2013]、[VOT2014]、[VOT2015]、[OTB100] </br> +百度网盘链接 </br> +VOT13~15 https://pan.baidu.com/s/1co6NSk3imqhLWfq3H1ek1A +提取码:bs6g. </br> +OTB100:https://pan.baidu.com/s/1mGrfVPXs7yz16vCBRwDurA. +提取码:faqb. + +## 环境要求 + +- 硬件(Ascend/ModelArts) + - 准备Ascend或ModelArts处理器搭建硬件环境 +- 框架 + - [MindSpore](https://www.mindspore.cn/install) +- 如需查看详情,请参见如下资源: + - [MindSpore教程](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html) + +## 快速入门 + +通过官方网站安装MindSpore后,您可以按照如下步骤进行训练和评估:</br> +train时注意dataset_path路径是包含vot13~15的根目录,test的时候dataset_path是包含basketball..等的目录 </br> +训练开始前需获取vgg的预训练模型(ckpt)百度网盘链接: https://pan.baidu.com/s/11TvgwR6spTpd_B9Nnf28KQ </br> +密码: wrq2 </br> +vggm.pth 预训练vggm参数 (https://data.lip6.fr/cadene/pretrainedmodels/vggm-786f2434.pth) </br> +将获取到的vggm.ckpt放在src/models/ 下,vggm.pth需要转换成vggm.ckpt + +```python +# 转换vggm.pth脚本,会在运行目录下生成一个vggm.ckpt +python pth2ckpt.py --pth_path /path/pth +# 进入脚本目录,需要指定device_id,该步骤默认需要进行Reinforcement LeLearning的微调,可在SL训练后手动终断 +python train.py --target_device device_id --dataset_path /path/dataset/ +# 进入脚本目录,根据权重文件生成预测框文件,需要指定训练ckpt,device_id +python eval.py --dataset_path /path/otb --weight_file /path/to/weight_file --target_device device_id +# 进入脚本目录,根据生成的预测文件进行最终精度评估,bboxes_folder为上一行命令生成的预测文件夹名,一般为results_on_test_images_part2/weight_file +python create_plots.py --bboxes_folder results_on_test_images_part2/weight_file +#Ascend多卡训练 +bash scripts/run_distributed_train.sh RANK_TABLE_FILE RANT_SIZE 0 /path/dataset +#Ascend单卡训练 +bash scripts/run_standalone_train.sh /path/dataset DEVICE_ID +#Ascend多卡测试,需要指定weight_file +bash scripts/run_distributed_test.sh RANK_TABLE_FILE RANT_SIZE 0 weight_file /path/dataset +``` + +Ascend训练:生成[RANK_TABLE_FILE](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) + +## 脚本说明 + +### 脚本及样例代码 + +```text +├── ADNet + ├── scripts + │ ├──run_distribute_train.sh // 在Ascend中多卡训练 + │ ├──run_distribute_test.sh // 在Ascend中多卡测试 + │ ├──run_standalone_train.sh // 在Ascend中单卡训练 + ├── src //源码 + │ │ ├── datasets + │ │ │ ├──get_train_dbs.py + │ │ │ ├──online_adaptation_dataset.py + │ │ │ ├──rl_dataset.py + │ │ │ ├──sl_dataset.py + │ │ │ ├── data + │ │ │ │ ├──otb //OTB100数据集 + │ │ │ │ ├──vot13 //vot2013数据集 + │ │ │ │ ├──vot14 //vot2014数据集 + │ │ │ │ ├──vot15 //vot2015数据集 + │ │ ├── models + │ │ │ ├──ADNet.py //ADNet主干网络模型 + │ │ │ ├──CustomizedCell.py //自定义网络结构 + │ │ │ ├──vggm.py //vggm网络模型 + │ │ │ ├──vggm.ckpt //vggm预训练网络模型 + │ │ ├── options + │ │ │ ├──general.py //模型相关配置 + │ │ ├── trainers + │ │ │ ├──adnet_test.py //测试主文件 + │ │ │ ├──adnet_train_rl.py //强化学习主文件 + │ │ │ ├──adnet_train_sl.py //监督学习主文件 + │ │ │ ├──RL_tools.py //强化学习环境 + │ │ ├── utils + │ │ │ ├──augmentations.py + │ │ │ ├──display.py + │ │ │ ├──do_action.py + │ │ │ ├──draw_box_from_npy.py + │ │ │ ├──gen_action_labels.py + │ │ │ ├──gen_samples.py + │ │ │ ├──get_action_history_onehot.py + │ │ │ ├──get_benchmark_info.py + │ │ │ ├──get_benchmark_path.py //数据集位置描述文件 + │ │ │ ├──get_train_videos.py + │ │ │ ├──get_video_infos.py + │ │ │ ├──my_math.py + │ │ │ ├──overlap_ratio.py + │ │ │ ├──precision_plot.py + │ │ │ ├── videolist //定义追踪文件夹 + │ │ │ │ ├── vot13-otb.txt + │ │ │ │ ├── vot14-otb.txt + │ │ │ │ ├── vot15-otb.txt + ├── README_CN.md // ADNet相关说明 + ├── train.py // 训练入口 + ├── eval.py // 评估入口 + ├── create_plots.py // 精度生成脚本 + +``` + +### 脚本参数 + +```text +train.py +--data_url: obs桶数据集位置,vot13,vot14,vot15 +--train_url: 输出文件路径 +--start_iter: 起始iteration数 +--save_folder: 权重文件保存的相对路径 +--device_target: 实现代码的设备,值为'Ascend' +--target_device: 使用的物理卡号 +--resume: 恢复训练保存ckpt的路径 +--run_supervised: 是否启用SL,或直接启用RL,需传入resume的ckpt路径 +--distributed: 多卡运行 +--run_online: ModelArts上运行,默认为False +eval.py +--weight_file: 权重文件路径 +--save_result_npy: 保存所有预测文件的相对路径的根目录 +--device_target: 实现代码的设备,值为'Ascend' +--target_device: 使用的物理卡号 +--data_url: obs桶数据集位置 +--train_url: 输出文件路径 +create_plots.py +--bboxes_folder 运行eval.py所指定的save_result_npy中对应权重文件目录 +``` + +### 训练过程 + +#### 训练 + +- Ascend处理器环境运行 + + ```python + python train.py --target_device device_id + # 或进入脚本目录,执行脚本 单卡训练时间过长,不建议使用单卡训练,8卡监督训练时间大概需要80h(30epoch),强化学习部分不建议在进行训练,精度也可达标 + bash scripts/run_standalone_train.sh DEVICE_ID + ``` + + 经过训练后,损失值如下: + + ```text + # grep "Loss:" log + iter 970 || Loss: 2.4038 || Timer: 2.5797 sec. + iter 980 || Loss: 2.2499 || Timer: 2.4897 sec. + iter 990 || Loss: 2.4569 || Timer: 2.4808 sec. + iter 1000 || Loss: 2.5012 || Timer: 2.4311 sec. + iter 1010 || Loss: 2.3282 || Timer: 2.5438 sec. + iter 1020 || Loss: 2.0806 || Timer: 2.4931 sec. + iter 1030 || Loss: 2.3262 || Timer: 2.6490 sec. + iter 1040 || Loss: 2.2101 || Timer: 2.4939 sec. + iter 1050 || Loss: 2.3560 || Timer: 2.4301 sec. + iter 1060 || Loss: 0.8712 || Timer: 2.5953 sec. + iter 1070 || Loss: 2.3375 || Timer: 2.4974 sec. + iter 1080 || Loss: 1.3731 || Timer: 2.4519 sec + ... + ``` + + 模型检查点保存在weights目录下,多卡训练时仅rank为0的卡保存检查点 + +### 评估过程 + +#### 评估 + +在运行以下命令之前,请检查用于评估的检查点路径 + +- Ascend处理器环境运行 + + ```python + # 进入脚本目录,根据OTB100数据集online finetune and test生成预测文件,该步骤单卡情况下大约要执行17个小时左右 + python eval.py --weight_file /path/weight_file + # 进入脚本目录,根据生成的预测文件绘制distance等metrics图,该步骤执行会生成对应的精度 + python create_plots.pt --bboxes_folder /path + ``` + +- + 测试数据集的准确率如下: +作者目标仓库精度75.3%,70.7%,69.0%,68.7%,75.5%,69.4%,avg precision=71.3% +实际测试的SL精度为73.6%左右 + +# 推理 + +本模型支持导出静态mindir,但静态推理效果无法接受,故暂不提供推理流程 + +## [导出mindir模型](#contents) + +```python +python export.py --device_id [DEVICE_ID] --ckpt_file [CKPT_PATH] +``` + +## [推理过程](#contents) + +### 用法 + +mindir文件必须由export.py导出,输入文件必须为bin格式 + +### 结果 + +## 模型描述 + +### 性能 + +#### 评估性能 + +| 参数 | ModelArts +| -------------------------- | ----------------------------------------------------------- +| 资源 | Ascend 910;CPU 2.60GHz, 192核;内存:755G +| 上传日期 | 2021-08-12 +| MindSpore版本 | 1.3.0 +| 数据集 | VOT2013,VOT2014,VOT2015 +| 训练参数 | epoch=100, batch_size=8, lr=0.0001 +| 优化器 | SGD +| 损失函数 | SoftmaxCrossEntropyWithLogits +| 损失 | 0.03 +| 速度 | 200毫秒/步 +| 总时间 | 10分钟 +| 微调检查点 | 大约40M (.ckpt文件) + +## 随机情况说明 + +train.py中设置了随机种子 + +## ModelZoo主页 + +请浏览官网[主页](https://gitee.com/mindspore/models) \ No newline at end of file diff --git a/official/cv/ADNet/create_plots.py b/official/cv/ADNet/create_plots.py new file mode 100644 index 0000000000000000000000000000000000000000..68516442f2b799c32f998036013933827a035ffd --- /dev/null +++ b/official/cv/ADNet/create_plots.py @@ -0,0 +1,87 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import argparse +import glob +import ast + +import numpy as np + +from src.utils.precision_plot import iou_precision_plot, distance_precision_plot + + +parser = argparse.ArgumentParser( + description='ADNet create plot') +parser.add_argument('--bboxes_folder', default='', type=str, help='location where bboxes files are saved') +parser.add_argument('--save_plot_folder', default=None, type=str, help='save plots folder') +parser.add_argument('--show_plot', default=False, type=ast.literal_eval, help='show plot or not') + +args = parser.parse_args() + +bboxes_files = glob.glob(os.path.join(args.bboxes_folder, '*-bboxes.npy')) +bboxes_files.sort(key=str.lower) + +all_bboxes = [] +all_gt = [] +all_dataset_name = [] + +for bboxes_file in bboxes_files: + dataset_name = os.path.basename(bboxes_file)[:-11] + gt_file = os.path.join(args.bboxes_folder, dataset_name + '-ground_truth.npy') + + bboxes = np.load(bboxes_file) + gt = np.load(gt_file) + + all_bboxes.append(bboxes) + all_gt.append(gt) + all_dataset_name.append(dataset_name) + +for idx, bboxes in enumerate(all_bboxes): + if args.save_plot_folder is not None: + save_plot_file = os.path.join(args.save_plot_folder, all_dataset_name[idx]) + else: + save_plot_file = None + + iou_precisions = iou_precision_plot(bboxes, all_gt[idx], all_dataset_name[idx], show=args.show_plot, + save_plot=save_plot_file) + + distance_precisions = distance_precision_plot(bboxes, all_gt[idx], all_dataset_name[idx], show=args.show_plot, + save_plot=save_plot_file) + +# all dataset plot precision +if args.save_plot_folder is not None: + save_plot_file = os.path.join(args.save_plot_folder, 'ALL') +else: + save_plot_file = None + +all_bboxes_merge = [] +for bboxes in all_bboxes: + all_bboxes_merge.extend(bboxes) + +all_gt_merge = [] +for gt in all_gt: + all_gt_merge.extend(gt) + +all_bboxes_merge = np.array(all_bboxes_merge) +all_gt_merge = np.array(all_gt_merge) + +iou_precisions = iou_precision_plot(all_bboxes_merge, all_gt_merge, 'ALL', show=args.show_plot, + save_plot=save_plot_file) + +distance_precisions = distance_precision_plot(all_bboxes_merge, all_gt_merge, 'ALL', show=args.show_plot, + save_plot=save_plot_file) + +print('distance_precision in 20px: ', distance_precisions[21]) diff --git a/official/cv/ADNet/eval.py b/official/cv/ADNet/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..7aa1d3b21b3eb13dee74714874aae938e1fb1d3d --- /dev/null +++ b/official/cv/ADNet/eval.py @@ -0,0 +1,148 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import argparse +import os +import ast + +from src.options.general import opts +from src.models.ADNet import adnet +from src.trainers.adnet_test import adnet_test + +from mindspore import context +from mindspore.communication.management import init +from mindspore.context import ParallelMode + +parser = argparse.ArgumentParser( + description='ADNet test') +parser.add_argument('--weight_file', default='weights/ADNet_RL_.pth', type=str, help='The pretrained weight file') +parser.add_argument('--num_workers', default=1, type=int, help='Number of workers used in dataloading') +parser.add_argument('--visualize', default=False, type=ast.literal_eval, help='Use tensorboardx to for visualization') +parser.add_argument('--send_images_to_visualization', type=ast.literal_eval, + default=False, help='visdom after augmentations') +parser.add_argument('--display_images', default=False, type=ast.literal_eval, help='Whether to display images or not') +parser.add_argument('--save_result_images', default='', type=str, help='save results folder') +parser.add_argument('--save_result_npy', default='../results_on_test_images_part2', + type=str, help='save results folder') +parser.add_argument('--initial_samples', default=3000, type=int, help='Num of training samples for the first frame.') +parser.add_argument('--online_samples', default=250, type=int, help='Num of training samples for the other frames.') +parser.add_argument('--redetection_samples', default=256, type=int, help='Num of samples for redetection.') +parser.add_argument('--initial_iteration', default=300, type=int, help='Number of iteration in initial training. T_I') +parser.add_argument('--online_iteration', default=30, type=int, help='Number of iteration in online training. T_O') +parser.add_argument('--online_adaptation_every_I_frames', default=10, type=int, help='Frequency of online training. I') + +parser.add_argument('--believe_score_result', default=0, type=int, help='Believe score result after n training') + +parser.add_argument('--pos_samples_ratio', default='0.5', type=float, + help='''The ratio of positive in all samples for online adaptation. + Rest of it will be negative samples. Default: 0.5''') +parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU']) +parser.add_argument('--target_device', type=int, default=0) +parser.add_argument('--distributed', type=ast.literal_eval, default=False) +parser.add_argument('--multidomain', type=ast.literal_eval, default=True) +parser.add_argument('--run_online', type=str, default='False') +parser.add_argument('--data_url', type=str) +parser.add_argument('--train_url', type=str) +parser.add_argument('--dataset_path', type=str, default='') + +args = parser.parse_args() +if args.distributed: + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=int(os.environ["DEVICE_ID"])) + init() + context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) +else: + context.set_context(device_target=args.device_target, mode=context.GRAPH_MODE, device_id=args.target_device) +assert 0 < args.pos_samples_ratio <= 1, "the pos_samples_ratio valid range is (0, 1]" + +# set opts based on the args.. especially the number of samples etc. +opts['nPos_init'] = int(args.initial_samples * args.pos_samples_ratio) +opts['nNeg_init'] = int(args.initial_samples - opts['nPos_init']) +opts['nPos_online'] = int(args.online_samples * args.pos_samples_ratio) +opts['nNeg_online'] = int(args.online_samples - opts['nPos_online']) + +# just to make sure if one of nNeg is zero, the other nNeg is zero (kinda small hack...) +if opts['nNeg_init'] == 0: + opts['nNeg_online'] = 0 + opts['nPos_online'] = args.online_samples + +elif opts['nNeg_online'] == 0: + opts['nNeg_init'] = 0 + opts['nPos_init'] = args.initial_samples + +opts['finetune_iters'] = args.initial_iteration +opts['finetune_iters_online'] = args.online_iteration +opts['redet_samples'] = args.redetection_samples + +if args.run_online == 'True': + local_result = '/cache/result' + args.save_result_npy = os.path.join(local_result, args.save_result_npy, + os.path.basename(args.weight_file)[:-4] + '-' + + str(args.pos_samples_ratio)) + import moxing + local_data_url = "/cache/data" + args.dataset_path = local_data_url + local_weight_url = "/cache/weight/" + args.weight_file.split('/')[-1] + # moving dataset from obs to container + moxing.file.copy_parallel(args.data_url, local_data_url) + # moving weight_file from obs to container + moxing.file.copy_parallel(args.weight_file, local_weight_url) + args.weight_file = local_weight_url + '/' + args.weight_file.split('/')[-1][:-4] +else: + local_result = '' + args.save_result_npy = os.path.join(args.save_result_npy, os.path.basename(args.weight_file)[:-4] + '-' + + str(args.pos_samples_ratio)) +if args.save_result_images is not None: + args.save_result_images = os.path.join(local_result, args.save_result_images, + os.path.basename(args.weight_file)[:-4] + '-' + str(args.pos_samples_ratio)) + if not os.path.exists(args.save_result_images): + os.makedirs(args.save_result_images) + +if not os.path.exists(args.save_result_npy): + os.makedirs(args.save_result_npy) + +if args.run_online == 'True': + dataset_root = '/cache/data' +else: + dataset_root = os.path.join(args.dataset_path) +vid_folders = [] + +for filename in os.listdir(dataset_root): + if os.path.isdir(os.path.join(dataset_root, filename)): + vid_folders.append(filename) +vid_folders.sort(key=str.lower) + +save_root = args.save_result_images +save_root_npy = args.save_result_npy + +for vid_folder in vid_folders: + print('Loading {}...'.format(args.weight_file)) + opts['num_videos'] = 1 + net, domain_nets = adnet(opts, + trained_file=args.weight_file, + random_initialize_domain_specific=True, + multidomain=False, distributed=args.distributed) + net.set_train() + + if args.save_result_images is not None: + args.save_result_images = os.path.join(save_root, vid_folder) + if not os.path.exists(args.save_result_images): + os.makedirs(args.save_result_images) + + args.save_result_npy = os.path.join(save_root_npy, vid_folder) + + vid_path = os.path.join(dataset_root, vid_folder) + + # load ADNetDomainSpecific + net.load_domain_specific(domain_nets[0]) + bboxes, t_sum = adnet_test(net, vid_path, opts, args) diff --git a/official/cv/ADNet/export_model.py b/official/cv/ADNet/export_model.py new file mode 100644 index 0000000000000000000000000000000000000000..83e3deec67e89aa7e42ab0f38a20a3246b563ad9 --- /dev/null +++ b/official/cv/ADNet/export_model.py @@ -0,0 +1,35 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import argparse +import numpy as np + +from src.options.general import opts +from src.models.ADNet import adnet + +from mindspore import Tensor, export, context + +parser = argparse.ArgumentParser( + description='ADNet test') +parser.add_argument('--weight_file', default='', type=str, help='The pretrained weight file') +parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU']) +parser.add_argument('--target_device', type=int, default=0) +args = parser.parse_args() +context.set_context(device_target=args.device_target, mode=context.PYNATIVE_MODE, device_id=args.target_device) +opts['num_videos'] = 1 +net, domain_specific_nets = adnet(opts, trained_file=args.weight_file) + +input_ = np.random.uniform(0.0, 1.0, size=[128, 3, 112, 112]).astype(np.float32) +export(net, Tensor(input_), file_name='ADNet', file_format='MINDIR') +print('export finished') diff --git a/official/cv/ADNet/pth2ckpt.py b/official/cv/ADNet/pth2ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4b94c792b5f17ce36f600d11a9929f1d5af216 --- /dev/null +++ b/official/cv/ADNet/pth2ckpt.py @@ -0,0 +1,45 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import argparse +import torch + +from mindspore.train.serialization import save_checkpoint +from mindspore import Tensor, context + + +def pytorch2mindspore(pth_path): + par_dict = torch.load(pth_path) + new_params_list = [] + + for name in par_dict: + param_dict = {} + parameter = par_dict[name] + param_dict['name'] = name + param_dict['data'] = Tensor(parameter.numpy()) + new_params_list.append(param_dict) + save_checkpoint(new_params_list, 'vggm.ckpt') + print('convert pth to ckpt finished') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='pth2ckpt') + parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU']) + parser.add_argument('--target_device', type=int, default=0) + parser.add_argument('--pth_path', type=str, default='') + args = parser.parse_args() + context.set_context(device_target=args.device_target, mode=context.PYNATIVE_MODE, device_id=args.target_device) + pytorch2mindspore(args.pth_path) diff --git a/official/cv/ADNet/scripts/run_distributed_test.sh b/official/cv/ADNet/scripts/run_distributed_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..ff4b83e4d16421ab92fa4efe0c5e3e9faf8429ba --- /dev/null +++ b/official/cv/ADNet/scripts/run_distributed_test.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run.sh RANK_TABLE_FILE RANK_SIZE RANK_START /path/weight_file /path/OTB" +echo "For example: bash run_distributed_test.sh /path/rank_table.json 16 0 weight_file /data/OTB" +echo "It is better to use the absolute path." +echo "==============================================================================================================" +execute_path=$(pwd) +echo ${execute_path} +script_self=$(readlink -f "$0") +self_path=$(dirname "${script_self}") +echo ${self_path} + +export RANK_TABLE_FILE=$1 +export RANK_SIZE=$2 +DEVICE_START=$3 +WEIGHT_FILE=$4 + +for((i=0;i<$RANK_SIZE;i++)); +do + export RANK_ID=$i + export DEVICE_ID=$((DEVICE_START + i)) + echo "Start test for rank $RANK_ID, device $DEVICE_ID." + if [ -d ${execute_path}/eval_device${DEVICE_ID} ]; then + rm -rf ${execute_path}/eval_device${DEVICE_ID} + fi + mkdir ${execute_path}/eval_device${DEVICE_ID} + cp -f eval.py ${execute_path}/eval_device${DEVICE_ID} + cp -rf src ${execute_path}/eval_device${DEVICE_ID} + cd ${execute_path}/eval_device${DEVICE_ID} || exit + python3.7 -u eval.py --distributed 'True' --weight_file ${WEIGHT_FILE} --dataset_path $5> eval_log$i 2>&1 & + cd .. +done +wait +filename=`echo ${WEIGHT_FILE##*/} |awk -F. '{print $1}'` +bboxes_folder="results_on_test_images_part2/${filename}.-0.5" +python3 create_plots.py --bboxes_folder ${execute_path}/${bboxes_folder} > eval_result.txt diff --git a/official/cv/ADNet/scripts/run_distributed_train.sh b/official/cv/ADNet/scripts/run_distributed_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..bc03535b72bbdafe90e7c3dc6e844159c459939e --- /dev/null +++ b/official/cv/ADNet/scripts/run_distributed_train.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run.sh RANK_TABLE_FILE RANK_SIZE RANK_START /path/dataset" +echo "For example: bash run.sh /path/rank_table.json 8 0 /path/dataset" +echo "It is better to use the absolute path." +echo "==============================================================================================================" +execute_path=$(pwd) +echo ${execute_path} + +export RANK_TABLE_FILE=$1 +export RANK_SIZE=$2 +DEVICE_START=$3 +DATASET_PATH=$4 +for((i=0;i<$RANK_SIZE;i++)); +do + export RANK_ID=$i + export DEVICE_ID=$((DEVICE_START + i)) + echo "Start training for rank $i, device $DEVICE_ID." + if [ -d ${execute_path}/device${DEVICE_ID} ]; then + rm -rf ${execute_path}/device${DEVICE_ID} + fi + mkdir ${execute_path}/device${DEVICE_ID} + cp -f train.py ${execute_path}/device${DEVICE_ID} + cp -rf src ${execute_path}/device${DEVICE_ID} + cd ${execute_path}/device${DEVICE_ID} || exit + python3.7 -u train.py --distributed 'True' --dataset_path ${DATASET_PATH} > log$i 2>&1 & + cd .. +done diff --git a/official/cv/ADNet/scripts/run_standalone_train.sh b/official/cv/ADNet/scripts/run_standalone_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..06ab29c2856610f8b267dd1aa4309fc54ba8a01f --- /dev/null +++ b/official/cv/ADNet/scripts/run_standalone_train.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# applicable to Ascend + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run.sh /path/dataset DEVICE_ID" +echo "For example: bash run.sh dataset_path 3" +echo "It is better to use the absolute path." +echo "==============================================================================================================" +execute_path=$(pwd) +echo ${execute_path} +DATASET=$1 +DEVICE_ID=$2 +echo "Start training for device $DEVICE_ID." +python3.7 -u train.py --dataset_path ${DATASET} --target_device ${DEVICE_ID} > log${DEVICE_ID} 2>&1 & diff --git a/official/cv/ADNet/src/datasets/get_train_dbs.py b/official/cv/ADNet/src/datasets/get_train_dbs.py new file mode 100644 index 0000000000000000000000000000000000000000..44e1d983dd82340a8c18b0403effc1cb1f94acda --- /dev/null +++ b/official/cv/ADNet/src/datasets/get_train_dbs.py @@ -0,0 +1,104 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# matlab code: +# https://github.com/hellbell/ADNet/blob/3a7955587b5d395401ebc94a5ab067759340680d/train/get_train_dbs.m +import cv2 +import numpy as np + +from src.utils.gen_samples import gen_samples +from src.utils.overlap_ratio import overlap_ratio +from src.utils.gen_action_labels import gen_action_labels + + +def get_train_dbs(vid_info, opts): + img = cv2.imread(vid_info['img_files'][0]) + + opts['scale_factor'] = 1.05 + opts['imgSize'] = list(img.shape) + gt_skip = opts['train']['gt_skip'] + + if vid_info['db_name'] == 'alov300': + train_sequences = vid_info['gt_use'] == 1 + else: + train_sequences = list(range(0, vid_info['nframes'], gt_skip)) + + train_db_pos = [] + train_db_neg = [] + + for train_i in range(len(train_sequences)): + train_db_pos_ = { + 'img_path': [], + 'bboxes': [], + 'labels': [], + 'score_labels': [] + } + train_db_neg_ = { + 'img_path': [], + 'bboxes': [], + 'labels': [], + 'score_labels': [] + } + + img_idx = train_sequences[train_i] + gt_bbox = vid_info['gt'][img_idx] + + if not gt_bbox: + continue + + pos_examples = [] + while len(pos_examples) < opts['nPos_train']: + pos = gen_samples('gaussian', gt_bbox, opts['nPos_train']*5, opts, 0.1, 5) + r = overlap_ratio(pos, np.tile(gt_bbox, (len(pos), 1))) + pos = pos[np.array(r) > opts['posThre_train']] + if pos.shape[0] == 0: + continue + pos = pos[np.random.randint(low=0, high=len(pos), + size=min(len(pos), opts['nPos_train']-len(pos_examples))), :] + pos_examples.extend(pos) + + neg_examples = [] + while len(neg_examples) < opts['nNeg_train']: + # in original code, this 1 line below use opts['nPos_train'] instead of opts['nNeg_train'] + neg = gen_samples('gaussian', gt_bbox, opts['nNeg_train']*5, opts, 2, 10) + r = overlap_ratio(neg, np.tile(gt_bbox, (len(neg), 1))) + neg = neg[np.array(r) < opts['negThre_train']] + if neg.shape[0] == 0: + continue + neg = neg[np.random.randint(low=0, high=len(neg), + size=min(len(neg), opts['nNeg_train']-len(neg_examples))), :] + neg_examples.extend(neg) + + action_labels_pos = gen_action_labels(opts['num_actions'], opts, np.array(pos_examples), gt_bbox) + action_labels_neg = np.full((opts['num_actions'], len(neg_examples)), fill_value=-1) + + action_labels_pos = np.transpose(action_labels_pos).tolist() + action_labels_neg = np.transpose(action_labels_neg).tolist() + + train_db_pos_['img_path'] = np.full(len(pos_examples), vid_info['img_files'][img_idx]) + train_db_pos_['bboxes'] = pos_examples + train_db_pos_['labels'] = action_labels_pos + # score labels: 1 is positive. 0 is negative + train_db_pos_['score_labels'] = list(np.ones(len(pos_examples), dtype=int)) + + train_db_neg_['img_path'] = np.full(len(neg_examples), vid_info['img_files'][img_idx]) + train_db_neg_['bboxes'] = neg_examples + train_db_neg_['labels'] = action_labels_neg + # score labels: 1 is positive. 0 is negative + train_db_neg_['score_labels'] = list(np.zeros(len(neg_examples), dtype=int)) + + train_db_pos.append(train_db_pos_) + train_db_neg.append(train_db_neg_) + + return train_db_pos, train_db_neg diff --git a/official/cv/ADNet/src/datasets/online_adaptation_dataset.py b/official/cv/ADNet/src/datasets/online_adaptation_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c182630e6f31c1f147855301a64bde903e618880 --- /dev/null +++ b/official/cv/ADNet/src/datasets/online_adaptation_dataset.py @@ -0,0 +1,157 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# pytorch dataset for SL learning +# matlab code (line 26-33): +# https://github.com/hellbell/ADNet/blob/master/train/adnet_train_SL.m +# reference: +# https://github.com/amdegroot/ssd.pytorch/blob/master/data/voc0712.py + + +import numpy as np + +from src.utils.gen_samples import gen_samples +from src.utils.overlap_ratio import overlap_ratio +from src.utils.gen_action_labels import gen_action_labels +from src.utils.augmentations import ADNet_Augmentation + + +class OnlineAdaptationDatasetStorage(): + def __init__(self, initial_frame, first_box, opts, args, positive=True): + self.opts = opts + self.positive = positive + + if positive: + self.max_num_past_frames = opts['nFrames_long'] + else: + self.max_num_past_frames = opts['nFrames_short'] + + self.transform = ADNet_Augmentation(opts) + + self.train_db = [] + + self.add_frame_then_generate_samples(initial_frame, first_box) + + def get_item(self, index): # __getitem__ + # find out which train_db's index is the index + remaining_idx = index + + train_db_idx = 0 + for train_db_idx, train_db_ in enumerate(self.train_db): + if remaining_idx < len(train_db_['bboxes']): + break + remaining_idx -= len(train_db_['bboxes']) + + # get the data + im = self.train_db[train_db_idx]['past_frame'] + bbox = self.train_db[train_db_idx]['bboxes'][remaining_idx] + action_label = np.array(self.train_db[train_db_idx]['labels'][remaining_idx], dtype=np.float32) + score_label = self.train_db[train_db_idx]['score_labels'][remaining_idx] + + if self.transform is not None: + im, bbox, action_label, score_label = self.transform(im, bbox, action_label, score_label) + return im, bbox, action_label, score_label + + def get_len(self): # __len__ + number_samples = 0 + for train_db_ in self.train_db: + number_samples += len(train_db_['bboxes']) + return number_samples + + # add past frame... + def add_frame_then_generate_samples(self, frame, curr_box): + init = not self.train_db + + train_db_ = { + 'past_frame': frame, + 'bboxes': [], + 'labels': [], + 'score_labels': [] + } + + self.opts['imgSize'] = frame.shape[:2] + + bboxes, labels, score_labels = self.generate_samples(curr_box, positive=self.positive, init=init) + train_db_['bboxes'] = bboxes + train_db_['labels'] = labels + train_db_['score_labels'] = score_labels + + self.train_db.append(train_db_) + + # delete old frames if the history is full + while len(self.train_db) > self.max_num_past_frames: # saver with while instead of if + del self.train_db[0] + + # generate samples from past frames, called if tracking success... + # generate pos/neg samples + # private class + def generate_samples(self, curr_bbox, positive, init=False): + if init: + if positive: + n = self.opts['nPos_init'] + thre = self.opts['posThre_init'] + else: + n = self.opts['nNeg_init'] + thre = self.opts['negThre_init'] + else: + if positive: + n = self.opts['nPos_online'] + thre = self.opts['posThre_online'] + else: + n = self.opts['nNeg_online'] + thre = self.opts['negThre_online'] + + assert n > 0, "if n = 0, don't initialize this class" + + if positive: + examples = gen_samples('gaussian', curr_bbox, n * 2, self.opts, + self.opts['finetune_trans'], self.opts['finetune_scale_factor']) + r = overlap_ratio(examples, np.tile(curr_bbox, (len(examples), 1))) + examples = examples[np.array(r) > thre] + examples = examples[np.random.randint(low=0, high=len(examples), + size=min(len(examples), n)), :] + + action_labels = gen_action_labels(self.opts['num_actions'], self.opts, np.array(examples), + curr_bbox) + # score labels: 1 is positive. 0 is negative + score_labels = list(np.ones(len(examples), dtype=int)) + + else: + examples = gen_samples('uniform', curr_bbox, n * 2, self.opts, 2, 5) + r = overlap_ratio(examples, np.tile(curr_bbox, (len(examples), 1))) + examples = examples[np.array(r) < thre] + examples = examples[np.random.randint(low=0, high=len(examples), + size=min(len(examples), n)), :] + + action_labels = np.full((self.opts['num_actions'], len(examples)), fill_value=-1) + # score labels: 1 is positive. 0 is negative + score_labels = list(np.zeros(len(examples), dtype=int)) + + action_labels = np.transpose(action_labels).tolist() + bboxes = examples + labels = action_labels + + return bboxes, labels, score_labels + + +# should be initialized again whenever the dataset_storage has changed +class OnlineAdaptationDataset: + def __init__(self, dataset_storage): + self.dataset_storage = dataset_storage + + def __getitem__(self, index): + return self.dataset_storage.get_item(index) + + def __len__(self): + return self.dataset_storage.get_len() diff --git a/official/cv/ADNet/src/datasets/rl_dataset.py b/official/cv/ADNet/src/datasets/rl_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3412cccc1299dab55d7fe8034e71ccad3753bf7a --- /dev/null +++ b/official/cv/ADNet/src/datasets/rl_dataset.py @@ -0,0 +1,163 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# pytorch dataset for SL learning +# matlab code (line 26-33): +# https://github.com/hellbell/ADNet/blob/master/train/adnet_train_SL.m +# reference: +# https://github.com/amdegroot/ssd.pytorch/blob/master/data/voc0712.py + +import time +import cv2 +import numpy as np + +from src.trainers.RL_tools import TrackingEnvironment +from src.utils.augmentations import ADNet_Augmentation +from src.utils.display import display_result, draw_box + +from mindspore import Tensor, ops +from mindspore import dtype as mstype + + +class RLDataset: + + def __init__(self, net, domain_specific_nets, train_videos, opts, args): + self.env = None + + # these lists won't include the ground truth + self.action_list = [] # a_t,l # argmax of self.action_prob_list + self.action_prob_list = [] # output of network (fc6_out) + self.log_probs_list = [] # log probs from each self.action_prob_list member + self.reward_list = [] # tracking score + self.patch_list = [] # input of network + self.action_dynamic_list = [] # action_dynamic used for inference (means before updating the action_dynamic) + self.result_box_list = [] + self.vid_idx_list = [] + + self.reset(net, domain_specific_nets, train_videos, opts, args) + + def __getitem__(self, index): + index = index % len(self.log_probs_list) + return np.array(self.log_probs_list[index]), \ + np.array(self.reward_list[index]), \ + np.array(self.vid_idx_list[index]), \ + np.array(self.patch_list[index]) + + def __len__(self): + return len(self.log_probs_list) + + def reset(self, net, domain_specific_nets, train_videos, opts, args): + self.action_list = [] # a_t,l # argmax of self.action_prob_list + self.action_prob_list = [] # output of network (fc6_out) + self.log_probs_list = [] # log probs from each self.action_prob_list member + self.reward_list = [] # tracking score + self.patch_list = [] # input of network + self.action_dynamic_list = [] # action_dynamic used for inference (means before updating the action_dynamic) + self.result_box_list = [] + self.vid_idx_list = [] + + print('generating reinforcement learning dataset') + transform = ADNet_Augmentation(opts) + + self.env = TrackingEnvironment(train_videos, opts, transform=transform, args=args) + clip_idx = 0 + while True: # for every clip (l) + + num_step_history = [] # T_l + + num_frame = 1 # the first frame won't be tracked.. + t = 0 + box_history_clip = [] # for checking oscillation in a clip + net.reset_action_dynamic() # action dynamic should be in a clip (what makes sense...) + + while True: # for every frame in a clip (t) + tic = time.time() + + if args.display_images: + im_with_bb = display_result(self.env.get_current_img(), self.env.get_state()) + cv2.imshow('patch', self.env.get_current_patch_unprocessed()) + cv2.waitKey(1) + else: + im_with_bb = draw_box(self.env.get_current_img(), self.env.get_state()) + + if args.save_result_images: + cv2.imwrite('images/' + str(clip_idx) + '-' + str(t) + '.jpg', im_with_bb) + + curr_patch = self.env.get_current_patch() + self.patch_list.append(curr_patch) + curr_patch = Tensor(np.expand_dims(curr_patch, 0), mstype.float32).transpose(0, 3, 1, 2) + + # load ADNetDomainSpecific with video index + if args.multidomain: + vid_idx = self.env.get_current_train_vid_idx() + else: + vid_idx = 0 + net.load_domain_specific(domain_specific_nets[vid_idx]) + + fc6_out, _ = net(curr_patch, -1, True) + net.update_action_dynamic(net.action_history) + + action = np.argmax(fc6_out.asnumpy()) + log_prob = ops.Log()(Tensor(fc6_out[0][Tensor(action, mstype.int32)].asnumpy(), mstype.float32)) + + self.log_probs_list.append(np.asscalar(log_prob.asnumpy())) + if args.multidomain: + self.vid_idx_list.append(np.asscalar(vid_idx)) + else: + self.vid_idx_list.append(0) + + self.action_list.append(action) + + new_state, reward, done, info = self.env.step(action) + if done and info['finish_epoch']: + pass + # check oscillating + elif any((np.array(new_state).round() == x).all() for x in np.array(box_history_clip).round()): + action = opts['stop_action'] + reward, done, finish_epoch = self.env.go_to_next_frame() + info['finish_epoch'] = finish_epoch + + # check if number of action is already too much + if t > opts['num_action_step_max']: + action = opts['stop_action'] + reward, done, finish_epoch = self.env.go_to_next_frame() + info['finish_epoch'] = finish_epoch + + box_history_clip.append(list(new_state)) + + t += 1 + + if action == opts['stop_action']: + num_frame += 1 + num_step_history.append(t) + t = 0 + + toc = time.time() - tic + print('forward time (clip ' + str(clip_idx) + " - frame " + str(num_frame) + " - t " + str(t) + ") = " + + str(toc) + " s") + + if done: # if finish the clip + break + + tracking_scores_size = np.array(num_step_history).sum() + tracking_scores = np.full(tracking_scores_size, reward) # seems no discount factor whatsoever + + self.reward_list.extend(tracking_scores) + + clip_idx += 1 + + if info['finish_epoch']: + break + + print('generating reinforcement learning dataset finish') diff --git a/official/cv/ADNet/src/datasets/sl_dataset.py b/official/cv/ADNet/src/datasets/sl_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b8634cce309c8280a5d89d20d17161aba69f1016 --- /dev/null +++ b/official/cv/ADNet/src/datasets/sl_dataset.py @@ -0,0 +1,132 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# pytorch dataset for SL learning +# matlab code (line 26-33): +# https://github.com/hellbell/ADNet/blob/master/train/adnet_train_SL.m +# reference: +# https://github.com/amdegroot/ssd.pytorch/blob/master/data/voc0712.py + +import cv2 +import numpy as np +from src.datasets.get_train_dbs import get_train_dbs +from src.utils.get_video_infos import get_video_infos + + +class SLDataset: + def __init__(self, train_db, transform=None): + self.transform = transform + self.train_db = train_db + + def __getitem__(self, index): + index = index % len(self.train_db['img_path']) + im = cv2.imread(self.train_db['img_path'][index]) + bbox = self.train_db['bboxes'][index] + action_label = np.array(self.train_db['labels'][index], dtype=np.float32) + score_label = self.train_db['score_labels'][index] + vid_idx = self.train_db['vid_idx'][index] + + if self.transform is not None: + im, bbox, action_label, score_label = self.transform(im, bbox, action_label, score_label) + + return im, bbox, action_label, score_label, vid_idx + + def __len__(self): + return len(self.train_db['img_path']) + + ######################################################### + # ADDITIONAL FUNCTIONS + + def pull_image(self, index): + im = cv2.imread(self.train_db['img_path'][index]) + return im + + def pull_anno(self, index): + action_label = self.train_db['labels'][index] + score_label = self.train_db['score_labels'][index] + return action_label, score_label + + +def initialize_pos_neg_dataset(train_videos, opts, transform=None, multidomain=True): + """ + Return list of pos and list of neg dataset for each domain. + Args: + train_videos: + opts: + transform: + multidomain: + Returns: + datasets_pos: (list of SLDataset) List length: if multidomain, #videos (or domain). Else: 1 + datasets_neg: (list of SLDataset) List length: if multidomain, #videos (or domain). Else: 1 + """ + num_videos = len(train_videos['video_names']) + + datasets_pos = [] + datasets_neg = [] + + for vid_idx in range(num_videos): + train_db_pos = { + 'img_path': [], # list of string + 'bboxes': [], # list of ndarray left top coordinate [left top width height] + 'labels': [], # list of ndarray #action elements. One hot vector + 'score_labels': [], # list of scalar 0 (negative) or 1 (positive) + 'vid_idx': [] # list of int. Each video (or domain) index + } + train_db_neg = { + 'img_path': [], # list of string + 'bboxes': [], # list of ndarray left top coordinate [left top width height] + 'labels': [], # list of ndarray #action elements. One hot vector + 'score_labels': [], # list of scalar 0 (negative) or 1 (positive) + 'vid_idx': [] # list of int. Each video (or domain) index + } + + print("generating dataset from video " + str(vid_idx + 1) + "/" + str(num_videos) + + "(current total data (pos-neg): " + str(len(train_db_pos['labels'])) + + "-" + str(len(train_db_neg['labels'])) + ")") + + bench_name = train_videos['bench_names'][vid_idx] + video_name = train_videos['video_names'][vid_idx] + video_path = train_videos['video_paths'][vid_idx] + vid_info = get_video_infos(bench_name, video_path, video_name) + train_db_pos_, train_db_neg_ = get_train_dbs(vid_info, opts) + # separate for each bboxes sample + for sample_idx in range(len(train_db_pos_)): + train_db_pos['img_path'].extend(train_db_pos_[sample_idx]['img_path']) + train_db_pos['bboxes'].extend(train_db_pos_[sample_idx]['bboxes']) + train_db_pos['labels'].extend(train_db_pos_[sample_idx]['labels']) + train_db_pos['score_labels'].extend(train_db_pos_[sample_idx]['score_labels']) + train_db_pos['vid_idx'].extend(np.repeat(vid_idx, len(train_db_pos_[sample_idx]['img_path']))) + + print("Finish generating positive dataset... (current total data: " + str(len(train_db_pos['labels'])) + ")") + + for sample_idx in range(len(train_db_neg_)): + train_db_neg['img_path'].extend(train_db_neg_[sample_idx]['img_path']) + train_db_neg['bboxes'].extend(train_db_neg_[sample_idx]['bboxes']) + train_db_neg['labels'].extend(train_db_neg_[sample_idx]['labels']) + train_db_neg['score_labels'].extend(train_db_neg_[sample_idx]['score_labels']) + train_db_neg['vid_idx'].extend(np.repeat(vid_idx, len(train_db_neg_[sample_idx]['img_path']))) + + print("Finish generating negative dataset... (current total data: " + str(len(train_db_neg['labels'])) + ")") + + dataset_pos = SLDataset(train_db_pos, transform=transform) + dataset_neg = SLDataset(train_db_neg, transform=transform) + + if multidomain: + datasets_pos.append(dataset_pos) + datasets_neg.append(dataset_neg) + else: + datasets_pos.extend(dataset_pos) + datasets_neg.extend(dataset_neg) + + return datasets_pos, datasets_neg diff --git a/official/cv/ADNet/src/models/ADNet.py b/official/cv/ADNet/src/models/ADNet.py new file mode 100644 index 0000000000000000000000000000000000000000..7a1a7b8cb50b7683ddf9444fa600221043126847 --- /dev/null +++ b/official/cv/ADNet/src/models/ADNet.py @@ -0,0 +1,389 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os + +from src.utils.get_action_history_onehot import get_action_history_onehot +from src.models.vggm import vggm + +from mindspore import dtype as mstype +from mindspore import Tensor, Parameter +from mindspore import nn, ops +from mindspore.common.initializer import initializer +from mindspore import numpy as nps +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +pretrained_settings = { + 'adnet': { + 'input_space': 'BGR', + 'input_size': [3, 112, 112], + 'input_range': [0, 255], + 'mean': [123.68, 116.779, 103.939], + 'std': [1, 1, 1], + 'num_classes': 11 + } +} + + +class ADNetDomainSpecific(nn.Cell): + """ + This module purpose is only for saving the state_dict's domain-specific layers of each domain. + Put this module to CPU + """ + def __init__(self, num_classes, num_history): + super(ADNetDomainSpecific, self).__init__() + action_dynamic_size = num_classes * num_history + self.fc6 = nn.Dense(512 + action_dynamic_size, num_classes) + self.fc7 = nn.Dense(512 + action_dynamic_size, 2) + + def load_weights(self, base_file, video_index, run_online=False): + """ + Load weights from file + Args: + base_file: (string) + video_index: (int) + """ + #/cache/weight/ADNet_SL_epoch29.ckpt + other, ext = os.path.splitext(base_file) + if ext == '.ckpt': + print('Loading ADNetDomainSpecific ' + str(video_index) + ' weights') + + if len(other.split('_')) > 3: + filename_ = other.split('_')[2] + '_' + other.split('_')[3] + else: + filename_ = other.split('_')[2] + if run_online == 'True': + filename_ = os.path.join('/cache/weight', 'domain_weights', filename_ + '_') + else: + filename_ = os.path.join('weights', 'domain_weights', filename_ + '_') + checkpoint = load_checkpoint(filename_ + str(video_index) + '.ckpt') + load_param_into_net(self, checkpoint) + print('Finished!') + else: + print('Sorry only .pth and .pkl files supported.') + + def load_weights_from_adnet(self, adnet_net): + """ + Load weights from ADNet. Use it after updating adnet to update the weights in this module + Args: + adnet_net: (ADNet) the updated ADNet whose fc6 and fc7 + """ + # parameters_dict() + adnet_state_dict = adnet_net.parameters_dict() + model_dict = self.parameters_dict() + + # 1. filter out unnecessary keys + pretrained_dict = {k: v for k, v in adnet_state_dict.items() if k in model_dict} + # 2. overwrite entries in the existing state dict + for key in pretrained_dict: + update = nn.ParameterUpdate(model_dict[key]) + update.phase = "update_param" + update(Tensor(pretrained_dict[key])) + + +class ADNet(nn.Cell): + + def __init__(self, base_network, opts, num_classes=11, phase='train', num_history=10): + super(ADNet, self).__init__() + + self.num_classes = num_classes + self.phase = phase + self.opts = opts + + self.base_network = base_network + self.fc4_5 = nn.SequentialCell([ + nn.Dense(18432, 512), + nn.ReLU(), + nn.Dropout(0.5), + nn.Dense(512, 512), # [3] + nn.ReLU(), + nn.Dropout(0.5) + ]) + + # -1 to differentiate between action '0' and haven't been explored + self.action_history = Parameter(nps.full((num_history,), -1)) + + self.action_dynamic_size = num_classes * num_history + self.action_dynamic = Parameter(nps.zeros((self.action_dynamic_size,))) + + self.fc6 = nn.Dense(512 + self.action_dynamic_size, self.num_classes) + self.fc7 = nn.Dense(512 + self.action_dynamic_size, 2) + self.ops_concat = ops.Concat(1) + self.ops_softmax = ops.Softmax() + self.expand = ops.ExpandDims() + # update_action_dynamic: history of action. We don't update the action_dynamic in SL learning. + def construct(self, x, action_d=None, update_action_dynamic=False): + """ + Args: + x: (Tensor) the input of network + action_dynamic: (Tensor) the previous state action dynamic. + If None, use the self.action_dynamic in this Module + update_action_dynamic: (bool) Whether to update the action_dynamic with the result. + We don't update the action_dynamic in SL learning. + """ + x = self.base_network(x) + x = x.view(x.shape[0], -1) + x = self.fc4_5(x) + + if action_d is None or action_d == -1: + ac_d = ops.ExpandDims()(self.action_dynamic, 0) + ac_d = nps.tile(ac_d, (x.shape[0], 1)) + x = self.ops_concat((x, ac_d)) + else: + x = self.ops_concat((x, action_d)) + fc6_out = self.fc6(x) + fc7_out = self.fc7(x) + + if self.phase == 'test': + fc6_out = self.ops_softmax(fc6_out) + fc7_out = self.ops_softmax(fc7_out) + + if update_action_dynamic: + selected_action = ops.Argmax(1, mstype.int32)(fc6_out) + self.action_history[1:] = self.action_history[0:-1] + self.action_history[0] = selected_action + + return fc6_out, fc7_out + + def load_domain_specific(self, adnet_domain_specific): + """ + Load existing domain_specific weight to this model (i.e. fc6 and fc7). Do it before updating this model to + update the weight to the specific domain + Args: + adnet_domain_specific: (ADNetDomainSpecific) the domain's ADNetDomainSpecific module. + """ + domain_specific_state_dict = adnet_domain_specific.parameters_dict() + model_dict = self.parameters_dict() + + # 1. filter out unnecessary keys + pretrained_dict = {k: v for k, v in domain_specific_state_dict.items() if k in model_dict} + # 2. overwrite entries in the existing state dict + for key in pretrained_dict: + update = nn.ParameterUpdate(model_dict[key]) + update.phase = "update_param" + update(Tensor(pretrained_dict[key].asnumpy())) + + + def load_weights(self, base_file, load_domain_specific=None): + """ + Args: + base_file: (string) checkpoint filename + load_domain_specific: (None or int) None if not loading. + Fill it with int of the video idx to load the specific domain weight + """ + _, ext = os.path.splitext(base_file) + if ext == '.ckpt': + print('Loading weights into state dict...') + + pretrained_dict = load_checkpoint(base_file) + + # load adnet + + model_dict = self.parameters_dict() + + # create new OrderedDict that does not contain `module.` + + # 1. filter out unnecessary keys + pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} + # 2. overwrite entries in the existing state dict + for key in pretrained_dict: + update = nn.ParameterUpdate(model_dict[key]) + update.phase = "update_param" + update(Tensor(pretrained_dict[key].asnumpy())) + + print('Finished!') + else: + print('Sorry only .ckpt files supported.') + + def update_action_dynamic(self, action_history): + onehot_action = get_action_history_onehot(action_history, self.opts) + + self.action_dynamic.set_data(onehot_action) + return True + + def reset_action_dynamic(self): + self.action_dynamic.set_data(nps.zeros((self.action_dynamic_size,))) + return True + + def get_action_dynamic(self): + return self.action_dynamic + + def set_phase(self, phase): + self.phase = phase + + +def adnet(opts, base_network='vggm', trained_file=None, random_initialize_domain_specific=False, + multidomain=True, distributed=False, run_online='False'): + """ + Args: + base_network: (string) + trained_file: (None or string) saved filename + random_initialize_domain_specific: (bool) if there is trained file, whether to use the weight in the file (True) + or just random initialize (False). Won't matter if the trained_file is None (always False) + multidomain: (bool) whether to have separate weight for each video or not. Default True: separate + Returns: + adnet_model: (ADNet) + domain_nets: (list of ADNetDomainSpecific) length: #videos + """ + assert base_network in ['vggm'], "Base network variant is unavailable" + + num_classes = opts['num_actions'] + num_history = opts['num_action_history'] + + assert num_classes in [11], "num classes is not exist" + + settings = pretrained_settings['adnet'] + + if base_network == 'vggm': + base_network = vggm() # by default, load vggm's weights too + base_network = base_network.features[0:10] + + else: # change this part if adding more base network variant + base_network = vggm() + base_network = base_network.features[0:10] + + if trained_file: + assert num_classes == settings['num_classes'], \ + "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes) + + print('Resuming training, loading {}...'.format(trained_file)) + + adnet_model = ADNet(base_network=base_network, opts=opts, num_classes=num_classes, num_history=num_history) + + adnet_model.load_weights(trained_file) + + adnet_model.input_space = settings['input_space'] + adnet_model.input_size = settings['input_size'] + adnet_model.input_range = settings['input_range'] + adnet_model.mean = settings['mean'] + adnet_model.std = settings['std'] + else: + adnet_model = ADNet(base_network=base_network, opts=opts, num_classes=num_classes) + + # initialize domain-specific network + domain_nets = [] + if multidomain: + num_videos = opts['num_videos'] + else: + num_videos = 1 + + for idx in range(num_videos): + domain_nets.append(ADNetDomainSpecific(num_classes=num_classes, num_history=num_history)) + + scal = Tensor([0.01], mstype.float32) + + if trained_file and not random_initialize_domain_specific: + domain_nets[idx].load_weights(trained_file, idx, run_online) + else: + if distributed: + domain_nets[idx].init_parameters_data(auto_parallel_mode=True) + else: + domain_nets[idx].init_parameters_data(auto_parallel_mode=False) + # fc 6 + domain_nets[idx].fc6.weight.set_data( + initializer('Normal', domain_nets[idx].fc6.weight.shape, mstype.float32)) + domain_nets[idx].fc6.weight.set_data( + domain_nets[idx].fc6.weight.data * scal.expand_as(domain_nets[idx].fc6.weight.data)) + domain_nets[idx].fc6.bias.set_data(nps.full(shape=domain_nets[idx].fc6.bias.shape, fill_value=0.)) + # fc 7 + domain_nets[idx].fc7.weight.set_data( + initializer('Normal', domain_nets[idx].fc7.weight.shape, mstype.float32)) + domain_nets[idx].fc7.weight.set_data( + domain_nets[idx].fc7.weight.data * scal.expand_as(domain_nets[idx].fc7.weight.data)) + domain_nets[idx].fc7.bias.set_data(nps.full(shape=domain_nets[idx].fc7.bias.shape, fill_value=0.)) + + return adnet_model, domain_nets + +class WithLossCell_ADNET(nn.Cell): + r""" + Cell with loss function. + + Wraps the network with loss function. This Cell accepts data and label as inputs and + the computed loss will be returned. + + Args: + backbone (Cell): The target network to wrap. + loss_fn (Cell): The loss function used to compute loss. + + Inputs: + - **data** (Tensor) - Tensor of shape :math:`(N, \ldots)`. + - **label** (Tensor) - Tensor of shape :math:`(N, \ldots)`. + + Outputs: + Tensor, a tensor means the loss value, the shape of which is usually :math:`()`. + + Raises: + TypeError: If dtype of `data` or `label` is neither float16 nor float32. + + Supported Platforms: + ``Ascend`` ``GPU`` ``CPU`` + """ + + def __init__(self, backbone, loss_fn, phase): + super(WithLossCell_ADNET, self).__init__(auto_prefix=False) + self._backbone = backbone + self._loss_fn = loss_fn + self.phase = phase + + def construct(self, data, label): + fc6_out, fc7_out = self._backbone(data) + if self.phase == 'score': + return self._loss_fn(fc7_out, label) + return self._loss_fn(fc6_out, label) + + @property + def backbone_network(self): + """ + Get the backbone network. + + Returns: + Cell, the backbone network. + """ + return self._backbone + + +class SoftmaxCrossEntropyExpand(nn.Cell): + ''' + used to train in distributed training + ''' + def __init__(self, sparse=False): + super(SoftmaxCrossEntropyExpand, self).__init__() + self.exp = ops.Exp() + self.sum = ops.ReduceSum(keep_dims=True) + self.onehot = ops.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.div = ops.RealDiv() + self.log = ops.Log() + self.sum_cross_entropy = ops.ReduceSum(keep_dims=False) + self.mul = ops.Mul() + self.mul2 = ops.Mul() + self.mean = ops.ReduceMean(keep_dims=False) + self.sparse = sparse + self.max = ops.ReduceMax(keep_dims=True) + self.sub = ops.Sub() + + def construct(self, logit, label): + logit_max = self.max(logit, -1) + exp = self.exp(self.sub(logit, logit_max)) + exp_sum = self.sum(exp, -1) + softmax_result = self.div(exp, exp_sum) + if self.sparse: + label = self.onehot(label, ops.shape(logit)[1], self.on_value, self.off_value) + softmax_result_log = self.log(softmax_result) + loss = self.sum_cross_entropy((self.mul(softmax_result_log, label)), -1) + loss = self.mul2(ops.scalar_to_array(-1.0), loss) + loss = self.mean(loss, -1) + + return loss diff --git a/official/cv/ADNet/src/models/CustomizedCell.py b/official/cv/ADNet/src/models/CustomizedCell.py new file mode 100644 index 0000000000000000000000000000000000000000..7022bd50e50f7929ff0feaceba1e70793beb1bbd --- /dev/null +++ b/official/cv/ADNet/src/models/CustomizedCell.py @@ -0,0 +1,78 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""cell define""" +from mindspore import nn +import mindspore.ops.operations as P +import mindspore.ops.functional as F +import mindspore.ops.composite as C +from mindspore.parallel._utils import _get_device_num, _get_parallel_mode, _get_gradients_mean +from mindspore.context import ParallelMode +from mindspore.nn.wrap.grad_reducer import DistributedGradReducer +from mindspore import ops +import mindspore + +class WithLossCell(nn.Cell): + """GenWithLossCell""" + def __init__(self, net, criterion, auto_prefix=True): + super(WithLossCell, self).__init__(auto_prefix=auto_prefix) + self.net = net + self.loss_fn = criterion + + def construct(self, patch, reward): + """adnet construct""" + fc6_out_, _ = self.net(patch, -1, False) + # loss + action = ops.Argmax(1, mindspore.dtype.int32)(fc6_out_) + log_prob = ops.Log()(fc6_out_[:, action]) + loss = self.loss_fn(log_prob, reward) + return loss + +class TrainOneStepCell(nn.Cell): + """define TrainOneStepCell""" + def __init__(self, net, optimizer, sens=1.0, auto_prefix=True): + + super(TrainOneStepCell, self).__init__(auto_prefix=auto_prefix) + self.net = net + self.net.set_grad() + self.net.add_flags(defer_inline=True) + + self.weights = optimizer.parameters + self.optimizer = optimizer + + self.grad = C.GradOperation(get_by_list=True, sens_param=True) + + self.sens = sens + self.reducer_flag = False + self.grad_reducer = F.identity + self.parallel_mode = _get_parallel_mode() + if self.parallel_mode in (ParallelMode.DATA_PARALLEL, + ParallelMode.HYBRID_PARALLEL): + self.reducer_flag = True + if self.reducer_flag: + mean = _get_gradients_mean() + degree = _get_device_num() + self.grad_reducer = DistributedGradReducer( + self.weights, mean, degree) + + def construct(self, patch, reward): + """construct""" + loss = self.net(patch, reward) + weights = self.weights + sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) + grads = self.grad(self.net, weights)(patch, reward, sens) + if self.reducer_flag: + # apply grad reducer on grads + grads = self.grad_reducer(grads) + return F.depend(loss, self.optimizer(grads)) diff --git a/official/cv/ADNet/src/models/vggm.py b/official/cv/ADNet/src/models/vggm.py new file mode 100644 index 0000000000000000000000000000000000000000..32352991fd38cc97753cacab48dc28fa6046aa67 --- /dev/null +++ b/official/cv/ADNet/src/models/vggm.py @@ -0,0 +1,103 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +from __future__ import print_function, division, absolute_import + +from mindspore import nn, ops +from mindspore import load_checkpoint, load_param_into_net +# source: https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/vggm.py + +__all__ = ['vggm'] + +pretrained_settings = { + 'vggm': { + 'imagenet': { + 'url': 'http://data.lip6.fr/cadene/pretrainedmodels/vggm-786f2434.pth', + 'input_space': 'BGR', + 'input_size': [3, 221, 221], + 'input_range': [0, 255], + 'mean': [123.68, 116.779, 103.939], + 'std': [1, 1, 1], + 'num_classes': 1000 + } + } +} + +class VGGM(nn.Cell): + + def __init__(self, num_classes=1000): + super(VGGM, self).__init__() + self.num_classes = num_classes + self.ops_LRN = wrapper_LRN() + self.features = nn.SequentialCell([ + nn.Conv2d(3, 96, (7, 7), (2, 2)), # conv1 + nn.ReLU(), + self.ops_LRN, + nn.MaxPool2d((3, 3), (2, 2), 'valid'), + nn.Conv2d(96, 256, (5, 5), (2, 2), 'pad', 1), # conv2 + nn.ReLU(), + self.ops_LRN, + nn.MaxPool2d((3, 3), (2, 2), 'valid'), + nn.Conv2d(256, 512, (3, 3), (1, 1), 'pad', 1), # conv3 + nn.ReLU(), + nn.Conv2d(512, 512, (3, 3), (1, 1), 'pad', 1), # conv4 + nn.ReLU(), + nn.Conv2d(512, 512, (3, 3), (1, 1), 'pad', 1), # conv5 + nn.ReLU(), + nn.MaxPool2d((3, 3), (2, 2), 'valid') + ]) + self.classifier = nn.SequentialCell([ + nn.Dense(18432, 4096), + nn.ReLU(), + nn.Dropout(0.5), + nn.Dense(4096, 4096), + nn.ReLU(), + nn.Dropout(0.5), + nn.Dense(4096, num_classes) + ]) + + def construct(self, x): + x = self.features(x) + # x = x.(x.size(0), -1) + x = x.view(x.shape[0], -1) + x = self.classifier(x) + return x + + +def vggm(num_classes=1000, pretrained='imagenet'): + if pretrained: + settings = pretrained_settings['vggm'][pretrained] + assert num_classes == settings['num_classes'], \ + "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes) + import os + model = VGGM(num_classes=num_classes) + load_param_into_net(model, (load_checkpoint(os.path.dirname(__file__)+'/vggm.ckpt'))) + + model.input_space = settings['input_space'] + model.input_size = settings['input_size'] + model.input_range = settings['input_range'] + model.mean = settings['mean'] + model.std = settings['std'] + else: + model = VGGM(num_classes=num_classes) + return model + + +class wrapper_LRN(nn.Cell): + def __init__(self, depth_=5, bias=2., alpha=5e-4, beta=0.75): + super(wrapper_LRN, self).__init__() + self.lrn = ops.LRN(depth_radius=depth_, bias=bias, alpha=alpha, beta=beta) + + def construct(self, x): + return self.lrn(x) diff --git a/official/cv/ADNet/src/options/general.py b/official/cv/ADNet/src/options/general.py new file mode 100644 index 0000000000000000000000000000000000000000..d45e7531ec4349fa5ed71a6878e4e263e7f6a7ec --- /dev/null +++ b/official/cv/ADNet/src/options/general.py @@ -0,0 +1,119 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# source: https://github.com/hellbell/ADNet/blob/3a7955587b5d395401ebc94a5ab067759340680d/utils/init_params.m + +#parameter settings +show_visualization = 0 +record_video = 0 +GT_anno_interval = 1 + + +# ============================ +# NETWORK PARAMETERS +# ============================ +opts = { + 'imgSize': [112, 112, 3], + 'train_dbs': ['vot15', 'vot14', 'vot13'], + 'test_db': 'otb', + 'train': { + 'weightDecay': 0.0005, + 'momentum': 0.9, + 'learningRate': 10e-5, + 'conserveMemory': True, + 'gt_skip': 1, + 'rl_num_batches': 5, + 'RL_steps': 10 + }, + 'minibatch_size': 32, + 'numEpoch': 30, + 'numInnerEpoch': 3, + 'continueTrain': False, + 'samplePerFrame_large': 40, + 'samplePerFrame_small': 10, + 'inputSize': [112, 112, 3], + 'stopIou': 0.93, + 'meta': { + 'inputSize': [112, 112, 3] + }, + 'use_finetune': True, + 'scale_factor': 1.05, + + # test + 'finetune_iters': 20, + 'finetune_iters_online': 10, + 'finetune_interval': 30, + 'posThre_init': 0.7, + 'negThre_init': 0.3, + 'posThre_online': 0.7, + 'negThre_online': 0.5, + 'nPos_init': 200, + 'nNeg_init': 150, + 'nPos_online': 30, + 'nNeg_online': 15, + 'finetune_scale_factor': 3.0, + 'redet_scale_factor': 3.0, + 'finetune_trans': 0.10, + 'redet_samples': 256, + + 'successThre': 0.5, + 'failedThre': 0.5, + + 'nFrames_long': 100, # long-term period (in matlab code, for positive samples... while current implementation just with history for now...) + 'nFrames_short': 20, # short-term period (for negative samples) + + 'nPos_train': 150, + 'nNeg_train': 50, + 'posThre_train': 0.5, + 'negThre_train': 0.3, + + 'random_perturb': { + 'x': 0.15, + 'y': 0.15, + 'w': 0.03, + 'h': 0.03 + }, + + 'action_move': { + 'x': 0.03, + 'y': 0.03, + 'w': 0.03, + 'h': 0.03, + 'deltas': [ + [-1, 0, 0, 0], # left + [-2, 0, 0, 0], # left x2 + [+1, 0, 0, 0], # right + [+2, 0, 0, 0], # right x2 + [0, -1, 0, 0], # up + [0, -2, 0, 0], # up x2 + [0, +1, 0, 0], # down + [0, +2, 0, 0], # down x2 + [0, 0, 0, 0], # stop + [0, 0, -1, -1], # smaller + [0, 0, +1, +1] # bigger + ] + }, + + 'num_actions': 11, + 'stop_action': 8, + 'num_show_actions': 20, + 'num_action_step_max': 20, + 'num_action_history': 10, + + 'visualize': True, + 'printscreen': True, + + 'means': [104, 117, 123] # https://github.com/amdegroot/ssd.pytorch/blob/8dd38657a3b1df98df26cf18be9671647905c2a0/data/config.py + +} diff --git a/official/cv/ADNet/src/trainers/RL_tools.py b/official/cv/ADNet/src/trainers/RL_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..20f4fa304b382cb667e05f6fcec36fafee489bc5 --- /dev/null +++ b/official/cv/ADNet/src/trainers/RL_tools.py @@ -0,0 +1,221 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import numpy as np +import cv2 + +from src.utils.get_video_infos import get_video_infos +from src.utils.do_action import do_action +from src.utils.overlap_ratio import overlap_ratio +from src.utils.augmentations import CropRegion + +from mindspore import ops, nn +import mindspore + + +class TrackingPolicyLoss(nn.Cell): + def __init__(self): + super(TrackingPolicyLoss, self).__init__() + self.cast = ops.Cast() + # https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py#L68 + + def construct(self, saved_log_probs, rewards): + rewards = self.cast(rewards, mindspore.dtype.float32) + saved_log_probs = self.cast(saved_log_probs, mindspore.dtype.float32) + policy_loss = ops.ReduceSum(False)(-saved_log_probs * rewards) + return policy_loss + + +# TrackingEnvironment for all of the videos in one epoch +# Number of steps can be set in opts['train']['RL_steps'] before initialize this environment +class TrackingEnvironment: + def __init__(self, train_videos, opts, transform, args): + self.videos = [] # list of clips dict + + self.opts = opts + self.transform = transform + self.args = args + + self.RL_steps = self.opts['train']['RL_steps'] # clip length + + video_names = train_videos['video_names'] + video_paths = train_videos['video_paths'] + bench_names = train_videos['bench_names'] + + vid_idxs = np.random.permutation(len(video_names)) + + for vid_idx in vid_idxs: + # dict consist of set of clips in ONE video + clips = { + 'img_path': [], + 'frame_start': [], + 'frame_end': [], + 'init_bbox': [], + 'end_bbox': [], + 'vid_idx': [], + } + # Load current training video info + video_name = video_names[vid_idx] + video_path = video_paths[vid_idx] + bench_name = bench_names[vid_idx] + + vid_info = get_video_infos(bench_name, video_path, video_name) + + if self.RL_steps is None: + self.RL_steps = len(vid_info['gt'])-1 + vid_clip_starts = [0] + vid_clip_ends = [len(vid_info['gt'])-1] + else: + vid_clip_starts = np.array(range(len(vid_info['gt']) - self.RL_steps)) + vid_clip_starts = np.random.permutation(vid_clip_starts) + vid_clip_ends = vid_clip_starts + self.RL_steps + + # number of clips in one video + num_train_clips = min(opts['train']['rl_num_batches'], len(vid_clip_starts)) + + print("num_train_clips of vid " + str(vid_idx) + ": ", str(num_train_clips)) + + for clipIdx in range(num_train_clips): + frameStart = vid_clip_starts[clipIdx] + frameEnd = vid_clip_ends[clipIdx] + + clips['img_path'].append(vid_info['img_files'][frameStart:frameEnd]) + clips['frame_start'].append(frameStart) + clips['frame_end'].append(frameEnd) + clips['init_bbox'].append(vid_info['gt'][frameStart]) + clips['end_bbox'].append(vid_info['gt'][frameEnd]) + clips['vid_idx'].append(vid_idx) + + if num_train_clips > 0: # small hack + self.videos.append(clips) + + self.clip_idx = -1 # hack for reset function + self.vid_idx = 0 + + self.state = None # current bbox + self.gt = None # end bbox + self.current_img = None # current image frame + self.current_patch = None # current patch (transformed) + self.current_img_idx = 0 + + self.reset() + + # return state, reward, done, info. Also update the curr_patch based on the new bounding box + # state: next bounding box + # reward: the reward + # done: True if finishing one clip. + # info: a dictionary + def step(self, action): + info = { + 'finish_epoch': False + } + + # do action + self.state = do_action(self.state, self.opts, action, self.current_img.shape) + self.current_patch, _, _, _ = self.transform(self.current_img, self.state) + + if action == self.opts['stop_action']: + reward, done, finish_epoch = self.go_to_next_frame() + + info['finish_epoch'] = finish_epoch + + else: # just go to the next patch (still same frame/current_img) + reward = 0 + done = False + self.current_patch, _, _, _ = self.transform(self.current_img, self.state) + + return self.state, reward, done, info + + # reset environment to new clip. + # Return finish_epoch status: False if finish the epoch. True if still have clips remain + def reset(self): + while True: + self.clip_idx += 1 + + # if the clips in a video are finished... go to the next video + if self.clip_idx >= len(self.videos[self.vid_idx]['frame_start']): + self.vid_idx += 1 + self.clip_idx = 0 + if self.vid_idx >= len(self.videos): + self.vid_idx = 0 + # one epoch finish... need to reinitialize the class to use this again randomly + return True + + # initialize state, gt, current_img_idx, current_img, and current_patch with new clip + self.state = self.videos[self.vid_idx]['init_bbox'][self.clip_idx] + self.gt = self.videos[self.vid_idx]['end_bbox'][self.clip_idx] + + # frameStart = self.videos[self.vid_idx]['frame_start'][self.clip_idx] + self.current_img_idx = 1 # self.current_img_idx = frameStart + 1 + self.current_img = cv2.imread(self.videos[self.vid_idx]['img_path'][self.clip_idx][self.current_img_idx]) + self.current_patch, _, _, _ = self.transform(self.current_img, np.array(self.state)) + + if self.gt != '': # small hack + break + + return False + + def get_current_patch(self): + return self.current_patch + + def get_current_train_vid_idx(self): + return self.videos[self.vid_idx]['vid_idx'][0] + + def get_current_patch_unprocessed(self): + crop = CropRegion() + state_int = [int(x) for x in self.state] + current_patch_unprocessed, _, _, _ = crop(self.current_img, state_int) + return current_patch_unprocessed.astype(np.uint8) + + def get_state(self): + return self.state + + def get_current_img(self): + return self.current_img + + def go_to_next_frame(self): + self.current_img_idx += 1 + finish_epoch = False + + # if already in the end of a clip... + if self.current_img_idx >= len(self.videos[self.vid_idx]['img_path'][self.clip_idx]): + # calculate reward before reset + reward = reward_original(np.array(self.gt), np.array(self.state)) + + print("reward=" + str(reward)) + + # reset (reset state, gt, current_img_idx, current_img and current_img_patch) + finish_epoch = self.reset() # go to the next clip (or video) + + done = True # done means one clip is finished + + # just go to the next frame (means new patch and new image) + else: + reward = 0 + done = False + # note: reset already read the current_img and current_img_patch + self.current_img = cv2.imread(self.videos[self.vid_idx]['img_path'][self.clip_idx][self.current_img_idx]) + self.current_patch, _, _, _ = self.transform(self.current_img, self.state) + + return reward, done, finish_epoch + + +def reward_original(gt, box): + iou = overlap_ratio(gt, box) + if iou > 0.7: + reward = 1 + else: + reward = -1 + + return reward diff --git a/official/cv/ADNet/src/trainers/adnet_test.py b/official/cv/ADNet/src/trainers/adnet_test.py new file mode 100644 index 0000000000000000000000000000000000000000..389288093c28774b5ca38730c2a38ca0f3ae464d --- /dev/null +++ b/official/cv/ADNet/src/trainers/adnet_test.py @@ -0,0 +1,290 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# ADNet/adnet_test.m +import os +from random import shuffle +import time + +import glob +import cv2 +import numpy as np + +from src.models.ADNet import WithLossCell_ADNET +from src.datasets.online_adaptation_dataset import OnlineAdaptationDataset, OnlineAdaptationDatasetStorage +from src.utils.augmentations import ADNet_Augmentation +from src.utils.do_action import do_action +from src.utils.display import display_result, draw_box +from src.utils.gen_samples import gen_samples +from src.utils.get_wrapper_utils import get_groundtruth, get_dataLoader + +from mindspore import dtype as mstype +from mindspore import Tensor, nn, ops +from mindspore.nn import TrainOneStepCell +from mindspore.communication.management import get_rank, get_group_size + + +def adnet_test(net, vid_path, opts, args): + print('Testing sequences in ' + str(vid_path) + '...') + t_sum = 0 + vid_info = {'gt': [], 'img_files': glob.glob(os.path.join(vid_path, 'img', '*.jpg')), 'nframes': 0} + vid_info['img_files'].sort(key=str.lower) + gt_path = os.path.join(vid_path, 'groundtruth_rect.txt') + gt = get_groundtruth(gt_path) + vid_info['gt'] = gt + if vid_info['gt'][-1] == '': # small hack + vid_info['gt'] = vid_info['gt'][:-1] + vid_info['nframes'] = min(len(vid_info['img_files']), len(vid_info['gt'])) + # catch the first box + curr_bbox = vid_info['gt'][0] + bboxes = np.zeros(np.array(vid_info['gt']).shape) # tracking result containers init containers + ntraining = 0 + # setup training + net_action, net_score = get_optim(net, ops.SparseSoftmaxCrossEntropyWithLogits(), opts) + + dataset_storage_pos = None + dataset_storage_neg = None + is_negative = False # is_negative = True if the tracking failed + target_score = 0 + all_iteration = 0 + t = 0 + + for idx in range(vid_info['nframes']): + frame_idx = idx + frame_path = vid_info['img_files'][idx] + t0_wholetracking = time.time() + frame = cv2.imread(frame_path) + # draw box or with display, then save + im_with_bb = display_result(frame, curr_bbox) if args.display_images else draw_box(frame, curr_bbox) + save_img(args, os.path.join(args.save_result_images, str(frame_idx) + '-' + str(t) + '.jpg'), im_with_bb) + curr_bbox_old = curr_bbox + cont_negatives = 0 + + if frame_idx > 0: + curr_score, cont_negatives = detection(net, opts, args, frame, curr_bbox, + cont_negatives, frame_idx, ntraining) + print('final curr_score: %.4f' % curr_score) + + # redetection when confidence < threshold 0.5. But when fc7 is already reliable. Else, just trust the ADNet + if ntraining > args.believe_score_result: + if curr_score < 0.5: + is_negative = True + redetection(net, curr_bbox_old, opts, cont_negatives, frame, args, frame_idx) + else: + is_negative = False + else: + is_negative = False + save_img(args, os.path.join(args.save_result_images, 'final-' + str(frame_idx) + '.jpg'), im_with_bb) + + # record the curr_bbox result + bboxes[frame_idx] = curr_bbox + + # create or update storage + set iteration_range for training + if frame_idx == 0: + dataset_storage_pos = OnlineAdaptationDatasetStorage( + initial_frame=frame, first_box=curr_bbox, opts=opts, args=args, positive=True) + # (thanks to small hack in adnet_test) the nNeg_online is also 0 + dataset_storage_neg = OnlineAdaptationDatasetStorage( + initial_frame=frame, first_box=curr_bbox, opts=opts, args=args, positive=False) + + iteration_range = range(opts['finetune_iters']) + else: + assert dataset_storage_pos is not None + # (thanks to small hack in adnet_test) the nNeg_online is also 0 + assert dataset_storage_neg is not None + # if confident or when always generate samples, generate new samples + always_generate_samples = (ntraining < args.believe_score_result) + + if always_generate_samples or (not is_negative or target_score > opts['successThre']): + dataset_storage_pos.add_frame_then_generate_samples(frame, curr_bbox) + + iteration_range = range(opts['finetune_iters_online']) + + # training when depend on the frequency.. else, don't run the training code... + if frame_idx % args.online_adaptation_every_I_frames == 0: + ntraining += 1 + # generate dataset just before training + dataset_pos = OnlineAdaptationDataset(dataset_storage_pos) + dataloader_pos = get_dataLoader(dataset_pos, opts, args, ["im", "bbox", "action_label", "score_label"]) + batch_iterator_pos = iter(dataloader_pos) + + # (thanks to small hack in adnet_test) the nNeg_online is also 0 + dataset_neg = OnlineAdaptationDataset(dataset_storage_neg) + dataloader_neg = get_dataLoader(dataset_neg, opts, args, ["im", "bbox", "action_label", "score_label"]) + batch_iterator_neg = iter(dataloader_neg) + # else: + # dataset_neg = [] + + epoch_size_pos = len(dataset_pos) // opts['minibatch_size'] + epoch_size_neg = len(dataset_neg) // opts['minibatch_size'] + if args.distributed: + rank_id = get_rank() + rank_size = get_group_size() + epoch_size_pos = epoch_size_pos // rank_size + epoch_size_neg = epoch_size_neg // rank_size + epoch_size = epoch_size_pos + epoch_size_neg # 1 epoch, how many iterations + + which_dataset = list(np.full(epoch_size_pos, fill_value=1)) + which_dataset.extend(np.zeros(epoch_size_neg, dtype=int)) + shuffle(which_dataset) + print("1 epoch = " + str(epoch_size) + " iterations") + train(net, net_action, net_score, iteration_range, + which_dataset, batch_iterator_pos, batch_iterator_neg, all_iteration, + dataloader_pos, dataloader_neg) + + t1_wholetracking = time.time() + t_sum += t1_wholetracking - t0_wholetracking + print('whole tracking time = %.4f sec.' % (t1_wholetracking - t0_wholetracking)) + + # evaluate the precision + if not args.distributed or rank_id == 0: + bboxes = np.array(bboxes) + vid_info['gt'] = np.array(vid_info['gt']) + if args.run_online == 'True': + import moxing + np.save(args.save_result_npy + '-bboxes.npy', bboxes) + np.save(args.save_result_npy + '-ground_truth.npy', vid_info['gt']) + moxing.file.copy_parallel('/cache/result', args.train_url) + else: + np.save(args.save_result_npy + '-bboxes.npy', bboxes) + np.save(args.save_result_npy + '-ground_truth.npy', vid_info['gt']) + return bboxes, t_sum + + +def redetection(net, curr_bbox_old, opts, cont_negatives, frame, args, frame_idx): + print('redetection') + transform = ADNet_Augmentation(opts) + # redetection process + redet_samples = gen_samples('gaussian', curr_bbox_old, opts['redet_samples'], opts, + min(1.5, 0.6 * 1.15 ** cont_negatives), opts['redet_scale_factor']) + score_samples = [] + + for redet_sample in redet_samples: + temp_patch, _, _, _ = transform(frame, redet_sample, None, None) + temp_patch = Tensor(np.expand_dims(temp_patch, 0), mstype.float32).transpose(0, 3, 1, 2) + + # 1 batch input [1, curr_patch.shape] + _, fc7_out_temp = net.construct(temp_patch, -1, False) + score_samples.append(fc7_out_temp.asnumpy()[0][1]) + + score_samples = np.array(score_samples) + max_score_samples_idx = np.argmax(score_samples) + + # replace the curr_box with the samples with maximum score + curr_bbox = redet_samples[max_score_samples_idx] + + # update the final result image + im_with_bb = display_result(frame, curr_bbox) if args.display_images else draw_box(frame, curr_bbox) + + save_img(args, os.path.join(args.save_result_images, str(frame_idx) + '-redet.jpg'), im_with_bb) + + +def train(net, net_action, net_score, iteration_range, which_dataset, + batch_iterator_pos, batch_iterator_neg, all_iteration, + dataloader_pos, dataloader_neg): + net.set_phase('train') + + # training loop + for iteration in iteration_range: + all_iteration += 1 # use this for update the visualization + # load train data + if which_dataset[iteration % len(which_dataset)]: # if positive + try: + images, _, action_label, score_label = next(batch_iterator_pos) + except StopIteration: + batch_iterator_pos = iter(dataloader_pos) + images, _, action_label, score_label = next(batch_iterator_pos) + else: + try: + images, _, action_label, score_label = next(batch_iterator_neg) + except StopIteration: + batch_iterator_neg = iter(dataloader_neg) + images, _, action_label, score_label = next(batch_iterator_neg) + + images = images.transpose(0, 3, 1, 2) + # forward + t0 = time.time() + if which_dataset[iteration % len(which_dataset)]: # if positive + action_l = net_action(images, ops.Argmax(1, output_type=mstype.int32)(action_label)) + else: + action_l = Tensor([0]) + score_l = net_score(images, score_label) + loss = action_l + score_l + t1 = time.time() + + if all_iteration % 10 == 0: + print('Timer: %.4f sec.' % (t1 - t0)) + print('iter ' + repr(all_iteration) + ' || Loss: %.4f ||' % (loss.asnumpy()), end=' ') + + +def detection(net, opts, args, frame, curr_bbox, cont_negatives, frame_idx, ntraining): + net.set_phase('test') + transform = ADNet_Augmentation(opts) + t = 0 + while True: + curr_patch, curr_bbox, _, _ = transform(frame, curr_bbox, None, None) + curr_patch = Tensor(np.expand_dims(curr_patch, 0), mstype.float32).transpose(0, 3, 1, 2) + fc6_out, fc7_out = net.construct(curr_patch) + + curr_score = fc7_out.asnumpy()[0][1] + + if ntraining > args.believe_score_result: + if curr_score < opts['failedThre']: + cont_negatives += 1 + + action = np.argmax(fc6_out.asnumpy()) + + # do action + curr_bbox = do_action(curr_bbox, opts, action, frame.shape) + + # bound the curr_bbox size + if curr_bbox[2] < 10: + curr_bbox[0] = min(0, curr_bbox[0] + curr_bbox[2] / 2 - 10 / 2) + curr_bbox[2] = 10 + if curr_bbox[3] < 10: + curr_bbox[1] = min(0, curr_bbox[1] + curr_bbox[3] / 2 - 10 / 2) + curr_bbox[3] = 10 + + t += 1 + + # draw box or with display, then save + if args.display_images: + im_with_bb = display_result(frame, curr_bbox) # draw box and display + else: + im_with_bb = draw_box(frame, curr_bbox) + save_img(args, os.path.join(args.save_result_images, str(frame_idx) + '-' + str(t) + '.jpg'), im_with_bb) + + if action == opts['stop_action'] or t >= opts['num_action_step_max']: + break + return curr_score, cont_negatives + + +def save_img(args, filename, img): + if args.save_result_images: + cv2.imwrite(filename, img) + + +def get_optim(net, loss_fn, opts): + optimizer = nn.SGD([{'params': net.base_network.trainable_params(), 'lr': 0}, + {'params': net.fc4_5.trainable_params()}, + {'params': net.fc6.trainable_params()}, + {'params': net.fc7.trainable_params(), 'lr': 1e-3}], + learning_rate=1e-3, momentum=opts['train']['momentum'], + weight_decay=opts['train']['weightDecay']) + + net_action_with_criterion = WithLossCell_ADNET(net, loss_fn, 'action') + net_score_with_criterion = WithLossCell_ADNET(net, loss_fn, 'score') + net_action = TrainOneStepCell(net_action_with_criterion, optimizer) + net_score = TrainOneStepCell(net_score_with_criterion, optimizer) + return net_action, net_score diff --git a/official/cv/ADNet/src/trainers/adnet_train_rl.py b/official/cv/ADNet/src/trainers/adnet_train_rl.py new file mode 100644 index 0000000000000000000000000000000000000000..942a45ea658ddfd877dc5c1dea45bff20acd392b --- /dev/null +++ b/official/cv/ADNet/src/trainers/adnet_train_rl.py @@ -0,0 +1,112 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# matlab code: https://github.com/hellbell/ADNet/blob/master/train/adnet_train_RL.m +# policy gradient in pytorch: https://medium.com/@ts1829/policy-gradient-reinforcement-learning-in-pytorch-df1383ea0baf +import os +import time +import copy + +import numpy as np + +from src.trainers.RL_tools import TrackingPolicyLoss +from src.datasets.rl_dataset import RLDataset +from src.models.CustomizedCell import WithLossCell, TrainOneStepCell +from src.utils.save_ckpt import save_ckpt +from src.utils.get_wrapper_utils import get_dataLoader + +from mindspore import nn, ops +from mindspore.communication.management import get_rank, get_group_size + + +def adnet_train_rl(net, domain_specific_nets, train_videos, opts, args): + if args.run_online == 'True': + save_path = '/cache/train_out' + else: + save_path = '' + if not os.path.exists(os.path.join(save_path, args.save_folder, args.save_domain_dir)): + os.makedirs(os.path.join(save_path, args.save_folder, args.save_domain_dir)) + + net.set_phase('test') + + optimizer = nn.SGD([{'params': net.base_network.trainable_params(), 'lr': 1e-4}, + {'params': net.fc4_5.trainable_params()}, + {'params': net.fc6.trainable_params()}, + {'params': net.fc7.trainable_params(), 'lr': 0}], + learning_rate=1e-3, momentum=opts['train']['momentum'], + weight_decay=opts['train']['weightDecay']) + criterion = TrackingPolicyLoss() + clip_idx_epoch = 0 + prev_net = copy.deepcopy(net) + dataset = RLDataset(prev_net, domain_specific_nets, train_videos, opts, args) + rlnet_with_criterion = WithLossCell(net, criterion) + net_rl = TrainOneStepCell(rlnet_with_criterion, optimizer) + for epoch in range(args.start_epoch, opts['numEpoch']): + if epoch != args.start_epoch: + dataset.reset(prev_net, domain_specific_nets, train_videos, opts, args) + data_loader = get_dataLoader(dataset, opts, args, + ["log_probs_list", "reward_list", "vid_idx_list", 'patch']) + # create batch iterator + batch_iterator = iter(data_loader) + + epoch_size = len(dataset) // opts['minibatch_size'] # 1 epoch, how many iterations + if args.distributed: + rank_id = get_rank() + rank_size = get_group_size() + epoch_size = epoch_size // rank_size + + for iteration in range(epoch_size): + # load train data + # action, action_prob, log_probs, reward, patch, action_dynamic, result_box = next(batch_iterator) + _, reward, vid_idx, patch = next(batch_iterator) + + # train + tic = time.time() + patch = patch.transpose(0, 3, 1, 2) + # find out the unique value in vid_idx + # separate the batch with each video idx + if args.multidomain: + vid_idx_unique = ops.Unique()(vid_idx)[0] + for i in range(len(vid_idx_unique)): + choice_list = (vid_idx_unique[i] == vid_idx).asnumpy().nonzero()[0].tolist() + if len(choice_list) == 1: + continue + tmp_patch = patch[choice_list] + tmp_reward = reward[choice_list] + net_rl(tmp_patch, tmp_reward) + # save the ADNetDomainSpecific back to their module + idx = np.asscalar(vid_idx_unique[i].asnumpy()) + domain_specific_nets[idx].load_weights_from_adnet(net) + else: + net_rl(patch, reward) + + toc = time.time() - tic + print('epoch ' + str(epoch) + ' - iteration ' + str(iteration) + ' - train time: ' + str(toc) + " s") + + if iteration % 1000 == 0: + if not args.distributed or rank_id == 0: + save_ckpt(net, domain_specific_nets, save_path, args, iteration, epoch, 1) + + clip_idx_epoch += 1 + + if not args.distributed or rank_id == 0: + save_ckpt(net, domain_specific_nets, save_path, args, iteration, epoch, 2) + + if not args.distributed or rank_id == 0: + save_ckpt(net, domain_specific_nets, save_path, args, iteration, epoch, 3) + + if args.run_online == 'True': + import moxing + moxing.file.copy_parallel('/cache/train_out/weights', args.train_url) + return net diff --git a/official/cv/ADNet/src/trainers/adnet_train_sl.py b/official/cv/ADNet/src/trainers/adnet_train_sl.py new file mode 100644 index 0000000000000000000000000000000000000000..302ccd01a81616677ad64098866eb70ecd6851f1 --- /dev/null +++ b/official/cv/ADNet/src/trainers/adnet_train_sl.py @@ -0,0 +1,214 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# matlab code: +# https://github.com/hellbell/ADNet/blob/master/train/adnet_train_SL.m +# reference: https://github.com/amdegroot/ssd.pytorch/blob/master/train.py + +import os +import time +from random import shuffle + +import numpy as np + +from src.models.ADNet import adnet, WithLossCell_ADNET +from src.utils.get_train_videos import get_train_videos +from src.datasets.sl_dataset import initialize_pos_neg_dataset +from src.utils.augmentations import ADNet_Augmentation +from src.utils.get_wrapper_utils import get_dataLoader + +from mindspore.communication.management import get_rank, get_group_size +from mindspore import nn, Tensor +from mindspore import ops +from mindspore import save_checkpoint +from mindspore.common.initializer import initializer +from mindspore import dtype as mstype +from mindspore.nn import TrainOneStepCell +import mindspore.numpy as nps + + +def adnet_train_sl(args, opts): + + train_videos = get_train_videos(opts, args) + opts['num_videos'] = len(train_videos['video_names']) + + net, domain_specific_nets = adnet(opts=opts, trained_file=args.resume, multidomain=args.multidomain, + distributed=args.distributed, run_online=args.run_online) + + optimizer = nn.SGD([{'params': net.base_network.trainable_params(), 'lr': 1e-4}, + {'params': net.fc4_5.trainable_params()}, + {'params': net.fc6.trainable_params()}, + {'params': net.fc7.trainable_params()}], + learning_rate=1e-3, + momentum=opts['train']['momentum'], weight_decay=opts['train']['weightDecay']) + net.set_train() + + if not args.resume: + print('Initializing weights...') + scal = Tensor([0.01], mstype.float32) + init_net(net, scal, args) + + criterion = ops.SparseSoftmaxCrossEntropyWithLogits() + net_action_with_criterion = WithLossCell_ADNET(net, criterion, 'action') + net_score_with_criterion = WithLossCell_ADNET(net, criterion, 'score') + net_action = TrainOneStepCell(net_action_with_criterion, optimizer) + net_score = TrainOneStepCell(net_score_with_criterion, optimizer) + print('generating Supervised Learning dataset..') + + datasets_pos, datasets_neg = initialize_pos_neg_dataset(train_videos, opts, transform=ADNet_Augmentation(opts)) + number_domain = opts['num_videos'] + + # calculating number of data + len_dataset_pos = 0 + len_dataset_neg = 0 + for dataset_pos in datasets_pos: + len_dataset_pos += len(dataset_pos) + for dataset_neg in datasets_neg: + len_dataset_neg += len(dataset_neg) + + epoch_size_pos = len_dataset_pos // opts['minibatch_size'] + epoch_size_neg = len_dataset_neg // opts['minibatch_size'] + if args.distributed: + rank_id = get_rank() + rank_size = get_group_size() + epoch_size_pos = epoch_size_pos // rank_size + epoch_size_neg = epoch_size_neg // rank_size + else: + rank_id = 0 + epoch_size = epoch_size_pos + epoch_size_neg # 1 epoch, how many iterations + + print("1 epoch = " + str(epoch_size) + " iterations") + + max_iter = opts['numEpoch'] * epoch_size + print("maximum iteration = " + str(max_iter)) + batch_iterators_pos, batch_iterators_neg = [], [] + dataloder_pos, dataloder_neg = get_dataLoader((datasets_pos, datasets_neg), opts, args, + ["im", "bbox", "action_label", "score_label", "vid_idx"]) + for data_pos in dataloder_pos: + batch_iterators_pos.append(iter(data_pos)) + for data_neg in dataloder_neg: + batch_iterators_neg.append(iter(data_neg)) + print('initial dataloader finished') + epoch = args.start_epoch + if epoch != 0 and args.start_iter == 0: + start_iter = epoch * epoch_size + else: + start_iter = args.start_iter + + which_dataset = list(np.full(epoch_size_pos, fill_value=1)) + which_dataset.extend(np.zeros(epoch_size_neg, dtype=int)) + shuffle(which_dataset) + + which_domain = np.random.permutation(number_domain) + + # training loop + print('start training') + for iteration in range(start_iter, max_iter): + if args.multidomain: + curr_domain = which_domain[iteration % len(which_domain)] + else: + curr_domain = 0 + # if new epoch (not including the very first iteration) + if (iteration != start_iter) and (iteration % epoch_size == 0): + epoch += 1 + shuffle(which_dataset) + np.random.shuffle(which_domain) + + if rank_id == 0: + print('Saving state, epoch:', epoch) + save_checkpoint(net, os.path.join(args.save_path, args.save_folder, args.save_file) + + 'epoch' + repr(epoch) + '.ckpt') + + # save domain_specific + + for curr_domain, domain_specific_net in enumerate(domain_specific_nets): + save_checkpoint(domain_specific_net, + os.path.join(args.save_path, args.save_folder, args.save_domain_dir, + 'epoch' + repr(epoch) + '_' + str(curr_domain) + '.ckpt')) + train(net, domain_specific_nets, curr_domain, which_dataset, + batch_iterators_pos, batch_iterators_neg, dataloder_pos, dataloder_neg, iteration, + net_action, net_score) + # final save + if rank_id == 0: + save_checkpoint(net, os.path.join(args.save_path, args.save_folder, args.save_file) + 'final.ckpt') + + for curr_domain, domain_specific_net in enumerate(domain_specific_nets): + save_checkpoint(domain_specific_net, + os.path.join(args.save_path, args.save_folder, args.save_domain_dir, + 'final' + '_' + str(curr_domain) + '.ckpt')) + if args.run_online == 'True': + import moxing as mox + mox.file.copy_parallel('/cache/train_out/weights', args.train_url) + return net, domain_specific_nets, train_videos + + +def train(net, domain_specific_nets, curr_domain, which_dataset, batch_iterators_pos, batch_iterators_neg, + dataloder_pos, dataloder_neg, iteration, net_action, net_score): + net.load_domain_specific(domain_specific_nets[curr_domain]) + # load train data + flag_pos = which_dataset[iteration % len(which_dataset)] + if flag_pos: # if positive + try: + images, _, action_label, score_label, _ = next(batch_iterators_pos[curr_domain]) + except StopIteration: + batch_iterators_pos[curr_domain] = iter(dataloder_pos[curr_domain]) + images, _, action_label, score_label, _ = next(batch_iterators_pos[curr_domain]) + else: + try: + images, _, action_label, score_label, _ = next(batch_iterators_neg[curr_domain]) + except StopIteration: + batch_iterators_neg[curr_domain] = iter(dataloder_neg[curr_domain]) + images, _, action_label, score_label, _ = next(batch_iterators_neg[curr_domain]) + images = Tensor(images).transpose(0, 3, 1, 2) + action_label = Tensor(action_label, dtype=mstype.float32) + score_label = Tensor(score_label, dtype=mstype.int32) + if flag_pos: + action_l = net_action(images, ops.Argmax(1, output_type=mstype.int32)(action_label)) + else: + action_l = Tensor([0]) + t0 = time.time() + # load ADNetDomainSpecific with video index + score_l = net_score(images, score_label) + loss = action_l + score_l + + domain_specific_nets[curr_domain].load_weights_from_adnet(net) + + t1 = time.time() + + if iteration % 10 == 0: + print('Timer: %.4f sec.' % (t1 - t0)) + print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.asnumpy()), end=' ') + + +def init_net(net, scal, args): + if args.distributed: + net.init_parameters_data(auto_parallel_mode=True) + else: + net.init_parameters_data(auto_parallel_mode=False) + # fc 4 + net.fc4_5[0].weight.set_data(initializer('Normal', net.fc4_5[0].weight.shape, mstype.float32)) + net.fc4_5[0].weight.data.set_data(net.fc4_5[0].weight.data * scal.expand_as(net.fc4_5[0].weight.data)) + net.fc4_5[0].bias.set_data(nps.full(shape=net.fc4_5[0].bias.shape, fill_value=0.1)) + # fc 5 + net.fc4_5[3].weight.set_data(initializer('Normal', net.fc4_5[3].weight.shape, mstype.float32)) + net.fc4_5[3].weight.set_data(net.fc4_5[3].weight.data * scal.expand_as(net.fc4_5[3].weight.data)) + net.fc4_5[3].bias.set_data(nps.full(shape=net.fc4_5[3].bias.shape, fill_value=0.1)) + # fc 6 + net.fc6.weight.set_data(initializer('Normal', net.fc6.weight.shape, mstype.float32)) + net.fc6.weight.set_data(net.fc6.weight.data * scal.expand_as(net.fc6.weight.data)) + net.fc6.bias.set_data(nps.full(shape=net.fc6.bias.shape, fill_value=0)) + # fc 7 + net.fc7.weight.set_data(initializer('Normal', net.fc7.weight.shape, mstype.float32)) + net.fc7.weight.set_data(net.fc7.weight.data * scal.expand_as(net.fc7.weight.data)) + net.fc7.bias.set_data(nps.full(shape=net.fc7.bias.shape, fill_value=0)) diff --git a/official/cv/ADNet/src/utils/augmentations.py b/official/cv/ADNet/src/utils/augmentations.py new file mode 100644 index 0000000000000000000000000000000000000000..c3d851a1dcc8577412b9a527562c32560864696c --- /dev/null +++ b/official/cv/ADNet/src/utils/augmentations.py @@ -0,0 +1,131 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# matlab code: +# https://github.com/hellbell/ADNet/blob/3a7955587b5d395401ebc94a5ab067759340680d/utils/get_extract_regions.m +# other reference: https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py + +import numpy as np +import cv2 + + +class ToTensor: + def __call__(self, cvimage, box=None, action_label=None, conf_label=None): + return cvimage.astype(np.float32), box, action_label, conf_label + + +class SubtractMeans: + def __init__(self, mean): + self.mean = np.array(mean, dtype=np.float32) + + def __call__(self, image, box=None, action_label=None, conf_label=None): + image = image.astype(np.float32) + image -= self.mean + return image.astype(np.float32), box, action_label, conf_label + + +class CropRegion: + def __call__(self, image, box, action_label=None, conf_label=None): + image = np.array(image) + box = np.array(box) + if box is not None: + center = box[0:2] + 0.5 * box[2:4] + wh = box[2:4] * 1.4 # multiplication = 1.4 + box_lefttop = center - 0.5 * wh + box_rightbottom = center + 0.5 * wh + box_ = [ + max(0, box_lefttop[0]), + max(0, box_lefttop[1]), + min(box_rightbottom[0], image.shape[1]), + min(box_rightbottom[1], image.shape[0]) + ] + + im = image[int(box_[1]):int(box_[3]), int(box_[0]):int(box_[2]), :] + else: + im = image[:, :, :] + + return im.astype(np.float32), box, action_label, conf_label + + +# crop "multiplication" times of the box width and height +class CropRegion_withContext: + def __init__(self, multiplication=None): + if multiplication is None: + multiplication = 1.4 # same with default CropRegion + assert multiplication >= 1, "multiplication should more than 1 so the object itself is not cropped" + self.multiplication = multiplication + + def __call__(self, image, box, action_label=None, conf_label=None): + image = np.array(image) + box = np.array(box) + if box is not None: + center = box[0:2] + 0.5 * box[2:4] + wh = box[2:4] * self.multiplication + box_lefttop = center - 0.5 * wh + box_rightbottom = center + 0.5 * wh + box_ = [ + max(0, box_lefttop[0]), + max(0, box_lefttop[1]), + min(box_rightbottom[0], image.shape[1]), + min(box_rightbottom[1], image.shape[0]) + ] + + im = image[int(box_[1]):int(box_[3]), int(box_[0]):int(box_[2]), :] + else: + im = image[:, :, :] + + return im.astype(np.float32), box, action_label, conf_label + + +class ResizeImage: + def __init__(self, inputSize): + self.inputSize = inputSize # network's input size (which is the output size of this function) + + def __call__(self, image, box, action_label=None, conf_label=None): + im = cv2.resize(image, dsize=tuple(self.inputSize[:2])) + return im.astype(np.float32), box, action_label, conf_label + + +class Compose(): + """Composes several augmentations together. + Args: + transforms (List[Transform]): list of transforms to compose. + Example: + # >>> augmentations.Compose([ + # >>> transforms.CenterCrop(10), + # >>> transforms.ToTensor(), + # >>> ]) + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, img, box=None, action_label=None, conf_label=None): + for t in self.transforms: + img, box, action_label, conf_label = t(img, box, action_label, conf_label) + return img, box, action_label, conf_label + + +class ADNet_Augmentation: + def __init__(self, opts): + self.augment = Compose([ + SubtractMeans(opts['means']), + CropRegion(), + ResizeImage(opts['inputSize']), + # not convert to Tensor,just + ToTensor() + ]) + + def __call__(self, img, box, action_label=None, conf_label=None): + return self.augment(img, box, action_label, conf_label) diff --git a/official/cv/ADNet/src/utils/display.py b/official/cv/ADNet/src/utils/display.py new file mode 100644 index 0000000000000000000000000000000000000000..668f150b1abb52530762703d84b837ef7bfa26aa --- /dev/null +++ b/official/cv/ADNet/src/utils/display.py @@ -0,0 +1,30 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import cv2 + + +def draw_box(image, box): + im_with_bb = image.copy() + cv2.rectangle(im_with_bb, (int(box[0]), int(box[1])), (int(box[0] + box[2]), int(box[1] + box[3])), (0, 0, 255)) + return im_with_bb + + +def display_result(image, box): + im_with_bb = draw_box(image, box) + cv2.imshow("result", im_with_bb) + cv2.waitKey(1) + + return im_with_bb diff --git a/official/cv/ADNet/src/utils/do_action.py b/official/cv/ADNet/src/utils/do_action.py new file mode 100644 index 0000000000000000000000000000000000000000..0deda45b4198481e5651eed91a682068abdec048 --- /dev/null +++ b/official/cv/ADNet/src/utils/do_action.py @@ -0,0 +1,53 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import numpy as np + +def do_action(bbox, opts, act, imSize): + m = opts['action_move'] + + # action + bbox[0] = bbox[0] + 0.5 * bbox[2] + bbox[1] = bbox[1] + 0.5 * bbox[3] + + deltas = [m['x'] * bbox[2], + m['y'] * bbox[3], + m['w'] * bbox[2], + m['h'] * bbox[3]] + + deltas = np.maximum(deltas, 1) + + ar = bbox[2]/bbox[3] + + if bbox[2] > bbox[3]: + deltas[3] = deltas[2] / ar + + else: + deltas[2] = deltas[3] * ar + + action_delta = np.multiply(np.array(m['deltas'])[act, :], deltas) + bbox_next = bbox + action_delta + bbox_next[0] = bbox_next[0] - 0.5 * bbox_next[2] + bbox_next[1] = bbox_next[1] - 0.5 * bbox_next[3] + bbox_next[0] = np.maximum(bbox_next[0], 1) + bbox_next[0] = np.minimum(bbox_next[0], imSize[1] - bbox_next[2]) + bbox_next[1] = np.maximum(bbox_next[1], 1) + bbox_next[1] = np.minimum(bbox_next[1], imSize[0] - bbox_next[3]) + bbox_next[2] = np.maximum(5, np.minimum(imSize[1], bbox_next[2])) + bbox_next[3] = np.maximum(5, np.minimum(imSize[0], bbox_next[3])) + + bbox[0] = bbox[0] - 0.5 * bbox[2] + bbox[1] = bbox[1] - 0.5 * bbox[3] + + return bbox_next diff --git a/official/cv/ADNet/src/utils/draw_box_from_npy.py b/official/cv/ADNet/src/utils/draw_box_from_npy.py new file mode 100644 index 0000000000000000000000000000000000000000..8bd5557c84c7ac8ddb123ae79616b93dc16cdb7f --- /dev/null +++ b/official/cv/ADNet/src/utils/draw_box_from_npy.py @@ -0,0 +1,38 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os + +import numpy as np +import cv2 + +from src.utils.display import draw_box + + +def draw_box_from_npy(video_path, npy_file, save_path): + if not os.path.exists(save_path): + os.mkdir(save_path) + + bboxes = np.load(npy_file) + + frames_files = os.listdir(video_path) + frames_files.sort(key=str.lower) + + for frame_idx, frame_file in enumerate(frames_files): + frame = cv2.imread(os.path.join(video_path, frame_file)) + curr_bbox = bboxes[frame_idx] + im_with_bb = draw_box(frame, curr_bbox) + + filename = os.path.join(save_path, str(frame_idx) + '.jpg') + cv2.imwrite(filename, im_with_bb) diff --git a/official/cv/ADNet/src/utils/gen_action_labels.py b/official/cv/ADNet/src/utils/gen_action_labels.py new file mode 100644 index 0000000000000000000000000000000000000000..1f6539e5435cb4e54593c4c0efc490deed4c471a --- /dev/null +++ b/official/cv/ADNet/src/utils/gen_action_labels.py @@ -0,0 +1,68 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# generate action labels for training the network +# matlab code: +# https://github.com/hellbell/ADNet/blob/master/utils/gen_action_labels.m + +import numpy as np + +from src.utils.overlap_ratio import overlap_ratio + + +def gen_action_labels(num_actions, opts, bb_samples, gt_bbox): + num_samples = len(bb_samples) + + action_labels = np.zeros([num_actions, num_samples]) + m = opts['action_move'] + + for j in range(len(bb_samples)): + bbox = bb_samples[j, :] + + bbox[0] = bbox[0] + 0.5*bbox[2] + bbox[1] = bbox[1] + 0.5*bbox[3] + + deltas = [m['x'] * bbox[2], m['y'] * bbox[3], m['w'] * bbox[2], m['h'] * bbox[3]] + ar = bbox[2]/bbox[3] + if bbox[2] > bbox[3]: + deltas[3] = deltas[2] / ar + else: + deltas[2] = deltas[3] * ar + + deltas = np.tile(deltas, (num_actions, 1)) + action_deltas = np.multiply(m['deltas'], deltas) + action_boxes = np.tile(bbox, (num_actions, 1)) + action_boxes = action_boxes + action_deltas + action_boxes[:, 0] = action_boxes[:, 0] - 0.5 * action_boxes[:, 2] + action_boxes[:, 1] = action_boxes[:, 1] - 0.5 * action_boxes[:, 3] + + overs = overlap_ratio(action_boxes, np.tile(gt_bbox, (num_actions, 1))) + max_action = np.argmax(overs[:-2]) # translation overlap + max_value = overs[max_action] + + if overs[opts['stop_action']] > opts['stopIou']: + max_action = opts['stop_action'] + + if max_value == overs[opts['stop_action']]: + max_action = np.argmax(overs[:]) # (trans + scale) action + + action = np.zeros(num_actions) + action[max_action] = 1 + action_labels[:, j] = action + + # return bbox back + bbox[0] = bbox[0] - 0.5 * bbox[2] + bbox[1] = bbox[1] - 0.5 * bbox[3] + + return action_labels # in real matlab code, they also return overs diff --git a/official/cv/ADNet/src/utils/gen_samples.py b/official/cv/ADNet/src/utils/gen_samples.py new file mode 100644 index 0000000000000000000000000000000000000000..d30ab0485b4d09807d80e5b759e363a553b95aef --- /dev/null +++ b/official/cv/ADNet/src/utils/gen_samples.py @@ -0,0 +1,98 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Generate sample bounding boxes. +# matlab code: +# https://github.com/hellbell/ADNet/blob/master/utils/gen_samples.m +import numpy as np + +from src.utils.my_math import normal_round + + +def gen_samples(dst_type, bb, n, opts, trans_f, scale_f): + # type => sampling method + # 'gaussian' generate samples from a Gaussian distribution centered at bb + # -> positive samples, target candidates + # 'uniform' generate samples from a uniform distribution around bb (same aspect ratio) + # -> negative samples + # 'uniform_aspect' generate samples from a uniform distribution around bb with varying aspect ratios + # -> training samples for bbox regression + # 'whole' generate samples from the whole image + # -> negative samples at the initial frame + assert dst_type in ['gaussian', 'uniform', 'uniform_aspect', 'whole'], "type of sampling method is unavailable" + + h = opts['imgSize'][0] + w = opts['imgSize'][1] + + # [center_x center_y width height] + sample = [bb[0] + bb[2] / 2, bb[1] + bb[3] / 2, bb[2], bb[3]] + samples = np.tile(sample, (n, 1)) + if dst_type == 'gaussian': + samples[:, 0:2] = samples[:, 0:2] + trans_f * normal_round(np.mean(bb[2:4])) * \ + np.maximum(-1, np.minimum(1, 0.5 * np.random.randn(n, 2))) + samples[:, 2:4] = np.multiply(samples[:, 2:4], + np.power(opts['scale_factor'], scale_f * + np.maximum(-1, np.minimum(1, 0.5 * np.random.randn(n, 1))))) + elif dst_type == 'uniform': + samples[:, 0:2] = samples[:, 0:2] + trans_f * normal_round(np.mean(bb[2:4])) * (np.random.rand(n, 2) * 2 - 1) + samples[:, 2:4] = np.multiply(samples[:, 2:4], + np.power(opts['scale_factor'], scale_f * (np.random.rand(n, 1) * 2 - 1))) + elif dst_type == 'uniform_aspect': + samples[:, 0:2] = samples[:, 0:2] + trans_f * np.multiply(bb[2:4], np.random.rand(n, 2) * 2 - 1) + samples[:, 2:4] = np.multiply(samples[:, 2:4], np.power(opts['scale_factor'], np.random.rand(n, 2) * 4 - 2)) + samples[:, 2:4] = np.multiply(samples[:, 2:4], np.power(opts['scale_factor'], scale_f * np.random.rand(n, 1))) + else: # elif type == 'whole' + # TODO: I am not very sure if this is correct or not... + range_ = np.array(normal_round([bb[2] / 2, bb[3] / 2, w - bb[2] / 2, h - bb[3] / 2])).astype(int) + stride = np.array(normal_round([bb[2] / 5, bb[3] / 5])).astype(int) + dx, dy, ds = np.meshgrid(range(range_[0], range_[2] + stride[0], stride[0]), + range(range_[1], range_[3] + stride[1], stride[1]), + range(-5, 6)) + windows = [dx, dy, bb[2] * np.power(opts['scale_factor'], ds), bb[3] * np.power(opts['scale_factor'], ds)] + + samples = [] + while len(samples) < n: + # windows[0] = x-axis + # windows[1] = y-axis + # windows[2] = w + # windows[3] = h + # random to get x, y, w, h. Each has 34 * 51 * 11 choices (in grid). Random the grid coordinate + random_idx = [np.random.randint(1, np.array(windows[0]).shape[0], 4), + np.random.randint(1, np.array(windows[0]).shape[1], 4), + np.random.randint(1, np.array(windows[0]).shape[2], 4)] + sample = [windows[0][random_idx[0][0]][random_idx[1][0]][random_idx[2][0]], + windows[1][random_idx[0][1]][random_idx[1][1]][random_idx[2][1]], + windows[2][random_idx[0][2]][random_idx[1][2]][random_idx[2][2]], + windows[3][random_idx[0][3]][random_idx[1][3]][random_idx[2][3]]] + samples.append(sample) + samples = np.array(samples) + + # bound the width and height + samples[:, 2] = np.maximum(10, np.minimum(w - 10, samples[:, 2])) + samples[:, 3] = np.maximum(10, np.minimum(h - 10, samples[:, 3])) + + # [left top width height] + bb_samples = np.array([samples[:, 0] - samples[:, 2] / 2, + samples[:, 1] - samples[:, 3] / 2, + samples[:, 2], + samples[:, 3]]).transpose() + + bb_samples[:, 0] = np.maximum(1 - bb_samples[:, 2], + np.minimum(w - bb_samples[:, 2] / 2, bb_samples[:, 0])) + bb_samples[:, 1] = np.maximum(1 - bb_samples[:, 3], + np.minimum(h - bb_samples[:, 3] / 2, bb_samples[:, 1])) + bb_samples = normal_round(bb_samples) + bb_samples = bb_samples.astype(np.float32) + + return bb_samples diff --git a/official/cv/ADNet/src/utils/get_action_history_onehot.py b/official/cv/ADNet/src/utils/get_action_history_onehot.py new file mode 100644 index 0000000000000000000000000000000000000000..494935d54029e425636bb895b354d05dfd9bf75e --- /dev/null +++ b/official/cv/ADNet/src/utils/get_action_history_onehot.py @@ -0,0 +1,25 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# returns action history as one-hot form +# https://github.com/hellbell/ADNet/blob/3a7955587b5d395401ebc94a5ab067759340680d/utils/get_action_history_onehot.m +import mindspore.numpy as nps + +def get_action_history_onehot(action_history, opts): + onehot = nps.zeros((opts['num_actions'] * len(action_history),)) + for i in range(len(action_history)): + start_idx = i * opts['num_actions'] + if action_history[i] >= 0 and action_history[i] < opts['num_actions']: + onehot[start_idx + action_history[i]] = 1. + return onehot diff --git a/official/cv/ADNet/src/utils/get_benchmark_info.py b/official/cv/ADNet/src/utils/get_benchmark_info.py new file mode 100644 index 0000000000000000000000000000000000000000..58b859965991ee0542f661d6eb9f6fbc73a68de9 --- /dev/null +++ b/official/cv/ADNet/src/utils/get_benchmark_info.py @@ -0,0 +1,32 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# matlab code: +# https://github.com/hellbell/ADNet/blob/master/utils/get_benchmark_info.m + +import os + +ROOT_PATH = os.path.dirname(__file__) + + +def get_benchmark_info(bench_name=None): + if bench_name is None: + bench_name = 'otb-vot15' + + bench_path = os.path.join(ROOT_PATH, '../utils/videolist', bench_name + '.txt') + bench_file = open(bench_path, "r") + video_names = bench_file.read().split('\n') + bench_file.close() + + return video_names diff --git a/official/cv/ADNet/src/utils/get_benchmark_path.py b/official/cv/ADNet/src/utils/get_benchmark_path.py new file mode 100644 index 0000000000000000000000000000000000000000..76c510989e29cef3c1b2cb30cc2d6bbd198722f6 --- /dev/null +++ b/official/cv/ADNet/src/utils/get_benchmark_path.py @@ -0,0 +1,30 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# matlab source: +# https://github.com/hellbell/ADNet/blob/master/utils/get_benchmark_path.m +import os +import glob + + +def get_benchmark_path(bench_name, args): + assert bench_name in ['vot15', 'vot14', 'vot13'] + if bench_name == 'vot15': + video_path = glob.glob(os.path.join(args.dataset_path, '*15'))[0] + elif bench_name == 'vot14': + video_path = glob.glob(os.path.join(args.dataset_path, '*14'))[0] + else: # elif bench_name == 'vot13' + video_path = glob.glob(os.path.join(args.dataset_path, '*13'))[0] + + return video_path diff --git a/official/cv/ADNet/src/utils/get_train_videos.py b/official/cv/ADNet/src/utils/get_train_videos.py new file mode 100644 index 0000000000000000000000000000000000000000..f01349868e6457e0582c7d0d91e3e5669f7e8783 --- /dev/null +++ b/official/cv/ADNet/src/utils/get_train_videos.py @@ -0,0 +1,47 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# matlab code: +# https://github.com/hellbell/ADNet/blob/3a7955587b5d395401ebc94a5ab067759340680d/utils/get_train_videos.m + +import numpy as np + +from src.utils.get_benchmark_path import get_benchmark_path +from src.utils.get_benchmark_info import get_benchmark_info + + +def get_train_videos(opts, args): + train_db_names = opts['train_dbs'] + test_db_names = opts['test_db'] + + video_names = [] + video_paths = [] + bench_names = [] + + for dbidx in range(len(train_db_names)): + bench_name = train_db_names[dbidx] + path_ = get_benchmark_path(bench_name, args) + video_names_ = get_benchmark_info(train_db_names[dbidx] + '-' + test_db_names) + video_paths_ = np.tile(path_, (1, len(video_names_))) + video_names.extend(video_names_) + video_paths.extend(list(video_paths_[0])) + #np.tile( + bench_names.extend(list(np.tile(bench_name, (1, len(video_names_)))[0])) + + train_db = { + 'video_names': video_names, + 'video_paths': video_paths, + 'bench_names': bench_names + } + return train_db diff --git a/official/cv/ADNet/src/utils/get_video_infos.py b/official/cv/ADNet/src/utils/get_video_infos.py new file mode 100644 index 0000000000000000000000000000000000000000..8264ebf277c44a4ddd85e8438d101636790ab67c --- /dev/null +++ b/official/cv/ADNet/src/utils/get_video_infos.py @@ -0,0 +1,85 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Get video information (image paths and ground truths) +# matlab code: +# https://github.com/hellbell/ADNet/blob/3a7955587b5d395401ebc94a5ab067759340680d/utils/get_video_infos.m + +import os +import glob + +ROOT_PATH = os.path.dirname(__file__) +def get_video_infos(bench_name, video_path, video_name): + assert bench_name in ['vot13', 'vot14', 'vot15'] + + if bench_name in ['vot13', 'vot14', 'vot15']: + # path to VOT dataset + video_info = { + 'gt': [], + 'img_files': [], + 'name': video_name, + 'db_name': bench_name, + 'nframes': 0 + } + benchmarkSeqHome = video_path + # img path + imgDir = os.path.join(benchmarkSeqHome, video_name) + if not os.path.exists(imgDir): + print(imgDir + ' does not exist!') + raise FileNotFoundError + if bench_name == 'vot15': + img_files = glob.glob(os.path.join(imgDir, 'color/*.jpg')) + else: + img_files = glob.glob(os.path.join(imgDir, '*.jpg')) + img_files.sort(key=str.lower) + + for i in range(len(img_files)): + img_path = os.path.join(img_files[i]) + video_info['img_files'].append(img_path) + + # gt path + gtPath = os.path.join(benchmarkSeqHome, video_name, 'groundtruth.txt') + if not os.path.exists(gtPath): + print(gtPath + ' does not exist!') + raise FileNotFoundError + + # parse gt + gtFile = open(gtPath, 'r') + gt = gtFile.read().split('\n') + for i in range(len(gt)): + if gt[i] == '' or gt[i] is None: + continue + gt[i] = gt[i].split(',') + gt[i] = list(map(float, gt[i])) + gtFile.close() + + if len(gt[0]) >= 6: + for gtidx in range(len(gt)): + if gt[gtidx] == "": + continue + x = gt[gtidx][0:len(gt[gtidx]):2] + y = gt[gtidx][1:len(gt[gtidx]):2] + gt[gtidx] = [min(x), + min(y), + max(x) - min(x), + max(y) - min(y)] + + video_info['gt'] = gt + + video_info['nframes'] = min(len(video_info['img_files']), len(video_info['gt'])) + video_info['img_files'] = video_info['img_files'][:video_info['nframes']] + video_info['gt'] = video_info['gt'][:video_info['nframes']] + + return video_info + return {} diff --git a/official/cv/ADNet/src/utils/get_wrapper_utils.py b/official/cv/ADNet/src/utils/get_wrapper_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..23d33235975b56858506cc00736575e9ec781ea3 --- /dev/null +++ b/official/cv/ADNet/src/utils/get_wrapper_utils.py @@ -0,0 +1,105 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os + +from mindspore import dataset as ds +from mindspore.communication.management import get_rank, get_group_size + + +def get_dataLoader(source, opts, args, column_names): + if args.distributed: + rank_id = get_rank() + rank_size = get_group_size() + if isinstance(source, tuple): + data_loaders_pos = [] + data_loaders_neg = [] + datasets_pos, datasets_neg = source + if not args.distributed: + for dataset_pos in datasets_pos: + dataset = ds.GeneratorDataset(source=dataset_pos, + column_names=column_names, + num_parallel_workers=args.num_workers, shuffle=True) + dataset = dataset.batch(batch_size=opts['minibatch_size']) + data_loaders_pos.append(dataset) + for dataset_neg in datasets_neg: + dataset = ds.GeneratorDataset(source=dataset_neg, + column_names=column_names, + num_parallel_workers=args.num_workers, shuffle=True) + dataset = dataset.batch(batch_size=opts['minibatch_size']) + data_loaders_neg.append(dataset) + else: + for dataset_pos in datasets_pos: + dataset = ds.GeneratorDataset(source=dataset_pos, + column_names=column_names, + num_parallel_workers=args.num_workers, shuffle=True, num_shards=rank_size, + shard_id=rank_id) + dataset = dataset.batch(batch_size=opts['minibatch_size']) + data_loaders_pos.append(dataset) + for dataset_neg in datasets_neg: + dataset = ds.GeneratorDataset(source=dataset_neg, + column_names=["im", "bbox", "action_label", "score_label", "vid_idx"], + num_parallel_workers=args.num_workers, shuffle=True, num_shards=rank_size, + shard_id=rank_id) + dataset = dataset.batch(batch_size=opts['minibatch_size']) + data_loaders_neg.append(dataset) + return data_loaders_pos, data_loaders_neg + if args.distributed: + dataset = ds.GeneratorDataset(source=source, + column_names=column_names, + num_parallel_workers=args.num_workers, shuffle=True, num_shards=rank_size, + shard_id=rank_id) + dataset = dataset.batch(batch_size=opts['minibatch_size']) + else: + dataset = ds.GeneratorDataset(source=source, + column_names=column_names, + num_parallel_workers=args.num_workers, shuffle=True) + dataset = dataset.batch(batch_size=opts['minibatch_size']) + return dataset + + +def get_groundtruth(gt_path): + if not os.path.exists(gt_path): + bboxes = [] + t_sum = 0 + return bboxes, t_sum + + # parse gt + gtFile = open(gt_path, 'r') + gt = gtFile.read().split('\n') + for i in range(len(gt)): + if gt[i] == '' or gt[i] is None: + continue + if ',' in gt[i]: + separator = ',' + elif '\t' in gt[i]: + separator = '\t' + elif ' ' in gt[i]: + separator = ' ' + else: + separator = ',' + + gt[i] = gt[i].split(separator) + gt[i] = list(map(float, gt[i])) + gtFile.close() + + if len(gt[0]) >= 6: + for gtidx in range(len(gt)): + if gt[gtidx] == "": + continue + x = gt[gtidx][0:len(gt[gtidx]):2] + y = gt[gtidx][1:len(gt[gtidx]):2] + gt[gtidx] = [min(x), min(y), max(x) - min(x), max(y) - min(y)] + + return gt diff --git a/official/cv/ADNet/src/utils/my_math.py b/official/cv/ADNet/src/utils/my_math.py new file mode 100644 index 0000000000000000000000000000000000000000..1a257102000c0feaf314f3db6fb38dbc662a0609 --- /dev/null +++ b/official/cv/ADNet/src/utils/my_math.py @@ -0,0 +1,34 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import math +import numpy as np + + +# https://stackoverflow.com/a/41206290/3839572 +def normal_round(n): + if isinstance(n, (list, np.ndarray)): + if isinstance(n, list): + temp = np.array(n) + else: + temp = n + + for idx, value in np.ndenumerate(temp): + if value - math.floor(value) < 0.5: + temp[idx] = math.floor(value) + temp[idx] = math.ceil(value) + return temp + if n - math.floor(n) < 0.5: + return math.floor(n) + return math.ceil(n) diff --git a/official/cv/ADNet/src/utils/overlap_ratio.py b/official/cv/ADNet/src/utils/overlap_ratio.py new file mode 100644 index 0000000000000000000000000000000000000000..4679b0dbce06aae398fa03b72f6343fe2c883223 --- /dev/null +++ b/official/cv/ADNet/src/utils/overlap_ratio.py @@ -0,0 +1,86 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# matlab source: +# https://github.com/hellbell/ADNet/blob/3a7955587b5d395401ebc94a5ab067759340680d/utils/overlap_ratio.m + +import numpy as np + + +# https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/ +def overlap_ratio(rect1, rect2): + assert isinstance(rect1, (list, np.ndarray)) and isinstance(rect2, (list, np.ndarray)) + + if len(np.array(rect1).shape) == 2 and len(np.array(rect2).shape) == 2: + + iou = [] + + for _rect1, _rect2 in zip(rect1, rect2): + + boxA = [_rect1[0], _rect1[1], _rect1[0] + _rect1[2], _rect1[1] + _rect1[3]] + boxB = [_rect2[0], _rect2[1], _rect2[0] + _rect2[2], _rect2[1] + _rect2[3]] + + # determine the (x, y)-coordinates of the intersection rectangle + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + + # compute the area of intersection rectangle + interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1) + + # compute the area of both the prediction and ground-truth + # rectangles + boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1) + boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1) + + # compute the intersection over union by taking the intersection + # area and dividing it by the sum of prediction + ground-truth + # areas - the intersection area + _iou = interArea / float(boxAArea + boxBArea - interArea) + + if _iou < 0: + _iou = 0 + + iou.append(_iou) + else: + assert len(np.array(rect1).shape) == len(np.array(rect2).shape) + + boxA = [rect1[0], rect1[1], rect1[0] + rect1[2], rect1[1] + rect1[3]] + boxB = [rect2[0], rect2[1], rect2[0] + rect2[2], rect2[1] + rect2[3]] + + # determine the (x, y)-coordinates of the intersection rectangle + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + + # compute the area of intersection rectangle + interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1) + + # compute the area of both the prediction and ground-truth + # rectangles + boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1) + boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1) + + # compute the intersection over union by taking the intersection + # area and dividing it by the sum of prediction + ground-truth + # areas - the intersection area + iou = interArea / float(boxAArea + boxBArea - interArea) + + if iou < 0: + iou = 0 + + # return the intersection over union value + return iou diff --git a/official/cv/ADNet/src/utils/precision_plot.py b/official/cv/ADNet/src/utils/precision_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..831fa34d1ba29f04e0ddb0a332450fbc8c6ccbfe --- /dev/null +++ b/official/cv/ADNet/src/utils/precision_plot.py @@ -0,0 +1,105 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import numpy as np +from matplotlib import pyplot as plt +from src.utils.overlap_ratio import overlap_ratio + + +def plot_result(Z, title, show=True, save_plot=None, xlabel=None, ylabel=None) -> None: + plt.plot(Z) + if xlabel is not None: + plt.xlabel(xlabel) + if ylabel is not None: + plt.ylabel(ylabel) + plt.title(title) + plt.ylim([0, 1]) + if save_plot: + plt.savefig(save_plot) + if show: + plt.show() + + plt.clf() + + +def distance_precision_plot(bboxes, ground_truth, title, show=True, save_plot=None): + # PRECISION_PLOT + # Calculates precision for a series of distance thresholds (percentage of frames where the distance to the ground + # truth is within the threshold). The results are shown in a new figure if SHOW is true. + + # Accepts positions and ground truth as Nx2 matrices(for N frames), and a title string. + # matlab code credit: + # Joao F.Henriques, 2014 + # http: // www.isr.uc.pt / ~henriques / + + positions = bboxes[:, [1, 0]] + bboxes[:, [3, 2]] / 2 + ground_truth = ground_truth[:, [1, 0]] + ground_truth[:, [3, 2]] / 2 + + max_threshold = 50 # used for graphs in the paper + + precisions = np.zeros([max_threshold, 1]) + + if len(positions) != len(ground_truth): + print("WARNING: the size of positions and ground_truth are not same") + # just ignore any extra frames, in either results or ground truth + n = min(len(positions), len(ground_truth)) + positions = positions[:n] + ground_truth = ground_truth[:n] + + # calculate distances to ground truth over all frames + distances = np.sqrt( + np.square(positions[:, 0] - ground_truth[:, 0]) + np.square(positions[:, 1] - ground_truth[:, 1])) + + distances = distances[~np.isnan(distances)] + + # compute precision + precisions = [] + for p in range(max_threshold): + precisions.append(len(distances[distances <= p]) / len(distances)) + + # plot + if show or save_plot: + if save_plot is not None: + save_plot += '-distance' + plot_result(precisions, title, show=show, save_plot=save_plot, xlabel='distance threshold', ylabel='precision') + + return precisions + + +def iou_precision_plot(bboxes, ground_truth, title, show=True, save_plot=None): + max_threshold = 100 # used for graphs in the paper + + # precisions = np.zeros([max_threshold, 1]) + + if len(bboxes) != len(ground_truth): + print("WARNING: the size of iou and ground_truth are not same") + # just ignore any extra frames, in either results or ground truth + n = min(len(bboxes), len(ground_truth)) + ground_truth = ground_truth[:n] + + iou = overlap_ratio(bboxes, ground_truth) + iou = np.array(iou) + + # compute precision + precisions = [] + for p in range(max_threshold): + precisions.append(len(iou[iou >= p/100.0]) / len(iou)) + + # plot + if show or save_plot: + if save_plot is not None: + save_plot += '-iou' + plot_result(precisions, title, + show=show, save_plot=save_plot, xlabel='iou threshold (x0.01)', ylabel='precision') + return precisions diff --git a/official/cv/ADNet/src/utils/save_ckpt.py b/official/cv/ADNet/src/utils/save_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..202b1ecf36b5ad7dd9ab877fdf8d5afb1737ca55 --- /dev/null +++ b/official/cv/ADNet/src/utils/save_ckpt.py @@ -0,0 +1,44 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os + +from mindspore import save_checkpoint + + +def save_ckpt(net, domain_specific_nets, save_path, args, iteration, epoch, pattern): + if pattern == 1: + save_checkpoint(net, os.path.join(save_path, args.save_folder, args.save_file_RL) + + '_epoch' + repr(epoch) + '_iter' + repr(iteration) + '.ckpt') + + for curr_domain, domain_specific_net in enumerate(domain_specific_nets): + save_checkpoint(domain_specific_net, + os.path.join(save_path, args.save_folder, args.save_domain_dir, + 'RL_epoch' + repr(epoch) + '_iter' + repr( + iteration) + '_' + str(curr_domain) + '.ckpt')) + elif pattern == 2: + save_checkpoint(net, os.path.join(save_path, + args.save_folder, args.save_file_RL) + 'epoch' + repr(epoch) + '.ckpt') + + for curr_domain, domain_specific_net in enumerate(domain_specific_nets): + save_checkpoint(domain_specific_net, + os.path.join(save_path, args.save_folder, args.save_domain_dir, + 'RL_epoch' + repr(epoch) + '_' + str(curr_domain) + '.ckpt')) + else: + save_checkpoint(net, os.path.join(os.path.join(save_path, args.save_folder, args.save_file_RL) + '.ckpt')) + if args.multidomain: + for curr_domain, domain_specific_net in enumerate(domain_specific_nets): + save_checkpoint(domain_specific_net, + os.path.join(save_path, args.save_folder, args.save_domain_dir, + '_' + str(curr_domain) + '.ckpt')) diff --git a/official/cv/ADNet/src/utils/videolist/vot13-otb.txt b/official/cv/ADNet/src/utils/videolist/vot13-otb.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a895c4646db260bcd85d9ce86323a98cdb7e6fa --- /dev/null +++ b/official/cv/ADNet/src/utils/videolist/vot13-otb.txt @@ -0,0 +1,3 @@ +cup +iceskater +juice \ No newline at end of file diff --git a/official/cv/ADNet/src/utils/videolist/vot14-otb.txt b/official/cv/ADNet/src/utils/videolist/vot14-otb.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b172c3ec8cf6f3ab0f3827d8db24bbe2e694014 --- /dev/null +++ b/official/cv/ADNet/src/utils/videolist/vot14-otb.txt @@ -0,0 +1,11 @@ +ball +bicycle +drunk +fish1 +hand1 +polarbear +sphere +sunshade +surfing +torus +tunnel \ No newline at end of file diff --git a/official/cv/ADNet/src/utils/videolist/vot15-otb.txt b/official/cv/ADNet/src/utils/videolist/vot15-otb.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6c430f5231fb3690e908295ce1c3a7fd8b04d3d --- /dev/null +++ b/official/cv/ADNet/src/utils/videolist/vot15-otb.txt @@ -0,0 +1,44 @@ +bag +ball1 +ball2 +birds1 +birds2 +blanket +bmx +book +butterfly +crossing +dinosaur +fernando +fish1 +fish2 +fish3 +fish4 +glove +godfather +graduate +gymnastics1 +gymnastics2 +gymnastics3 +gymnastics4 +hand +handball1 +handball2 +helicopter +iceskater1 +leaves +marching +motocross2 +nature +octopus +rabbit +racing +road +sheep +singer3 +soccer2 +soldier +sphere +traffic +tunnel +wiper \ No newline at end of file diff --git a/official/cv/ADNet/train.py b/official/cv/ADNet/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ba323a512a3d5a74eb273990726df3d5a3bd0474 --- /dev/null +++ b/official/cv/ADNet/train.py @@ -0,0 +1,153 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os +import argparse +import ast + +from src.trainers.adnet_train_sl import adnet_train_sl +from src.options.general import opts +from src.models.ADNet import adnet +from src.utils.get_train_videos import get_train_videos +from src.trainers.adnet_train_rl import adnet_train_rl + + +from mindspore import context +from mindspore.communication.management import init +from mindspore.context import ParallelMode + + + +parser = argparse.ArgumentParser( + description='ADNet training') +parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU']) +parser.add_argument('--target_device', type=int, default=0) +parser.add_argument('--resume', default=None, type=str, help='Resume from checkpoint') +parser.add_argument('--num_workers', default=1, type=int, help='Number of workers used in dataloading') +parser.add_argument('--start_iter', default=0, type=int, + help='Begin counting iterations starting from this value (should be used with resume)') +parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD') +parser.add_argument('--visualize', default=False, type=ast.literal_eval, + help='Use tensorboardx to for loss visualization') +parser.add_argument('--send_images_to_visualization', type=ast.literal_eval, default=False, + help='Sample a random image from each 10th batch, send it to visdom after augmentations step') +parser.add_argument('--save_folder', default='../weights', help='Location to save checkpoint models') + +parser.add_argument('--save_file', default='ADNet_SL_', type=str, help='save file part of file name for SL') +parser.add_argument('--save_domain_dir', default='domain_weights', type=str, help='save ckpt from domain') +parser.add_argument('--save_file_RL', default='ADNet_RL_', type=str, help='save file part of file name for RL') +parser.add_argument('--start_epoch', default=0, type=int, help='Begin counting epochs starting from this value') + +parser.add_argument('--run_supervised', default=True, + type=ast.literal_eval, help='Whether to run supervised learning or not') + +parser.add_argument('--multidomain', default=True, type=ast.literal_eval, + help='Separating weight for each videos (default) or not') + +parser.add_argument('--save_result_images', default=True, type=ast.literal_eval, + help='Whether to save the results or not. Save folder: images/') +parser.add_argument('--display_images', default=False, type=ast.literal_eval, help='Whether to display images or not') +parser.add_argument('--distributed', type=ast.literal_eval, default=False) +parser.add_argument('--run_online', type=str, default='False') +parser.add_argument('--data_url', type=str) +parser.add_argument('--train_url', type=str) +parser.add_argument('--save_path', type=str, default='') +parser.add_argument('--dataset_path', type=str, default='') + +args = parser.parse_args() +if args.run_online == 'True': + import moxing as mox + local_data_url = "/cache/data" + args.dataset_path = local_data_url + # move dataset path + mox.file.copy_parallel(args.data_url, local_data_url) + args.save_path = '/cache/train_out' + args.save_folder = 'weights' + +if args.distributed: + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=int(os.environ["DEVICE_ID"])) + init() + context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) +else: + context.set_context(device_target=args.device_target, mode=context.GRAPH_MODE, device_id=args.target_device) +# Supervised Learning part +if args.run_supervised: + opts['minibatch_size'] = 128 + # train with supervised learning + if args.run_online == 'True': + save_path = '/cache/train_out' + if args.resume is not None: + import moxing + local_weight = '/cache/weight/' + args.resume.split('/')[-1] + #moving ckpt + moxing.file.copy_parallel(args.resume, local_weight) + #moving multidomain + if not os.path.exists("/cache/weight/domain_weights/"): + os.makedirs("/cache/weight/domain_weights/") + moxing.file.copy_parallel(args.resume[:args.resume.rfind('/')] + + '/domain_weights/', "/cache/weight/domain_weights/") + args.resume = local_weight + else: + save_path = '' + dir_path = os.path.join(args.save_path, args.save_folder, args.save_domain_dir) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + _, _, train_videos = adnet_train_sl(args, opts) + + args.resume = os.path.join(save_path, args.save_folder, args.save_file) + 'final.ckpt' + + + # reinitialize the network with network from SL + net, domain_specific_nets = adnet(opts, trained_file=args.resume, + random_initialize_domain_specific=True, + multidomain=args.multidomain, + distributed=args.distributed, + run_online=args.run_online) + + args.start_epoch = 0 + args.start_iter = 0 + +else: + assert args.resume is not None, \ + "Please put result of supervised learning or reinforcement learning with --resume (filename)" + if args.run_online == 'True': + import moxing + local_data_url = "/cache/data" + # move dataset path + args.dataset_path = local_data_url + moxing.file.copy_parallel(args.data_url, local_data_url) + local_weight_url = "/cache/weight/" + args.resume.split('/')[-1] + # moving ckpt + moxing.file.copy_parallel(args.resume, local_weight_url) + args.resume = local_weight_url + train_videos = get_train_videos(opts, args) + opts['num_videos'] = len(train_videos['video_names']) + + if args.start_iter == 0: # means the weight came from the SL + net, domain_specific_nets = adnet(opts, trained_file=args.resume, + random_initialize_domain_specific=True, + multidomain=args.multidomain, + distributed=args.distributed, + run_online=args.run_online) + else: # resume the adnet + net, domain_specific_nets = adnet(opts, trained_file=args.resume, + random_initialize_domain_specific=False, + multidomain=args.multidomain, + distributed=args.distributed, + run_online=args.run_online) + +# Reinforcement Learning part +opts['minibatch_size'] = 32 + +net = adnet_train_rl(net, domain_specific_nets, train_videos, opts, args)