diff --git a/research/audio/jasper/README-CN.md b/research/audio/jasper/README-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..467684eb6880d4f0a8c1246bf6b8edac0a879767 --- /dev/null +++ b/research/audio/jasper/README-CN.md @@ -0,0 +1,299 @@ +# 目录 + +[View English](./README.md) + +<!-- TOC --> + +- - [目录](#目录) + - [jasper介绍](#jasper介绍) + - [网络模型结构](#网络模型结构) + - [数据集](#数据集) + - [环境要求](#环境要求) + - [文件说明和运行说明](#文件说明和运行说明) + - [代码目录结构说明](#代码目录结构说明) + - [模型参数](#模型参数) + - [训练和推理过程](#训练和推理过程) + - [Export](#Export) + - [性能](#性能) + - [训练性能](#训练性能) + - [推理性能](#推理性能) + - [ModelZoo主页](#modelzoo主页) + +## [Jasper介绍](#contents) + +Japser是一个使用 CTC 损失训练的端到端的语音识别模型。Jasper模型仅仅使用1D convolutions, batch normalization, ReLU, dropout和residual connections这些模块。训练和验证支持CPU和GPU。 + +[论文](https://arxiv.org/pdf/1904.03288v3.pdf): Jason Li, et al. Jasper: An End-to-End Convolutional Neural Acoustic Model. + +## [网络模型结构](#contents) + +Jasper是一种基于卷积的端到端神经声学模型。在音频处理阶段,将每一帧转换为梅尔尺度谱图特征,声学模型将其作为输入,并输出每一帧词汇表上的概率分布。声学模型具有模块化的块结构,可以相应地进行参数化:Jasper BxR模型有B个块,每个块由R个重复子块组成。 + +每一个子块应用下面这些操作: +1D-Convolution, Batch Normalization, ReLU activation, Dropout. +每个块输入通过残差连接直接连接到所有后续块的最后一个子块,本文称之为dense residual。每个块的内核大小和过滤器数量都不同,从底层到顶层,过滤器的大小都在增加。不管精确的块配置参数B和R如何,每个Jasper模型都有四个额外的卷积块:一个紧跟在输入层之后,三个在B块末尾。 + +## [数据集](#contents) + +可以基于论文中提到的数据集或在相关领域/网络架构中广泛使用的数据集运行脚本。在下面的部分中,我们将介绍如何使用下面的相关数据集运行脚本。 + +使用的数据集为: [LibriSpeech](<http://www.openslr.org/12>) + +训练集: +train-clean-100: [6.3G] (100小时的无噪音演讲训练集) +train-clean-360.tar.gz [23G] (360小时的无噪音演讲训练集) +train-other-500.tar.gz [30G] (500小时的有噪音演讲训练集) +验证集: +dev-clean.tar.gz [337M] (无噪音) +dev-other.tar.gz [314M] (有噪音) +测试集: +test-clean.tar.gz [346M] (测试集, 无噪音) +test-other.tar.gz [328M] (测试集, 有噪音) +数据格式:wav 和 txt 文件 + +## [环境要求](#contents) + +硬件(GPU) + GPU处理器 +框架 + [MindSpore](https://www.mindspore.cn/install/en) +通过下面网址可以获得更多信息: + [MindSpore tutorials](https://www.mindspore.cn/tutorials/en/master/index.html) + [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html) + +## [文件说明和运行说明](#contents) + +### [代码目录结构说明](#contents) + +```path +. +└─audio + └─jasper + │ eval.py //推理文件 + │ labels.json //需要用到的字符 + │ pt2mind.py //pth转化ckpt文件 + | create_mindrecord.py //将数据集转化为mindrecord + │ README-CN.md //中文readme + │ README.md //英文readme + │ requirements.txt //需要的库文件 + │ train.py //训练文件 + │ + ├─scripts + │ download_librispeech.sh //下载数据集的脚本 + │ preprocess_librispeech.sh //处理数据集的脚本 + │ run_distribute_train_gpu.sh //GPU8卡训练 + │ run_eval_cpu.sh //CPU推理 + │ run_eval_gpu.sh //GPU推理 + │ run_standalone_train_cpu.sh //CPU单卡训练 + │ run_standalone_train_gpu.sh //GPU单卡训练 + │ + ├─src + │ audio.py //数据处理相关代码 + │ callback.py //回调以监控训练 + │ cleaners.py //数据清理 + │ config.py //jasper配置文件 + │ dataset.py //数据处理 + │ decoder.py //来自第三方的解码器 + │ eval_callback.py //推理的数据回调 + │ greedydecoder.py //修改Mindspore代码的greedydecoder + │ jasper10x5dr_speca.yaml //jasper网络结构配置 + │ lr_generator.py //产生学习率 + │ model.py //训练模型 + │ model_test.py //推理模型 + │ number.py //数据处理 + │ text.py //数据处理 + │ __init__.py + │ + └─utils + convert_librispeech.py //转化数据集 + download_librispeech.py //下载数据集 + download_utils.py //下载工具 + inference_librispeech.csv //推理数据集链接 + librispeech.csv //全部数据集链接 + preprocessing_utils.py //预处理工具 + __init__.py +``` + +### [模型参数](#contents) + +训练和推理的相关参数在`config.py`文件 + +```text +训练相关参数 + epochs 训练的epoch数量,默认为440 +``` + +```text +数据处理相关参数 + train_manifest 用于训练的数据文件路径,默认为 'data/libri_train_manifest.json' + val_manifest 用于测试的数据文件路径,默认为 'data/libri_val_manifest.json' + batch_size 批处理大小,默认为64 + labels_path 模型输出的token json 路径, 默认为 "./labels.json" + sample_rate 数据特征的采样率,默认为16000 + window_size 频谱图生成的窗口大小(秒),默认为0.02 + window_stride 频谱图生成的窗口步长(秒),默认为0.01 + window 频谱图生成的窗口类型,默认为 'hamming' + speed_volume_perturb 使用随机速度和增益扰动,默认为False,当前模型中未使用 + spec_augment 在MEL谱图上使用简单的光谱增强,默认为False,当前模型中未使用 + noise_dir 注入噪音到音频。默认为noise Inject未添加,默认为'',当前模型中未使用 + noise_prob 每个样本加噪声的概率,默认为0.4,当前模型中未使用 + noise_min 样本的最小噪音水平,(1.0意味着所有的噪声,不是原始信号),默认是0.0,当前模型中未使用 + noise_max 样本的最大噪音水平。最大值为1.0,默认值为0.5,当前模型中未使用 +``` + +```text +优化器相关参数 + learning_rate 初始化学习率,默认为3e-4 + learning_anneal 对每个epoch之后的学习率进行退火,默认为1.1 + weight_decay 权重衰减,默认为1e-5 + momentum 动量,默认为0.9 + eps Adam eps,默认为1e-8 + betas Adam betas,默认为(0.9, 0.999) + loss_scale 损失规模,默认是1024 +``` + +```text +checkpoint相关参数 + ckpt_file_name_prefix ckpt文件的名称前缀,默认为'DeepSpeech' + ckpt_path ckpt文件的保存路径,默认为'checkpoints' + keep_checkpoint_max ckpt文件的最大数量限制,删除旧的检查点,默认是10 +``` + +## [训练和推理过程](#contents) + +### 训练 + +```text +运行: train.py [--use_pretrained USE_PRETRAINED] + [--pre_trained_model_path PRE_TRAINED_MODEL_PATH] + [--is_distributed IS_DISTRIBUTED] + [--bidirectional BIDIRECTIONAL] + [--device_target DEVICE_TARGET] +参数: + --pre_trained_model_path 预先训练的模型文件路径,默认为'' + --is_distributed 多卡训练,默认为False + --device_target 运行代码的设备:"GPU" | “CPU”,默认为"GPU" +``` + +### 推理 + +```text +运行: eval.py [--bidirectional BIDIRECTIONAL] + [--pretrain_ckpt PRETRAIN_CKPT] + [--device_target DEVICE_TARGET] + +参数: + --pretrain_ckpt checkpoint的文件路径, 默认为'' + --device_target 运行代码的设备:"GPU" | “CPU”,默认为"GPU" +``` + +在训练之前,应该下载、处理数据集。 + +``` bash +bash scripts/download_librispeech.sh +bash scripts/preprocess_librispeech.sh +python create_mindrecord.py //将数据集转成mindrecord格式 +``` + +流程结束后,数据目录结构如下: + +```path + . + |--LibriSpeech + │ |--train-clean-100-wav + │ │--train-clean-360-wav + │ │--train-other-500-wav + │ |--dev-clean-wav + │ |--dev-other-wav + │ |--test-clean-wav + │ |--test-other-wav + |--librispeech-train-clean-100-wav.json,librispeech-train-clean-360-wav.json,librispeech-train-other-500-wav.json,librispeech-dev-clean-wav.json,librispeech-dev-other-wav.json,librispeech-test-clean-wav.json,librispeech-test-other-wav.json +``` + +src/config中设置数据集的位置。 + +```shell +... +训练配置 +"Data_dir": '/data/dataset', +"train_manifest": ['/data/dataset/librispeech-train-clean-100-wav.json', + '/data/dataset/librispeech-train-clean-360-wav.json', + '/data/dataset/librispeech-train-other-500-wav.json'], +"mindrecord_format": "/data/jasper_tr{}.md", +"mindrecord_files": [f"/data/jasper_tr{i}.md" for i in range(8)] + +评估配置 +"DataConfig":{ + "Data_dir": '/data/inference_datasets', + "test_manifest": ['/data/inference_datasets/librispeech-dev-clean-wav.json'], +} + +``` + +训练之前,需要安装`librosa` and `Levenshtein` +通过官网安装MindSpore并完成数据集处理后,可以开始训练如下: + +```shell + +# gpu单卡训练 +bash ./scripts/run_standalone_train_gpu.sh [DEVICE_ID] + +# cpu单卡训练 +bash ./scripts/run_standalone_train_cpu.sh + +# gpu多卡训练 +bash ./scripts/run_distribute_train_gpu.sh + +``` + +推理: + +```shell + +# cpu评估 +bash ./scripts/run_eval_cpu.sh [PATH_CHECKPOINT] + +# gpu评估 +bash ./scripts/run_eval_gpu.sh [DEVICE_ID] [PATH_CHECKPOINT] + +``` + +## [性能](#contents) + +### [训练和测试性能分析](#contents) + +#### 训练性能 + +| 参数 | Jasper | +| -------------------------- | ---------------------------------------------------------------| +| 资源 | NV SMX2 V100-32G | +| 更新日期 | 2/7/2022 (month/day/year) | +| MindSpore版本 | 1.8.0 | +| 数据集 | LibriSpeech | +| 训练参数 | 8p, epoch=440, steps=1088 * epoch, batch_size = 64, lr=3e-4 | +| 优化器 | Adam | +| 损失函数 | CTCLoss | +| 输出 | 概率值 | +| 损失值 | 0.2-0.7 | +| 运行速度 | 8p 2.7s/step | +| 训练总时间 | 8p: around 194h; | +| Checkpoint文件大小 | 991M (.ckpt file) | +| 代码 | [Japser script](https://gitee.com/mindspore/models/tree/master/research/audio/jasper) | + +#### Inference Performance + +| 参数 | Jasper | +| -------------------------- | ----------------------------------------------------------------| +| 资源 | NV SMX2 V100-32G | +| 更新日期 | 2/7/2022 (month/day/year) | +| MindSpore版本 | 1.8.0 | +| 数据集 | LibriSpeech | +| 批处理大小 | 64 | +| 输出 | 概率值 | +| 精确度(无噪声) | 8p: WER: 5.754 CER: 2.151 | +| 精确度(有噪声) | 8p: WER: 19.213 CER: 9.393 | +| 模型大小 | 330M (.mindir file) | + +## [ModelZoo主页](#contents) + + [ModelZoo主页](https://gitee.com/mindspore/models). diff --git a/research/audio/jasper/README.md b/research/audio/jasper/README.md new file mode 100644 index 0000000000000000000000000000000000000000..948c0e70608ef3ca6f30296e2f03c1bad4bb50c4 --- /dev/null +++ b/research/audio/jasper/README.md @@ -0,0 +1,300 @@ +# Contents + +- - [jasper Description](#CenterNet-description) + - [Model Architecture](#Model-Architecture) + - [Dataset](#dataset) + - [Environment Requirements](#environment-requirements) + - [Script Description](#script-description) + - [Script and Sample Code](#script-parameters) + - [Script Parameters](#script-parameters) + - [Training and eval Process](#training-process) + - [Export](#Export) + - [Performance](#performance) + - [Training Performance](#training-performance) + - [Inference Performance](#inference-performance) + - [ModelZoo Homepage](#modelzoo-homepage) + +## [Jasper Description](#contents) + +Jasper is an end-to-end speech recognition models which is trained with CTC loss. Jasper model uses only 1D convolutions, batch normalization, ReLU, dropout, and residual connections. We support training and evaluation on CPU and GPU. + +[Paper](https://arxiv.org/pdf/1904.03288v3.pdf): Jason Li, et al. Jasper: An End-to-End Convolutional Neural Acoustic Model. + +## [Model Architecture](#contents) + +Jasper is an end-to-end neural acoustic model that is based on convolutions. In the audio processing stage, each frame is transformed into mel-scale spectrogram features, which the acoustic model takes as input and outputs a probability distribution over the vocabulary for each frame. The acoustic model has a modular block structure and can be parametrized accordingly: a Jasper BxR model has B blocks, each consisting of R repeating sub-blocks. +Each sub-block applies the following operations in sequence: 1D-Convolution, Batch Normalization, ReLU activation, and Dropout. +Each block input is connected directly to the last subblock of all following blocks via a residual connection, which is referred to as dense residual in the paper. Every block differs in kernel size and number of filters, which are increasing in size from the bottom to the top layers. Irrespective of the exact block configuration parameters B and R, every Jasper model has four additional convolutional blocks: one immediately succeeding the input layer (Prologue) and three at the end of the B blocks (Epilogue). + +## [Dataset](#contents) + +Note that you can run the scripts based on the dataset mentioned in original paper or widely used in relevant domain/network architecture. In the following sections, we will introduce how to run the scripts using the related dataset below. + +Dataset used: [LibriSpeech](<http://www.openslr.org/12>) + +Train Data: +train-clean-100: [6.3G] (training set of 100 hours "clean" speech) +train-clean-360.tar.gz [23G] (training set of 360 hours "clean" speech) +train-other-500.tar.gz [30G] (training set of 500 hours "other" speech) +Val Data: +dev-clean.tar.gz [337M] (development set, "clean" speech) +dev-other.tar.gz [314M] (development set, "other", more challenging, speech) +Test Data: +test-clean.tar.gz [346M] (test set, "clean" speech ) +test-other.tar.gz [328M] (test set, "other" speech ) +Data format:wav and txt files + +## [Environment Requirements](#contents) + +Hardware(GPU) + Prepare hardware environment with GPU processor. +Framework + [MindSpore](https://www.mindspore.cn/install/en) +For more information, please check the resources below: + [MindSpore tutorials](https://www.mindspore.cn/tutorials/en/master/index.html) + [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html) + +## [Script Description](#contents) + +### [Script and Sample Code](#contents) + +```path +. +└─audio + └─jasper + │ eval.py //inference file + │ labels.json //label file + │ pt2mind.py //pth transform to ckpt file + | create_mindrecord.py //transform data to mindrecord + │ README-CN.md //Chinese readme + │ README.md //English readme + │ requirements.txt //required library file + │ train.py //train file + │ + ├─scripts + │ download_librispeech.sh //download data + │ preprocess_librispeech.sh //preprocess data + │ run_distribute_train_gpu.sh //8 GPU cards train + │ run_eval_cpu.sh //CPU evaluate + │ run_eval_gpu.sh //GPU evaluate + │ run_standalone_train_cpu.sh //one CPU train + │ run_standalone_train_gpu.sh //one GPU train + │ + ├─src + │ audio.py //preprocess data + │ callback.py //callback + │ cleaners.py //preprocess data + │ config.py //jasper config + │ dataset.py //preporcess data + │ decoder.py //Third-party decoders + │ eval_callback.py //evaluate callback + │ greedydecoder.py //refactored greedydecoder + │ jasper10x5dr_speca.yaml //jasper model's config + │ lr_generator.py //learning rate + │ model.py //training model + │ model_test.py //inference model + │ number.py //preprocess data + │ text.py //preprocess data + │ __init__.py + │ + └─utils + convert_librispeech.py //convert data + download_librispeech.py //download data + download_utils.py //download utils + inference_librispeech.csv //links to inference data + librispeech.csv //links to all data + preprocessing_utils.py //preprocessing utils + __init__.py + +``` + +### [Script Parameters](#contents) + +#### Training + +```text +usage: train.py [--use_pretrained USE_PRETRAINED] + [--pre_trained_model_path PRE_TRAINED_MODEL_PATH] + [--is_distributed IS_DISTRIBUTED] + [--bidirectional BIDIRECTIONAL] + [--device_target DEVICE_TARGET] +options: + --pre_trained_model_path pretrained checkpoint path, default is '' + --is_distributed distributed training, default is False + is True. Currently, only bidirectional model is implemented + --device_target device where the code will be implemented: "GPU" | "CPU", default is "GPU" +``` + +#### Evaluation + +```text +usage: eval.py [--bidirectional BIDIRECTIONAL] + [--pretrain_ckpt PRETRAIN_CKPT] + [--device_target DEVICE_TARGET] + +options: + --bidirectional whether to use bidirectional RNN, default is True. Currently, only bidirectional model is implemented + --pretrain_ckpt saved checkpoint path, default is '' + --device_target device where the code will be implemented: "GPU" | "CPU", default is "GPU" +``` + +#### Options and Parameters + +Parameters for training and evaluation can be set in file `config.py` + +```text +config for training. + epochs number of training epoch, default is 70 +``` + +```text +config for dataloader. + train_manifest train manifest path, default is 'data/libri_train_manifest.json' + val_manifest dev manifest path, default is 'data/libri_val_manifest.json' + batch_size batch size for training, default is 8 + labels_path tokens json path for model output, default is "./labels.json" + sample_rate sample rate for the data/model features, default is 16000 + window_size window size for spectrogram generation (seconds), default is 0.02 + window_stride window stride for spectrogram generation (seconds), default is 0.01 + window window type for spectrogram generation, default is 'hamming' + speed_volume_perturb use random tempo and gain perturbations, default is False, not used in current model + spec_augment use simple spectral augmentation on mel spectograms, default is False, not used in current model + noise_dir directory to inject noise into audio. If default, noise Inject not added, default is '', not used in current model + noise_prob probability of noise being added per sample, default is 0.4, not used in current model + noise_min minimum noise level to sample from. (1.0 means all noise, not original signal), default is 0.0, not used in current model + noise_max maximum noise levels to sample from. Maximum 1.0, default is 0.5, not used in current model +``` + +```text +config for optimizer. + learning_rate initial learning rate, default is 3e-4 + learning_anneal annealing applied to learning rate after each epoch, default is 1.1 + weight_decay weight decay, default is 1e-5 + momentum momentum, default is 0.9 + eps Adam eps, default is 1e-8 + betas Adam betas, default is (0.9, 0.999) + loss_scale loss scale, default is 1024 +``` + +```text +config for checkpoint. + ckpt_file_name_prefix ckpt_file_name_prefix, default is 'Jasper' + ckpt_path path to save ckpt, default is 'checkpoints' + keep_checkpoint_max max number of checkpoints to save, delete older checkpoints, default is 10 +``` + +## [Training and Eval process](#contents) + +Before training, the dataset should be processed. + +``` bash +bash scripts/download_librispeech.sh +bash scripts/preprocess_librispeech.sh +python createmindrecord.py //transform data to mindrecord +``` + +dataset directory structure is as follows: + +```path + . + |--LibriSpeech + │ |--train-clean-100-wav + │ │--train-clean-360-wav + │ │--train-other-500-wav + │ |--dev-clean-wav + │ |--dev-other-wav + │ |--test-clean-wav + │ |--test-other-wav + |--librispeech-train-clean-100-wav.json,librispeech-train-clean-360-wav.json,librispeech-train-other-500-wav.json,librispeech-dev-clean-wav.json,librispeech-dev-other-wav.json,librispeech-test-clean-wav.json,librispeech-test-other-wav.json +``` + +The three *.json file stores the absolute path of the corresponding +data. After obtaining the 3 json file, you should modify the configurations in `src/config.py`. +For training config, the train_manifest should be configured with the path of `libri_train_manifest.json` and for eval config, it should be configured +with `libri_test_other_manifest.json` or `libri_train_manifest.json`, depending on which dataset is evaluated. + +```shell +... +train config +"Data_dir": '/data/dataset', +"train_manifest": ['/data/dataset/librispeech-train-clean-100-wav.json', + '/data/dataset/librispeech-train-clean-360-wav.json', + '/data/dataset/librispeech-train-other-500-wav.json'], +"mindrecord_format": "/data/jasper_tr{}.md", +"mindrecord_files": [f"/data/jasper_tr{i}.md" for i in range(8)] + +eval config +"DataConfig":{ + "Data_dir": '/data/inference_datasets', + "test_manifest": ['/data/inference_datasets/librispeech-dev-clean-wav.json'], +} + +``` + +Before training, some requirements should be installed, including `librosa` and `Levenshtein` +After installing MindSpore via the official website and finishing dataset processing, you can start training as follows: + +```shell + +# standalone training gpu +bash ./scripts/run_standalone_train_gpu.sh [DEVICE_ID] + +# standalone training cpu +bash ./scripts/run_standalone_train_cpu.sh + +# distributed training gpu +bash ./scripts/run_distribute_train_gpu.sh + +``` + +The following script is used to evaluate the model. Note we only support greedy decoder now and before run the script: + +```shell + +# eval on cpu +bash ./scripts/run_eval_cpu.sh [PATH_CHECKPOINT] + +# eval on gpu +bash ./scripts/run_eval_gpu.sh [DEVICE_ID] [PATH_CHECKPOINT] + +``` + +## [Model Description](#contents) + +### [Performance](#contents) + +#### Training Performance + +| Parameters | Jasper | +| -------------------- | ------------------------------------------------------------ | +| Resource | NV SMX2 V100-32G | +| uploaded Date | 2/7/2022 (month/day/year) | +| MindSpore Version | 1.8.0 | +| Dataset | LibriSpeech | +| Training Parameters | 8p, epoch=70, steps=1088 * epoch, batch_size = 64, lr=3e-4 | +| Optimizer | Adam | +| Loss Function | CTCLoss | +| outputs | probability | +| Loss | 0.2-0.7 | +| Speed | 8p 2.7s/step | +| Total time: training | 8p: around 194 h; | +| Checkpoint | 991M (.ckpt file) | +| Scripts | [Jasper script](https://gitee.com/mindspore/models/tree/master/research/audio/jasper) | + +#### Inference Performance + +| Parameters | Jasper | +| ------------------- | -------------------------- | +| Resource | NV SMX2 V100-32G | +| uploaded Date | 2/7/2022 (month/day/year) | +| MindSpore Version | 1.8.0 | +| Dataset | LibriSpeech | +| batch_size | 64 | +| outputs | probability | +| Accuracy(dev-clean) | 8p: WER: 5.754 CER: 2.151 | +| Accuracy(dev-other) | 8p: WER: 19.213 CER: 9.393 | +| Model for inference | 330M (.mindir file) | + +## [ModelZoo Homepage](#contents) + + Please check the official [homepage](https://gitee.com/mindspore/models). diff --git a/research/audio/jasper/create_mindrecord.py b/research/audio/jasper/create_mindrecord.py new file mode 100644 index 0000000000000000000000000000000000000000..b25edeee9612e736fc5639b6fafb81466c4589fb --- /dev/null +++ b/research/audio/jasper/create_mindrecord.py @@ -0,0 +1,72 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# httpwww.apache.orglicensesLICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +from multiprocessing import Pool +import mindspore.dataset.engine as de +from mindspore.mindrecord import FileWriter +from src.dataset import ASRDataset +from src.config import train_config, symbols + + +def _exec_task(task_id): + """ + Execute task with specified task id + """ + print("exec task {}...".format(task_id)) + # get number of files + writer = FileWriter(mindrecord_file.format(task_id), 1) + writer.set_page_size(1 << 25) + jasper_json = { + "batch_spect": {"type": "float32", "shape": [1, 64, -1]}, + "batch_script": {"type": "int32", "shape": [-1,]} + } + writer.add_schema(jasper_json, "jasper_json") + output_columns = ["batch_spect", "batch_script"] + dataset = ASRDataset(data_dir=train_config.DataConfig.Data_dir, + manifest_fpaths=train_config.DataConfig.train_manifest, + labels=symbols, + batch_size=1, + train_mode=True) + ds = de.GeneratorDataset(dataset, output_columns, + num_shards=num_tasks, shard_id=task_id) + dataset_size = ds.get_dataset_size() + for c, item in enumerate(ds.create_dict_iterator(output_numpy=True)): + row = {"batch_spect": item["batch_spect"], + "batch_script": item["batch_script"]} + writer.write_raw_data([row]) + print(f"{c}/{dataset_size}", flush=True) + writer.commit() + + +if __name__ == "__main__": + mindrecord_file = train_config.DataConfig.mindrecord_format + mindrecord_dir = os.path.dirname(mindrecord_file) + if not os.path.isdir(mindrecord_dir): + os.makedirs(mindrecord_dir) + num_tasks = 8 + + print("Write mindrecord ...") + + task_list = list(range(num_tasks)) + + if os.name == 'nt': + for window_task_id in task_list: + _exec_task(window_task_id) + elif num_tasks > 1: + with Pool(num_tasks) as p: + p.map(_exec_task, task_list) + else: + _exec_task(0) diff --git a/research/audio/jasper/eval.py b/research/audio/jasper/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..21503542afd47f241ec0f9dd46c3439cdca0ea30 --- /dev/null +++ b/research/audio/jasper/eval.py @@ -0,0 +1,110 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +Eval for Japer +""" +import argparse +import json +import pickle +import numpy as np +from src.config import eval_config, symbols, encoder_kw, decoder_kw +from src.model_test import Jasper, PredictWithSoftmax +from src.dataset import create_eval_dataset +from src.decoder import GreedyDecoder +from mindspore import context +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +parser = argparse.ArgumentParser(description='jasper evaluation') +parser.add_argument('--pretrain_ckpt', type=str, + default='./checkpoint/ckpt_0/jasper10.ckpt', help='Pretrained checkpoint path') +parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", "CPU"), + help='Device target, support GPU and CPU, Default: GPU') +args = parser.parse_args() + +if __name__ == '__main__': + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target, save_graphs=False) + config = eval_config + with open(config.DataConfig.labels_path) as label_file: + labels = json.load(label_file) + + model = PredictWithSoftmax( + Jasper(encoder_kw=encoder_kw, decoder_kw=decoder_kw)) + + ds_eval = create_eval_dataset(data_dir=config.DataConfig.Data_dir, + manifest_filepath=config.DataConfig.test_manifest, + labels=symbols, batch_size=config.DataConfig.batch_size, train_mode=False) + + param_dict = load_checkpoint(args.pretrain_ckpt) + load_param_into_net(model, param_dict) + print('Successfully loading the pre-trained model') + + if config.LMConfig.decoder_type == 'greedy': + decoder = GreedyDecoder(labels=symbols, blank_index=len(symbols)-1) + else: + raise NotImplementedError("Only greedy decoder is supported now") + target_decoder = GreedyDecoder(symbols, blank_index=len(symbols)-1) + + model.set_train(False) + total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 + output_data = [] + for data in ds_eval.create_dict_iterator(): + inputs, input_length, target_indices, targets = data['inputs'], data['input_length'], data['target_indices'], \ + data['targets'] + + split_targets = [] + start, count, last_id = 0, 0, 0 + target_indices, targets = target_indices.asnumpy(), targets.asnumpy() + for i in range(np.shape(targets)[0]): + if target_indices[i, 0] == last_id: + count += 1 + else: + split_targets.append(list(targets[start:count])) + last_id += 1 + start = count + count += 1 + split_targets.append(list(targets[start:])) + + out, output_sizes = model(inputs, input_length) + decoded_output, _ = decoder.decode(out, output_sizes) + target_strings = target_decoder.convert_to_strings(split_targets) + + if config.save_output is not None: + output_data.append( + (out.asnumpy(), output_sizes.asnumpy(), target_strings)) + for doutput, toutput in zip(decoded_output, target_strings): + transcript, reference = doutput[0], toutput[0] + wer_inst = decoder.wer(transcript, reference) + cer_inst = decoder.cer(transcript, reference) + total_wer += wer_inst + total_cer += cer_inst + num_tokens += len(reference.split()) + num_chars += len(reference.replace(' ', '')) + if config.verbose: + print("Ref:", reference.lower()) + print("Hyp:", transcript.lower()) + print("WER:", float(wer_inst) / len(reference.split()), + "CER:", float(cer_inst) / len(reference.replace(' ', '')), "\n") + wer = float(total_wer) / num_tokens + cer = float(total_cer) / num_chars + + print('Test Summary \t' + 'Average WER {wer:.3f}\t' + 'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100)) + + if config.save_output is not None: + with open(config.save_output + '.bin', 'wb') as output: + pickle.dump(output_data, output) diff --git a/research/audio/jasper/export.py b/research/audio/jasper/export.py new file mode 100644 index 0000000000000000000000000000000000000000..58e6d9ae1d7ac3943ffed654fc5cfd7e2d4f6c0e --- /dev/null +++ b/research/audio/jasper/export.py @@ -0,0 +1,55 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +export checkpoint file to mindir model +""" +import json +import argparse +import numpy as np +import mindspore as ms +from mindspore import context, Tensor +from mindspore.train.serialization import load_checkpoint, load_param_into_net, export +from src.config import train_config, encoder_kw, decoder_kw +from src.model import Jasper + +parser = argparse.ArgumentParser( + description='Export DeepSpeech model to Mindir') +parser.add_argument('--pre_trained_model_path', type=str, + default='', help=' existed checkpoint path') +parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", "CPU"), + help='Device target, support GPU and CPU, Default: GPU') +args = parser.parse_args() + +if __name__ == '__main__': + config = train_config + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target, save_graphs=False) + with open(config.DataConfig.labels_path) as label_file: + labels = json.load(label_file) + + jasper_net = Jasper(encoder_kw=encoder_kw, + decoder_kw=decoder_kw).to_float(ms.float16) + + param_dict = load_checkpoint(args.pre_trained_model_path) + load_param_into_net(jasper_net, param_dict) + print('Successfully loading the pre-trained model') + # 3500 is the max length in evaluation dataset(LibriSpeech). This is consistent with that in dataset.py + # The length is fixed to this value because Mindspore does not support dynamic shape currently + input_np = np.random.uniform( + 0.0, 1.0, size=[1, 64, 3500]).astype(np.float32) + length = np.array([100], dtype=np.int32) + export(jasper_net, Tensor(input_np), Tensor(length), + file_name="jasper.mindir", file_format='MINDIR') diff --git a/research/audio/jasper/labels.json b/research/audio/jasper/labels.json new file mode 100644 index 0000000000000000000000000000000000000000..7cd70315b998e92c729269910efebfd3827457ac --- /dev/null +++ b/research/audio/jasper/labels.json @@ -0,0 +1,31 @@ +[ + "'", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + " ", + "_" +] \ No newline at end of file diff --git a/research/audio/jasper/pt2mind.py b/research/audio/jasper/pt2mind.py new file mode 100644 index 0000000000000000000000000000000000000000..b2e9847478000fb99bea7dc4e94b2132a6e7dd87 --- /dev/null +++ b/research/audio/jasper/pt2mind.py @@ -0,0 +1,92 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +import argparse +import re + +import numpy as np +from sympy import arg +import torch +from mindspore.train.serialization import save_checkpoint +from mindspore import Tensor + +parser = argparse.ArgumentParser(description='pth translate to ckpt') +parser.add_argument('--pth', type='str', + default='/data/Jasper_epoch10_checkpoint.pt', help='path of pth') + +args = parser.parse_args() + + +def convert_v1_state_dict(state_dict): + rules = [ + ('^jasper_encoder.encoder.', 'encoder.layers.'), + ('^jasper_decoder.decoder_layers.', 'decoder.layers.'), + ] + ret = {} + for k, v in state_dict.items(): + if k.startswith('acoustic_model.'): + continue + if k.startswith('audio_preprocessor.'): + continue + for pattern, to in rules: + k = re.sub(pattern, to, k) + ret[k] = v + + return ret + + +checkpoint = torch.load(arg.pth, map_location="cpu") + +state_dic = convert_v1_state_dict(checkpoint['state_dict']) + + +mydict = state_dic +newparams_list = [] +names = [item for item in mydict if 'num_batches_tracked' not in item] +i = 0 +for name in names: + parameter = mydict[name].numpy() + param_dict = {} + + if i % 5 == 0: + name = name.replace('weight', 'conv1.weight') + parameter = np.expand_dims(parameter, axis=2) + elif i % 5 == 1: + name = name.replace('weight', 'batchnorm.gamma') + elif i % 5 == 2: + name = name.replace('bias', 'batchnorm.beta') + elif i % 5 == 3: + name = name.replace('running_mean', 'batchnorm.moving_mean') + else: + name = name.replace('running_var', 'batchnorm.moving_variance') + + if i == 540: + name = name.replace('0.conv1.weight', 'weight') + if i == 541: + name = name.replace('0.bias', 'bias') + + param_dict['name'] = name + param_dict['data'] = Tensor(parameter) + newparams_list.append(param_dict) + if i % 5 == 4: + newparams_list[i-3], newparams_list[i-2], newparams_list[i-1], newparams_list[i] = \ + newparams_list[i - 1], newparams_list[i], newparams_list[i - + 3], newparams_list[i-2] + + i += 1 + +save_checkpoint(newparams_list, './jasper_mindspore_10.ckpt') +print("end") diff --git a/research/audio/jasper/requirements.txt b/research/audio/jasper/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2854fd222c13d33d52e94b0166bcb4644e1ba41f --- /dev/null +++ b/research/audio/jasper/requirements.txt @@ -0,0 +1,15 @@ +ctcdecode==1.0.2 +easydict==1.9 +inflect==5.4.0 +librosa==0.8.0 +mindspore==1.8.0 +numpy==1.20.1 +pandas==1.2.4 +python_Levenshtein==0.12.2 +PyYAML==6.0 +requests==2.25.1 +six==1.15.0 +SoundFile==0.10.3.post1 +sox==1.4.1 +tqdm==4.59.0 +Unidecode==1.3.4 diff --git a/research/audio/jasper/scripts/download_librispeech.sh b/research/audio/jasper/scripts/download_librispeech.sh new file mode 100644 index 0000000000000000000000000000000000000000..7331a1c1a43414041b6365d2cdceb04f98af95ee --- /dev/null +++ b/research/audio/jasper/scripts/download_librispeech.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + + +DATA_SET=$1 +DATA_ROOT_DIR=$2 +DATA_DIR="${DATA_ROOT_DIR}/${DATA_SET}" + +if [ ! -d "$DATA_DIR" ] +then + mkdir --mode 755 $DATA_DIR + + python utils/download_librispeech.py \ + utils/inference_librispeech.csv \ + $DATA_DIR \ + -e ${DATA_ROOT_DIR}/ +else + echo "Directory $DATA_DIR already exists." +fi diff --git a/research/audio/jasper/scripts/preprocess_librispeech.sh b/research/audio/jasper/scripts/preprocess_librispeech.sh new file mode 100644 index 0000000000000000000000000000000000000000..592374982c3eddcce12fb66bcfdda00cec9e52f3 --- /dev/null +++ b/research/audio/jasper/scripts/preprocess_librispeech.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +SPEEDS=$1 +[ -n "$SPEEDS" ] && SPEED_FLAG="--speed $SPEEDS" + +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-clean-100 \ + --dest_dir /datasets/LibriSpeech/train-clean-100-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-clean-100-wav.json \ + $SPEED_FLAG +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-clean-360 \ + --dest_dir /datasets/LibriSpeech/train-clean-360-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-clean-360-wav.json \ + $SPEED_FLAG +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-other-500 \ + --dest_dir /datasets/LibriSpeech/train-other-500-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-other-500-wav.json \ + $SPEED_FLAG +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/dev-clean \ + --dest_dir /datasets/LibriSpeech/dev-clean-wav \ + --output_json /datasets/LibriSpeech/librispeech-dev-clean-wav.json +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/dev-other \ + --dest_dir /datasets/LibriSpeech/dev-other-wav \ + --output_json /datasets/LibriSpeech/librispeech-dev-other-wav.json +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/test-clean \ + --dest_dir /datasets/LibriSpeech/test-clean-wav \ + --output_json /datasets/LibriSpeech/librispeech-test-clean-wav.json +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/test-other \ + --dest_dir /datasets/LibriSpeech/test-other-wav \ + --output_json /datasets/LibriSpeech/librispeech-test-other-wav.json diff --git a/research/audio/jasper/scripts/run_distribute_train_gpu.sh b/research/audio/jasper/scripts/run_distribute_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..c82a142f151dcc37b9ec25d86905a2bb7c3033ed --- /dev/null +++ b/research/audio/jasper/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \ +python ./train.py --is_distributed --device_target 'GPU' > train_8p.log 2>&1 & diff --git a/research/audio/jasper/scripts/run_eval_cpu.sh b/research/audio/jasper/scripts/run_eval_cpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..661ccfd80de9285e7612c2e6b0972d5236befdfc --- /dev/null +++ b/research/audio/jasper/scripts/run_eval_cpu.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +PATH_CHECKPOINT=$1 +python ./eval.py --pretrain_ckpt $PATH_CHECKPOINT --device_target 'CPU' > eval.log 2>&1 & diff --git a/research/audio/jasper/scripts/run_eval_gpu.sh b/research/audio/jasper/scripts/run_eval_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..fc4edfe349893db8f941623b06ec048b894b9b29 --- /dev/null +++ b/research/audio/jasper/scripts/run_eval_gpu.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +DEVICE_ID=$1 +PATH_CHECKPOINT=$2 +export CUDA_VISIBLE_DEVICES=$DEVICE_ID +python ./eval.py --pretrain_ckpt $PATH_CHECKPOINT \ +--device_target 'GPU' > eval.log 2>&1 & \ No newline at end of file diff --git a/research/audio/jasper/scripts/run_standalone_train_cpu.sh b/research/audio/jasper/scripts/run_standalone_train_cpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..ac83a8ad060e22eb8ec3a94ee15cc122278bad02 --- /dev/null +++ b/research/audio/jasper/scripts/run_standalone_train_cpu.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +python ./train.py --device_target 'CPU' > train.log 2>&1 & + diff --git a/research/audio/jasper/scripts/run_standalone_train_gpu.sh b/research/audio/jasper/scripts/run_standalone_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..4782ab3513504ec09456dffc51d16ad6e317af4d --- /dev/null +++ b/research/audio/jasper/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +DEVICE_ID=$1 +CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./train.py --device_target 'GPU' > train.log 2>&1 & + diff --git a/research/audio/jasper/src/__init__.py b/research/audio/jasper/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..602527cd720c8d268599dbaef190ba1cf1eb6f2b --- /dev/null +++ b/research/audio/jasper/src/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/research/audio/jasper/src/audio.py b/research/audio/jasper/src/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..453c41292ab781f6662e006621d1d8a66274e415 --- /dev/null +++ b/research/audio/jasper/src/audio.py @@ -0,0 +1,216 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# httpwww.apache.orglicensesLICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import random +import soundfile as sf + +import librosa + +import numpy as np + +import sox + + +def audio_from_file(file_path, offset=0, duration=0, trim=False, target_sr=16000): + audio = AudioSegment(file_path, target_sr=target_sr, int_values=False, + offset=offset, duration=duration, trim=trim) + + samples = audio.samples + num_samples = samples.shape[0] + return (np.expand_dims(samples, 0), np.expand_dims(num_samples, 0)) + + +class AudioSegment: + """Monaural audio segment abstraction. + + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :raises TypeError: If the sample data type is not float or int. + """ + + def __init__(self, filename, target_sr=None, int_values=False, offset=0, + duration=0, trim=False, trim_db=60): + """Create audio segment from samples. + + Samples are convert float32 internally, with int scaled to [-1, 1]. + Load a file supported by librosa and return as an AudioSegment. + :param filename: path of file to load + :param target_sr: the desired sample rate + :param int_values: if true, load samples as 32-bit integers + :param offset: offset in seconds when loading audio + :param duration: duration in seconds when loading audio + :return: numpy array of samples + """ + with sf.SoundFile(filename, 'r') as f: + dtype = 'int32' if int_values else 'float32' + sample_rate = f.samplerate + if offset > 0: + f.seek(int(offset * sample_rate)) + if duration > 0: + samples = f.read(int(duration * sample_rate), dtype=dtype) + else: + samples = f.read(dtype=dtype) + samples = samples.transpose() + + samples = self._convert_samples_to_float32(samples) + + if target_sr is not None and target_sr != sample_rate: + samples = librosa.core.resample(samples, sample_rate, target_sr) + sample_rate = target_sr + if trim: + samples, _ = librosa.effects.trim(samples, trim_db) + + self._samples = samples + self._sample_rate = sample_rate + if self._samples.ndim >= 2: + self._samples = np.mean(self._samples, 1) + + def __eq__(self, other): + """Return whether two objects are equal.""" + if type(other) is not type(self): + return False + if self._sample_rate != other._sample_rate: # pylint: disable=W0212 + return False + if self._samples.shape != other._samples.shape: # pylint: disable=W0212 + return False + if np.any(self.samples != other._samples): # pylint: disable=W0212 + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + def __str__(self): + """Return human-readable representation of segment.""" + return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " + "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate, + self.duration, self.rms_db)) + + @staticmethod + def _convert_samples_to_float32(samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2 ** (bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + @property + def samples(self): + return self._samples.copy() + + @property + def sample_rate(self): + return self._sample_rate + + @property + def num_samples(self): + return self._samples.shape[0] + + @property + def duration(self): + return self._samples.shape[0] / float(self._sample_rate) + + @property + def rms_db(self): + mean_square = np.mean(self._samples ** 2) + return 10 * np.log10(mean_square) + + def gain_db(self, gain): + self._samples *= 10. ** (gain / 20.) + + def pad(self, pad_size, symmetric=False): + """Add zero padding to the sample. + + The pad size is given in number of samples. If symmetric=True, + `pad_size` will be added to both sides. If false, `pad_size` zeros + will be added only to the end. + """ + self._samples = np.pad(self._samples, + (pad_size if symmetric else 0, pad_size), + mode='constant') + + def subsegment(self, start_time=None, end_time=None): + """Cut the AudioSegment between given boundaries. + + Note that this is an in-place transformation. + :param start_time: Beginning of subsegment in seconds. + :type start_time: float + :param end_time: End of subsegment in seconds. + :type end_time: float + :raise ValueError: If start_time or end_time is incorrectly set, e.g. out + of bounds in time. + """ + start_time = 0.0 if start_time is None else start_time + end_time = self.duration if end_time is None else end_time + if start_time < 0.0: + start_time = self.duration + start_time + if end_time < 0.0: + end_time = self.duration + end_time + if start_time < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start_time) + if end_time < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end_time) + if start_time > end_time: + raise ValueError("The slice start position (%f s) is later than " + "the end position (%f s)." % (start_time, end_time)) + if end_time > self.duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end_time, self.duration)) + start_sample = int(round(start_time * self._sample_rate)) + end_sample = int(round(end_time * self._sample_rate)) + self._samples = self._samples[start_sample:end_sample] + + +class Perturbation: + def __init__(self, p=0.1, rng=None): + self.p = p + self._rng = random.Random() if rng is None else rng + + def maybe_apply(self, segment, sample_rate=None): + if self._rng.random() < self.p: + self(segment, sample_rate) # pylint: disable=E1102 + + +class SpeedPerturbation(Perturbation): + def __init__(self, min_rate=0.85, max_rate=1.15, discrete=False, p=0.1, rng=None): + super(SpeedPerturbation, self).__init__(p, rng) + assert 0 < min_rate < max_rate + self.min_rate = min_rate + self.max_rate = max_rate + self.discrete = discrete + + def __call__(self, data, sample_rate): + if self.discrete: + rate = np.random.choice([self.min_rate, None, self.max_rate]) + else: + rate = self._rng.uniform(self.min_rate, self.max_rate) + + if rate is not None: + data._samples = sox.Transformer().speed(factor=rate).build_array( # pylint: disable=W0212 + input_array=data._samples, sample_rate_in=sample_rate) # pylint: disable=W0212 diff --git a/research/audio/jasper/src/callback.py b/research/audio/jasper/src/callback.py new file mode 100644 index 0000000000000000000000000000000000000000..70b9c911de0d0caffe9edba75fbcc3214b651106 --- /dev/null +++ b/research/audio/jasper/src/callback.py @@ -0,0 +1,113 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# httpwww.apache.orglicensesLICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +Defined callback for jasper. +""" +import time +import math +import numpy as np +from mindspore.train.callback import Callback +from mindspore import Tensor + + +class TimeMonitor(Callback): + """ + Time monitor for calculating cost of each epoch. + Args + data_size (int) step size of an epoch. + """ + + def __init__(self, data_size): + super(TimeMonitor, self).__init__() + self.data_size = data_size + + def epoch_begin(self, run_context): + self.epoch_time = time.time() + + def epoch_end(self, run_context): + epoch_mseconds = (time.time() - self.epoch_time) * 1000 + per_step_mseconds = epoch_mseconds / self.data_size + print("epoch time: {0}, per step time: {1}".format( + epoch_mseconds, per_step_mseconds), flush=True) + + def step_begin(self, run_context): + self.step_time = time.time() + + def step_end(self, run_context): + step_mseconds = (time.time() - self.step_time) * 1000 + print(f"step time {step_mseconds}", flush=True) + + +class Monitor(Callback): + """ + Monitor loss and time. + + Args: + lr_init (numpy array): train lr + + Returns: + None + """ + + def __init__(self, lr_init=None): + super(Monitor, self).__init__() + self.lr_init = lr_init + self.lr_init_len = len(lr_init) + + def epoch_begin(self, run_context): + self.losses = [] + self.step_now = 0 + self.step_nan = 0 + self.epoch_time = time.time() + + def epoch_end(self, run_context): + cb_params = run_context.original_args() + + epoch_mseconds = (time.time() - self.epoch_time) + per_step_mseconds = epoch_mseconds / cb_params.batch_num + print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}".format(epoch_mseconds, + per_step_mseconds, + np.mean(self.losses))) + + def step_begin(self, run_context): + self.step_time = time.time() + + def step_end(self, run_context): + """ + Args: + run_context: + + Returns: + """ + cb_params = run_context.original_args() + step_mseconds = (time.time() - self.step_time) + step_loss = cb_params.net_outputs + + if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor): + step_loss = step_loss[0] + if isinstance(step_loss, Tensor): + step_loss = np.mean(step_loss.asnumpy()) + + if math.isnan(step_loss) is not True and math.isinf(step_loss) is not True: + + self.losses.append(step_loss) + + cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + + print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:.9f}]".format( + cb_params.cur_epoch_num - + 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, + np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1].asnumpy())) diff --git a/research/audio/jasper/src/cleaners.py b/research/audio/jasper/src/cleaners.py new file mode 100644 index 0000000000000000000000000000000000000000..e64beab067bc290856c855fa908b829ebb44d824 --- /dev/null +++ b/research/audio/jasper/src/cleaners.py @@ -0,0 +1,90 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import re +from unidecode import unidecode +from .number import normalize_numbers + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + +def expand_numbers(text): + return normalize_numbers(text) + +def lowercase(text): + return text.lower() + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + +def convert_to_ascii(text): + return unidecode(text) + +def remove_punctuation(text, table): + text = text.translate(table) + text = re.sub(r'&', " and ", text) + text = re.sub(r'\+', " plus ", text) + return text + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + +def english_cleaners(text, table=None): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + if table is not None: + text = remove_punctuation(text, table) + text = collapse_whitespace(text) + return text diff --git a/research/audio/jasper/src/config.py b/research/audio/jasper/src/config.py new file mode 100644 index 0000000000000000000000000000000000000000..05985f92953481482bdf0fc0a94319786f6ec7f8 --- /dev/null +++ b/research/audio/jasper/src/config.py @@ -0,0 +1,178 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========================================================================== +""" +network config setting, will be used in train.py and eval.py +""" +import inspect + +import yaml +from easydict import EasyDict as ed + +from src.model import JasperBlock, JasperDecoderForCTC, JasperEncoder + +train_config = ed({ + + + "TrainingConfig": { + "epochs": 440, + "loss_scale": 128.0 + }, + + "DataConfig": { + "Data_dir": '/data/train_datasets', + "train_manifest": ['/data/train_datasets/librispeech-train-clean-100-wav.json', + '/data/train_datasets/librispeech-train-clean-360-wav.json', + '/data/train_datasets/librispeech-train-other-500-wav.json'], + "mindrecord_format": "/data/jasper_tr{}.md", + "mindrecord_files": [f"/data/jasper_tr{i}.md" for i in range(8)], + "batch_size": 64, + "accumulation_step": 2, + "labels_path": "labels.json", + + "SpectConfig": { + "sample_rate": 16000, + "window_size": 0.02, + "window_stride": 0.01, + "window": "hamming" + }, + + "AugmentationConfig": { + "speed_volume_perturb": False, + "spec_augment": False, + "noise_dir": '', + "noise_prob": 0.4, + "noise_min": 0.0, + "noise_max": 0.5, + } + }, + + "OptimConfig": { + "learning_rate": 0.01, + "learning_anneal": 1.1, + "weight_decay": 1e-5, + "momentum": 0.9, + "eps": 1e-8, + "betas": (0.9, 0.999), + "loss_scale": 1024, + "epsilon": 0.00001 + }, + + "CheckpointConfig": { + "ckpt_file_name_prefix": 'Jasper', + "ckpt_path": './checkpoint', + "keep_checkpoint_max": 10 + } +}) + +eval_config = ed({ + + "save_output": 'librispeech_val_output', + "verbose": True, + + "DataConfig": { + + "Data_dir": '/data/inference_datasets', + + "test_manifest": ['/data/inference_datasets/librispeech-dev-clean-wav.json'], + + + "batch_size": 32, + "labels_path": "labels.json", + + "SpectConfig": { + "sample_rate": 16000, + "window_size": 0.02, + "window_stride": 0.01, + "window": "hanning" + }, + }, + "LMConfig": { + "decoder_type": "greedy", + "lm_path": './3-gram.pruned.3e-7.arpa', + "top_paths": 1, + "alpha": 1.818182, + "beta": 0, + "cutoff_top_n": 40, + "cutoff_prob": 1.0, + "beam_width": 1024, + "lm_workers": 4 + }, + +}) + + +def default_args(klass): + sig = inspect.signature(klass.__init__) + return {k: v.default for k, v in sig.parameters.items() if k != 'self'} + + +def load(fpath): + if fpath.endswith('.toml'): + raise ValueError('.toml config format has been changed to .yaml') + + cfg = yaml.safe_load(open(fpath, 'r')) + + # Reload to deep copy shallow copies, which were made with yaml anchors + yaml.Dumper.ignore_aliases = lambda *args: True + cfg = yaml.dump(cfg) + cfg = yaml.safe_load(cfg) + return cfg + + +def validate_and_fill(klass, user_conf, ignore_unk=None, optional=None): + conf = default_args(klass) + if ignore_unk is None: + ignore_unk = [] + if optional is None: + optional = [] + for k, v in user_conf.items(): + conf[k] = v + + # Keep only mandatory or optional-nonempty + conf = {k: v for k, v in conf.items() + if k not in optional or v is not inspect.Parameter.empty} + + # Validate + for k, v in conf.items(): + assert v is not inspect.Parameter.empty, \ + f'Value for {k} not specified for {klass}' + return conf + + +def encoder(conf): + """Validate config for JasperEncoder and subsequent JasperBlocks""" + + # Validate, but don't overwrite with defaults + for blk in conf['jasper']['encoder']['blocks']: + validate_and_fill(JasperBlock, blk, optional=['infilters'], + ignore_unk=['residual_dense']) + + return validate_and_fill(JasperEncoder, conf['jasper']['encoder']) + + +def decoder(conf, n_classes): + deco_kw = {'n_classes': n_classes, **conf['jasper']['decoder']} + return validate_and_fill(JasperDecoderForCTC, deco_kw) + + +def add_ctc_blank(sym): + return sym + ['_'] + + +cfgs = load('./src/jasper10x5dr_speca.yaml') + +symbols = add_ctc_blank(cfgs['labels']) +encoder_kw = encoder(cfgs) +decoder_kw = decoder(cfgs, n_classes=len(symbols)) diff --git a/research/audio/jasper/src/dataset.py b/research/audio/jasper/src/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..21719b70dc067f6e2c8a1ec04fb52c433738591c --- /dev/null +++ b/research/audio/jasper/src/dataset.py @@ -0,0 +1,466 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# httpwww.apache.orglicensesLICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import math +import json +from pathlib import Path +import librosa +import numpy as np +import mindspore.dataset.engine as de + +from src.audio import AudioSegment, SpeedPerturbation +from src.text import _clean_text, punctuation_map + +TRAIN_INPUT_PAD_LENGTH = 1300 +TRAIN_LABEL_PAD_LENGTH = 360 +TEST_INPUT_PAD_LENGTH = 3500 + + +class BaseFeatures(): + """Base class for GPU accelerated audio preprocessing.""" + __constants__ = ["pad_align", "pad_to_max_duration", "max_len"] + + def __init__(self, pad_align, pad_to_max_duration, max_duration, + sample_rate, window_size, window_stride): + super(BaseFeatures, self).__init__() + + self.pad_align = pad_align + self.pad_to_max_duration = pad_to_max_duration + self.win_length = int(sample_rate * window_size) # frame size + self.hop_length = int(sample_rate * window_stride) + + # Calculate maximum sequence length (# frames) + if pad_to_max_duration: + self.max_len = 1 + math.ceil( + (max_duration * sample_rate - self.win_length) / self.hop_length + ) + + def calculate_features(self, audio, audio_lens): + return audio, audio_lens + + def __call__(self, audio, audio_lens): + dtype = audio.dtype + audio = audio + feat, feat_lens = self.calculate_features(audio, audio_lens) + feat = self.apply_padding(feat) + feat = feat.astype(dtype) + return feat, feat_lens + + def apply_padding(self, x): + if self.pad_to_max_duration: + x_size = max(x.shape[-1], self.max_len) + else: + x_size = x.shape[-1] + if self.pad_align > 0: + pad_amt = x_size % self.pad_align + else: + pad_amt = 0 + + padded_len = x_size + (self.pad_align - pad_amt if pad_amt > 0 else 0) + return np.pad(x, ((0, 0), (0, 0), (0, padded_len - x.shape[-1]))) + + +def normalize_string(s, labels, punct_map): + """Normalizes string. + + Example: + 'call me at 8:00 pm!' -> 'call me at eight zero pm' + """ + labels = set(labels) + try: + text = _clean_text(s, ["english_cleaners"], punct_map).strip() + return ''.join([tok for tok in text if all(t in labels for t in tok)]) + except ValueError: + print(f"WARNING: Normalizing failed: {s}") + return None + + +class SpecAugment(): + """Spec augment. refer to https://arxiv.org/abs/1904.08779 + """ + + def __init__(self, freq_masks=2, min_freq=0, max_freq=20, time_masks=2, + min_time=0, max_time=75): + super(SpecAugment, self).__init__() + assert 0 <= min_freq <= max_freq + assert 0 <= min_time <= max_time + + self.freq_masks = freq_masks + self.min_freq = min_freq + self.max_freq = max_freq + + self.time_masks = time_masks + self.min_time = min_time + self.max_time = max_time + + def run(self, x): + sh = x.shape + mask = np.ones(x.shape, dtype=np.bool) + + for idx in range(sh[0]): + for _ in range(self.freq_masks): + w = np.random.randint( + self.min_freq, self.max_freq + 1, size=(1,)).item() + f0 = np.random.randint(0, max(1, sh[1] - w), size=(1,)).item() + mask[idx, f0:f0 + w] = 0 + + for _ in range(self.time_masks): + w = np.random.randint( + self.min_time, self.max_time + 1, size=(1,)).item() + t0 = np.random.randint(0, max(1, sh[2] - w), size=(1,)).item() + mask[idx, :, t0:t0 + w] = 0 + x = x * mask + return x + + +def normalize_batch(x): + x_mean = np.zeros((x.shape[0], x.shape[1]), dtype=x.dtype) + x_std = np.zeros((x.shape[0], x.shape[1]), dtype=x.dtype) + for i in range(x.shape[0]): + x_mean[i, :] = x[i, :, :].mean(axis=1) + x_std[i, :] = x[i, :, :].std(axis=1) + # make sure x_std is not zero + x_std += 1e-5 + return (x - np.expand_dims(x_mean, 2)) / np.expand_dims(x_std, 2) + + +class FilterbankFeatures(BaseFeatures): + """ + parse audio and transcript + """ + + def __init__(self, sample_rate=16000, window_size=0.02, window_stride=0.01, + window="hann", normalize="per_feature", n_fft=512, + preemph=0.97, n_filt=64, lowfreq=0, highfreq=None, log=True, + dither=1e-5, pad_align=16, pad_to_max_duration=False, + max_duration=16.7, frame_splicing=1): + super(FilterbankFeatures, self).__init__( + pad_align=pad_align, pad_to_max_duration=pad_to_max_duration, + max_duration=max_duration, sample_rate=sample_rate, + window_size=window_size, window_stride=window_stride) + + self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) + self.sample_rate = sample_rate + self.window = window + self.normalize = normalize + self.log = log + self.dither = dither + self.frame_splicing = frame_splicing + self.n_filt = n_filt + self.preemph = preemph + highfreq = highfreq or sample_rate / 2 + self.lowfreq = lowfreq + self.highfreq = highfreq + + def get_seq_len(self, seq_len): + return np.ceil(seq_len / self.hop_length) + + def calculate_features(self, x): + dtype = x.dtype + seq_len = self.get_seq_len(np.asarray([x.shape[0]])) + # dither + if self.dither > 0: + x += self.dither * np.random.randn(x.shape[0]) + + # do preemphasis + if self.preemph is not None: + x = np.concatenate( + (np.expand_dims(x[0], 0), x[1:] - self.preemph * x[:-1]), axis=0) + + tmp = librosa.stft(y=x, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, + window=self.window, pad_mode='reflect', dtype=np.complex64) + + rel = np.real(tmp) + img = np.imag(tmp) + + x = np.power(rel, 2) + np.power(img, 2) + + filterbank = np.array( + librosa.filters.mel(sr=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_filt, fmin=self.lowfreq, + fmax=self.highfreq), dtype=np.float32) + filterbanks = np.expand_dims(filterbank, 0) + x = np.matmul(filterbanks, x) + # frame splicing if required + if self.frame_splicing > 1: + raise ValueError('Frame splicing not supported') + + x = np.log(x + 1e-20) + x = normalize_batch(x) + max_len = x.shape[-1] + mask = np.arange(max_len, dtype=np.int32) + mask = np.expand_dims(mask, 0) + mask = mask < np.expand_dims(seq_len, 1) + + x = x * np.expand_dims(mask, 1) + x = self.apply_padding(x) + return x.astype(dtype) + + +class ASRDataset(): + """ + create ASRDataset + Args: + data_dir: Dataset path + manifest_filepath (str): manifest_file path. + labels (list): List containing all the possible characters to map to + normalize: Apply standard mean and deviation Normalization to audio tensor + batch_size (int): Dataset batch size (default=32) + """ + + def __init__(self, data_dir, manifest_fpaths, labels, batch_size=64, train_mode=True, + sample_rate=16000, min_duration=0.1, max_duration=16.7, + pad_to_max_duration=False, max_utts=0, normalize_transcripts=True, + sort_by_duration=False, trim_silence=True, + ignore_offline_speed_perturbation=True): + self.data_dir = data_dir + self.labels = labels + self.labels_map = {labels[i]: i for i in range(len(labels))} + self.punctuation_map = punctuation_map(labels) + self.blank_index = len(labels) - 1 + self.pad_to_max_duration = pad_to_max_duration + self.sort_by_duration = sort_by_duration + self.max_utts = max_utts + self.normalize_transcripts = normalize_transcripts + self.ignore_offline_speed_perturbation = ignore_offline_speed_perturbation + self.min_duration = min_duration + self.max_duration = max_duration + if not train_mode: + self.max_duration = float("inf") + self.ignore_offline_speed_perturbation = False + else: + batch_size = 1 + self.trim_silence = trim_silence + self.sample_rate = sample_rate + perturbations = [] + perturbations.append(SpeedPerturbation()) + self.perturbations = perturbations + self.max_duration = max_duration + + self.samples = [] + self.duration = 0.0 + self.duration_filtered = 0.0 + + for fpath in manifest_fpaths: + self._load_json_manifest(fpath) + if sort_by_duration: + self.samples = sorted(self.samples, key=lambda s: s['duration']) + + ids = self.samples + self.bins = [ids[i:i + batch_size] + for i in range(0, len(ids), batch_size)] + if len(ids) % batch_size != 0: + self.bins = self.bins[:-1] + self.bins.append(ids[-batch_size:]) + self.size = len(self.bins) + self.batch_size = batch_size + self.train_feat_proc = FilterbankFeatures() + self.mask_length = 0 + self.train_mode = train_mode + + def __getitem__(self, index): + batch_idx = self.bins[index] + if self.train_mode: + s = batch_idx[0] + rn_indx = np.random.randint(len(s['audio_filepath'])) + duration = s['audio_duration'][rn_indx] if 'audio_duration' in s else 0 + offset = s.get('offset', 0) + segment = AudioSegment( + s['audio_filepath'][rn_indx], target_sr=self.sample_rate, + offset=offset, duration=duration, trim=self.trim_silence) + for p in self.perturbations: + p.maybe_apply(segment, self.sample_rate) + segment = segment.samples + inputs = self.train_feat_proc.calculate_features(segment) + transcript = np.array(s["transcript"], np.int32) + return np.array(inputs, np.float32), transcript + batch_spect = [] + batch_script = [] + for data in batch_idx: + s = data + rn_indx = np.random.randint(len(s['audio_filepath'])) + duration = s['audio_duration'][rn_indx] if 'audio_duration' in s else 0 + offset = s.get('offset', 0) + segment = AudioSegment( + s['audio_filepath'][rn_indx], target_sr=self.sample_rate, + offset=offset, duration=duration, trim=self.trim_silence) + segment = segment.samples + inputs = self.train_feat_proc.calculate_features(segment) + inputs = np.squeeze(inputs, 0) + batch_spect.append(inputs) + batch_script.append(np.array(s["transcript"], np.int32)) + batch_size = len(batch_idx) + input_length = np.zeros(batch_size, np.float32) + target_indices = [] + frez = inputs.shape[0] + inputs = np.zeros( + (batch_size, frez, TEST_INPUT_PAD_LENGTH), dtype=np.float32) + targets = [] + for k, spect_, scripts_ in zip(range(batch_size), batch_spect, batch_script): + seq_length = np.shape(spect_)[1] + input_length[k] = seq_length + targets.extend(scripts_) + for m in range(len(scripts_)): + target_indices.append([k, m]) + inputs[k, :, 0:seq_length] = spect_ + return inputs, input_length, np.array(target_indices, dtype=np.int64), np.array(targets, dtype=np.int32) + + def __len__(self): + return self.size + + def _load_json_manifest(self, fpath): + for s in json.load(open(fpath, "r", encoding="utf-8")): + + if self.pad_to_max_duration and not self.ignore_offline_speed_perturbation: + # require all perturbed samples to be < self.max_duration + s_max_duration = max(f['duration'] for f in s['files']) + else: + # otherwise we allow perturbances to be > self.max_duration + s_max_duration = s['original_duration'] + + s['duration'] = s.pop('original_duration') + if not self.min_duration <= s_max_duration <= self.max_duration: + self.duration_filtered += s['duration'] + continue + + # Prune and normalize according to transcript + tr = (s.get('transcript', None) + or self.load_transcript(s['text_filepath'])) + + if not isinstance(tr, str): + print(f'WARNING: Skipped sample (transcript not a str): {tr}.') + self.duration_filtered += s['duration'] + continue + + if self.normalize_transcripts: + tr = normalize_string(tr, self.labels, self.punctuation_map) + s["transcript"] = self.to_vocab_inds(tr) + + files = s.pop('files') + if self.ignore_offline_speed_perturbation: + files = [f for f in files if f['speed'] == 1.0] + + s['audio_duration'] = [f['duration'] for f in files] + s['audio_filepath'] = [str(Path(self.data_dir, f['fname'])) + for f in files] + self.samples.append(s) + self.duration += s['duration'] + + if self.max_utts > 0 and len(self.samples) >= self.max_utts: + print( + f'Reached max_utts={self.max_utts}. Finished parsing {fpath}.') + break + + def load_transcript(self, transcript_path): + with open(transcript_path, 'r', encoding="utf-8") as transcript_file: + transcript = transcript_file.read().replace('\n', '') + return transcript + + def to_vocab_inds(self, transcript): + chars = [self.labels_map.get(x, self.blank_index) + for x in list(transcript)] + transcript = list(filter(lambda x: x != self.blank_index, chars)) + return transcript + + +def preprocess(batch_spect, batch_script, blank_index): + specAugment = SpecAugment() + x = specAugment.run(batch_spect) + batch_spect = np.squeeze(x, 0) + frez = batch_spect.shape[0] + # 1501 is the max length in train dataset(LibriSpeech). + # The length is fixed to this value because Mindspore does not support dynamic shape currently + inputs = np.zeros((frez, TRAIN_INPUT_PAD_LENGTH), dtype=np.float32) + # The target length is fixed to this value because Mindspore does not support dynamic shape currently + # 350 may be greater than the max length of labels in train dataset(LibriSpeech). + targets = np.ones((TRAIN_LABEL_PAD_LENGTH), dtype=np.int32) * blank_index + seq_length = batch_spect.shape[1] + script_length = batch_script.shape[0] + targets[:script_length] = batch_script + if seq_length <= TRAIN_INPUT_PAD_LENGTH: + input_length = seq_length + inputs[:, :seq_length] = batch_spect + else: + maxstart = seq_length - TRAIN_INPUT_PAD_LENGTH + start = np.random.randint(maxstart) + input_length = TRAIN_INPUT_PAD_LENGTH + inputs[:, :] = batch_spect[:, start:start + TRAIN_INPUT_PAD_LENGTH] + return inputs, np.array(input_length, dtype=np.float32), np.array(targets, dtype=np.int32) + + +def postprocess(inputs, input_length, targets): + batch_size = inputs.shape[0] + target_indices = [] + for b in range(batch_size): + for m in range(TRAIN_LABEL_PAD_LENGTH): + target_indices.append([b, m]) + targets = np.reshape(targets, (-1,)) + return inputs, input_length, np.array(target_indices, dtype=np.int64), targets + + +def create_train_dataset(mindrecord_files, labels, batch_size, train_mode, rank=None, group_size=None): + """ + create train dataset + + Args: + mindrecord_files (list): A list of mindrecord files + labels (list): list containing all the possible characters to map to + batch_size (int): Dataset batch size + train_mode (bool): Whether dataset is use for train or eval (default=True). + rank (int): The shard ID within num_shards (default=None). + group_size (int): Number of shards that the dataset should be divided into (default=None). + + Returns: + Dataset. + """ + + output_columns = ["batch_spect", "batch_script"] + ds = de.MindDataset(mindrecord_files, columns_list=output_columns, num_shards=group_size, shard_id=rank, + num_parallel_workers=4, shuffle=train_mode) + + compose_map_func = (lambda batch_spect, batch_script: preprocess( + batch_spect, batch_script, len(labels) - 1)) + ds = ds.map(operations=compose_map_func, input_columns=["batch_spect", "batch_script"], + output_columns=["inputs", "input_length", "targets"], + column_order=["inputs", "input_length", "targets"], + num_parallel_workers=8) + ds = ds.batch(batch_size, drop_remainder=True) + ds = ds.map(operations=postprocess, input_columns=["inputs", "input_length", "targets"], + output_columns=["inputs", "input_length", + "target_indices", "targets"], + column_order=["inputs", "input_length", "target_indices", "targets"]) + return ds + + +def create_eval_dataset(data_dir, manifest_filepath, labels, batch_size, train_mode): + """ + create train dataset + + Args: + data_dir (str): Dataset path + manifest_filepath (str): manifest_file path. + labels (list): list containing all the possible characters to map to + batch_size (int): Dataset batch size + train_mode (bool): Whether dataset is use for train or eval (default=True). + rank (int): The shard ID within num_shards (default=None). + group_size (int): Number of shards that the dataset should be divided into (default=None). + + Returns: + Dataset. + """ + dataset = ASRDataset(data_dir=data_dir, manifest_fpaths=manifest_filepath, labels=labels, + batch_size=batch_size, train_mode=train_mode) + ds = de.GeneratorDataset( + dataset, ["inputs", "input_length", "target_indices", "targets"]) + return ds diff --git a/research/audio/jasper/src/decoder.py b/research/audio/jasper/src/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..541095512293ec71a94a1624d786532b8434d7db --- /dev/null +++ b/research/audio/jasper/src/decoder.py @@ -0,0 +1,129 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from six.moves import xrange +import Levenshtein as Lev +import numpy as np + + +class Decoder(): + """ + Basic decoder class from which all other decoders inherit. Implements several + helper functions. Subclasses should implement the decode() method. + Arguments: + labels (list): mapping from integers to characters. + """ + + def __init__(self, labels, blank_index=0): + self.labels = labels + self.int_to_char = {i: c for (i, c) in enumerate(labels)} + self.blank_index = blank_index + # To prevent errors in decode, we add an out of bounds index for the space + space_index = len(labels) + if ' ' in labels: + space_index = labels.index(' ') + self.space_index = space_index + + def wer(self, s1, s2): + """ + Computes the Word Error Rate, defined as the edit distance between the + two provided sentences after tokenizing to words. + Arguments: + s1 (string): space-separated sentence + s2 (string): space-separated sentence + """ + + # build mapping of words to integers + b = set(s1.split() + s2.split()) + word2char = dict(zip(b, range(len(b)))) + + # map the words to a char array (Levenshtein packages only accepts + # strings) + w1 = [chr(word2char[w]) for w in s1.split()] + w2 = [chr(word2char[w]) for w in s2.split()] + + return Lev.distance(''.join(w1), ''.join(w2)) + + def cer(self, s1, s2): + """ + Computes the Character Error Rate, defined as the edit distance. + Arguments: + s1 (string): space-separated sentence + s2 (string): space-separated sentence + """ + s1, s2, = s1.replace(' ', ''), s2.replace(' ', '') + return Lev.distance(s1, s2) + + def decode(self, probs, sizes=None): + """ + Given a matrix of character probabilities, returns the decoder's + best guess of the transcription + Arguments: + probs: Tensor of character probabilities, where probs[c,t] + is the probability of character c at time t + sizes(optional): Size of each sequence in the mini-batch + Returns: + string: sequence of the model's best guess for the transcription + """ + raise NotImplementedError + + +class GreedyDecoder(Decoder): + def convert_to_strings(self, + sequences, + sizes=None, + remove_repetitions=False, + return_offsets=False): + """Given a list of numeric sequences, returns the corresponding strings""" + strings = [] + offsets = [] if return_offsets else None + for x in xrange(len(sequences)): + seq_len = sizes[x] if sizes is not None else len(sequences[x]) + string, string_offsets = self.process_string( + sequences[x], seq_len, remove_repetitions) + strings.append([string]) # We only return one path + if return_offsets: + offsets.append([string_offsets]) + if return_offsets: + return strings, offsets + return strings + + def process_string(self, sequence, size, remove_repetitions=False): + """ + process string + """ + string = '' + offsets = [] + for i in range(size): + char = self.int_to_char[sequence[i].item()] + if char != self.int_to_char[self.blank_index]: + if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]: + pass + elif char == self.labels[self.space_index]: + string += ' ' + offsets.append(i) + else: + string = string + char + offsets.append(i) + return string, offsets + + def decode(self, probs, sizes=None): + probs = probs.asnumpy() + sizes = sizes.asnumpy() + + max_probs = np.argmax(probs, axis=-1) + strings, offsets = self.convert_to_strings( + max_probs, sizes, remove_repetitions=True, return_offsets=True) + return strings, offsets diff --git a/research/audio/jasper/src/eval_callback.py b/research/audio/jasper/src/eval_callback.py new file mode 100644 index 0000000000000000000000000000000000000000..5c796c373b8ca176a800c97e854ce0e6bdcb048f --- /dev/null +++ b/research/audio/jasper/src/eval_callback.py @@ -0,0 +1,64 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""CallBack of jasper""" + +import os +import logging +from mindspore import save_checkpoint +from mindspore.train.callback import Callback + + +class SaveCallback(Callback): + """ + EvalCallback body + """ + + def __init__(self, path): + + super(SaveCallback, self).__init__() + self.logger = logging.getLogger(__name__) + self.init_logger() + self.interval = 10 + self.store_start_epoch = 10 + self.path = path + + def epoch_end(self, run_context): + """ + select ckpt after some epoch + """ + cb_params = run_context.original_args() + cur_epoch = cb_params.cur_epoch_num + + if cur_epoch >= self.store_start_epoch and (cur_epoch - self.store_start_epoch) % self.interval == 0: + message = '------------Epoch {} :start eval------------'.format( + cur_epoch) + self.logger.info(message) + if not os.path.exists(self.path): + os.makedirs(self.path) + filename = os.path.join( + self.path, 'jasper' + '_' + str(cur_epoch) + '.ckpt') + save_checkpoint(save_obj=cb_params.train_network, + ckpt_file_name=filename) + message = '------------Epoch {} :training ckpt saved------------'.format( + cur_epoch) + self.logger.info(message) + + def init_logger(self): + self.logger.setLevel(level=logging.INFO) + handler = logging.FileHandler('eval_callback.log') + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + self.logger.addHandler(handler) diff --git a/research/audio/jasper/src/greedydecoder.py b/research/audio/jasper/src/greedydecoder.py new file mode 100644 index 0000000000000000000000000000000000000000..3393d97898d93c34b3f58fc74733c0da851fa559 --- /dev/null +++ b/research/audio/jasper/src/greedydecoder.py @@ -0,0 +1,52 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +modify GreedyDecoder to adapt to MindSpore +""" + +import numpy as np +from src.decoder import GreedyDecoder + +class MSGreedyDecoder(GreedyDecoder): + """ + GreedyDecoder used for MindSpore + """ + + def process_string(self, sequence, size, remove_repetitions=False): + """ + process string + """ + string = '' + offsets = [] + for i in range(size): + char = self.int_to_char[sequence[i].item()] + if char != self.int_to_char[self.blank_index]: + if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]: + pass + elif char == self.labels[self.space_index]: + string += ' ' + offsets.append(i) + else: + string = string + char + offsets.append(i) + return string, offsets + + def decode(self, probs, sizes=None): + probs = probs.asnumpy() + sizes = sizes.asnumpy() + + max_probs = np.argmax(probs, axis=-1) + strings, offsets = self.convert_to_strings(max_probs, sizes, remove_repetitions=True, return_offsets=True) + return strings, offsets diff --git a/research/audio/jasper/src/jasper10x5dr_speca.yaml b/research/audio/jasper/src/jasper10x5dr_speca.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d0b3563f9919c325cc6c30b71b06bfeafbb19981 --- /dev/null +++ b/research/audio/jasper/src/jasper10x5dr_speca.yaml @@ -0,0 +1,141 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +name: "Jasper" +labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] + +input_val: + audio_dataset: &val_dataset + sample_rate: &sample_rate 16000 + trim_silence: true + normalize_transcripts: true + + filterbank_features: &val_features + normalize: per_feature + sample_rate: *sample_rate + window_size: 0.02 + window_stride: 0.01 + window: hann + n_filt: &n_filt 64 + n_fft: 512 + frame_splicing: &frame_splicing 1 + dither: 0.00001 + pad_align: 16 + +# For training we keep samples < 16.7s and apply augmentation +input_train: + audio_dataset: + <<: *val_dataset + max_duration: 16.7 + ignore_offline_speed_perturbation: true + + filterbank_features: + <<: *val_features + max_duration: 16.7 + + spec_augment: + freq_masks: 2 + max_freq: 20 + time_masks: 2 + max_time: 75 + +jasper: + encoder: + weight_init: xavier_uniform + in_feats: *n_filt + frame_splicing: *frame_splicing + activation: relu + use_conv_masks: true + blocks: + - &Conv1 + filters: 256 + repeat: 1 + kernel_size: 11 + stride: 2 + dilation: 1 + dropout: 0.2 + residual: false + - &B1 + filters: 256 + repeat: 5 + kernel_size: 11 + stride: 1 + dilation: 1 + dropout: 0.2 + residual: true + residual_dense: true + - *B1 + - &B2 + filters: 384 + repeat: 5 + kernel_size: 13 + stride: 1 + dilation: 1 + dropout: 0.2 + residual: true + residual_dense: true + - *B2 + - &B3 + filters: 512 + repeat: 5 + kernel_size: 17 + stride: 1 + dilation: 1 + dropout: 0.2 + residual: true + residual_dense: true + - *B3 + - &B4 + filters: 640 + repeat: 5 + kernel_size: 21 + stride: 1 + dilation: 1 + dropout: 0.3 + residual: true + residual_dense: true + - *B4 + - &B5 + filters: 768 + repeat: 5 + kernel_size: 25 + stride: 1 + dilation: 1 + dropout: 0.3 + residual: true + residual_dense: true + - *B5 + - &Conv2 + filters: 896 + repeat: 1 + kernel_size: 29 + stride: 1 + dilation: 2 + dropout: 0.4 + residual: false + - &Conv3 + filters: &enc_feats 1024 + repeat: 1 + kernel_size: 1 + stride: 1 + dilation: 1 + dropout: 0.4 + residual: false + + decoder: + in_feats: *enc_feats + init: xavier_uniform diff --git a/research/audio/jasper/src/lr_generator.py b/research/audio/jasper/src/lr_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..878b41ae1ee7b18786dcd6efd3bd0d21077105ee --- /dev/null +++ b/research/audio/jasper/src/lr_generator.py @@ -0,0 +1,51 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""learning rate generator""" +import numpy as np + + +def get_lr(lr_init, total_epochs, steps_per_epoch): + """ + generate learning rate array + + Args: + lr_init(float): init learning rate + total_epochs(int): total epoch of training + steps_per_epoch(int): steps of one epoch + + Returns: + np.array, learning rate array + """ + warmup_epoch = 2 + hold_epoch = 140 + warmup_step = warmup_epoch * steps_per_epoch + hold_step = hold_epoch * steps_per_epoch + total_step = total_epochs * steps_per_epoch + + lr_each_step = [] + for i in range(total_step): + if i < warmup_step: + a = (i+1)/(warmup_step+1) + elif i < warmup_step + hold_step: + a = 1.0 + else: + epoch = int(i / steps_per_epoch) + 1 + a = 0.981 ** (epoch - hold_epoch - warmup_epoch) + lr = max(a * lr_init, 0.00001) + lr_each_step.append(lr) + + learning_rate = np.array(lr_each_step).astype(np.float32) + return learning_rate diff --git a/research/audio/jasper/src/model.py b/research/audio/jasper/src/model.py new file mode 100644 index 0000000000000000000000000000000000000000..2767c968f35e01d2d84de8aec7443cb9dd6cbf37 --- /dev/null +++ b/research/audio/jasper/src/model.py @@ -0,0 +1,389 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +import mindspore +import mindspore.nn as nn +import mindspore.ops as P +from mindspore.common import initializer as ini +from mindspore.boost import GradientAccumulation +from mindspore.ops import functional as F +from mindspore.common import RowTensor +import mindspore.common.dtype as mstype +import mindspore.numpy as np +import numpy as np + +activations = { + "relu": nn.ReLU, + "elu": nn.ELU, +} + +TRAIN_INPUT_PAD_LENGTH = 1300 +TRAIN_LABEL_PAD_LENGTH = 350 +TEST_INPUT_PAD_LENGTH = 3500 + +_grad_scale = P.composite.MultitypeFuncGraph("grad_scale") +reciprocal = P.Reciprocal() + + +@_grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * F.cast(reciprocal(scale), F.dtype(grad)) + + +@_grad_scale.register("Tensor", "RowTensor") +def tensor_grad_scale_row_tensor(scale, grad): + return RowTensor(grad.indices, + grad.values * F.cast(reciprocal(scale), + F.dtype(grad.values)), + grad.dense_shape) + + +def get_same_padding(kernel_size, stride, dilation): + if stride > 1 and dilation > 1: + raise ValueError("Only stride OR dilation may be greater than 1") + return (kernel_size // 2) * dilation + + +class MaskedConv1d(nn.Cell): + """1D convolution with sequence masking + """ + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, group=1, + padding=0, pad_mode='pad', weight_init='xavier_uniform', + has_bias=False, masked=True): + super(MaskedConv1d, self).__init__() + + self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, pad_mode=pad_mode, weight_init=weight_init, + dilation=dilation, group=group, has_bias=has_bias) + + self.padding = padding + self.dilation = dilation + self.kernel_size = kernel_size + self.stride = stride + self.masked = masked + self.indices_max = mindspore.Tensor(np.arange(TRAIN_INPUT_PAD_LENGTH)) + self.expand_dims = mindspore.ops.ExpandDims() + self.unsqueeze = mindspore.ops.ExpandDims() + + def get_seq_len(self, lens): + return ((lens + 2 * self.padding - self.dilation + * (self.kernel_size - 1) - 1) // self.stride + 1) + + def construct(self, x, x_lens): + + if self.masked: + x_shape = P.shape(x) + max_length = x_shape[2] + indices = self.indices_max[:max_length] + indices = self.expand_dims(indices, 0) + mask = indices < self.unsqueeze(x_lens, 1) + x = x * self.unsqueeze(mask, 1) + x_lens = self.get_seq_len(x_lens) + x = self.conv1(x) + + return x, x_lens + + +class Mbatchnorm(nn.Cell): + def __init__(self, num_features, eps, momentum): + super(Mbatchnorm, self).__init__() + + self.batchnorm = nn.BatchNorm2d( + num_features=num_features, eps=eps, momentum=momentum) + + def construct(self, x): + shape = P.shape(x) + x = x.reshape((shape[0], shape[1], shape[2], -1)) + x = self.batchnorm(x) + out = x.reshape((shape[0], shape[1], shape[2])) + + return out + + +class JasperBlock(nn.Cell): + __constants__ = ["use_conv_masks"] + + """Jasper Block. See https://arxiv.org/pdf/1904.03288.pdf + """ + + def __init__(self, infilters, filters, repeat, kernel_size, stride, + dilation, pad_mode='pad', dropout=0.2, activation=None, + residual=True, residual_panes=None, use_conv_masks=False): + super(JasperBlock, self).__init__() + if residual_panes is None: + residual_panes = [] + padding_val = get_same_padding(kernel_size, stride, dilation) + self.use_conv_masks = use_conv_masks + self.conv = nn.CellList() + for i in range(repeat): + self.conv.extend(self._conv_bn(infilters if i == 0 else filters, + filters, + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding_val, + pad_mode=pad_mode)) + + if i < repeat - 1: + self.conv.extend(self._act_dropout(dropout, activation)) + + self.res = nn.CellList() if residual else None + res_panes = residual_panes.copy() + self.dense_residual = residual + if residual: + if residual_panes is None: + res_panes = [infilters] + self.dense_residual = False + for ip in res_panes: + self.res.append(nn.CellList( + self._conv_bn(ip, filters, kernel_size=1))) + self.out = nn.SequentialCell(*self._act_dropout(dropout, activation)) + + def _conv_bn(self, in_channels, out_channels, **kw): + return [MaskedConv1d(in_channels, out_channels, + masked=self.use_conv_masks, **kw), + Mbatchnorm(num_features=out_channels, eps=1e-3, momentum=0.9)] + + def _act_dropout(self, dropout=0.2, activation=None): + return [activation, + nn.Dropout(keep_prob=1-dropout)] + + def construct(self, xs, xs_lens=None): + if not self.use_conv_masks: + xs_lens = 0 + out = xs[-1] + lens = xs_lens + for i, l in enumerate(self.conv): + if i % 4 == 0: + out, lens = l(out, lens) + else: + out = l(out) + # residuals + if self.res is not None: + for i, layer in enumerate(self.res): + res_out = xs[i] + for j, res_layer in enumerate(layer): + if j == 0: + res_out, lens = res_layer(res_out, xs_lens) + else: + res_out = res_layer(res_out) + out += res_out + # output + out = self.out(out) + if self.res is not None and self.dense_residual: + out = xs + [out] + else: + out = [out] + if self.use_conv_masks: + return out, lens + return out, None + + +class JasperEncoder(nn.Cell): + __constants__ = ["use_conv_masks"] + + def __init__(self, in_feats=10, frame_splicing=1, activation="relu", + weight_init='xavier_uniform', use_conv_masks=False, blocks=None): + super(JasperEncoder, self).__init__() + if blocks is None: + blocks = [] + self.use_conv_masks = use_conv_masks + self.layers = nn.CellList() + in_feats *= frame_splicing + all_residual_panes = [] + for _, blk in enumerate(blocks): + blk['activation'] = activations[activation]() + has_residual_dense = blk.pop('residual_dense', False) + if has_residual_dense: + all_residual_panes += [in_feats] + blk['residual_panes'] = all_residual_panes + else: + blk['residual_panes'] = [] + self.layers.append( + JasperBlock(infilters=in_feats, use_conv_masks=use_conv_masks, **blk)) + in_feats = blk['filters'] + + def construct(self, x, x_lens=None): + out, out_lens = [x], x_lens + for l in self.layers: + out, out_lens = l(out, out_lens) + return out, out_lens + + +class JasperDecoderForCTC(nn.Cell): + def __init__(self, in_feats=10, n_classes=3, init='xavier_uniform'): + super(JasperDecoderForCTC, self).__init__() + + self.layers = nn.Conv1d(in_channels=in_feats, out_channels=n_classes, + kernel_size=1, has_bias=True, weight_init='xavier_uniform') + + self.transpose = P.Transpose() + self.logsoftmax = nn.LogSoftmax() + + def construct(self, enc_out): + out = self.layers(enc_out[-1]) + out = self.transpose(out, (0, 2, 1)) + out_2d = mindspore.ops.reshape(out, (-1, out.shape[2])) + out_2d = self.logsoftmax(out_2d) + out = self.transpose(mindspore.ops.reshape( + out_2d, out.shape), (1, 0, 2)) + return out + + +class GreedyCTCDecoder(nn.Cell): + + def __init__(self): + super().__init__() + self.cast = mindspore.ops.Cast() + self.fill = mindspore.ops.Fill() + self.select = mindspore.ops.Select() + self.argmax = np.argmax() + + def construct(self, log_probs, log_prob_lens=None): + + if log_prob_lens is not None: + max_len = log_probs.size(1) + idxs = np.arange(max_len, dtype=log_prob_lens.dtype) + idxs = mindspore.Tensor(idxs) + mask = np.expand_dims(idxs, 0) >= np.expand_dims(log_prob_lens, 1) + mask = self.cast(mask, mstype.bool_) + masked_value = self.fill(mindspore.float16, log_probs.shape, 0) + log_probs = self.select(mask, masked_value, log_probs) + out = self.argmax(log_probs, axis=-1) + return out.astype("int") + + +class Jasper(nn.Cell): + def __init__(self, encoder_kw=None, decoder_kw=None): + super(Jasper, self).__init__() + if encoder_kw is None: + encoder_kw = {} + if decoder_kw is None: + decoder_kw = {} + self.encoder = JasperEncoder(**encoder_kw) + self.decoder = JasperDecoderForCTC(**decoder_kw) + + def construct(self, x, x_lens=None): + enc, enc_lens = self.encoder(x, x_lens) + out = self.decoder(enc) + return out, enc_lens + + +class NetWithLossClass(nn.Cell): + """ + NetWithLossClass definition + """ + + def __init__(self, network): + super(NetWithLossClass, self).__init__(auto_prefix=False) + self.loss = P.CTCLoss(ctc_merge_repeated=True) + self.network = network + self.ReduceMean_false = P.ReduceMean(keep_dims=False) + self.squeeze_op = P.Squeeze(0) + self.cast_op = P.Cast() + + def construct(self, inputs, input_length, target_indices, label_values): + predict, output_length = self.network(inputs, input_length) + predict = self.cast_op(predict, mstype.float32) + loss = self.loss(predict, target_indices, label_values, + self.cast_op(output_length, mstype.int32)) + return self.ReduceMean_false(loss[0]) + + +class TrainGradAccumulationStepsCell(nn.TrainOneStepWithLossScaleCell): + """construct train accu step cell""" + + def __init__(self, network, optimizer, scale_sense, max_accumulation_step=2): + super(TrainGradAccumulationStepsCell, self).__init__( + network, optimizer, scale_sense) + self.max_accumulation_step = max_accumulation_step + self.grad_accumulation = GradientAccumulation( + self.max_accumulation_step, self.optimizer) + + def construct(self, *inputs): + + weights = self.weights + loss = self.network(*inputs) + scaling_sens = self.scale_sense + + status, scaling_sens = self.start_overflow_check(loss, scaling_sens) + + scaling_sens_filled = P.composite.ones_like( + loss) * P.functional.cast(scaling_sens, P.functional.dtype(loss)) + grads = self.grad(self.network, weights)(*inputs, scaling_sens_filled) + grads = self.hyper_map(P.functional.partial( + _grad_scale, scaling_sens), grads) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + cond = self.get_overflow_status(status, grads) + overflow = self.process_loss_scale(cond) + # if there is no overflow, do optimize + if not overflow: + loss = self.grad_accumulation(loss, grads) + return loss + + +class PredictWithSoftmax(nn.Cell): + """ + PredictWithSoftmax + """ + + def __init__(self, network): + super(PredictWithSoftmax, self).__init__(auto_prefix=False) + self.network = network + self.inference_softmax = P.Softmax(axis=-1) + self.transpose_op = P.Transpose() + self.cast_op = P.Cast() + + def construct(self, inputs, input_length): + x, output_sizes = self.network( + inputs, self.cast_op(input_length, mstype.int32)) + x = self.inference_softmax(x) + x = self.transpose_op(x, (1, 0, 2)) + return x, output_sizes + + +def init_weights(net, init_type='xavier', init_gain=1.0): + """ + Initialize network weights. + Parameters: + net (Cell): Network to be initialized + init_type (str): The name of an initialization method: normal | xavier. + init_gain (float): Gain factor for normal and xavier. + """ + for _, cell in net.cells_and_names(): + if isinstance(cell, (nn.Conv1d, nn.Conv1dTranspose)): + if init_type == 'normal': + cell.weight.set_data(ini.initializer( + ini.Normal(init_gain), cell.weight.shape)) + elif init_type == 'xavier': + cell.weight.set_data(ini.initializer( + ini.XavierUniform(init_gain), cell.weight.shape)) + elif init_type == 'constant': + cell.weight.set_data( + ini.initializer(0.001, cell.weight.shape)) + else: + raise NotImplementedError( + 'initialization method [%s] is not implemented' % init_type) + elif isinstance(cell, (nn.BatchNorm1d, nn.BatchNorm2d)): + cell.gamma.set_data(ini.initializer('ones', cell.gamma.shape)) + cell.beta.set_data(ini.initializer('zeros', cell.beta.shape)) + cell.moving_mean.set_data(ini.initializer( + 'zeros', cell.moving_mean.shape)) + cell.moving_variance.set_data( + ini.initializer('ones', cell.moving_variance.shape)) diff --git a/research/audio/jasper/src/model_test.py b/research/audio/jasper/src/model_test.py new file mode 100644 index 0000000000000000000000000000000000000000..2ee3c95b02b0770e39b8db77baf39988c4685bbc --- /dev/null +++ b/research/audio/jasper/src/model_test.py @@ -0,0 +1,338 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +import mindspore +import mindspore.nn as nn +import mindspore.ops as P +from mindspore.common import initializer as ini +import mindspore.common.dtype as mstype +import mindspore.numpy as np +import numpy as np + +activations = { + "relu": nn.ReLU, + "elu": nn.ELU, +} + +TRAIN_INPUT_PAD_LENGTH = 1300 +TRAIN_LABEL_PAD_LENGTH = 350 +TEST_INPUT_PAD_LENGTH = 3500 + + +def get_same_padding(kernel_size, stride, dilation): + if stride > 1 and dilation > 1: + raise ValueError("Only stride OR dilation may be greater than 1") + return (kernel_size // 2) * dilation + + +class MaskedConv1d(nn.Cell): + """1D convolution with sequence masking + """ + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, group=1, + padding=0, pad_mode='pad', weight_init='xavier_uniform', + has_bias=False, masked=True): + super(MaskedConv1d, self).__init__() + + self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, pad_mode=pad_mode, weight_init=weight_init, + dilation=dilation, group=group, has_bias=has_bias) + + self.padding = padding + self.dilation = dilation + self.kernel_size = kernel_size + self.stride = stride + self.masked = masked + self.indices_max = mindspore.Tensor(np.arange(TEST_INPUT_PAD_LENGTH)) + self.expand_dims = mindspore.ops.ExpandDims() + self.unsqueeze = mindspore.ops.ExpandDims() + + def get_seq_len(self, lens): + return ((lens + 2 * self.padding - self.dilation + * (self.kernel_size - 1) - 1) // self.stride + 1) + + def construct(self, x, x_lens): + + if self.masked: + x_shape = P.shape(x) + max_length = x_shape[2] + indices = self.indices_max[:max_length] + indices = self.expand_dims(indices, 0) + mask = indices < self.unsqueeze(x_lens, 1) + x = x * self.unsqueeze(mask, 1) + x_lens = self.get_seq_len(x_lens) + x = self.conv1(x) + + return x, x_lens + + +class Mbatchnorm(nn.Cell): + def __init__(self, num_features, eps, momentum): + super(Mbatchnorm, self).__init__() + + self.batchnorm = nn.BatchNorm2d( + num_features=num_features, eps=eps, momentum=momentum) + + def construct(self, x): + shape = P.shape(x) + x = x.reshape((shape[0], shape[1], shape[2], -1)) + x = self.batchnorm(x) + out = x.reshape((shape[0], shape[1], shape[2])) + + return out + + +class JasperBlock(nn.Cell): + __constants__ = ["use_conv_masks"] + + """Jasper Block. See https://arxiv.org/pdf/1904.03288.pdf + """ + + def __init__(self, infilters, filters, repeat, kernel_size, stride, + dilation, pad_mode='pad', dropout=0.2, activation=None, + residual=True, residual_panes=None, use_conv_masks=False): + super(JasperBlock, self).__init__() + if residual_panes is None: + residual_panes = [] + padding_val = get_same_padding(kernel_size, stride, dilation) + self.use_conv_masks = use_conv_masks + self.conv = nn.CellList() + for i in range(repeat): + self.conv.extend(self._conv_bn(infilters if i == 0 else filters, + filters, + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding_val, + pad_mode=pad_mode)) + + if i < repeat - 1: + self.conv.extend(self._act_dropout(dropout, activation)) + + self.res = nn.CellList() if residual else None + res_panes = residual_panes.copy() + self.dense_residual = residual + if residual: + if residual_panes is None: + res_panes = [infilters] + self.dense_residual = False + for ip in res_panes: + self.res.append(nn.CellList( + self._conv_bn(ip, filters, kernel_size=1))) + self.out = nn.SequentialCell(*self._act_dropout(dropout, activation)) + + def _conv_bn(self, in_channels, out_channels, **kw): + return [MaskedConv1d(in_channels, out_channels, + masked=self.use_conv_masks, **kw), + Mbatchnorm(num_features=out_channels, eps=1e-3, momentum=0.9)] + + def _act_dropout(self, dropout=0.2, activation=None): + return [activation, + nn.Dropout(keep_prob=1-dropout)] + + def construct(self, xs, xs_lens=None): + if not self.use_conv_masks: + xs_lens = 0 + out = xs[-1] + lens = xs_lens + for i, l in enumerate(self.conv): + if i % 4 == 0: + out, lens = l(out, lens) + else: + out = l(out) + # residuals + if self.res is not None: + for i, layer in enumerate(self.res): + res_out = xs[i] + for j, res_layer in enumerate(layer): + if j == 0: + res_out, lens = res_layer(res_out, xs_lens) + else: + res_out = res_layer(res_out) + out += res_out + # output + out = self.out(out) + if self.res is not None and self.dense_residual: + out = xs + [out] + else: + out = [out] + if self.use_conv_masks: + return out, lens + return out, None + + +class JasperEncoder(nn.Cell): + __constants__ = ["use_conv_masks"] + + def __init__(self, in_feats=10, frame_splicing=1, activation="relu", + weight_init='xavier_uniform', use_conv_masks=False, blocks=None): + super(JasperEncoder, self).__init__() + + if blocks is None: + blocks = [] + self.use_conv_masks = use_conv_masks + self.layers = nn.CellList() + in_feats *= frame_splicing + all_residual_panes = [] + for _, blk in enumerate(blocks): + blk['activation'] = activations[activation]() + has_residual_dense = blk.pop('residual_dense', False) + if has_residual_dense: + all_residual_panes += [in_feats] + blk['residual_panes'] = all_residual_panes + else: + blk['residual_panes'] = [] + self.layers.append( + JasperBlock(infilters=in_feats, use_conv_masks=use_conv_masks, **blk)) + in_feats = blk['filters'] + + def construct(self, x, x_lens=None): + out, out_lens = [x], x_lens + for l in self.layers: + out, out_lens = l(out, out_lens) + return out, out_lens + + +class JasperDecoderForCTC(nn.Cell): + def __init__(self, in_feats=10, n_classes=3, init='xavier_uniform'): + super(JasperDecoderForCTC, self).__init__() + + self.layers = nn.Conv1d(in_channels=in_feats, out_channels=n_classes, + kernel_size=1, has_bias=True, weight_init='xavier_uniform') + + self.transpose = P.Transpose() + self.logsoftmax = nn.LogSoftmax() + + def construct(self, enc_out): + out = self.layers(enc_out[-1]) + out = self.transpose(out, (0, 2, 1)) + out_2d = mindspore.ops.reshape(out, (-1, out.shape[2])) + out_2d = self.logsoftmax(out_2d) + out = self.transpose(mindspore.ops.reshape( + out_2d, out.shape), (1, 0, 2)) + return out + + +class GreedyCTCDecoder(nn.Cell): + + def __init__(self): + super().__init__() + self.cast = mindspore.ops.Cast() + self.fill = mindspore.ops.Fill() + self.select = mindspore.ops.Select() + self.argmax = np.argmax() + + def construct(self, log_probs, log_prob_lens=None): + + if log_prob_lens is not None: + max_len = log_probs.size(1) + idxs = np.arange(max_len, dtype=log_prob_lens.dtype) + idxs = mindspore.Tensor(idxs) + mask = np.expand_dims(idxs, 0) >= np.expand_dims(log_prob_lens, 1) + mask = self.cast(mask, mstype.bool_) + masked_value = self.fill(mindspore.float16, log_probs.shape, 0) + log_probs = self.select(mask, masked_value, log_probs) + out = self.argmax(log_probs, axis=-1) + return out.astype("int") + + +class Jasper(nn.Cell): + def __init__(self, encoder_kw=None, decoder_kw=None): + super(Jasper, self).__init__() + if encoder_kw is None: + encoder_kw = {} + if decoder_kw is None: + decoder_kw = {} + self.encoder = JasperEncoder(**encoder_kw) + self.decoder = JasperDecoderForCTC(**decoder_kw) + + def construct(self, x, x_lens=None): + enc, enc_lens = self.encoder(x, x_lens) + out = self.decoder(enc) + return out, enc_lens + + +class NetWithLossClass(nn.Cell): + """ + NetWithLossClass definition + """ + + def __init__(self, network): + super(NetWithLossClass, self).__init__(auto_prefix=False) + self.loss = P.CTCLoss(ctc_merge_repeated=True) + self.network = network + self.ReduceMean_false = P.ReduceMean(keep_dims=False) + self.squeeze_op = P.Squeeze(0) + self.cast_op = P.Cast() + + def construct(self, inputs, input_length, target_indices, label_values): + predict, output_length = self.network(inputs, input_length) + predict = self.cast_op(predict, mstype.float32) + loss = self.loss(predict, target_indices, label_values, + self.cast_op(output_length, mstype.int32)) + return self.ReduceMean_false(loss[0]) + + +class PredictWithSoftmax(nn.Cell): + """ + PredictWithSoftmax + """ + + def __init__(self, network): + super(PredictWithSoftmax, self).__init__(auto_prefix=False) + self.network = network + self.inference_softmax = P.Softmax(axis=-1) + self.transpose_op = P.Transpose() + self.cast_op = P.Cast() + + def construct(self, inputs, input_length): + x, output_sizes = self.network( + inputs, self.cast_op(input_length, mstype.int32)) + x = self.inference_softmax(x) + x = self.transpose_op(x, (1, 0, 2)) + return x, output_sizes + + +def init_weights(net, init_type='xavier', init_gain=1.0): + """ + Initialize network weights. + Parameters: + net (Cell): Network to be initialized + init_type (str): The name of an initialization method: normal | xavier. + init_gain (float): Gain factor for normal and xavier. + """ + for _, cell in net.cells_and_names(): + if isinstance(cell, (nn.Conv1d, nn.Conv1dTranspose)): + if init_type == 'normal': + cell.weight.set_data(ini.initializer( + ini.Normal(init_gain), cell.weight.shape)) + elif init_type == 'xavier': + cell.weight.set_data(ini.initializer( + ini.XavierUniform(init_gain), cell.weight.shape)) + elif init_type == 'constant': + cell.weight.set_data( + ini.initializer(0.001, cell.weight.shape)) + else: + raise NotImplementedError( + 'initialization method [%s] is not implemented' % init_type) + elif isinstance(cell, (nn.BatchNorm1d, nn.BatchNorm2d)): + cell.gamma.set_data(ini.initializer('ones', cell.gamma.shape)) + cell.beta.set_data(ini.initializer('zeros', cell.beta.shape)) + cell.moving_mean.set_data(ini.initializer( + 'zeros', cell.moving_mean.shape)) + cell.moving_variance.set_data( + ini.initializer('ones', cell.moving_variance.shape)) diff --git a/research/audio/jasper/src/number.py b/research/audio/jasper/src/number.py new file mode 100644 index 0000000000000000000000000000000000000000..c61492701698ea158e9503161fef4f1ee4a9b342 --- /dev/null +++ b/research/audio/jasper/src/number.py @@ -0,0 +1,94 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import re +import inflect + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') +_time_re = re.compile(r'([0-9]{1,2}):([0-9]{2})') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + if dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + if cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + if int(m.group(0)[0]) == 0: + return _inflect.number_to_words(m.group(0), andword='', group=1) + num = int(m.group(0)) + if 1000 < num < 3000: + if num == 2000: + return 'two thousand' + if 2000 < num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + if num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + # Add check for number phones and other large numbers + if num > 1000000000 and num % 10000 != 0: + return _inflect.number_to_words(num, andword='', group=1) + return _inflect.number_to_words(num, andword='') + + +def _expand_time(m): + mins = int(m.group(2)) + if mins == 0: + return _inflect.number_to_words(m.group(1)) + return " ".join([_inflect.number_to_words(m.group(1)), _inflect.number_to_words(m.group(2))]) + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + text = re.sub(_time_re, _expand_time, text) + return text diff --git a/research/audio/jasper/src/text.py b/research/audio/jasper/src/text.py new file mode 100644 index 0000000000000000000000000000000000000000..f9123d5b205a1e607280e70c61cd40f36b3f8586 --- /dev/null +++ b/research/audio/jasper/src/text.py @@ -0,0 +1,45 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import string +import src.cleaners as cleaners + + +def _clean_text(text, cleaner_names, *args): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text, *args) + return text + + +def punctuation_map(labels): + # Punctuation to remove + punctuation = string.punctuation + punctuation = punctuation.replace("+", "") + punctuation = punctuation.replace("&", "") + # TODO We might also want to consider: + # @ -> at + # # -> number, pound, hashtag + # ~ -> tilde + # _ -> underscore + # % -> percent + # If a punctuation symbol is inside our vocab, we do not remove from text + for l in labels: + punctuation = punctuation.replace(l, "") + # Turn all punctuation to whitespace + table = str.maketrans(punctuation, " " * len(punctuation)) + return table diff --git a/research/audio/jasper/train.py b/research/audio/jasper/train.py new file mode 100644 index 0000000000000000000000000000000000000000..7fd8daa509f62678adc99b95be91882dfbfe715d --- /dev/null +++ b/research/audio/jasper/train.py @@ -0,0 +1,117 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""train_criteo.""" +import argparse +import json +import os + +from mindspore import context, Tensor, ParameterTuple +from mindspore.communication.management import init, get_rank, get_group_size +from mindspore.context import ParallelMode +from mindspore.nn.optim import AdamWeightDecay +from mindspore.train import Model +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.train.loss_scale_manager import FixedLossScaleManager +import mindspore as ms +import mindspore.nn as nn + +from src.callback import TimeMonitor, Monitor +from src.config import train_config, symbols, encoder_kw, decoder_kw +from src.dataset import create_train_dataset + +from src.model import Jasper, NetWithLossClass, init_weights +from src.eval_callback import SaveCallback +from src.lr_generator import get_lr + +parser = argparse.ArgumentParser(description='Jasper training') +parser.add_argument('--pre_trained_model_path', type=str, + default='', help='Pretrained checkpoint path') +parser.add_argument('--is_distributed', action="store_true", + default=False, help='Distributed training') +parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", "CPU"), + help='Device target, support GPU and CPU, Default: GPU') +args = parser.parse_args() + +ms.set_seed(1) + +if __name__ == '__main__': + + rank_id = 0 + group_size = 1 + config = train_config + data_sink = False + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target, save_graphs=False) + if args.device_target == "GPU": + context.set_context(enable_graph_kernel=False) + if args.is_distributed: + init() + rank_id = get_rank() + group_size = get_group_size() + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) + + with open(config.DataConfig.labels_path) as label_file: + labels = json.load(label_file) + bs = config.DataConfig.batch_size + ds_train = create_train_dataset(mindrecord_files=config.DataConfig.mindrecord_files, + labels=symbols, batch_size=bs, train_mode=True, + rank=rank_id, group_size=group_size) + steps_size = ds_train.get_dataset_size() + + lr = get_lr(lr_init=config.OptimConfig.learning_rate, total_epochs=config.TrainingConfig.epochs, + steps_per_epoch=steps_size) + lr = Tensor(lr) + + jasper_net = Jasper(encoder_kw=encoder_kw, + decoder_kw=decoder_kw).to_float(ms.float16) + + loss_net = NetWithLossClass(jasper_net) + init_weights(loss_net) + weights = ParameterTuple(jasper_net.trainable_params()) + optimizer = AdamWeightDecay(weights, learning_rate=lr, eps=config.OptimConfig.epsilon, weight_decay=1e-3) + train_net = nn.TrainOneStepCell(loss_net, optimizer) + train_net.set_train(True) + if args.pre_trained_model_path != '': + param_dict = load_checkpoint(args.pre_trained_model_path) + load_param_into_net(loss_net, param_dict) + print('Successfully loading the pre-trained model') + + loss_scale = 128.0 + loss_scale = FixedLossScaleManager(loss_scale, drop_overflow_update=True) + model = Model(loss_net, optimizer=optimizer, loss_scale_manager=loss_scale) + + callback_list = [TimeMonitor(steps_size), Monitor(lr)] + + if args.is_distributed: + print('Distributed training.') + config.CheckpointConfig.ckpt_path = os.path.join(config.CheckpointConfig.ckpt_path, + 'ckpt_' + str(get_rank()) + '/') + if rank_id == 0: + callback_update = SaveCallback(config.CheckpointConfig.ckpt_path) + callback_list += [callback_update] + else: + print('Standalone training.') + config_ck = CheckpointConfig(save_checkpoint_steps=1000, + keep_checkpoint_max=config.CheckpointConfig.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint(prefix=config.CheckpointConfig.ckpt_file_name_prefix, + directory=config.CheckpointConfig.ckpt_path, config=config_ck) + + callback_list.append(ckpt_cb) + model.train(config.TrainingConfig.epochs, ds_train, + callbacks=callback_list, dataset_sink_mode=data_sink) diff --git a/research/audio/jasper/utils/__init__.py b/research/audio/jasper/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..602527cd720c8d268599dbaef190ba1cf1eb6f2b --- /dev/null +++ b/research/audio/jasper/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/research/audio/jasper/utils/convert_librispeech.py b/research/audio/jasper/utils/convert_librispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..39a16bdb0d3b456759bdd91b0804313cabb8152c --- /dev/null +++ b/research/audio/jasper/utils/convert_librispeech.py @@ -0,0 +1,83 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +#!/usr/bin/env python +import argparse +import os +import glob +import multiprocessing +import json + +import pandas as pd + +from preprocessing_utils import parallel_preprocess + +parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.') +parser.add_argument('--input_dir', type=str, required=True, + help='LibriSpeech collection input dir') +parser.add_argument('--dest_dir', type=str, required=True, + help='Output dir') +parser.add_argument('--output_json', type=str, default='./', + help='name of the output json file.') +parser.add_argument('-s', '--speed', type=float, nargs='*', + help='Speed perturbation ratio') +parser.add_argument('--target_sr', type=int, default=None, + help='Target sample rate. ' + 'defaults to the input sample rate') +parser.add_argument('--overwrite', action='store_true', + help='Overwrite file if exists') +parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(), + help='Number of threads to use when processing audio files') +args = parser.parse_args() + +args.input_dir = args.input_dir.rstrip('/') +args.dest_dir = args.dest_dir.rstrip('/') + + +def build_input_arr(input_dir): + txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'), + recursive=True) + input_data = [] + for txt_file in txt_files: + rel_path = os.path.relpath(txt_file, input_dir) + with open(txt_file) as fp: + for line in fp: + fname, _, transcript = line.partition(' ') + input_data.append(dict(input_relpath=os.path.dirname(rel_path), + input_fname=fname+'.flac', + transcript=transcript)) + return input_data + + +print("[%s] Scanning input dir..." % args.output_json) +dataset = build_input_arr(input_dir=args.input_dir) + +print("[%s] Converting audio files..." % args.output_json) +dataset = parallel_preprocess(dataset=dataset, + input_dir=args.input_dir, + dest_dir=args.dest_dir, + target_sr=args.target_sr, + speed=args.speed, + overwrite=args.overwrite, + parallel=args.parallel) + +print("[%s] Generating json..." % args.output_json) +df = pd.DataFrame(dataset, dtype=object) + +# Save json with python. df.to_json() produces back slashed in file paths +dataset = df.to_dict(orient='records') +with open(args.output_json, 'w') as fp1: + json.dump(dataset, fp1, indent=2) diff --git a/research/audio/jasper/utils/download_librispeech.py b/research/audio/jasper/utils/download_librispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..6f28a244a7153e4549da8032754eb96e47f71afc --- /dev/null +++ b/research/audio/jasper/utils/download_librispeech.py @@ -0,0 +1,74 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +#!/usr/bin/env python + +import os +import argparse +import pandas as pd + +from download_utils import download_file, md5_checksum, extract + +parser = argparse.ArgumentParser(description='Download, verify and extract dataset files') +parser.add_argument('csv', type=str, + help='CSV file with urls and checksums to download.') +parser.add_argument('dest', type=str, + help='Download destnation folder.') +parser.add_argument('-e', type=str, default=None, + help='Extraction destnation folder. Defaults to download folder if not provided') +parser.add_argument('--skip_download', action='store_true', + help='Skip downloading the files') +parser.add_argument('--skip_checksum', action='store_true', + help='Skip checksum') +parser.add_argument('--skip_extract', action='store_true', + help='Skip extracting files') +args = parser.parse_args() +args.e = args.e or args.dest + + +df = pd.read_csv(args.csv, delimiter=',') + + +if not args.skip_download: + for url in df.url: + fname = url.split('/')[-1] + print("Downloading %s:" % fname) + download_file(url=url, dest_folder=args.dest, fname=fname) +else: + print("Skipping file download") + + +if not args.skip_checksum: + for index, row in df.iterrows(): + url = row['url'] + md5 = row['md5'] + fname = url.split('/')[-1] + fpath = os.path.join(args.dest, fname) + print("Verifying %s: " % fname, end='') + ret = md5_checksum(fpath=fpath, target_hash=md5) + print("Passed" if ret else "Failed") +else: + print("Skipping checksum") + + +if not args.skip_extract: + for url in df.url: + fname = url.split('/')[-1] + fpath = os.path.join(args.dest, fname) + print("Decompressing %s:" % fpath) + extract(fpath=fpath, dest_folder=args.e) +else: + print("Skipping file extraction") diff --git a/research/audio/jasper/utils/download_utils.py b/research/audio/jasper/utils/download_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..124654c2da3fba9f6666bed32a49023334662e04 --- /dev/null +++ b/research/audio/jasper/utils/download_utils.py @@ -0,0 +1,74 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +#!/usr/bin/env python + +import os +import tarfile +import hashlib +import requests +import tqdm + + +def download_file(url, dest_folder, fname, overwrite=False): + fpath = os.path.join(dest_folder, fname) + if os.path.isfile(fpath): + if overwrite: + print("Overwriting existing file") + else: + print("File exists, skipping download.") + return + + tmp_fpath = fpath + '.tmp' + + if not os.path.exists(os.path.dirname(tmp_fpath)): + os.makedirs(os.path.dirname(tmp_fpath)) + + r = requests.get(url, stream=True) + file_size = int(r.headers['Content-Length']) + chunk_size = 1024 * 1024 # 1MB + total_chunks = int(file_size / chunk_size) + + with open(tmp_fpath, 'wb') as fp: + content_iterator = r.iter_content(chunk_size=chunk_size) + chunks = tqdm.tqdm(content_iterator, total=total_chunks, + unit='MB', desc=fpath, leave=True) + for chunk in chunks: + fp.write(chunk) + + os.rename(tmp_fpath, fpath) + + +def md5_checksum(fpath, target_hash): + file_hash = hashlib.md5() + with open(fpath, "rb") as fp: + for chunk in iter(lambda: fp.read(1024*1024), b""): + file_hash.update(chunk) + return file_hash.hexdigest() == target_hash + + +def extract(fpath, dest_folder): + if fpath.endswith('.tar.gz'): + mode = 'r:gz' + elif fpath.endswith('.tar'): + mode = 'r:' + else: + raise IOError('fpath has unknown extension: %s' % fpath) + + with tarfile.open(fpath, mode) as tar: + members = tar.getmembers() + for member in tqdm.tqdm(iterable=members, total=len(members), leave=True): + tar.extract(path=dest_folder, member=member) diff --git a/research/audio/jasper/utils/inference_librispeech.csv b/research/audio/jasper/utils/inference_librispeech.csv new file mode 100644 index 0000000000000000000000000000000000000000..40dac4e0e613e9e44bbe5e04522dac77a226e66f --- /dev/null +++ b/research/audio/jasper/utils/inference_librispeech.csv @@ -0,0 +1,5 @@ +url,md5 +http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1 +http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931 +http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9 +http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135 diff --git a/research/audio/jasper/utils/librispeech.csv b/research/audio/jasper/utils/librispeech.csv new file mode 100644 index 0000000000000000000000000000000000000000..d48a9f8db72f237153d1268c1ebb068db311181d --- /dev/null +++ b/research/audio/jasper/utils/librispeech.csv @@ -0,0 +1,8 @@ +url,md5 +http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1 +http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931 +http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9 +http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135 +http://www.openslr.org/resources/12/train-clean-100.tar.gz,2a93770f6d5c6c964bc36631d331a522 +http://www.openslr.org/resources/12/train-clean-360.tar.gz,c0e676e450a7ff2f54aeade5171606fa +http://www.openslr.org/resources/12/train-other-500.tar.gz,d1a0fd59409feb2c614ce4d30c387708 diff --git a/research/audio/jasper/utils/preprocessing_utils.py b/research/audio/jasper/utils/preprocessing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4ec9a90a4a53e662f340f81d51f5574e3ca51b94 --- /dev/null +++ b/research/audio/jasper/utils/preprocessing_utils.py @@ -0,0 +1,78 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +#!/usr/bin/env python +import os +import multiprocessing +import functools + +import sox + +from tqdm import tqdm + + +def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None, + overwrite=True): + speed = speed or [] + speed.append(1) + speed = list(set(speed)) + + input_fname = os.path.join(input_dir, + data['input_relpath'], + data['input_fname']) + input_sr = sox.file_info.sample_rate(input_fname) + target_sr = target_sr or input_sr + + os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True) + + output_dict = {} + output_dict['transcript'] = data['transcript'].lower().strip() + output_dict['files'] = [] + + fname = os.path.splitext(data['input_fname'])[0] + for s in speed: + output_fname = fname + \ + '{}.wav'.format('' if s == 1 else '-{}'.format(s)) + output_fpath = os.path.join(dest_dir, + data['input_relpath'], + output_fname) + + if not os.path.exists(output_fpath) or overwrite: + cbn = sox.Transformer().speed(factor=s).convert(target_sr) + cbn.build(input_fname, output_fpath) + + file_info = sox.file_info.info(output_fpath) + file_info['fname'] = os.path.join(os.path.basename(dest_dir), + data['input_relpath'], + output_fname) + file_info['speed'] = s + output_dict['files'].append(file_info) + + if s == 1: + file_info = sox.file_info.info(output_fpath) + output_dict['original_duration'] = file_info['duration'] + output_dict['original_num_samples'] = file_info['num_samples'] + + return output_dict + + +def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel): + with multiprocessing.Pool(parallel) as p: + func = functools.partial(preprocess, + input_dir=input_dir, dest_dir=dest_dir, + target_sr=target_sr, speed=speed, overwrite=overwrite) + dataset = list(tqdm(p.imap(func, dataset), total=len(dataset))) + return dataset