diff --git a/official/recommend/fibinet/README.md b/official/recommend/fibinet/README.md index d9dfcabaf37e2a0e9d1f140a96e9cf4e8b6b9720..5afba7dfb3f1f51d8c12eefea2738b25a92070c5 100644 --- a/official/recommend/fibinet/README.md +++ b/official/recommend/fibinet/README.md @@ -79,10 +79,10 @@ Once the dataset is ready, the model can be trained and evaluated by the command ```bash # Python command -python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --eval_while_train=True +python train.py --data_path=./data/mindrecord --device_target=GPU --eval_while_train=True # Shell command -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU --eval_while_train=True +bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU True ``` To evaluate the model, command as follows: @@ -92,7 +92,7 @@ To evaluate the model, command as follows: python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU # Shell command -bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU ``` # [Script Description](#contents) @@ -105,15 +105,13 @@ bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU ├── requirements.txt # python environment ├── script │ ├── common.sh - │ ├── run_train_and eval_gpu.sh # Shell script of GPU singledevice training and evaluation (quick start) - │ ├── run_train_gpu.sh # Shell script of GPU singledevice training - │ └── run_eval_gpu.sh # Shell script of GPU singledevice evaluation + │ ├── run_train_gpu.sh # Shell script of GPU single-device training + │ └── run_eval_gpu.sh # Shell script of GPU single-device evaluation ├──src │ ├── callbacks.py │ ├── datasets.py # Create dataset │ ├── generate_synthetic_data.py # Generate synthetic data │ ├── __init__.py - │ ├── model_builder.py │ ├── metrics.py # Script of metrics │ ├── preprocess_data.py # Preprocess on dataset │ ├── process_data.py @@ -142,7 +140,7 @@ Arguments: --device_target Device where the code will be implemented, only support GPU currently. (Default:GPU) --data_path Where the preprocessed data is put in - --epochs Total train epochs. (Default:100) + --epochs Total train epochs. (Default:10) --full_batch Enable loading the full batch. (Default:False) --batch_size Training batch size.(Default:1000) --eval_batch_size Eval batch size.(Default:1000) @@ -164,7 +162,7 @@ Arguments: --loss_file_name Loss output file.(Default:loss.log) --dataset_type The data type of the training files, chosen from [tfrecord, mindrecord, hd5].(Default:mindrecord) --vocab_cache_size Enable cache mode.(Default:0) - --eval_while_train Whether to evaluate after each epoch + --eval_while_train Whether to evaluate after training each epoch ``` ### [Preprocess Script Parameters](#contents) @@ -194,7 +192,7 @@ usage: preprocess_data.py --threshold Word frequency below this value will be regarded as OOV. It aims to reduce the vocab size.(default: 100) --train_line_count The number of examples in your dataset. --skip_id_convert 0 or 1. If set 1, the code will skip the id convert, regarding the original id as the final id.(default: 0) - --eval_size The percent of eval samples in the whole dataset. + --eval_size The percent of eval samples in the whole dataset.(default: 0.1) --line_per_sample The number of sample per line, must be divisible by batch_size. ``` @@ -235,30 +233,16 @@ python src/generate_synthetic_data.py --output_file=syn_data/origin_data/train.t python src/preprocess_data.py --data_path=./syn_data/ --dense_dim=13 --slot_dim=51 --threshold=0 --train_line_count=40000000 --skip_id_convert=1 ``` -## [Training Process](#contents) +## [Train Process](#contents) -### [SingleDevice](#contents) - -To train and evaluate the model, command as follows: - -```bash -python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU - -# Or - -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU -``` - -### [SingleDevice For Cache Mode](#contents) - -To train and evaluate the model, command as follows: +To train the model, command as follows: ```bash python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU # Or -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU False ``` ## [Evaluation Process](#contents) @@ -266,11 +250,11 @@ bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU To evaluate the model, command as follows: ```bash -python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --ckpt_path=./ckpt/fibinet_train-15_2582.ckpt +python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --ckpt_path=./ckpt/fibinet_train-10_41265.ckpt # Or -bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU ``` ## Inference Process @@ -289,7 +273,7 @@ The ckpt_file parameter is required, Inference result is saved in current path, you can find result like this in acc.log file. ```markdown -auc : 0.7607724041538813 +auc : 0.7814143582416716 ``` # [Model Description](#contents) @@ -302,17 +286,17 @@ auc : 0.7607724041538813 | ----------------- | --------------------------- | | Resource |A100-SXM4-40GB | | Uploaded Date | 07/29/2022 | -| MindSpore Version | 1.7.1 | +| MindSpore Version | 1.9 | | Dataset | [1] | | Batch Size | 1000 | -| Epoch | 100 | +| Epoch | 10 | | Learning rate | 0.0001 | | Optimizer | FTRL,Adam | | Loss Function | Sigmoid cross entropy | -| Loss | 0.4269 | -| Speed | 12.837 ms/step | +| Loss | 0.4702615 | +| Speed | 15.588 ms/step | | Outputs | AUC | -| Accuracy | AUC=0.76077 | +| Accuracy | AUC= 0.7814143582416716 | # [Description of Random Situation](#contents) diff --git a/official/recommend/fibinet/README_CN.md b/official/recommend/fibinet/README_CN.md index 6ece99f41cc4a255b6e044080e6267af6d528c22..f868c40a1b98e3ad0699794bad0404b32c413694 100644 --- a/official/recommend/fibinet/README_CN.md +++ b/official/recommend/fibinet/README_CN.md @@ -81,10 +81,10 @@ python src/preprocess_data.py --data_path=./data/ --dense_dim=13 --slot_dim=26 ```bash # 执行Python脚本 -python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --eval_while_train=True +python train.py --data_path=./data/mindrecord --device_target=GPU --eval_while_train=True # 执行Shell脚本 -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU --eval_while_train=True +bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU True ``` 按如下操作单独评估模型: @@ -94,7 +94,7 @@ bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU # 执行Shell脚本 -bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU ``` ## 脚本说明 @@ -107,7 +107,6 @@ bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU ├── requirements.txt # python环境 ├── script │ ├── common.sh - │ ├── run_train_and eval_gpu.sh # GPU处理器单卡训练与评估shell脚本 (快速开始) │ ├── run_train_gpu.sh # GPU处理器单卡训练shell脚本 │ └── run_eval_gpu.sh # GPU处理器单卡评估shell脚本 ├──src @@ -116,7 +115,6 @@ bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU │ ├── generate_synthetic_data.py # 生成虚拟数据 │ ├── __init__.py │ ├── metrics.py # 模型表现评价指标脚本 - │ ├── model_builder.py │ ├── preprocess_data.py # 数据预处理 │ ├── process_data.py │ ├── fibinet.py # FiBiNET主体架构 @@ -142,7 +140,7 @@ Arguments: --device_target Device where the code will be implemented, only support GPU currently. (Default:GPU) --data_path Where the preprocessed data is put in - --epochs Total train epochs. (Default:100) + --epochs Total train epochs. (Default:10) --full_batch Enable loading the full batch. (Default:False) --batch_size Training batch size.(Default:1000) --eval_batch_size Eval batch size.(Default:1000) @@ -164,7 +162,7 @@ Arguments: --loss_file_name Loss output file.(Default:loss.log) --dataset_type The data type of the training files, chosen from [tfrecord, mindrecord, hd5].(Default:mindrecord) --vocab_cache_size Enable cache mode.(Default:0) - --eval_while_train Whether to evaluate after each epoch + --eval_while_train Whether to evaluate after training each epoch ``` ### 预处理脚本参数 @@ -246,7 +244,7 @@ python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device # Or -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU False ``` ## 评估过程 @@ -254,11 +252,11 @@ bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU 运行如下命令单独评估模型: ```bash -python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --ckpt_path=./ckpt/fibinet_train-100_45830.ckpt +python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --ckpt_path=./ckpt/fibinet_train-10_41265.ckpt # Or -bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU ``` ## 推理过程 @@ -269,7 +267,7 @@ bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --device_target [DEVICE_TARGET] --file_format [FILE_FORMAT] ``` -参数ckpt_file为必填项,默认值:"./ckpt/fibinet_train-100_45830.ckpt"; +参数ckpt_file为必填项,默认值:"./ckpt/fibinet_train-10_41265.ckpt"; `FILE_FORMAT` 必须在 ["AIR", "MINDIR"]中选择,默认值:"MINDIR"。 @@ -278,7 +276,7 @@ python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --device_target 推理结果保存在脚本执行的当前路径,在acc.log中可以看到以下精度计算结果。 ```markdown -auc : 0.7607724041538813 +auc : 0.7814143582416716 ``` # 模型描述 @@ -292,18 +290,18 @@ auc : 0.7607724041538813 | 处理器 | GPU | | 资源 | A100-SXM4-40GB | | 上传日期 | 2022-07-29 | -| MindSpore版本 | 1.7.1 | +| MindSpore版本 | 1.9 | | 数据集 | [1](#数据集) | -| 训练参数 | Epoch=100,<br />batch_size=1000,<br />lr=0.0001 | +| 训练参数 | Epoch=10,<br />batch_size=1000,<br />lr=0.0001 | | 优化器 | FTRL,Adam | | 损失函数 | Sigmoid交叉熵 | -| AUC分数 | 0.76077 | -| 速度 | 12.837毫秒/步 | -| 损失 | 0.4269 | +| AUC分数 | 0.7814143582416716 | +| 速度 | 15.588毫秒/步 | +| 损失 | 0.4702615 | | 参数(M) | 30 | -| 推理检查点 | 179.56MB(.ckpt文件) | +| 推理检查点 | 180MB(.ckpt文件) | -所有可执行脚本参见[此处](https://gitee.com/mindspore/models/tree/master/official/recommend/FiBiNet/script)。 +所有可执行脚本参见[此处](https://gitee.com/mindspore/models/tree/master/official/recommend/fibinet/script)。 # 随机情况说明 diff --git a/official/recommend/fibinet/default_config.yaml b/official/recommend/fibinet/default_config.yaml index 7de64eed8df5209d0f7170558630a754b32c36e4..8b4b7ff79c97bb3d34602e9e170988855ceb2eef 100644 --- a/official/recommend/fibinet/default_config.yaml +++ b/official/recommend/fibinet/default_config.yaml @@ -1,4 +1,4 @@ -# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly the meaning of each parameter) enable_modelarts: False data_url: "" train_url: "" @@ -12,7 +12,7 @@ enable_profiling: False # ============================================================================== # argparse_init 'fibinet' -epochs: 100 +epochs: 10 full_batch: False batch_size: 1000 eval_batch_size: 1000 @@ -31,7 +31,7 @@ loss_file_name: "loss.log" dataset_type: "mindrecord" field_slice: False sparse: False -eval_while_train: True +eval_while_train: False deep_table_slice_mode: "column_slice" seed: 1024 weight_bias_init: ['normal', 'normal'] @@ -45,7 +45,7 @@ bilinear_type: "all" #bilinear_type: ['all', 'each', 'interaction'] # ============================================================================== # fibinet export device_id: 0 -ckpt_file: "./ckpt/fibinet_train-100_45830.ckpt" +ckpt_file: "./ckpt/fibinet_train-10_41265.ckpt" file_name: "fibinet" file_format: "MINDIR" @@ -67,7 +67,7 @@ threshold: 100 train_line_count: 45840617 skip_id_convert: 0 line_per_sample: 10 -eval_size: 0.2 +eval_size: 0.1 # src/generate_synthetic_data.py 'Generate Synthetic Data' output_file: "./train.txt" diff --git a/official/recommend/fibinet/export.py b/official/recommend/fibinet/export.py index 8308779491d83876143042557490e8e038b005f0..f92c693899b597e62fbfc47293881b113616578a 100644 --- a/official/recommend/fibinet/export.py +++ b/official/recommend/fibinet/export.py @@ -23,7 +23,9 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +context.set_context(mode=context.GRAPH_MODE, + device_target=config.device_target, + max_call_depth=10000) def modelarts_pre_process(): pass diff --git a/official/recommend/fibinet/script/run_eval_gpu.sh b/official/recommend/fibinet/script/run_eval_gpu.sh index 55b1e280df8bef62fce86482abaf5747c479680c..ebcae2e84bef81a03f4864238098e839107268d1 100644 --- a/official/recommend/fibinet/script/run_eval_gpu.sh +++ b/official/recommend/fibinet/script/run_eval_gpu.sh @@ -17,24 +17,29 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_eval_gpu.sh" -echo " " +echo "______________________________________________________________________________________________________________" echo "If your data_path or device_id or device_target is different from those in default_config.yaml, " echo "please run the script as: " echo "bash run_eval_gpu.sh DATA_PATH DEVICE_ID DEVICE_TARGET" echo "for example: bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU " echo " " echo "**** FYI: only DEVICE_TARGET=GPU is supported currently. ****" -echo " " +echo "______________________________________________________________________________________________________________" echo "Then you can find detailed log and results in files: eval_output.log, eval.log and loss.log. " echo " " echo "If you want to set up more parameters by yourself, " echo "you are suggested to check the file default_config.yaml and change parameter values there. " echo "==============================================================================================================" +if [ $# != 3 ] +then + echo "Usage: bash run_train_gpu.sh [DATA_PATH] [DEVICE_ID] [DEVICE_TARGET] " +exit 1 +fi + DATA_PATH=$1 DEVICE_ID=$2 -DEVICE_TARGET=GPU - +DEVICE_TARGET=$3 export CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./eval.py --data_path=$DATA_PATH --device_target=$DEVICE_TARGET > eval_output.log 2>&1 & diff --git a/official/recommend/fibinet/script/run_train_gpu.sh b/official/recommend/fibinet/script/run_train_gpu.sh index 1e46e3928430e655c06ff46bbf389e22c02aa207..51083a032caadca9a196f6baeee21eab4613ae14 100644 --- a/official/recommend/fibinet/script/run_train_gpu.sh +++ b/official/recommend/fibinet/script/run_train_gpu.sh @@ -17,24 +17,30 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_train_gpu.sh" -echo " " +echo "______________________________________________________________________________________________________________" echo "If your data_path or device_id or device_target is different from those in default_config.yaml, " echo "please run the script as: " echo "bash run_train_gpu.sh DATA_PATH DEVICE_ID DEVICE_TARGET EVAL_WHILE_TRAIN " echo "for example: bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU True" echo " " echo "**** FYI: only DEVICE_TARGET=GPU is supported currently. ****" -echo " " +echo "______________________________________________________________________________________________________________" echo "Then you can find detailed log and results in files: train.log, eval.log and loss.log. " echo " " echo "If you want to set up more parameters by yourself, " echo "you are suggested to check the file default_config.yaml and change parameter values there. " echo "==============================================================================================================" +if [ $# != 4 ] +then + echo "Usage: bash run_train_gpu.sh [DATA_PATH] [DEVICE_ID] [DEVICE_TARGET] [EVAL_WHILE_TRAIN] " +exit 1 +fi + DATA_PATH=$1 DEVICE_ID=$2 -DEVICE_TARGET=GPU -EVAL_WHILE_TRAIN=$3 +DEVICE_TARGET=$3 +EVAL_WHILE_TRAIN=$4 export CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./train.py --data_path=$DATA_PATH --device_target=$DEVICE_TARGET --eval_while_train=$EVAL_WHILE_TRAIN > train.log 2>&1 & diff --git a/official/recommend/fibinet/src/fibinet.py b/official/recommend/fibinet/src/fibinet.py index 49930027d7d9b3ebd55f6fe8054b81bd438b15b1..559079aead87a913698fe02ad00049bdf50e9559 100644 --- a/official/recommend/fibinet/src/fibinet.py +++ b/official/recommend/fibinet/src/fibinet.py @@ -343,8 +343,7 @@ class NetWithLossClass(nn.Cell): sparse = config.sparse parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) - if sparse: - self.no_l2loss = True + self.no_l2loss = sparse self.network = network self.l2_coef = config.l2_coef self.loss = ops.SigmoidCrossEntropyWithLogits() diff --git a/official/recommend/fibinet/train.py b/official/recommend/fibinet/train.py index 8d8f43eafb009a8cb9dc533d1172d2424e529838..03c7a377063ca9e511d8f95f15dbcbe28b541888 100644 --- a/official/recommend/fibinet/train.py +++ b/official/recommend/fibinet/train.py @@ -30,6 +30,7 @@ def train_eval_fibinet(config): data_path = config.data_path batch_size = config.batch_size epochs = config.epochs + sparse = config.sparse if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": @@ -57,7 +58,6 @@ def train_eval_fibinet(config): auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) - sparse = config.sparse out = model.eval(ds_eval, dataset_sink_mode=(not sparse)) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train,