diff --git a/official/recommend/fibinet/README.md b/official/recommend/fibinet/README.md index d9dfcabaf37e2a0e9d1f140a96e9cf4e8b6b9720..5afba7dfb3f1f51d8c12eefea2738b25a92070c5 100644 --- a/official/recommend/fibinet/README.md +++ b/official/recommend/fibinet/README.md @@ -79,10 +79,10 @@ Once the dataset is ready, the model can be trained and evaluated by the command ```bash # Python command -python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --eval_while_train=True +python train.py --data_path=./data/mindrecord --device_target=GPU --eval_while_train=True # Shell command -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU --eval_while_train=True +bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU True ``` To evaluate the model, command as follows: @@ -92,7 +92,7 @@ To evaluate the model, command as follows: python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU # Shell command -bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU ``` # [Script Description](#contents) @@ -105,15 +105,13 @@ bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU 鈹溾攢鈹€ requirements.txt # python environment 鈹溾攢鈹€ script 鈹� 鈹溾攢鈹€ common.sh - 鈹� 鈹溾攢鈹€ run_train_and eval_gpu.sh # Shell script of GPU singledevice training and evaluation (quick start) - 鈹� 鈹溾攢鈹€ run_train_gpu.sh # Shell script of GPU singledevice training - 鈹� 鈹斺攢鈹€ run_eval_gpu.sh # Shell script of GPU singledevice evaluation + 鈹� 鈹溾攢鈹€ run_train_gpu.sh # Shell script of GPU single-device training + 鈹� 鈹斺攢鈹€ run_eval_gpu.sh # Shell script of GPU single-device evaluation 鈹溾攢鈹€src 鈹� 鈹溾攢鈹€ callbacks.py 鈹� 鈹溾攢鈹€ datasets.py # Create dataset 鈹� 鈹溾攢鈹€ generate_synthetic_data.py # Generate synthetic data 鈹� 鈹溾攢鈹€ __init__.py - 鈹� 鈹溾攢鈹€ model_builder.py 鈹� 鈹溾攢鈹€ metrics.py # Script of metrics 鈹� 鈹溾攢鈹€ preprocess_data.py # Preprocess on dataset 鈹� 鈹溾攢鈹€ process_data.py @@ -142,7 +140,7 @@ Arguments: --device_target Device where the code will be implemented, only support GPU currently. (Default:GPU) --data_path Where the preprocessed data is put in - --epochs Total train epochs. (Default:100) + --epochs Total train epochs. (Default:10) --full_batch Enable loading the full batch. (Default:False) --batch_size Training batch size.(Default:1000) --eval_batch_size Eval batch size.(Default:1000) @@ -164,7 +162,7 @@ Arguments: --loss_file_name Loss output file.(Default:loss.log) --dataset_type The data type of the training files, chosen from [tfrecord, mindrecord, hd5].(Default:mindrecord) --vocab_cache_size Enable cache mode.(Default:0) - --eval_while_train Whether to evaluate after each epoch + --eval_while_train Whether to evaluate after training each epoch ``` ### [Preprocess Script Parameters](#contents) @@ -194,7 +192,7 @@ usage: preprocess_data.py --threshold Word frequency below this value will be regarded as OOV. It aims to reduce the vocab size.(default: 100) --train_line_count The number of examples in your dataset. --skip_id_convert 0 or 1. If set 1, the code will skip the id convert, regarding the original id as the final id.(default: 0) - --eval_size The percent of eval samples in the whole dataset. + --eval_size The percent of eval samples in the whole dataset.(default: 0.1) --line_per_sample The number of sample per line, must be divisible by batch_size. ``` @@ -235,30 +233,16 @@ python src/generate_synthetic_data.py --output_file=syn_data/origin_data/train.t python src/preprocess_data.py --data_path=./syn_data/ --dense_dim=13 --slot_dim=51 --threshold=0 --train_line_count=40000000 --skip_id_convert=1 ``` -## [Training Process](#contents) +## [Train Process](#contents) -### [SingleDevice](#contents) - -To train and evaluate the model, command as follows: - -```bash -python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU - -# Or - -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU -``` - -### [SingleDevice For Cache Mode](#contents) - -To train and evaluate the model, command as follows: +To train the model, command as follows: ```bash python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU # Or -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU False ``` ## [Evaluation Process](#contents) @@ -266,11 +250,11 @@ bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU To evaluate the model, command as follows: ```bash -python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --ckpt_path=./ckpt/fibinet_train-15_2582.ckpt +python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --ckpt_path=./ckpt/fibinet_train-10_41265.ckpt # Or -bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU ``` ## Inference Process @@ -289,7 +273,7 @@ The ckpt_file parameter is required, Inference result is saved in current path, you can find result like this in acc.log file. ```markdown -auc : 0.7607724041538813 +auc : 0.7814143582416716 ``` # [Model Description](#contents) @@ -302,17 +286,17 @@ auc : 0.7607724041538813 | ----------------- | --------------------------- | | Resource |A100-SXM4-40GB | | Uploaded Date | 07/29/2022 | -| MindSpore Version | 1.7.1 | +| MindSpore Version | 1.9 | | Dataset | [1] | | Batch Size | 1000 | -| Epoch | 100 | +| Epoch | 10 | | Learning rate | 0.0001 | | Optimizer | FTRL,Adam | | Loss Function | Sigmoid cross entropy | -| Loss | 0.4269 | -| Speed | 12.837 ms/step | +| Loss | 0.4702615 | +| Speed | 15.588 ms/step | | Outputs | AUC | -| Accuracy | AUC=0.76077 | +| Accuracy | AUC= 0.7814143582416716 | # [Description of Random Situation](#contents) diff --git a/official/recommend/fibinet/README_CN.md b/official/recommend/fibinet/README_CN.md index 6ece99f41cc4a255b6e044080e6267af6d528c22..f868c40a1b98e3ad0699794bad0404b32c413694 100644 --- a/official/recommend/fibinet/README_CN.md +++ b/official/recommend/fibinet/README_CN.md @@ -81,10 +81,10 @@ python src/preprocess_data.py --data_path=./data/ --dense_dim=13 --slot_dim=26 ```bash # 鎵цPython鑴氭湰 -python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --eval_while_train=True +python train.py --data_path=./data/mindrecord --device_target=GPU --eval_while_train=True # 鎵цShell鑴氭湰 -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU --eval_while_train=True +bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU True ``` 鎸夊涓嬫搷浣滃崟鐙瘎浼版ā鍨嬶細 @@ -94,7 +94,7 @@ bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU # 鎵цShell鑴氭湰 -bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU ``` ## 鑴氭湰璇存槑 @@ -107,7 +107,6 @@ bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU 鈹溾攢鈹€ requirements.txt # python鐜 鈹溾攢鈹€ script 鈹� 鈹溾攢鈹€ common.sh - 鈹� 鈹溾攢鈹€ run_train_and eval_gpu.sh # GPU澶勭悊鍣ㄥ崟鍗¤缁冧笌璇勪及shell鑴氭湰 锛堝揩閫熷紑濮嬶級 鈹� 鈹溾攢鈹€ run_train_gpu.sh # GPU澶勭悊鍣ㄥ崟鍗¤缁僺hell鑴氭湰 鈹� 鈹斺攢鈹€ run_eval_gpu.sh # GPU澶勭悊鍣ㄥ崟鍗¤瘎浼皊hell鑴氭湰 鈹溾攢鈹€src @@ -116,7 +115,6 @@ bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU 鈹� 鈹溾攢鈹€ generate_synthetic_data.py # 鐢熸垚铏氭嫙鏁版嵁 鈹� 鈹溾攢鈹€ __init__.py 鈹� 鈹溾攢鈹€ metrics.py # 妯″瀷琛ㄧ幇璇勪环鎸囨爣鑴氭湰 - 鈹� 鈹溾攢鈹€ model_builder.py 鈹� 鈹溾攢鈹€ preprocess_data.py # 鏁版嵁棰勫鐞� 鈹� 鈹溾攢鈹€ process_data.py 鈹� 鈹溾攢鈹€ fibinet.py # FiBiNET涓讳綋鏋舵瀯 @@ -142,7 +140,7 @@ Arguments: --device_target Device where the code will be implemented, only support GPU currently. (Default:GPU) --data_path Where the preprocessed data is put in - --epochs Total train epochs. (Default:100) + --epochs Total train epochs. (Default:10) --full_batch Enable loading the full batch. (Default:False) --batch_size Training batch size.(Default:1000) --eval_batch_size Eval batch size.(Default:1000) @@ -164,7 +162,7 @@ Arguments: --loss_file_name Loss output file.(Default:loss.log) --dataset_type The data type of the training files, chosen from [tfrecord, mindrecord, hd5].(Default:mindrecord) --vocab_cache_size Enable cache mode.(Default:0) - --eval_while_train Whether to evaluate after each epoch + --eval_while_train Whether to evaluate after training each epoch ``` ### 棰勫鐞嗚剼鏈弬鏁� @@ -246,7 +244,7 @@ python train.py --data_path=./data/mindrecord --dataset_type=mindrecord --device # Or -bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU False ``` ## 璇勪及杩囩▼ @@ -254,11 +252,11 @@ bash ./script/run_train_gpu.sh --data_path=./data/mindrecord --device_target=GPU 杩愯濡備笅鍛戒护鍗曠嫭璇勪及妯″瀷锛� ```bash -python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --ckpt_path=./ckpt/fibinet_train-100_45830.ckpt +python eval.py --data_path=./data/mindrecord --dataset_type=mindrecord --device_target=GPU --ckpt_path=./ckpt/fibinet_train-10_41265.ckpt # Or -bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU +bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU ``` ## 鎺ㄧ悊杩囩▼ @@ -269,7 +267,7 @@ bash ./script/run_eval_gpu.sh --data_path=./data/mindrecord --device_target=GPU python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --device_target [DEVICE_TARGET] --file_format [FILE_FORMAT] ``` -鍙傛暟ckpt_file涓哄繀濉」锛岄粯璁ゅ€硷細"./ckpt/fibinet_train-100_45830.ckpt"锛� +鍙傛暟ckpt_file涓哄繀濉」锛岄粯璁ゅ€硷細"./ckpt/fibinet_train-10_41265.ckpt"锛� `FILE_FORMAT` 蹇呴』鍦� ["AIR", "MINDIR"]涓€夋嫨锛岄粯璁ゅ€硷細"MINDIR"銆� @@ -278,7 +276,7 @@ python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --device_target 鎺ㄧ悊缁撴灉淇濆瓨鍦ㄨ剼鏈墽琛岀殑褰撳墠璺緞锛屽湪acc.log涓彲浠ョ湅鍒颁互涓嬬簿搴﹁绠楃粨鏋溿€� ```markdown -auc : 0.7607724041538813 +auc : 0.7814143582416716 ``` # 妯″瀷鎻忚堪 @@ -292,18 +290,18 @@ auc : 0.7607724041538813 | 澶勭悊鍣� | GPU | | 璧勬簮 | A100-SXM4-40GB | | 涓婁紶鏃ユ湡 | 2022-07-29 | -| MindSpore鐗堟湰 | 1.7.1 | +| MindSpore鐗堟湰 | 1.9 | | 鏁版嵁闆� | [1](#鏁版嵁闆�) | -| 璁粌鍙傛暟 | Epoch=100,<br />batch_size=1000,<br />lr=0.0001 | +| 璁粌鍙傛暟 | Epoch=10,<br />batch_size=1000,<br />lr=0.0001 | | 浼樺寲鍣� | FTRL,Adam | | 鎹熷け鍑芥暟 | Sigmoid浜ゅ弶鐔� | -| AUC鍒嗘暟 | 0.76077 | -| 閫熷害 | 12.837姣/姝� | -| 鎹熷け | 0.4269 | +| AUC鍒嗘暟 | 0.7814143582416716 | +| 閫熷害 | 15.588姣/姝� | +| 鎹熷け | 0.4702615 | | 鍙傛暟(M) | 30 | -| 鎺ㄧ悊妫€鏌ョ偣 | 179.56MB锛�.ckpt鏂囦欢锛� | +| 鎺ㄧ悊妫€鏌ョ偣 | 180MB锛�.ckpt鏂囦欢锛� | -鎵€鏈夊彲鎵ц鑴氭湰鍙傝[姝ゅ](https://gitee.com/mindspore/models/tree/master/official/recommend/FiBiNet/script)銆� +鎵€鏈夊彲鎵ц鑴氭湰鍙傝[姝ゅ](https://gitee.com/mindspore/models/tree/master/official/recommend/fibinet/script)銆� # 闅忔満鎯呭喌璇存槑 diff --git a/official/recommend/fibinet/default_config.yaml b/official/recommend/fibinet/default_config.yaml index 7de64eed8df5209d0f7170558630a754b32c36e4..8b4b7ff79c97bb3d34602e9e170988855ceb2eef 100644 --- a/official/recommend/fibinet/default_config.yaml +++ b/official/recommend/fibinet/default_config.yaml @@ -1,4 +1,4 @@ -# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly the meaning of each parameter) enable_modelarts: False data_url: "" train_url: "" @@ -12,7 +12,7 @@ enable_profiling: False # ============================================================================== # argparse_init 'fibinet' -epochs: 100 +epochs: 10 full_batch: False batch_size: 1000 eval_batch_size: 1000 @@ -31,7 +31,7 @@ loss_file_name: "loss.log" dataset_type: "mindrecord" field_slice: False sparse: False -eval_while_train: True +eval_while_train: False deep_table_slice_mode: "column_slice" seed: 1024 weight_bias_init: ['normal', 'normal'] @@ -45,7 +45,7 @@ bilinear_type: "all" #bilinear_type: ['all', 'each', 'interaction'] # ============================================================================== # fibinet export device_id: 0 -ckpt_file: "./ckpt/fibinet_train-100_45830.ckpt" +ckpt_file: "./ckpt/fibinet_train-10_41265.ckpt" file_name: "fibinet" file_format: "MINDIR" @@ -67,7 +67,7 @@ threshold: 100 train_line_count: 45840617 skip_id_convert: 0 line_per_sample: 10 -eval_size: 0.2 +eval_size: 0.1 # src/generate_synthetic_data.py 'Generate Synthetic Data' output_file: "./train.txt" diff --git a/official/recommend/fibinet/export.py b/official/recommend/fibinet/export.py index 8308779491d83876143042557490e8e038b005f0..f92c693899b597e62fbfc47293881b113616578a 100644 --- a/official/recommend/fibinet/export.py +++ b/official/recommend/fibinet/export.py @@ -23,7 +23,9 @@ from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +context.set_context(mode=context.GRAPH_MODE, + device_target=config.device_target, + max_call_depth=10000) def modelarts_pre_process(): pass diff --git a/official/recommend/fibinet/script/run_eval_gpu.sh b/official/recommend/fibinet/script/run_eval_gpu.sh index 55b1e280df8bef62fce86482abaf5747c479680c..ebcae2e84bef81a03f4864238098e839107268d1 100644 --- a/official/recommend/fibinet/script/run_eval_gpu.sh +++ b/official/recommend/fibinet/script/run_eval_gpu.sh @@ -17,24 +17,29 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_eval_gpu.sh" -echo " " +echo "______________________________________________________________________________________________________________" echo "If your data_path or device_id or device_target is different from those in default_config.yaml, " echo "please run the script as: " echo "bash run_eval_gpu.sh DATA_PATH DEVICE_ID DEVICE_TARGET" echo "for example: bash ./script/run_eval_gpu.sh './data/mindrecord/' 1 GPU " echo " " echo "**** FYI: only DEVICE_TARGET=GPU is supported currently. ****" -echo " " +echo "______________________________________________________________________________________________________________" echo "Then you can find detailed log and results in files: eval_output.log, eval.log and loss.log. " echo " " echo "If you want to set up more parameters by yourself, " echo "you are suggested to check the file default_config.yaml and change parameter values there. " echo "==============================================================================================================" +if [ $# != 3 ] +then + echo "Usage: bash run_train_gpu.sh [DATA_PATH] [DEVICE_ID] [DEVICE_TARGET] " +exit 1 +fi + DATA_PATH=$1 DEVICE_ID=$2 -DEVICE_TARGET=GPU - +DEVICE_TARGET=$3 export CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./eval.py --data_path=$DATA_PATH --device_target=$DEVICE_TARGET > eval_output.log 2>&1 & diff --git a/official/recommend/fibinet/script/run_train_gpu.sh b/official/recommend/fibinet/script/run_train_gpu.sh index 1e46e3928430e655c06ff46bbf389e22c02aa207..51083a032caadca9a196f6baeee21eab4613ae14 100644 --- a/official/recommend/fibinet/script/run_train_gpu.sh +++ b/official/recommend/fibinet/script/run_train_gpu.sh @@ -17,24 +17,30 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run_train_gpu.sh" -echo " " +echo "______________________________________________________________________________________________________________" echo "If your data_path or device_id or device_target is different from those in default_config.yaml, " echo "please run the script as: " echo "bash run_train_gpu.sh DATA_PATH DEVICE_ID DEVICE_TARGET EVAL_WHILE_TRAIN " echo "for example: bash ./script/run_train_gpu.sh './data/mindrecord/' 1 GPU True" echo " " echo "**** FYI: only DEVICE_TARGET=GPU is supported currently. ****" -echo " " +echo "______________________________________________________________________________________________________________" echo "Then you can find detailed log and results in files: train.log, eval.log and loss.log. " echo " " echo "If you want to set up more parameters by yourself, " echo "you are suggested to check the file default_config.yaml and change parameter values there. " echo "==============================================================================================================" +if [ $# != 4 ] +then + echo "Usage: bash run_train_gpu.sh [DATA_PATH] [DEVICE_ID] [DEVICE_TARGET] [EVAL_WHILE_TRAIN] " +exit 1 +fi + DATA_PATH=$1 DEVICE_ID=$2 -DEVICE_TARGET=GPU -EVAL_WHILE_TRAIN=$3 +DEVICE_TARGET=$3 +EVAL_WHILE_TRAIN=$4 export CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./train.py --data_path=$DATA_PATH --device_target=$DEVICE_TARGET --eval_while_train=$EVAL_WHILE_TRAIN > train.log 2>&1 & diff --git a/official/recommend/fibinet/src/fibinet.py b/official/recommend/fibinet/src/fibinet.py index 49930027d7d9b3ebd55f6fe8054b81bd438b15b1..559079aead87a913698fe02ad00049bdf50e9559 100644 --- a/official/recommend/fibinet/src/fibinet.py +++ b/official/recommend/fibinet/src/fibinet.py @@ -343,8 +343,7 @@ class NetWithLossClass(nn.Cell): sparse = config.sparse parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) - if sparse: - self.no_l2loss = True + self.no_l2loss = sparse self.network = network self.l2_coef = config.l2_coef self.loss = ops.SigmoidCrossEntropyWithLogits() diff --git a/official/recommend/fibinet/train.py b/official/recommend/fibinet/train.py index 8d8f43eafb009a8cb9dc533d1172d2424e529838..03c7a377063ca9e511d8f95f15dbcbe28b541888 100644 --- a/official/recommend/fibinet/train.py +++ b/official/recommend/fibinet/train.py @@ -30,6 +30,7 @@ def train_eval_fibinet(config): data_path = config.data_path batch_size = config.batch_size epochs = config.epochs + sparse = config.sparse if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": @@ -57,7 +58,6 @@ def train_eval_fibinet(config): auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) - sparse = config.sparse out = model.eval(ds_eval, dataset_sink_mode=(not sparse)) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train,