Summer2022
221cb0332

Repository

  UserID::MovieID::Rating::Timestamp
userId,movieId,rating,timestamp
#run data process
bash scripts/run_download_dataset.sh

# run training example on Ascend
bash scripts/run_train.sh

# run training example on GPU
bash scripts/run_train_gpu.sh

# run training distribute example on Ascend
bash scripts/run_distribute_train.sh /path/hccl.json /path/MovieLens

# run evaluation example on Ascend
bash run_eval.sh

# run evaluation example on GPU
bash run_eval_gpu.sh
# run distributed training on modelarts example
# (1) First, Perform a or b.
#       a. Set "enable_modelarts=True" on default_config.yaml file.
#          Set other parameters on default_config.yaml file you need.
#       b. Add "enable_modelarts=True" on the website UI interface.
#          Add other parameters on the website UI interface.
# (2) Set the code directory to "/path/ncf" on the website UI interface.
# (3) Set the startup file to "train.py" on the website UI interface.
# (4) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (5) Create your job.

# run evaluation on modelarts example
# (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b.
#       a. Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
#          Set "checkpoint_url=/The path of checkpoint in S3/" on default_config.yaml file.
#       b. Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
#          Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Set the code directory to "/path/ncf" on the website UI interface.
# (4) Set the startup file to "eval.py" on the website UI interface.
# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (6) Create your job.

# run export on modelarts example
# (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b.
#       a. Set "file_name='ncf'" on default_config.yaml file.
#          Set "file_format='MINDIR'" on default_config.yaml file.
#          Set "ckpt_file='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
#          Set "checkpoint_url=/The path of checkpoint in S3/" on default_config.yaml file.
#       b. Add "file_name='ncf'" on the website UI interface.
#          Add "file_format='MINDIR'" on the website UI interface.
#          Add "ckpt_file='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
#          Set "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Set the code directory to "/path/ncf" on the website UI interface.
# (4) Set the startup file to "export.py" on the website UI interface.
# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (6) Create your job.
├── ModelZoo_NCF_ME
    ├── README.md                          // descriptions about NCF
    ├── scripts
    │   ├──ascend_distributed_launcher
    │       ├──__init__.py                      // init file
    │       ├──get_distribute_pretrain_cmd.py   // create distribute shell script
    │   ├──run_train.sh                    // shell script for train on Ascend
    │   ├──run_distribute_train.sh         // shell script for distribute train
    │   ├──run_eval.sh                     // shell script for evaluation on Ascend
    │   ├──run_train_gpu.sh                // shell script for train on GPU
    │   ├──run_eval_gpu.sh                 // shell script for evaluation on GPU
    │   ├──run_download_dataset.sh         // shell script for dataget and process
    │   ├──run_transfer_ckpt_to_air.sh     // shell script for transfer model style
    ├── src
    │   ├──dataset.py                      // creating dataset
    │   ├──ncf.py                          // ncf architecture
    │   ├──config.py                       // parameter analysis
    │   ├──device_adapter.py               // device adapter
    │   ├──local_adapter.py                // local adapter
    │   ├──moxing_adapter.py               // moxing adapter
    │   ├──movielens.py                    // data download file
    │   ├──callbacks.py                    // model loss and eval callback file
    │   ├──constants.py                    // the constants of model
    │   ├──export.py                       // export checkpoint files into geir/onnx
    │   ├──metrics.py                      // the file for auc compute
    │   ├──stat_utils.py                   // the file for data process functions
    ├── default_config.yaml    // parameter configuration
    ├── train.py               // training script
    ├── eval.py                //  evaluation script
* `--data_path`: This should be set to the same directory given to the data_download data_dir argument.
* `--dataset`: The dataset name to be downloaded and preprocessed. By default, it is ml-1m.
* `--train_epochs`: Total train epochs.
* `--batch_size`: Training batch size.
* `--eval_batch_size`: Eval batch size.
* `--num_neg`: The Number of negative instances to pair with a positive instance.
* `--layers`： The sizes of hidden layers for MLP.
* `--num_factors`：The Embedding size of MF model.
* `--output_path`：The location of the output file.
* `--eval_file_name` : Eval output file.
# train single
bash scripts/run_train.sh
# train distribute
bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]
bash scripts/run_train_gpu.sh
# grep "loss is " train.log
ds_train.size: 95
epoch: 1 step: 95, loss is 0.25074288
epoch: 2 step: 95, loss is 0.23324402
epoch: 3 step: 95, loss is 0.18286772
...
bash scripts/run_eval.sh
# grep "accuracy: " eval.log
HR:0.6846,NDCG:0.410
bash scripts/run_eval_gpu.sh
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
# Ascend310 inference
bash run_infer_310.sh [MINDIR_PATH] [NEED_PREPROCESS] [DEVICE_ID]
  HR:0.6846,NDCG:0.410
# Load unseen dataset for inference
dataset = dataset.create_dataset(cfg.data_path, 1, False)

# Define model
net = GoogleNet(num_classes=cfg.num_classes)
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01,
               cfg.momentum, weight_decay=cfg.weight_decay)
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

# Load pre-trained model
param_dict = load_checkpoint(cfg.checkpoint_path)
load_param_into_net(net, param_dict)
net.set_train(False)

# Make predictions on the unseen dataset
acc = model.eval(dataset)
print("accuracy: ", acc)
# Load dataset
dataset = create_dataset(cfg.data_path, cfg.epoch_size)
batch_num = dataset.get_dataset_size()

# Define model
net = GoogleNet(num_classes=cfg.num_classes)
# Continue training if set pre_trained to be True
if cfg.pre_trained:
    param_dict = load_checkpoint(cfg.checkpoint_path)
    load_param_into_net(net, param_dict)
lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size,
              steps_per_epoch=batch_num)
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
               Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay)
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
              amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)

# Set callbacks
config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5,
                             keep_checkpoint_max=cfg.keep_checkpoint_max)
time_cb = TimeMonitor(data_size=batch_num)
ckpoint_cb = ModelCheckpoint(prefix="train_googlenet_cifar10", directory="./",
                             config=config_ck)
loss_cb = LossMonitor()

# Start training
model.train(cfg.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])
print("train success")