Skip to content
Snippets Groups Projects
Commit 585e0096 authored by Yi Zhu's avatar Yi Zhu Committed by Jinhui Yuan
Browse files

feat: refine vgg protos (#924)

* refine vgg proto && add report

* refine proto

* add proto for 2 machine

* add proto for 2 machine 4 GPU and measurement of reduce time for 2 machine 8 GPU

* refine protos

* remove comment
parent c4f303df
No related branches found
No related tags found
No related merge requests found
Showing
with 401 additions and 7 deletions
net: "./train.net"
resource: "./1_machine_4_gpu.resource"
placement: "./1_machine_1_gpu.placement"
other: "./piece_size_16.other"
placement_group {
op_set {
op_name: "decode"
}
parallel_conf {
policy: kDataParallel
device_name: "first:cpu:16"
}
}
placement_group {
op_set {
op_name: "transpose"
op_name: "conv1_1"
op_name: "conv1_2"
op_name: "pool1"
op_name: "conv2_1"
op_name: "conv2_2"
op_name: "pool2"
op_name: "conv3_1"
op_name: "conv3_2"
op_name: "conv3_3"
op_name: "pool3"
op_name: "conv4_1"
op_name: "conv4_2"
op_name: "conv4_3"
op_name: "pool4"
op_name: "conv5_1"
op_name: "conv5_2"
op_name: "conv5_3"
op_name: "pool5"
op_name: "fc6"
op_name: "drop6"
op_name: "fc7"
op_name: "drop7"
op_name: "fc8"
op_name: "softmax_loss"
}
parallel_conf {
policy: kDataParallel
device_name: "first:gpu:0"
}
}
net: "./train.net"
resource: "./1_machine_4_gpu.resource"
placement: "./1_machine_2_gpu.placement"
other: "./piece_size_16.other"
placement_group {
op_set {
op_name: "decode"
}
parallel_conf {
policy: kDataParallel
device_name: "first:cpu:16"
}
}
placement_group {
op_set {
op_name: "transpose"
op_name: "conv1_1"
op_name: "conv1_2"
op_name: "pool1"
op_name: "conv2_1"
op_name: "conv2_2"
op_name: "pool2"
op_name: "conv3_1"
op_name: "conv3_2"
op_name: "conv3_3"
op_name: "pool3"
op_name: "conv4_1"
op_name: "conv4_2"
op_name: "conv4_3"
op_name: "pool4"
op_name: "conv5_1"
op_name: "conv5_2"
op_name: "conv5_3"
op_name: "pool5"
op_name: "fc6"
op_name: "drop6"
op_name: "fc7"
op_name: "drop7"
op_name: "fc8"
op_name: "softmax_loss"
}
parallel_conf {
policy: kDataParallel
device_name: "first:gpu:0-1"
}
}
net: "./train.net"
resource: "./1_machine_4_gpu.resource"
placement: "./1_machine_4_gpu.placement"
other: "./piece_size_64.other"
placement_group {
op_set {
op_name: "decode"
}
parallel_conf {
policy: kDataParallel
device_name: "first:cpu:32"
}
}
placement_group {
op_set {
op_name: "transpose"
op_name: "conv1_1"
op_name: "conv1_2"
op_name: "pool1"
op_name: "conv2_1"
op_name: "conv2_2"
op_name: "pool2"
op_name: "conv3_1"
op_name: "conv3_2"
op_name: "conv3_3"
op_name: "pool3"
op_name: "conv4_1"
op_name: "conv4_2"
op_name: "conv4_3"
op_name: "pool4"
op_name: "conv5_1"
op_name: "conv5_2"
op_name: "conv5_3"
op_name: "pool5"
op_name: "fc6"
op_name: "drop6"
op_name: "fc7"
op_name: "drop7"
op_name: "fc8"
op_name: "softmax_loss"
}
parallel_conf {
policy: kDataParallel
device_name: "first:gpu:0-3"
}
}
machine {
addr: "192.168.1.12"
port: 7776
name: "first"
}
gpu_device_num: 4
net: "./train.net"
resource: "./2_machine_4_gpu.resource"
placement: "./2_machine_4_gpu.placement"
other: "./hdfs_piece_size_64.other"
......@@ -4,7 +4,8 @@ placement_group {
}
parallel_conf {
policy: kDataParallel
device_name: "first:cpu:30"
device_name: "first:cpu:16"
device_name: "second:cpu:16"
}
}
......@@ -34,6 +35,7 @@ placement_group {
parallel_conf {
policy: kDataParallel
device_name: "first:gpu:0-1"
device_name: "second:gpu:0-1"
}
}
......@@ -46,7 +48,8 @@ placement_group {
op_name: "fc8"
}
parallel_conf {
policy: kModelParallel
policy: kDataParallel
device_name: "first:gpu:0-1"
device_name: "second:gpu:0-1"
}
}
machine {
addr: "192.168.1.15"
port: 7776
port: 7767
name: "first"
}
cpu_device_num: 24
machine {
addr: "192.168.1.16"
port: 7766
name: "second"
}
gpu_device_num: 2
persistence_worker_num: 36
comm_net_worker_num: 4
net: "./train.net"
resource: "./2_machine_8_gpu.resource"
placement: "./2_machine_8_gpu.placement"
other: "./hdfs_piece_size_128.other"
placement_group {
op_set {
op_name: "decode"
}
parallel_conf {
policy: kDataParallel
device_name: "first:cpu:32"
}
}
placement_group {
op_set {
op_name: "transpose"
op_name: "conv1_1"
op_name: "conv1_2"
op_name: "pool1"
op_name: "conv2_1"
op_name: "conv2_2"
op_name: "pool2"
op_name: "conv3_1"
op_name: "conv3_2"
op_name: "conv3_3"
op_name: "pool3"
op_name: "conv4_1"
op_name: "conv4_2"
op_name: "conv4_3"
op_name: "pool4"
op_name: "conv5_1"
op_name: "conv5_2"
op_name: "conv5_3"
op_name: "pool5"
op_name: "softmax_loss"
}
parallel_conf {
policy: kDataParallel
device_name: "first:gpu:0-3"
device_name: "second:gpu:0-3"
}
}
placement_group {
op_set {
op_name: "fc6"
op_name: "drop6"
op_name: "fc7"
op_name: "drop7"
op_name: "fc8"
}
parallel_conf {
policy: kDataParallel
device_name: "first:gpu:0-3"
device_name: "second:gpu:0-3"
}
}
machine {
addr: "192.168.1.12"
port: 7766
name: "first"
}
machine {
addr: "192.168.1.14"
port: 7767
name: "second"
}
gpu_device_num: 4
net: "./train.net"
resource: "./2_machine_8_gpu.resource"
placement: "./2_machine_8_gpu_reduce.placement"
other: "./hdfs_piece_size_8.other"
placement_group {
op_set {
op_name: "decode"
}
parallel_conf {
policy: kDataParallel
device_name: "first:cpu:8"
}
}
placement_group {
op_set {
op_name: "transpose"
op_name: "conv1_1"
op_name: "conv1_2"
op_name: "pool1"
op_name: "conv2_1"
op_name: "conv2_2"
op_name: "pool2"
op_name: "conv3_1"
op_name: "conv3_2"
op_name: "conv3_3"
op_name: "pool3"
op_name: "conv4_1"
op_name: "conv4_2"
op_name: "conv4_3"
op_name: "pool4"
op_name: "conv5_1"
op_name: "conv5_2"
op_name: "conv5_3"
op_name: "pool5"
op_name: "fc6"
op_name: "drop6"
op_name: "fc7"
op_name: "drop7"
op_name: "fc8"
op_name: "softmax_loss"
}
parallel_conf {
policy: kDataParallel
device_name: "first:gpu:0-3"
device_name: "second:gpu:0-3"
}
}
use_rdma: false
use_rdma: true
globalfs_conf {
hdfs_conf {
namenode: "hdfs://192.168.1.11:9000"
}
}
piece_size: 30
data_part_num: 30
piece_size: 128
data_part_num: 32
max_data_id_length: 0
train_conf {
model_save_snapshots_path: "/zhuyi/vgg_snapshots_buf"
num_of_batches_in_snapshot: 1000
staleness: 0
total_batch_num: 200000
total_batch_num: 50
default_initializer_conf {
msra_conf {
variance_norm: kFanOut
}
}
batch_size: 240
batch_size: 256
l2: 0.0005
model_update_conf {
momentum_conf {
......
use_rdma: true
globalfs_conf {
hdfs_conf {
namenode: "hdfs://192.168.1.11:9000"
}
}
piece_size: 32
data_part_num: 32
max_data_id_length: 0
train_conf {
model_save_snapshots_path: "/zhuyi/vgg_snapshots_buf"
num_of_batches_in_snapshot: 1000
staleness: 0
total_batch_num: 50
default_initializer_conf {
msra_conf {
variance_norm: kFanOut
}
}
batch_size: 256
l2: 0.0005
model_update_conf {
momentum_conf {
beta: 0.9
}
}
}
use_rdma: true
globalfs_conf {
hdfs_conf {
namenode: "hdfs://192.168.1.11:9000"
}
}
piece_size: 64
data_part_num: 32
max_data_id_length: 0
train_conf {
model_save_snapshots_path: "/zhuyi/vgg_snapshots_buf"
num_of_batches_in_snapshot: 1000
staleness: 0
total_batch_num: 50
default_initializer_conf {
msra_conf {
variance_norm: kFanOut
}
}
batch_size: 256
l2: 0.0005
model_update_conf {
momentum_conf {
beta: 0.9
}
}
}
use_rdma: true
globalfs_conf {
hdfs_conf {
namenode: "hdfs://192.168.1.11:9000"
}
}
piece_size: 8
data_part_num: 8
max_data_id_length: 0
train_conf {
model_save_snapshots_path: "/zhuyi/vgg_snapshots_buf"
num_of_batches_in_snapshot: 1000
staleness: 0
total_batch_num: 50
default_initializer_conf {
msra_conf {
variance_norm: kFanOut
}
}
batch_size: 8
l2: 0.0005
model_update_conf {
momentum_conf {
beta: 0.9
}
}
}
use_rdma: false
globalfs_conf {
localfs_conf {
}
}
piece_size: 16
data_part_num: 16
max_data_id_length: 0
train_conf {
model_save_snapshots_path: "/home/zhuyi/vgg_snapshots_buf"
num_of_batches_in_snapshot: 1000
staleness: 0
total_batch_num: 50
default_initializer_conf {
msra_conf {
variance_norm: kFanOut
}
}
batch_size: 256
l2: 0.0005
model_update_conf {
momentum_conf {
beta: 0.9
}
}
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment