diff --git a/oneflow/core/actor/actor.cpp b/oneflow/core/actor/actor.cpp
index a1b16b0a7bebb973f793b9c7a6baea14fd84d9f5..4f1899d6bebc6e5edd3e575725e1c84f9369389b 100644
--- a/oneflow/core/actor/actor.cpp
+++ b/oneflow/core/actor/actor.cpp
@@ -99,7 +99,7 @@ void Actor::InitDeviceCtx(const ThreadCtx& thread_ctx) {
     case DeviceType::kGPU: {
       CudaStreamHandle* cuda_handle = nullptr;
       if (GetLocalWorkStreamId() == 0) {
-        cuda_handle = thread_ctx.compute_cuda_stream.get();
+        cuda_handle = thread_ctx.g_cuda_stream.get();
       } else {
         CHECK(Global<IDMgr>::Get()->IsIndependentLocalWorkStreamId(GetLocalWorkStreamId()));
         cuda_handle = &cuda_handle_;
diff --git a/oneflow/core/actor/copy_hd_actor.cpp b/oneflow/core/actor/copy_hd_actor.cpp
index 472cca94c31ff4669a27b91b853d52289f1bec32..b51ee9728eb1fbd9bcdb6390235cfdb177190416 100644
--- a/oneflow/core/actor/copy_hd_actor.cpp
+++ b/oneflow/core/actor/copy_hd_actor.cpp
@@ -8,22 +8,6 @@ void CopyHdActor::VirtualActorInit(const TaskProto& task_proto) {
   OF_SET_MSG_HANDLER(&CopyHdActor::HandlerNormal);
 }
 
-void CopyHdActor::InitDeviceCtx(const ThreadCtx& thread_ctx) {
-  CHECK_EQ(exec_kernel_vec().size(), 1);
-  const OperatorConf& op_conf = exec_kernel_vec().begin()->kernel->op_conf();
-  CHECK(op_conf.has_copy_hd_conf());
-  CudaStreamHandle* cuda_stream = nullptr;
-  if (op_conf.copy_hd_conf().type() == CopyHdOpConf::H2D) {
-    cuda_stream = thread_ctx.copy_h2d_cuda_stream.get();
-  } else if (op_conf.copy_hd_conf().type() == CopyHdOpConf::D2H) {
-    cuda_stream = thread_ctx.copy_d2h_cuda_stream.get();
-  } else {
-    UNIMPLEMENTED();
-  }
-  CHECK_NOTNULL(cuda_stream);
-  mut_device_ctx().reset(new CudaDeviceCtx(nullptr, 0, cuda_stream));
-}
-
 void CopyHdActor::Act() {
   Regst* in_regst = GetNaiveSoleCurReadable();
   AsyncLaunchKernel(GenDefaultKernelCtx());
diff --git a/oneflow/core/actor/copy_hd_actor.h b/oneflow/core/actor/copy_hd_actor.h
index be3808e8639bbccf17aafeef2ad296580926f34a..16bcba5d1dd145932a522d9b1969a4c2d54c5ffe 100644
--- a/oneflow/core/actor/copy_hd_actor.h
+++ b/oneflow/core/actor/copy_hd_actor.h
@@ -15,7 +15,6 @@ class CopyHdActor final : public Actor {
 
  private:
   void VirtualActorInit(const TaskProto&) override;
-  void InitDeviceCtx(const ThreadCtx&) override;
   void Act() override;
   std::pair<bool, std::vector<std::string>> GetNaiveConsumedRegstDescName() override {
     return {true, {}};
diff --git a/oneflow/core/graph/copy_task_node.cpp b/oneflow/core/graph/copy_task_node.cpp
index f21674c0991c60bf50ef7c5359c326c193664926..007e835690c391b5959003b9c4258f73bd19ce86 100644
--- a/oneflow/core/graph/copy_task_node.cpp
+++ b/oneflow/core/graph/copy_task_node.cpp
@@ -21,17 +21,13 @@ void CopyTaskNode::BuildExecGphAndRegst() {
   node->BindBnWithRegst(node->op()->SoleObn(), out_regst);
 }
 
-void CopyHdTaskNode::Init(int64_t machine_id, int64_t thrd_id, CopyHdOpConf::Type copy_type) {
+void CopyHdTaskNode::Init(CopyHdOpConf::Type copy_type, int64_t machine_id, int64_t dev_phy_id) {
   copy_type_ = copy_type;
   set_machine_id(machine_id);
-  set_thrd_id(thrd_id);
-}
-
-int64_t CopyHdTaskNode::AllocateLocalWorkStreamId() {
-  if (copy_type_ == CopyHdOpConf::H2D) {
-    return 1;
-  } else if (copy_type_ == CopyHdOpConf::D2H) {
-    return 2;
+  if (copy_type == CopyHdOpConf::H2D) {
+    set_thrd_id(Global<IDMgr>::Get()->GetGpuH2DThrdId(dev_phy_id));
+  } else if (copy_type == CopyHdOpConf::D2H) {
+    set_thrd_id(Global<IDMgr>::Get()->GetGpuD2HThrdId(dev_phy_id));
   } else {
     UNIMPLEMENTED();
   }
diff --git a/oneflow/core/graph/copy_task_node.h b/oneflow/core/graph/copy_task_node.h
index 3fe311f10cba163533d09b560f15b219e7f681b6..369bdfbec8590876dee2196daa458a569f8e166b 100644
--- a/oneflow/core/graph/copy_task_node.h
+++ b/oneflow/core/graph/copy_task_node.h
@@ -29,12 +29,20 @@ class CopyHdTaskNode final : public CopyTaskNode {
 
   TaskType GetTaskType() const override { return TaskType::kCopyHd; }
 
-  void Init(int64_t machine_id, int64_t thrd_id, CopyHdOpConf::Type);
+  void Init(CopyHdOpConf::Type, int64_t machine_id, int64_t dev_phy_id);
 
   CopyHdOpConf::Type copy_type() const { return copy_type_; }
+  int64_t MemZoneId121() const override {
+    if (copy_type_ == CopyHdOpConf::H2D) {
+      return TaskNode::MemZoneId121();
+    } else if (copy_type_ == CopyHdOpConf::D2H) {
+      return Global<IDMgr>::Get()->CpuMemZoneId();
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
 
  private:
-  int64_t AllocateLocalWorkStreamId() override;
   void InitProducedRegstMemCase(MemoryCase*) override;
   OperatorConf NewCopyOpConf() override;
 
diff --git a/oneflow/core/graph/logical_node.cpp b/oneflow/core/graph/logical_node.cpp
index 747ea7d51284f62f6e98da5ccc4e384a859507b1..7db45f92c2e54625c7206241f07a4fdff0a0e8e2 100644
--- a/oneflow/core/graph/logical_node.cpp
+++ b/oneflow/core/graph/logical_node.cpp
@@ -192,7 +192,11 @@ void LogicalNode::GenSortedCompTaskNodes(std::function<int64_t(const TaskNode*)>
       CompTaskNode* comp_task_node = NewCompTaskNode();
       comp_task_node->set_machine_id(machine_id);
       if (parallel_desc_->device_type() == DeviceType::kGPU) {
-        comp_task_node->set_thrd_id(Global<IDMgr>::Get()->GetGpuDeviceThrdId(dev_phy_id));
+        if (comp_task_node->NeedIndependentWorkStream()) {
+          comp_task_node->set_thrd_id(Global<IDMgr>::Get()->GetGpuIndependentThrdId(dev_phy_id));
+        } else {
+          comp_task_node->set_thrd_id(Global<IDMgr>::Get()->GetGpuComputeThrdId(dev_phy_id));
+        }
       } else if (parallel_desc_->device_type() == DeviceType::kCPU) {
         comp_task_node->set_thrd_id(AllocateCpuThrdId(comp_task_node));
       } else {
diff --git a/oneflow/core/graph/reduce_scatter_compute_task_node.cpp b/oneflow/core/graph/reduce_scatter_compute_task_node.cpp
index fbcd2ca3a026aedc1dead32d3ded5963e657a071..70782f1cc45a81900c51ffcc4ad5f2f518426c4c 100644
--- a/oneflow/core/graph/reduce_scatter_compute_task_node.cpp
+++ b/oneflow/core/graph/reduce_scatter_compute_task_node.cpp
@@ -18,7 +18,7 @@ void ReduceScatterCompTaskNode::ProduceAllRegstsAndBindEdges() {
       MemoryCase* mem_case = out_regst.get()->mut_mem_case();
       mem_case->Clear();
       mem_case->mutable_device_cuda_mem()->set_device_id(
-          Global<IDMgr>::Get()->GetGpuDevPhyIdFromThrdId(thrd_id()));
+          Global<IDMgr>::Get()->GetGpuPhyIdFromThrdId(thrd_id()));
     }
   }
 }
diff --git a/oneflow/core/graph/task_graph.cpp b/oneflow/core/graph/task_graph.cpp
index e332704bba325e391a30f09600b377c29e613200..12138e22da663c70350f42e82e8320f14463c542 100644
--- a/oneflow/core/graph/task_graph.cpp
+++ b/oneflow/core/graph/task_graph.cpp
@@ -86,7 +86,7 @@ DEFINE_BLD_SUB_TASK_GRAPH_METHOD(BldSubTskGphByOneToOne) {
     CompTaskNode* src = sorted_src_comp_tasks[i];
     CompTaskNode* dst = sorted_dst_comp_tasks[i];
     Connect<TaskNode>(
-        Build121BufTo(src, dst->machine_id(), dst->MemZoneId(),
+        Build121BufTo(src, dst->machine_id(), dst->MemZoneId121(),
                       [&](int64_t machine_id, int32_t mem_zone_id) {
                         return *Mut121BufTask(src, machine_id, mem_zone_id);
                       },
@@ -179,7 +179,7 @@ TaskNode* TaskGraph::Build121BufTo(
       return Build121BufTo(dst_cpu, dst_machine_id, dst_mem_zone_id, Get121BufTask, Set121BufTask);
     }
   } else {
-    if (src->MemZoneId() == dst_mem_zone_id) {
+    if (src->MemZoneId121() == dst_mem_zone_id) {
       return Set121BufTask(dst_machine_id, dst_mem_zone_id, src);
     } else {
       if (dst_mem_zone_id == cpu_mem_zone_id) {
@@ -188,9 +188,8 @@ TaskNode* TaskGraph::Build121BufTo(
         TaskNode* src_cpu =
             Build121BufTo(src, dst_machine_id, cpu_mem_zone_id, Get121BufTask, Set121BufTask);
         CopyHdTaskNode* src_h2d = NewNode<CopyHdTaskNode>();
-        src_h2d->Init(dst_machine_id,
-                      Global<IDMgr>::Get()->GetThrdIdFromGpuMemZoneId(dst_mem_zone_id),
-                      CopyHdOpConf::H2D);
+        src_h2d->Init(CopyHdOpConf::H2D, dst_machine_id,
+                      Global<IDMgr>::Get()->GetGpuPhyIdFromMemZoneId(dst_mem_zone_id));
         Connect<TaskNode>(src_cpu, NewEdge(), src_h2d);
         return Set121BufTask(dst_machine_id, dst_mem_zone_id, src_h2d);
       }
@@ -201,7 +200,7 @@ TaskNode* TaskGraph::Build121BufTo(
 TaskNode* TaskGraph::AddCopyH2DTaskIfNotCpu(TaskNode* task) {
   if (task->device_type() == DeviceType::kCPU) { return task; }
   CopyHdTaskNode* copy_task = NewNode<CopyHdTaskNode>();
-  copy_task->Init(task->machine_id(), task->thrd_id(), CopyHdOpConf::H2D);
+  copy_task->Init(CopyHdOpConf::H2D, task->machine_id(), task->GpuPhyId());
   Connect<TaskNode>(copy_task, NewEdge(), task);
   return copy_task;
 }
@@ -209,7 +208,7 @@ TaskNode* TaskGraph::AddCopyH2DTaskIfNotCpu(TaskNode* task) {
 TaskNode* TaskGraph::AddCopyD2HTaskIfNotCpu(TaskNode* task) {
   if (task->device_type() == DeviceType::kCPU) { return task; }
   CopyHdTaskNode* copy_task = NewNode<CopyHdTaskNode>();
-  copy_task->Init(task->machine_id(), task->thrd_id(), CopyHdOpConf::D2H);
+  copy_task->Init(CopyHdOpConf::D2H, task->machine_id(), task->GpuPhyId());
   Connect<TaskNode>(task, NewEdge(), copy_task);
   return copy_task;
 }
diff --git a/oneflow/core/graph/task_node.cpp b/oneflow/core/graph/task_node.cpp
index 34bc9eacc1adc2d06c0a938fdc91f0437f77ae04..d9152ebc33d08094cf3e368023c0ca8714e79d87 100644
--- a/oneflow/core/graph/task_node.cpp
+++ b/oneflow/core/graph/task_node.cpp
@@ -110,6 +110,15 @@ void TaskNode::ToProto(TaskProto* task_proto) {
   }
 }
 
+int64_t TaskNode::MemZoneId121() const {
+  const IDMgr* id_mgr = Global<IDMgr>::Get();
+  if (device_type() == DeviceType::kCPU) {
+    return id_mgr->CpuMemZoneId();
+  } else {
+    return id_mgr->GpuMemZoneId(id_mgr->GetGpuPhyIdFromThrdId(thrd_id_));
+  }
+}
+
 void TaskNode::BindEdgeWithProducedRegst(TaskEdge* edge, const std::string& name) {
   edge->AddRegst(name, GetProducedRegst(name));
 }
@@ -138,7 +147,7 @@ void TaskNode::InitProducedRegstMemCase(MemoryCase* mem_case) {
     mem_case->mutable_host_mem();
   } else if (device_type() == DeviceType::kGPU) {
     mem_case->mutable_device_cuda_mem()->set_device_id(
-        Global<IDMgr>::Get()->GetGpuDevPhyIdFromThrdId(thrd_id_));
+        Global<IDMgr>::Get()->GetGpuPhyIdFromThrdId(thrd_id_));
   } else {
     UNIMPLEMENTED();
   }
diff --git a/oneflow/core/graph/task_node.h b/oneflow/core/graph/task_node.h
index 809c302e8393c07b6ef9aa9807048dc2076794fe..1df33d28915d439b7624b98c009c7dc5015d189f 100644
--- a/oneflow/core/graph/task_node.h
+++ b/oneflow/core/graph/task_node.h
@@ -30,8 +30,8 @@ class TaskNode : public Node<TaskNode, TaskEdge> {
   std::shared_ptr<RegstDesc> GetSoleConsumedRegst(const std::string& name);
   DeviceType device_type() const;
   virtual const ParallelContext* parallel_ctx() const { return nullptr; }
-  int64_t MemZoneId() const { return Global<IDMgr>::Get()->GetMemZoneIdFromThrdId(thrd_id_); }
   int64_t LocalWorkStreamId() const;
+  int64_t GpuPhyId() const { return Global<IDMgr>::Get()->GetGpuPhyIdFromThrdId(thrd_id_); }
 
   // Setters
   void set_machine_id(int64_t val);
@@ -52,6 +52,8 @@ class TaskNode : public Node<TaskNode, TaskEdge> {
   virtual void ToProto(TaskProto*);
   virtual bool IsPersistence() const { return false; }
   void BindEdgeWithProducedRegst(TaskEdge*, const std::string& name);
+  virtual bool NeedIndependentWorkStream() { return false; }
+  virtual int64_t MemZoneId121() const;  // TODO: there is bug for reduce task node
 
  protected:
   std::shared_ptr<RegstDesc> ProduceRegst(const std::string& name);
@@ -72,8 +74,7 @@ class TaskNode : public Node<TaskNode, TaskEdge> {
   virtual void LockRegsts();
   virtual void FixRegisterNumRange();
 
-  virtual int64_t AllocateLocalWorkStreamId();
-  virtual bool NeedIndependentWorkStream() { return false; }
+  int64_t AllocateLocalWorkStreamId();
 
  private:
   void ClearOutOfDateConsumedRegst();
diff --git a/oneflow/core/job/compiler.cpp b/oneflow/core/job/compiler.cpp
index 0baaac79c465ed5899040b8edb945831929112da..5b4ec35a29b52af14c06e489c74e8cb75e02ee61 100644
--- a/oneflow/core/job/compiler.cpp
+++ b/oneflow/core/job/compiler.cpp
@@ -66,7 +66,7 @@ Plan Compiler::DoCompile() {
   plan.set_total_mbn_num(total_mbn_num);
   FOR_RANGE(int64_t, machine_id, 0, job_desc->TotalMachineNum()) {
     plan.mutable_buf_info()->Add()->mutable_buf_size()->Resize(
-        job_desc->GpuDeviceNum() + job_desc->CpuDeviceNum(), 0);
+        job_desc->GpuDeviceNum() * 4 + job_desc->CpuDeviceNum(), 0);
   }
   task_gph->ForEachNode([&](TaskNode* task_node) {
     if (task_node->IsMeaningLess()) { return; }
diff --git a/oneflow/core/job/id_manager.cpp b/oneflow/core/job/id_manager.cpp
index d158a62bb20d968e8531eddb24112202bed60b7c..11a1fb45b6b5d841e21f4632f9a7c0a577aac9af 100644
--- a/oneflow/core/job/id_manager.cpp
+++ b/oneflow/core/job/id_manager.cpp
@@ -7,6 +7,26 @@ int64_t IDMgr::MachineID4MachineName(const std::string& machine_name) const {
   CHECK(it != machine_name2machine_id_.end()) << "Undefined machine name: " << machine_name;
   return it->second;
 }
+const std::string& IDMgr::MachineName4MachineId(int64_t machine_id) const {
+  return machine_id2machine_name_.at(machine_id);
+}
+
+int64_t IDMgr::GetGpuH2DThrdId(int64_t dev_phy_id) const { return gpu_device_num_ + dev_phy_id; }
+int64_t IDMgr::GetGpuD2HThrdId(int64_t dev_phy_id) const {
+  return gpu_device_num_ * 2 + dev_phy_id;
+}
+int64_t IDMgr::GetGpuIndependentThrdId(int64_t dev_phy_id) const {
+  return gpu_device_num_ * 3 + dev_phy_id;
+}
+int64_t IDMgr::GetCpuDeviceThrdId(int64_t dev_phy_id) const {
+  return gpu_device_num_ * 4 + dev_phy_id;
+}
+int64_t IDMgr::GetPersistenceThrdId(int64_t offset) const {
+  return gpu_device_num_ * 4 + cpu_device_num_ + offset;
+}
+int64_t IDMgr::CommNetThrdId() const {
+  return gpu_device_num_ * 4 + cpu_device_num_ + Global<JobDesc>::Get()->PersistenceWorkerNum();
+}
 
 int64_t IDMgr::NewTaskId(int64_t machine_id, int64_t thrd_id, int64_t local_work_stream_id) {
   int64_t machine_thrd_id = GetMachineThrdId(machine_id, thrd_id);
@@ -18,20 +38,16 @@ int64_t IDMgr::NewTaskId(int64_t machine_id, int64_t thrd_id, int64_t local_work
 }
 
 DeviceType IDMgr::GetDeviceTypeFromThrdId(int64_t thrd_id) const {
-  if (thrd_id < gpu_device_num_) {
+  if (thrd_id < 4 * gpu_device_num_) {
     return DeviceType::kGPU;
   } else {
     return DeviceType::kCPU;
   }
 }
 
-int64_t IDMgr::GetGpuDevPhyIdFromThrdId(int64_t thrd_id) const {
-  CHECK_LT(thrd_id, gpu_device_num_);
-  return thrd_id;
-}
-
-int64_t IDMgr::GetMemZoneIdFromThrdId(int64_t thrd_id) const {
-  return std::min(thrd_id, gpu_device_num_);
+int64_t IDMgr::GetGpuPhyIdFromThrdId(int64_t thrd_id) const {
+  CHECK_LT(thrd_id, 4 * gpu_device_num_);
+  return thrd_id % gpu_device_num_;
 }
 
 DeviceType IDMgr::GetDeviceTypeFromActorId(int64_t actor_id) const {
diff --git a/oneflow/core/job/id_manager.h b/oneflow/core/job/id_manager.h
index caac182c82c0afae6bcbbc20b03a934495208d7a..cc1d338ab9e03bc9dc2f8dc3ec8b0e72e9cb4aa8 100644
--- a/oneflow/core/job/id_manager.h
+++ b/oneflow/core/job/id_manager.h
@@ -14,35 +14,31 @@ class IDMgr final {
 
   // machine_name <-> machine_id
   int64_t MachineID4MachineName(const std::string& machine_name) const;
-  const std::string& MachineName4MachineId(int64_t machine_id) const {
-    return machine_id2machine_name_.at(machine_id);
-  }
+  const std::string& MachineName4MachineId(int64_t machine_id) const;
 
   // Get ThrdId, TaskId, RegstDescId
-  int64_t GetGpuDeviceThrdId(int64_t dev_phy_id) const { return dev_phy_id; }
-  int64_t GetCpuDeviceThrdId(int64_t dev_phy_id) const { return gpu_device_num_ + dev_phy_id; }
-  int64_t GetPersistenceThrdId(int64_t offset) const {
-    return gpu_device_num_ + cpu_device_num_ + offset;
-  }
-  int64_t CommNetThrdId() const {
-    return gpu_device_num_ + cpu_device_num_ + Global<JobDesc>::Get()->PersistenceWorkerNum();
-  }
+  int64_t GetGpuComputeThrdId(int64_t dev_phy_id) const { return dev_phy_id; }
+  int64_t GetGpuH2DThrdId(int64_t dev_phy_id) const;
+  int64_t GetGpuD2HThrdId(int64_t dev_phy_id) const;
+  int64_t GetGpuIndependentThrdId(int64_t dev_phy_id) const;
+  int64_t GetCpuDeviceThrdId(int64_t dev_phy_id) const;
+  int64_t GetPersistenceThrdId(int64_t offset) const;
+  int64_t CommNetThrdId() const;
+
   int64_t NewTaskId(int64_t machine_id, int64_t thrd_id, int64_t local_work_stream_id);
   int64_t NewRegstDescId() { return regst_desc_id_count_++; }
 
-  // Get MemZoneId
+  // MemZoneId
   int64_t CpuMemZoneId() const { return Global<JobDesc>::Get()->GpuDeviceNum(); }
-  int64_t GpuMemZoneId(int64_t dev_phy_id) { return dev_phy_id; }
+  int64_t GpuMemZoneId(int64_t dev_phy_id) const { return dev_phy_id; }
+  int64_t GetGpuPhyIdFromMemZoneId(int64_t mem_zone_id) const {
+    CHECK_LT(mem_zone_id, gpu_device_num_);
+    return mem_zone_id;
+  }
 
   // GetFromThrdId
   DeviceType GetDeviceTypeFromThrdId(int64_t thrd_id) const;
-  int64_t GetGpuDevPhyIdFromThrdId(int64_t thrd_id) const;
-  int64_t GetMemZoneIdFromThrdId(int64_t thrd_id) const;
-
-  int64_t GetThrdIdFromGpuMemZoneId(int64_t mem_zone_id) {
-    CHECK_NE(mem_zone_id, CpuMemZoneId());
-    return mem_zone_id;
-  }
+  int64_t GetGpuPhyIdFromThrdId(int64_t thrd_id) const;
 
   // Runtime
   DeviceType GetDeviceTypeFromActorId(int64_t actor_id) const;
@@ -53,9 +49,7 @@ class IDMgr final {
   // for cpu:
   //   0: the actor thread
   // for gpu:
-  //   0: the compute cuda stream
-  //   1: the copy h2d cuda stream
-  //   2: the copy d2h cuda stream
+  //   0: the global cuda stream
   //   other: start from 100
   int64_t AllocateLocalWorkStreamId(int64_t machine_id, int64_t thrd_id);
   int64_t LocalWorkStreamId4TaskId(int64_t task_id) const;
diff --git a/oneflow/core/thread/gpu_thread.cpp b/oneflow/core/thread/gpu_thread.cpp
index e49381a45046a614acae4a4231a38dd41c439d66..de9fd1fce47ac7988196d9a742459dbce5d27df8 100644
--- a/oneflow/core/thread/gpu_thread.cpp
+++ b/oneflow/core/thread/gpu_thread.cpp
@@ -15,9 +15,7 @@ GpuThread::GpuThread(int64_t thrd_id, int64_t dev_id, size_t buf_size) {
       ThreadCtx ctx;
       ctx.buf_ptr = buf_ptr;
       ctx.buf_size = buf_size;
-      ctx.compute_cuda_stream.reset(new CudaStreamHandle);
-      ctx.copy_h2d_cuda_stream.reset(new CudaStreamHandle);
-      ctx.copy_d2h_cuda_stream.reset(new CudaStreamHandle);
+      ctx.g_cuda_stream.reset(new CudaStreamHandle);
       PollMsgChannel(ctx);
     }
     if (buf_ptr) { CudaCheck(cudaFree(buf_ptr)); }
diff --git a/oneflow/core/thread/thread_context.h b/oneflow/core/thread/thread_context.h
index a4bbadbe482f8d6efb9080cde0d48bce2e1c49df..d9bd82c12c9e042d6c79f13bc1925718a3d0bf8c 100644
--- a/oneflow/core/thread/thread_context.h
+++ b/oneflow/core/thread/thread_context.h
@@ -9,9 +9,7 @@ struct ThreadCtx {
   void* buf_ptr;
   size_t buf_size;
 #ifdef WITH_CUDA
-  std::unique_ptr<CudaStreamHandle> compute_cuda_stream;
-  std::unique_ptr<CudaStreamHandle> copy_h2d_cuda_stream;
-  std::unique_ptr<CudaStreamHandle> copy_d2h_cuda_stream;
+  std::unique_ptr<CudaStreamHandle> g_cuda_stream;
 #endif
 };
 
diff --git a/oneflow/core/thread/thread_manager.cpp b/oneflow/core/thread/thread_manager.cpp
index 1c093f65d921186bcb048ad4d8cfbcb8f262056c..3cb5e5421491ebf8a44a4c3b9edcb862e1d2827c 100644
--- a/oneflow/core/thread/thread_manager.cpp
+++ b/oneflow/core/thread/thread_manager.cpp
@@ -23,9 +23,11 @@ ThreadMgr::ThreadMgr(const Plan& plan) {
   const OneMachineBufInfo& info = plan.buf_info().Get(Global<MachineCtx>::Get()->this_machine_id());
 
 #ifdef WITH_CUDA
-  FOR_RANGE(int64_t, i, 0, job_desc->GpuDeviceNum()) {
-    threads_.push_back(new GpuThread(thrd_id, i, info.buf_size(thrd_id)));
-    thrd_id += 1;
+  FOR_RANGE(int64_t, i, 0, 4) {
+    FOR_RANGE(int64_t, dev_phy_id, 0, job_desc->GpuDeviceNum()) {
+      threads_.push_back(new GpuThread(thrd_id, dev_phy_id, info.buf_size(thrd_id)));
+      thrd_id += 1;
+    }
   }
 #endif
   FOR_RANGE(int64_t, i, 0, job_desc->CpuDeviceNum()) {
diff --git a/oneflow/of_submit b/oneflow/of_submit
index b8a39b4c460c8f9ccaf3a865def14e69cab1d0ab..8e83c2d326188373024da7606a4c97dc0c7e8cc1 100755
--- a/oneflow/of_submit
+++ b/oneflow/of_submit
@@ -2,7 +2,6 @@
 
 import os
 import sys
-import paramiko
 import getpass
 import google.protobuf.text_format as pbtxt
 from tempfile import NamedTemporaryFile