diff --git a/oneflow/core/actor/actor.cpp b/oneflow/core/actor/actor.cpp index a1b16b0a7bebb973f793b9c7a6baea14fd84d9f5..4f1899d6bebc6e5edd3e575725e1c84f9369389b 100644 --- a/oneflow/core/actor/actor.cpp +++ b/oneflow/core/actor/actor.cpp @@ -99,7 +99,7 @@ void Actor::InitDeviceCtx(const ThreadCtx& thread_ctx) { case DeviceType::kGPU: { CudaStreamHandle* cuda_handle = nullptr; if (GetLocalWorkStreamId() == 0) { - cuda_handle = thread_ctx.compute_cuda_stream.get(); + cuda_handle = thread_ctx.g_cuda_stream.get(); } else { CHECK(Global<IDMgr>::Get()->IsIndependentLocalWorkStreamId(GetLocalWorkStreamId())); cuda_handle = &cuda_handle_; diff --git a/oneflow/core/actor/copy_hd_actor.cpp b/oneflow/core/actor/copy_hd_actor.cpp index 472cca94c31ff4669a27b91b853d52289f1bec32..b51ee9728eb1fbd9bcdb6390235cfdb177190416 100644 --- a/oneflow/core/actor/copy_hd_actor.cpp +++ b/oneflow/core/actor/copy_hd_actor.cpp @@ -8,22 +8,6 @@ void CopyHdActor::VirtualActorInit(const TaskProto& task_proto) { OF_SET_MSG_HANDLER(&CopyHdActor::HandlerNormal); } -void CopyHdActor::InitDeviceCtx(const ThreadCtx& thread_ctx) { - CHECK_EQ(exec_kernel_vec().size(), 1); - const OperatorConf& op_conf = exec_kernel_vec().begin()->kernel->op_conf(); - CHECK(op_conf.has_copy_hd_conf()); - CudaStreamHandle* cuda_stream = nullptr; - if (op_conf.copy_hd_conf().type() == CopyHdOpConf::H2D) { - cuda_stream = thread_ctx.copy_h2d_cuda_stream.get(); - } else if (op_conf.copy_hd_conf().type() == CopyHdOpConf::D2H) { - cuda_stream = thread_ctx.copy_d2h_cuda_stream.get(); - } else { - UNIMPLEMENTED(); - } - CHECK_NOTNULL(cuda_stream); - mut_device_ctx().reset(new CudaDeviceCtx(nullptr, 0, cuda_stream)); -} - void CopyHdActor::Act() { Regst* in_regst = GetNaiveSoleCurReadable(); AsyncLaunchKernel(GenDefaultKernelCtx()); diff --git a/oneflow/core/actor/copy_hd_actor.h b/oneflow/core/actor/copy_hd_actor.h index be3808e8639bbccf17aafeef2ad296580926f34a..16bcba5d1dd145932a522d9b1969a4c2d54c5ffe 100644 --- a/oneflow/core/actor/copy_hd_actor.h +++ b/oneflow/core/actor/copy_hd_actor.h @@ -15,7 +15,6 @@ class CopyHdActor final : public Actor { private: void VirtualActorInit(const TaskProto&) override; - void InitDeviceCtx(const ThreadCtx&) override; void Act() override; std::pair<bool, std::vector<std::string>> GetNaiveConsumedRegstDescName() override { return {true, {}}; diff --git a/oneflow/core/graph/copy_task_node.cpp b/oneflow/core/graph/copy_task_node.cpp index f21674c0991c60bf50ef7c5359c326c193664926..007e835690c391b5959003b9c4258f73bd19ce86 100644 --- a/oneflow/core/graph/copy_task_node.cpp +++ b/oneflow/core/graph/copy_task_node.cpp @@ -21,17 +21,13 @@ void CopyTaskNode::BuildExecGphAndRegst() { node->BindBnWithRegst(node->op()->SoleObn(), out_regst); } -void CopyHdTaskNode::Init(int64_t machine_id, int64_t thrd_id, CopyHdOpConf::Type copy_type) { +void CopyHdTaskNode::Init(CopyHdOpConf::Type copy_type, int64_t machine_id, int64_t dev_phy_id) { copy_type_ = copy_type; set_machine_id(machine_id); - set_thrd_id(thrd_id); -} - -int64_t CopyHdTaskNode::AllocateLocalWorkStreamId() { - if (copy_type_ == CopyHdOpConf::H2D) { - return 1; - } else if (copy_type_ == CopyHdOpConf::D2H) { - return 2; + if (copy_type == CopyHdOpConf::H2D) { + set_thrd_id(Global<IDMgr>::Get()->GetGpuH2DThrdId(dev_phy_id)); + } else if (copy_type == CopyHdOpConf::D2H) { + set_thrd_id(Global<IDMgr>::Get()->GetGpuD2HThrdId(dev_phy_id)); } else { UNIMPLEMENTED(); } diff --git a/oneflow/core/graph/copy_task_node.h b/oneflow/core/graph/copy_task_node.h index 3fe311f10cba163533d09b560f15b219e7f681b6..369bdfbec8590876dee2196daa458a569f8e166b 100644 --- a/oneflow/core/graph/copy_task_node.h +++ b/oneflow/core/graph/copy_task_node.h @@ -29,12 +29,20 @@ class CopyHdTaskNode final : public CopyTaskNode { TaskType GetTaskType() const override { return TaskType::kCopyHd; } - void Init(int64_t machine_id, int64_t thrd_id, CopyHdOpConf::Type); + void Init(CopyHdOpConf::Type, int64_t machine_id, int64_t dev_phy_id); CopyHdOpConf::Type copy_type() const { return copy_type_; } + int64_t MemZoneId121() const override { + if (copy_type_ == CopyHdOpConf::H2D) { + return TaskNode::MemZoneId121(); + } else if (copy_type_ == CopyHdOpConf::D2H) { + return Global<IDMgr>::Get()->CpuMemZoneId(); + } else { + UNIMPLEMENTED(); + } + } private: - int64_t AllocateLocalWorkStreamId() override; void InitProducedRegstMemCase(MemoryCase*) override; OperatorConf NewCopyOpConf() override; diff --git a/oneflow/core/graph/logical_node.cpp b/oneflow/core/graph/logical_node.cpp index 747ea7d51284f62f6e98da5ccc4e384a859507b1..7db45f92c2e54625c7206241f07a4fdff0a0e8e2 100644 --- a/oneflow/core/graph/logical_node.cpp +++ b/oneflow/core/graph/logical_node.cpp @@ -192,7 +192,11 @@ void LogicalNode::GenSortedCompTaskNodes(std::function<int64_t(const TaskNode*)> CompTaskNode* comp_task_node = NewCompTaskNode(); comp_task_node->set_machine_id(machine_id); if (parallel_desc_->device_type() == DeviceType::kGPU) { - comp_task_node->set_thrd_id(Global<IDMgr>::Get()->GetGpuDeviceThrdId(dev_phy_id)); + if (comp_task_node->NeedIndependentWorkStream()) { + comp_task_node->set_thrd_id(Global<IDMgr>::Get()->GetGpuIndependentThrdId(dev_phy_id)); + } else { + comp_task_node->set_thrd_id(Global<IDMgr>::Get()->GetGpuComputeThrdId(dev_phy_id)); + } } else if (parallel_desc_->device_type() == DeviceType::kCPU) { comp_task_node->set_thrd_id(AllocateCpuThrdId(comp_task_node)); } else { diff --git a/oneflow/core/graph/reduce_scatter_compute_task_node.cpp b/oneflow/core/graph/reduce_scatter_compute_task_node.cpp index fbcd2ca3a026aedc1dead32d3ded5963e657a071..70782f1cc45a81900c51ffcc4ad5f2f518426c4c 100644 --- a/oneflow/core/graph/reduce_scatter_compute_task_node.cpp +++ b/oneflow/core/graph/reduce_scatter_compute_task_node.cpp @@ -18,7 +18,7 @@ void ReduceScatterCompTaskNode::ProduceAllRegstsAndBindEdges() { MemoryCase* mem_case = out_regst.get()->mut_mem_case(); mem_case->Clear(); mem_case->mutable_device_cuda_mem()->set_device_id( - Global<IDMgr>::Get()->GetGpuDevPhyIdFromThrdId(thrd_id())); + Global<IDMgr>::Get()->GetGpuPhyIdFromThrdId(thrd_id())); } } } diff --git a/oneflow/core/graph/task_graph.cpp b/oneflow/core/graph/task_graph.cpp index e332704bba325e391a30f09600b377c29e613200..12138e22da663c70350f42e82e8320f14463c542 100644 --- a/oneflow/core/graph/task_graph.cpp +++ b/oneflow/core/graph/task_graph.cpp @@ -86,7 +86,7 @@ DEFINE_BLD_SUB_TASK_GRAPH_METHOD(BldSubTskGphByOneToOne) { CompTaskNode* src = sorted_src_comp_tasks[i]; CompTaskNode* dst = sorted_dst_comp_tasks[i]; Connect<TaskNode>( - Build121BufTo(src, dst->machine_id(), dst->MemZoneId(), + Build121BufTo(src, dst->machine_id(), dst->MemZoneId121(), [&](int64_t machine_id, int32_t mem_zone_id) { return *Mut121BufTask(src, machine_id, mem_zone_id); }, @@ -179,7 +179,7 @@ TaskNode* TaskGraph::Build121BufTo( return Build121BufTo(dst_cpu, dst_machine_id, dst_mem_zone_id, Get121BufTask, Set121BufTask); } } else { - if (src->MemZoneId() == dst_mem_zone_id) { + if (src->MemZoneId121() == dst_mem_zone_id) { return Set121BufTask(dst_machine_id, dst_mem_zone_id, src); } else { if (dst_mem_zone_id == cpu_mem_zone_id) { @@ -188,9 +188,8 @@ TaskNode* TaskGraph::Build121BufTo( TaskNode* src_cpu = Build121BufTo(src, dst_machine_id, cpu_mem_zone_id, Get121BufTask, Set121BufTask); CopyHdTaskNode* src_h2d = NewNode<CopyHdTaskNode>(); - src_h2d->Init(dst_machine_id, - Global<IDMgr>::Get()->GetThrdIdFromGpuMemZoneId(dst_mem_zone_id), - CopyHdOpConf::H2D); + src_h2d->Init(CopyHdOpConf::H2D, dst_machine_id, + Global<IDMgr>::Get()->GetGpuPhyIdFromMemZoneId(dst_mem_zone_id)); Connect<TaskNode>(src_cpu, NewEdge(), src_h2d); return Set121BufTask(dst_machine_id, dst_mem_zone_id, src_h2d); } @@ -201,7 +200,7 @@ TaskNode* TaskGraph::Build121BufTo( TaskNode* TaskGraph::AddCopyH2DTaskIfNotCpu(TaskNode* task) { if (task->device_type() == DeviceType::kCPU) { return task; } CopyHdTaskNode* copy_task = NewNode<CopyHdTaskNode>(); - copy_task->Init(task->machine_id(), task->thrd_id(), CopyHdOpConf::H2D); + copy_task->Init(CopyHdOpConf::H2D, task->machine_id(), task->GpuPhyId()); Connect<TaskNode>(copy_task, NewEdge(), task); return copy_task; } @@ -209,7 +208,7 @@ TaskNode* TaskGraph::AddCopyH2DTaskIfNotCpu(TaskNode* task) { TaskNode* TaskGraph::AddCopyD2HTaskIfNotCpu(TaskNode* task) { if (task->device_type() == DeviceType::kCPU) { return task; } CopyHdTaskNode* copy_task = NewNode<CopyHdTaskNode>(); - copy_task->Init(task->machine_id(), task->thrd_id(), CopyHdOpConf::D2H); + copy_task->Init(CopyHdOpConf::D2H, task->machine_id(), task->GpuPhyId()); Connect<TaskNode>(task, NewEdge(), copy_task); return copy_task; } diff --git a/oneflow/core/graph/task_node.cpp b/oneflow/core/graph/task_node.cpp index 34bc9eacc1adc2d06c0a938fdc91f0437f77ae04..d9152ebc33d08094cf3e368023c0ca8714e79d87 100644 --- a/oneflow/core/graph/task_node.cpp +++ b/oneflow/core/graph/task_node.cpp @@ -110,6 +110,15 @@ void TaskNode::ToProto(TaskProto* task_proto) { } } +int64_t TaskNode::MemZoneId121() const { + const IDMgr* id_mgr = Global<IDMgr>::Get(); + if (device_type() == DeviceType::kCPU) { + return id_mgr->CpuMemZoneId(); + } else { + return id_mgr->GpuMemZoneId(id_mgr->GetGpuPhyIdFromThrdId(thrd_id_)); + } +} + void TaskNode::BindEdgeWithProducedRegst(TaskEdge* edge, const std::string& name) { edge->AddRegst(name, GetProducedRegst(name)); } @@ -138,7 +147,7 @@ void TaskNode::InitProducedRegstMemCase(MemoryCase* mem_case) { mem_case->mutable_host_mem(); } else if (device_type() == DeviceType::kGPU) { mem_case->mutable_device_cuda_mem()->set_device_id( - Global<IDMgr>::Get()->GetGpuDevPhyIdFromThrdId(thrd_id_)); + Global<IDMgr>::Get()->GetGpuPhyIdFromThrdId(thrd_id_)); } else { UNIMPLEMENTED(); } diff --git a/oneflow/core/graph/task_node.h b/oneflow/core/graph/task_node.h index 809c302e8393c07b6ef9aa9807048dc2076794fe..1df33d28915d439b7624b98c009c7dc5015d189f 100644 --- a/oneflow/core/graph/task_node.h +++ b/oneflow/core/graph/task_node.h @@ -30,8 +30,8 @@ class TaskNode : public Node<TaskNode, TaskEdge> { std::shared_ptr<RegstDesc> GetSoleConsumedRegst(const std::string& name); DeviceType device_type() const; virtual const ParallelContext* parallel_ctx() const { return nullptr; } - int64_t MemZoneId() const { return Global<IDMgr>::Get()->GetMemZoneIdFromThrdId(thrd_id_); } int64_t LocalWorkStreamId() const; + int64_t GpuPhyId() const { return Global<IDMgr>::Get()->GetGpuPhyIdFromThrdId(thrd_id_); } // Setters void set_machine_id(int64_t val); @@ -52,6 +52,8 @@ class TaskNode : public Node<TaskNode, TaskEdge> { virtual void ToProto(TaskProto*); virtual bool IsPersistence() const { return false; } void BindEdgeWithProducedRegst(TaskEdge*, const std::string& name); + virtual bool NeedIndependentWorkStream() { return false; } + virtual int64_t MemZoneId121() const; // TODO: there is bug for reduce task node protected: std::shared_ptr<RegstDesc> ProduceRegst(const std::string& name); @@ -72,8 +74,7 @@ class TaskNode : public Node<TaskNode, TaskEdge> { virtual void LockRegsts(); virtual void FixRegisterNumRange(); - virtual int64_t AllocateLocalWorkStreamId(); - virtual bool NeedIndependentWorkStream() { return false; } + int64_t AllocateLocalWorkStreamId(); private: void ClearOutOfDateConsumedRegst(); diff --git a/oneflow/core/job/compiler.cpp b/oneflow/core/job/compiler.cpp index 0baaac79c465ed5899040b8edb945831929112da..5b4ec35a29b52af14c06e489c74e8cb75e02ee61 100644 --- a/oneflow/core/job/compiler.cpp +++ b/oneflow/core/job/compiler.cpp @@ -66,7 +66,7 @@ Plan Compiler::DoCompile() { plan.set_total_mbn_num(total_mbn_num); FOR_RANGE(int64_t, machine_id, 0, job_desc->TotalMachineNum()) { plan.mutable_buf_info()->Add()->mutable_buf_size()->Resize( - job_desc->GpuDeviceNum() + job_desc->CpuDeviceNum(), 0); + job_desc->GpuDeviceNum() * 4 + job_desc->CpuDeviceNum(), 0); } task_gph->ForEachNode([&](TaskNode* task_node) { if (task_node->IsMeaningLess()) { return; } diff --git a/oneflow/core/job/id_manager.cpp b/oneflow/core/job/id_manager.cpp index d158a62bb20d968e8531eddb24112202bed60b7c..11a1fb45b6b5d841e21f4632f9a7c0a577aac9af 100644 --- a/oneflow/core/job/id_manager.cpp +++ b/oneflow/core/job/id_manager.cpp @@ -7,6 +7,26 @@ int64_t IDMgr::MachineID4MachineName(const std::string& machine_name) const { CHECK(it != machine_name2machine_id_.end()) << "Undefined machine name: " << machine_name; return it->second; } +const std::string& IDMgr::MachineName4MachineId(int64_t machine_id) const { + return machine_id2machine_name_.at(machine_id); +} + +int64_t IDMgr::GetGpuH2DThrdId(int64_t dev_phy_id) const { return gpu_device_num_ + dev_phy_id; } +int64_t IDMgr::GetGpuD2HThrdId(int64_t dev_phy_id) const { + return gpu_device_num_ * 2 + dev_phy_id; +} +int64_t IDMgr::GetGpuIndependentThrdId(int64_t dev_phy_id) const { + return gpu_device_num_ * 3 + dev_phy_id; +} +int64_t IDMgr::GetCpuDeviceThrdId(int64_t dev_phy_id) const { + return gpu_device_num_ * 4 + dev_phy_id; +} +int64_t IDMgr::GetPersistenceThrdId(int64_t offset) const { + return gpu_device_num_ * 4 + cpu_device_num_ + offset; +} +int64_t IDMgr::CommNetThrdId() const { + return gpu_device_num_ * 4 + cpu_device_num_ + Global<JobDesc>::Get()->PersistenceWorkerNum(); +} int64_t IDMgr::NewTaskId(int64_t machine_id, int64_t thrd_id, int64_t local_work_stream_id) { int64_t machine_thrd_id = GetMachineThrdId(machine_id, thrd_id); @@ -18,20 +38,16 @@ int64_t IDMgr::NewTaskId(int64_t machine_id, int64_t thrd_id, int64_t local_work } DeviceType IDMgr::GetDeviceTypeFromThrdId(int64_t thrd_id) const { - if (thrd_id < gpu_device_num_) { + if (thrd_id < 4 * gpu_device_num_) { return DeviceType::kGPU; } else { return DeviceType::kCPU; } } -int64_t IDMgr::GetGpuDevPhyIdFromThrdId(int64_t thrd_id) const { - CHECK_LT(thrd_id, gpu_device_num_); - return thrd_id; -} - -int64_t IDMgr::GetMemZoneIdFromThrdId(int64_t thrd_id) const { - return std::min(thrd_id, gpu_device_num_); +int64_t IDMgr::GetGpuPhyIdFromThrdId(int64_t thrd_id) const { + CHECK_LT(thrd_id, 4 * gpu_device_num_); + return thrd_id % gpu_device_num_; } DeviceType IDMgr::GetDeviceTypeFromActorId(int64_t actor_id) const { diff --git a/oneflow/core/job/id_manager.h b/oneflow/core/job/id_manager.h index caac182c82c0afae6bcbbc20b03a934495208d7a..cc1d338ab9e03bc9dc2f8dc3ec8b0e72e9cb4aa8 100644 --- a/oneflow/core/job/id_manager.h +++ b/oneflow/core/job/id_manager.h @@ -14,35 +14,31 @@ class IDMgr final { // machine_name <-> machine_id int64_t MachineID4MachineName(const std::string& machine_name) const; - const std::string& MachineName4MachineId(int64_t machine_id) const { - return machine_id2machine_name_.at(machine_id); - } + const std::string& MachineName4MachineId(int64_t machine_id) const; // Get ThrdId, TaskId, RegstDescId - int64_t GetGpuDeviceThrdId(int64_t dev_phy_id) const { return dev_phy_id; } - int64_t GetCpuDeviceThrdId(int64_t dev_phy_id) const { return gpu_device_num_ + dev_phy_id; } - int64_t GetPersistenceThrdId(int64_t offset) const { - return gpu_device_num_ + cpu_device_num_ + offset; - } - int64_t CommNetThrdId() const { - return gpu_device_num_ + cpu_device_num_ + Global<JobDesc>::Get()->PersistenceWorkerNum(); - } + int64_t GetGpuComputeThrdId(int64_t dev_phy_id) const { return dev_phy_id; } + int64_t GetGpuH2DThrdId(int64_t dev_phy_id) const; + int64_t GetGpuD2HThrdId(int64_t dev_phy_id) const; + int64_t GetGpuIndependentThrdId(int64_t dev_phy_id) const; + int64_t GetCpuDeviceThrdId(int64_t dev_phy_id) const; + int64_t GetPersistenceThrdId(int64_t offset) const; + int64_t CommNetThrdId() const; + int64_t NewTaskId(int64_t machine_id, int64_t thrd_id, int64_t local_work_stream_id); int64_t NewRegstDescId() { return regst_desc_id_count_++; } - // Get MemZoneId + // MemZoneId int64_t CpuMemZoneId() const { return Global<JobDesc>::Get()->GpuDeviceNum(); } - int64_t GpuMemZoneId(int64_t dev_phy_id) { return dev_phy_id; } + int64_t GpuMemZoneId(int64_t dev_phy_id) const { return dev_phy_id; } + int64_t GetGpuPhyIdFromMemZoneId(int64_t mem_zone_id) const { + CHECK_LT(mem_zone_id, gpu_device_num_); + return mem_zone_id; + } // GetFromThrdId DeviceType GetDeviceTypeFromThrdId(int64_t thrd_id) const; - int64_t GetGpuDevPhyIdFromThrdId(int64_t thrd_id) const; - int64_t GetMemZoneIdFromThrdId(int64_t thrd_id) const; - - int64_t GetThrdIdFromGpuMemZoneId(int64_t mem_zone_id) { - CHECK_NE(mem_zone_id, CpuMemZoneId()); - return mem_zone_id; - } + int64_t GetGpuPhyIdFromThrdId(int64_t thrd_id) const; // Runtime DeviceType GetDeviceTypeFromActorId(int64_t actor_id) const; @@ -53,9 +49,7 @@ class IDMgr final { // for cpu: // 0: the actor thread // for gpu: - // 0: the compute cuda stream - // 1: the copy h2d cuda stream - // 2: the copy d2h cuda stream + // 0: the global cuda stream // other: start from 100 int64_t AllocateLocalWorkStreamId(int64_t machine_id, int64_t thrd_id); int64_t LocalWorkStreamId4TaskId(int64_t task_id) const; diff --git a/oneflow/core/thread/gpu_thread.cpp b/oneflow/core/thread/gpu_thread.cpp index e49381a45046a614acae4a4231a38dd41c439d66..de9fd1fce47ac7988196d9a742459dbce5d27df8 100644 --- a/oneflow/core/thread/gpu_thread.cpp +++ b/oneflow/core/thread/gpu_thread.cpp @@ -15,9 +15,7 @@ GpuThread::GpuThread(int64_t thrd_id, int64_t dev_id, size_t buf_size) { ThreadCtx ctx; ctx.buf_ptr = buf_ptr; ctx.buf_size = buf_size; - ctx.compute_cuda_stream.reset(new CudaStreamHandle); - ctx.copy_h2d_cuda_stream.reset(new CudaStreamHandle); - ctx.copy_d2h_cuda_stream.reset(new CudaStreamHandle); + ctx.g_cuda_stream.reset(new CudaStreamHandle); PollMsgChannel(ctx); } if (buf_ptr) { CudaCheck(cudaFree(buf_ptr)); } diff --git a/oneflow/core/thread/thread_context.h b/oneflow/core/thread/thread_context.h index a4bbadbe482f8d6efb9080cde0d48bce2e1c49df..d9bd82c12c9e042d6c79f13bc1925718a3d0bf8c 100644 --- a/oneflow/core/thread/thread_context.h +++ b/oneflow/core/thread/thread_context.h @@ -9,9 +9,7 @@ struct ThreadCtx { void* buf_ptr; size_t buf_size; #ifdef WITH_CUDA - std::unique_ptr<CudaStreamHandle> compute_cuda_stream; - std::unique_ptr<CudaStreamHandle> copy_h2d_cuda_stream; - std::unique_ptr<CudaStreamHandle> copy_d2h_cuda_stream; + std::unique_ptr<CudaStreamHandle> g_cuda_stream; #endif }; diff --git a/oneflow/core/thread/thread_manager.cpp b/oneflow/core/thread/thread_manager.cpp index 1c093f65d921186bcb048ad4d8cfbcb8f262056c..3cb5e5421491ebf8a44a4c3b9edcb862e1d2827c 100644 --- a/oneflow/core/thread/thread_manager.cpp +++ b/oneflow/core/thread/thread_manager.cpp @@ -23,9 +23,11 @@ ThreadMgr::ThreadMgr(const Plan& plan) { const OneMachineBufInfo& info = plan.buf_info().Get(Global<MachineCtx>::Get()->this_machine_id()); #ifdef WITH_CUDA - FOR_RANGE(int64_t, i, 0, job_desc->GpuDeviceNum()) { - threads_.push_back(new GpuThread(thrd_id, i, info.buf_size(thrd_id))); - thrd_id += 1; + FOR_RANGE(int64_t, i, 0, 4) { + FOR_RANGE(int64_t, dev_phy_id, 0, job_desc->GpuDeviceNum()) { + threads_.push_back(new GpuThread(thrd_id, dev_phy_id, info.buf_size(thrd_id))); + thrd_id += 1; + } } #endif FOR_RANGE(int64_t, i, 0, job_desc->CpuDeviceNum()) { diff --git a/oneflow/of_submit b/oneflow/of_submit index b8a39b4c460c8f9ccaf3a865def14e69cab1d0ab..8e83c2d326188373024da7606a4c97dc0c7e8cc1 100755 --- a/oneflow/of_submit +++ b/oneflow/of_submit @@ -2,7 +2,6 @@ import os import sys -import paramiko import getpass import google.protobuf.text_format as pbtxt from tempfile import NamedTemporaryFile