From a2bfb1557b50e442b707a0156a6f651327d0c41a Mon Sep 17 00:00:00 2001
From: cheng cheng <472491134@qq.com>
Date: Sat, 17 Jul 2021 01:10:40 +0800
Subject: [PATCH] GlobalMultiClientEnv and refine EagerExecution (#5523)

* core: GlobalMultiClientEnv; refine EagerExecution

* move IsMultiClient to EnvDesc

Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
---
 oneflow/core/eager/eager_oneflow.cpp                 |  4 ++--
 oneflow/core/framework/instructions_builder.cpp      |  4 ++--
 .../core/framework/multi_client_session_context.cpp  |  3 +--
 oneflow/core/job/env_desc.cpp                        |  7 +++++++
 oneflow/core/job/env_desc.h                          |  2 ++
 oneflow/core/job/job_build_and_infer_ctx_mgr.cpp     | 12 +++++++++---
 oneflow/core/vm/id_generator.cpp                     |  6 +++---
 oneflow/python/test/graph/test_graph.py              |  4 ----
 oneflow/python/test/graph/test_input_op_expr.py      |  4 ----
 oneflow/python/test/graph/test_variable_op_expr.py   |  4 ----
 10 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/oneflow/core/eager/eager_oneflow.cpp b/oneflow/core/eager/eager_oneflow.cpp
index 491fd2f1a..975153da1 100644
--- a/oneflow/core/eager/eager_oneflow.cpp
+++ b/oneflow/core/eager/eager_oneflow.cpp
@@ -24,6 +24,7 @@ limitations under the License.
 #include "oneflow/core/vm/symbol_storage.h"
 #include "oneflow/core/vm/string_symbol.h"
 #include "oneflow/core/eager/eager_symbol.cfg.h"
+#include "oneflow/core/job/env_desc.h"
 #include "oneflow/core/job/job_desc.h"
 #include "oneflow/core/job/scope.h"
 #include "oneflow/core/job/cluster_instruction.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "oneflow/core/operator/op_conf_symbol.h"
 #include "oneflow/core/common/protobuf.h"
 #include "oneflow/core/common/util.h"
-#include "oneflow/api/python/env/env.h"
 
 namespace oneflow {
 namespace vm {
@@ -94,7 +94,7 @@ Maybe<void> EagerOneflow::RunPhysicalInstruction(vm::InstructionMsgList* instruc
 
 Maybe<void> EagerOneflow::RunLogicalInstruction(vm::InstructionMsgList* instruction_list,
                                                 const vm::cfg::EagerSymbolList& eager_symbol_list) {
-  if (JUST(IsMultiClient())) {
+  if (JUST(GlobalMultiClientEnv())) {
     // NOTE(chengcheng): in Multi-Client LogicalRun will degenerate directly to PhysicalRun,
     //   because each rank will process instructions ONLY from itself, NOT the master.
     return RunPhysicalInstruction(instruction_list, eager_symbol_list);
diff --git a/oneflow/core/framework/instructions_builder.cpp b/oneflow/core/framework/instructions_builder.cpp
index 81a0ebd5d..2901ec2ef 100644
--- a/oneflow/core/framework/instructions_builder.cpp
+++ b/oneflow/core/framework/instructions_builder.cpp
@@ -39,7 +39,7 @@ limitations under the License.
 #include "oneflow/core/framework/tensor.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/instruction_replay.h"
-#include "oneflow/api/python/env/env.h"
+#include "oneflow/core/job/env_desc.h"
 
 namespace oneflow {
 
@@ -1578,7 +1578,7 @@ InstructionsBuilder::GetMut2OperandBlobObjects(
 }
 
 Maybe<void> LogicalRun(const std::function<Maybe<void>(InstructionsBuilder*)>& Build) {
-  if (JUST(IsMultiClient())) {
+  if (JUST(GlobalMultiClientEnv())) {
     // NOTE(chengcheng): in Multi-Client LogicalRun will degenerate directly to PhysicalRun,
     //   because each rank will process instructions ONLY from itself, NOT the master.
     return PhysicalRun(Build);
diff --git a/oneflow/core/framework/multi_client_session_context.cpp b/oneflow/core/framework/multi_client_session_context.cpp
index 8ebca1ebf..422735276 100644
--- a/oneflow/core/framework/multi_client_session_context.cpp
+++ b/oneflow/core/framework/multi_client_session_context.cpp
@@ -23,7 +23,6 @@ limitations under the License.
 #include "oneflow/core/job/job_build_and_infer_ctx_mgr.h"
 #include "oneflow/core/common/buffer_manager.h"
 #include "oneflow/core/rpc/include/global_process_ctx.h"
-#include "oneflow/api/python/env/env.h"
 #ifdef WITH_CUDA
 #include <cuda.h>
 #endif  // WITH_CUDA
@@ -64,7 +63,7 @@ MultiClientSessionContext::~MultiClientSessionContext() {
 
 Maybe<void> MultiClientSessionContext::TryInit(const ConfigProto& config_proto) {
   if (!is_inited_) {
-    CHECK_OR_RETURN(JUST(IsMultiClient()));
+    CHECK_OR_RETURN(JUST(GlobalMultiClientEnv()));
     DumpVersionInfo();
 
     Resource resource = config_proto.resource();
diff --git a/oneflow/core/job/env_desc.cpp b/oneflow/core/job/env_desc.cpp
index ecafa5c72..064e4e13c 100644
--- a/oneflow/core/job/env_desc.cpp
+++ b/oneflow/core/job/env_desc.cpp
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/job/env_desc.h"
+#include "oneflow/core/job/global_for.h"
 
 namespace oneflow {
 
@@ -49,4 +50,10 @@ int64_t EnvDesc::GetMachineId(const std::string& addr) const {
   return machine_id;
 }
 
+Maybe<bool> GlobalMultiClientEnv() {
+  Maybe<bool>* is_multi_client = Global<Maybe<bool>, MultiClient>::Get();
+  CHECK_NOTNULL_OR_RETURN(is_multi_client);
+  return *is_multi_client;
+}
+
 }  // namespace oneflow
diff --git a/oneflow/core/job/env_desc.h b/oneflow/core/job/env_desc.h
index 4733202aa..9389471c4 100644
--- a/oneflow/core/job/env_desc.h
+++ b/oneflow/core/job/env_desc.h
@@ -44,6 +44,8 @@ class EnvDesc final {
   EnvProto env_proto_;
 };
 
+Maybe<bool> GlobalMultiClientEnv();
+
 }  // namespace oneflow
 
 #endif  // ONEFLOW_CORE_JOB_CLUSTER_DESC_H_
diff --git a/oneflow/core/job/job_build_and_infer_ctx_mgr.cpp b/oneflow/core/job/job_build_and_infer_ctx_mgr.cpp
index 9133c381c..ca6fbce2d 100644
--- a/oneflow/core/job/job_build_and_infer_ctx_mgr.cpp
+++ b/oneflow/core/job/job_build_and_infer_ctx_mgr.cpp
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "oneflow/core/job/global_for.h"
 #include "oneflow/core/job/lazy_mode.h"
+#include "oneflow/core/job/env_desc.h"
 #include "oneflow/core/common/util.h"
 #include <json.hpp>
 
@@ -105,10 +106,15 @@ Maybe<void> EagerJobBuildAndInferCtxMgr::VirtualCloseJob() {
 bool EagerExecutionEnabled() { return *Global<bool, EagerExecution>::Get(); }
 
 Maybe<JobBuildAndInferCtxMgr*> GlobalJobBuildAndInferCtxMgr() {
-  if (EagerExecutionEnabled() && !LazyMode::is_enabled()) {
-    return JUST(GlobalMaybe<EagerJobBuildAndInferCtxMgr>());
-  } else {
+  if (JUST(GlobalMultiClientEnv())) {
     return JUST(GlobalMaybe<LazyJobBuildAndInferCtxMgr>());
+  } else {
+    // single-client
+    if (EagerExecutionEnabled()) {
+      return JUST(GlobalMaybe<EagerJobBuildAndInferCtxMgr>());
+    } else {
+      return JUST(GlobalMaybe<LazyJobBuildAndInferCtxMgr>());
+    }
   }
 }
 
diff --git a/oneflow/core/vm/id_generator.cpp b/oneflow/core/vm/id_generator.cpp
index 731f818b0..ca8a9c1e8 100644
--- a/oneflow/core/vm/id_generator.cpp
+++ b/oneflow/core/vm/id_generator.cpp
@@ -16,13 +16,13 @@ limitations under the License.
 #include "oneflow/core/vm/id_generator.h"
 #include "oneflow/core/vm/id_util.h"
 #include "oneflow/core/control/global_process_ctx.h"
-#include "oneflow/api/python/env/env.h"
+#include "oneflow/core/job/env_desc.h"
 
 namespace oneflow {
 namespace vm {
 
 Maybe<int64_t> LogicalIdGenerator::NewSymbolId() {
-  if (JUST(IsMultiClient())) {
+  if (JUST(GlobalMultiClientEnv())) {
     // NOTE(chengcheng): in Multi-Client LogicalIdGenerator will degenerate directly to
     //   PhysicalIdGenerator, because each rank will generate id ONLY from itself, NOT the master.
     return IdUtil::NewPhysicalSymbolId(GlobalProcessCtx::Rank());
@@ -32,7 +32,7 @@ Maybe<int64_t> LogicalIdGenerator::NewSymbolId() {
 }
 
 Maybe<int64_t> LogicalIdGenerator::NewObjectId() {
-  if (JUST(IsMultiClient())) {
+  if (JUST(GlobalMultiClientEnv())) {
     // NOTE(chengcheng): in Multi-Client LogicalIdGenerator will degenerate directly to
     //   PhysicalIdGenerator, because each rank will generate id ONLY from itself, NOT the master.
     return IdUtil::NewPhysicalObjectId(GlobalProcessCtx::Rank());
diff --git a/oneflow/python/test/graph/test_graph.py b/oneflow/python/test/graph/test_graph.py
index e64633037..816ab199c 100644
--- a/oneflow/python/test/graph/test_graph.py
+++ b/oneflow/python/test/graph/test_graph.py
@@ -59,10 +59,6 @@ class CustomModule(flow.nn.Module):
 
 
 @flow.unittest.skip_unless_1n1d()
-@unittest.skipIf(
-    not flow.unittest.env.eager_execution_enabled(),
-    ".numpy() doesn't work in lazy mode",
-)
 class TestGraph(flow.unittest.TestCase):
     def test_add_nested_module(test_case):
         x = flow.Tensor(1, 1, 10, 10)
diff --git a/oneflow/python/test/graph/test_input_op_expr.py b/oneflow/python/test/graph/test_input_op_expr.py
index 22011b046..befbb0624 100644
--- a/oneflow/python/test/graph/test_input_op_expr.py
+++ b/oneflow/python/test/graph/test_input_op_expr.py
@@ -33,10 +33,6 @@ import oneflow.python.framework.c_api_util as c_api_util
 
 
 @flow.unittest.skip_unless_1n1d()
-@unittest.skipIf(
-    not flow.unittest.env.eager_execution_enabled(),
-    "default use eager mode to test this case",
-)
 class TestFeedInputTensor(unittest.TestCase):
     def test_feed_input_tensor(test_case):
         test_case.assertTrue(oneflow.distributed.is_multi_client())
diff --git a/oneflow/python/test/graph/test_variable_op_expr.py b/oneflow/python/test/graph/test_variable_op_expr.py
index 952666628..8e9fcd9e6 100644
--- a/oneflow/python/test/graph/test_variable_op_expr.py
+++ b/oneflow/python/test/graph/test_variable_op_expr.py
@@ -33,10 +33,6 @@ import oneflow.python.framework.c_api_util as c_api_util
 
 
 @flow.unittest.skip_unless_1n1d()
-@unittest.skipIf(
-    not flow.unittest.env.eager_execution_enabled(),
-    "default use eager mode to test this case",
-)
 class TestFeedVariableTensor(unittest.TestCase):
     def test_feed_var_tensor(test_case):
         test_case.assertTrue(oneflow.distributed.is_multi_client())
-- 
GitLab