GlobalMultiClientEnv and refine EagerExecution (#5523)

* core: GlobalMultiClientEnv; refine EagerExecution * move IsMultiClient to EnvDesc Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>

GlobalMultiClientEnv and refine EagerExecution (#5523)
* core: GlobalMultiClientEnv; refine EagerExecution * move IsMultiClient to EnvDesc Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
a2bfb155 · cheng cheng · GitHub · 1eae63b9 · a2bfb155 · a2bfb155
Unverified Commit a2bfb155 authored 3 years ago by cheng cheng Committed by GitHub 3 years ago
--- a/oneflow/core/eager/eager_oneflow.cpp
+++ b/oneflow/core/eager/eager_oneflow.cpp
@@ -24,6 +24,7 @@ limitations under the License.
 #include "oneflow/core/vm/symbol_storage.h"
 #include "oneflow/core/vm/string_symbol.h"
 #include "oneflow/core/eager/eager_symbol.cfg.h"
+#include "oneflow/core/job/env_desc.h"
 #include "oneflow/core/job/job_desc.h"
 #include "oneflow/core/job/scope.h"
 #include "oneflow/core/job/cluster_instruction.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "oneflow/core/operator/op_conf_symbol.h"
 #include "oneflow/core/common/protobuf.h"
 #include "oneflow/core/common/util.h"
-#include "oneflow/api/python/env/env.h"

 namespace oneflow {
 namespace vm {
@@ -94,7 +94,7 @@ Maybe<void> EagerOneflow::RunPhysicalInstruction(vm::InstructionMsgList* instruc

 Maybe<void> EagerOneflow::RunLogicalInstruction(vm::InstructionMsgList* instruction_list,
                                                const vm::cfg::EagerSymbolList& eager_symbol_list) {
-  if (JUST(IsMultiClient())) {
+  if (JUST(GlobalMultiClientEnv())) {
    // NOTE(chengcheng): in Multi-Client LogicalRun will degenerate directly to PhysicalRun,
    //   because each rank will process instructions ONLY from itself, NOT the master.
    return RunPhysicalInstruction(instruction_list, eager_symbol_list);

--- a/oneflow/core/framework/instructions_builder.cpp
+++ b/oneflow/core/framework/instructions_builder.cpp
@@ -39,7 +39,7 @@ limitations under the License.
 #include "oneflow/core/framework/tensor.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/instruction_replay.h"
-#include "oneflow/api/python/env/env.h"
+#include "oneflow/core/job/env_desc.h"

 namespace oneflow {

@@ -1578,7 +1578,7 @@ InstructionsBuilder::GetMut2OperandBlobObjects(
 }

 Maybe<void> LogicalRun(const std::function<Maybe<void>(InstructionsBuilder*)>& Build) {
-  if (JUST(IsMultiClient())) {
+  if (JUST(GlobalMultiClientEnv())) {
    // NOTE(chengcheng): in Multi-Client LogicalRun will degenerate directly to PhysicalRun,
    //   because each rank will process instructions ONLY from itself, NOT the master.
    return PhysicalRun(Build);

--- a/oneflow/core/framework/multi_client_session_context.cpp
+++ b/oneflow/core/framework/multi_client_session_context.cpp
@@ -23,7 +23,6 @@ limitations under the License.
 #include "oneflow/core/job/job_build_and_infer_ctx_mgr.h"
 #include "oneflow/core/common/buffer_manager.h"
 #include "oneflow/core/rpc/include/global_process_ctx.h"
-#include "oneflow/api/python/env/env.h"
 #ifdef WITH_CUDA
 #include <cuda.h>
 #endif  // WITH_CUDA
@@ -64,7 +63,7 @@ MultiClientSessionContext::~MultiClientSessionContext() {

 Maybe<void> MultiClientSessionContext::TryInit(const ConfigProto& config_proto) {
  if (!is_inited_) {
-    CHECK_OR_RETURN(JUST(IsMultiClient()));
+    CHECK_OR_RETURN(JUST(GlobalMultiClientEnv()));
    DumpVersionInfo();

    Resource resource = config_proto.resource();

--- a/oneflow/core/job/env_desc.cpp
+++ b/oneflow/core/job/env_desc.cpp
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/job/env_desc.h"
+#include "oneflow/core/job/global_for.h"

 namespace oneflow {

@@ -49,4 +50,10 @@ int64_t EnvDesc::GetMachineId(const std::string& addr) const {
  return machine_id;
 }

+Maybe<bool> GlobalMultiClientEnv() {
+  Maybe<bool>* is_multi_client = Global<Maybe<bool>, MultiClient>::Get();
+  CHECK_NOTNULL_OR_RETURN(is_multi_client);
+  return *is_multi_client;
+}
+
 }  // namespace oneflow
--- a/oneflow/core/job/env_desc.h
+++ b/oneflow/core/job/env_desc.h
@@ -44,6 +44,8 @@ class EnvDesc final {
  EnvProto env_proto_;
 };

+Maybe<bool> GlobalMultiClientEnv();
+
 }  // namespace oneflow

 #endif  // ONEFLOW_CORE_JOB_CLUSTER_DESC_H_
--- a/oneflow/core/job/job_build_and_infer_ctx_mgr.cpp
+++ b/oneflow/core/job/job_build_and_infer_ctx_mgr.cpp
@@ -17,6 +17,7 @@ limitations under the License.

 #include "oneflow/core/job/global_for.h"
 #include "oneflow/core/job/lazy_mode.h"
+#include "oneflow/core/job/env_desc.h"
 #include "oneflow/core/common/util.h"
 #include <json.hpp>

@@ -105,10 +106,15 @@ Maybe<void> EagerJobBuildAndInferCtxMgr::VirtualCloseJob() {
 bool EagerExecutionEnabled() { return *Global<bool, EagerExecution>::Get(); }

 Maybe<JobBuildAndInferCtxMgr*> GlobalJobBuildAndInferCtxMgr() {
-  if (EagerExecutionEnabled() && !LazyMode::is_enabled()) {
-    return JUST(GlobalMaybe<EagerJobBuildAndInferCtxMgr>());
-  } else {
+  if (JUST(GlobalMultiClientEnv())) {
    return JUST(GlobalMaybe<LazyJobBuildAndInferCtxMgr>());
+  } else {
+    // single-client
+    if (EagerExecutionEnabled()) {
+      return JUST(GlobalMaybe<EagerJobBuildAndInferCtxMgr>());
+    } else {
+      return JUST(GlobalMaybe<LazyJobBuildAndInferCtxMgr>());
+    }
  }
 }


--- a/oneflow/core/vm/id_generator.cpp
+++ b/oneflow/core/vm/id_generator.cpp
@@ -16,13 +16,13 @@ limitations under the License.
 #include "oneflow/core/vm/id_generator.h"
 #include "oneflow/core/vm/id_util.h"
 #include "oneflow/core/control/global_process_ctx.h"
-#include "oneflow/api/python/env/env.h"
+#include "oneflow/core/job/env_desc.h"

 namespace oneflow {
 namespace vm {

 Maybe<int64_t> LogicalIdGenerator::NewSymbolId() {
-  if (JUST(IsMultiClient())) {
+  if (JUST(GlobalMultiClientEnv())) {
    // NOTE(chengcheng): in Multi-Client LogicalIdGenerator will degenerate directly to
    //   PhysicalIdGenerator, because each rank will generate id ONLY from itself, NOT the master.
    return IdUtil::NewPhysicalSymbolId(GlobalProcessCtx::Rank());
@@ -32,7 +32,7 @@ Maybe<int64_t> LogicalIdGenerator::NewSymbolId() {
 }

 Maybe<int64_t> LogicalIdGenerator::NewObjectId() {
-  if (JUST(IsMultiClient())) {
+  if (JUST(GlobalMultiClientEnv())) {
    // NOTE(chengcheng): in Multi-Client LogicalIdGenerator will degenerate directly to
    //   PhysicalIdGenerator, because each rank will generate id ONLY from itself, NOT the master.
    return IdUtil::NewPhysicalObjectId(GlobalProcessCtx::Rank());

--- a/oneflow/python/test/graph/test_graph.py
+++ b/oneflow/python/test/graph/test_graph.py
@@ -59,10 +59,6 @@ class CustomModule(flow.nn.Module):


 @flow.unittest.skip_unless_1n1d()
-@unittest.skipIf(
-    not flow.unittest.env.eager_execution_enabled(),
-    ".numpy() doesn't work in lazy mode",
-)
 class TestGraph(flow.unittest.TestCase):
    def test_add_nested_module(test_case):
        x = flow.Tensor(1, 1, 10, 10)

--- a/oneflow/python/test/graph/test_input_op_expr.py
+++ b/oneflow/python/test/graph/test_input_op_expr.py
@@ -33,10 +33,6 @@ import oneflow.python.framework.c_api_util as c_api_util


 @flow.unittest.skip_unless_1n1d()
-@unittest.skipIf(
-    not flow.unittest.env.eager_execution_enabled(),
-    "default use eager mode to test this case",
-)
 class TestFeedInputTensor(unittest.TestCase):
    def test_feed_input_tensor(test_case):
        test_case.assertTrue(oneflow.distributed.is_multi_client())

--- a/oneflow/python/test/graph/test_variable_op_expr.py
+++ b/oneflow/python/test/graph/test_variable_op_expr.py
@@ -33,10 +33,6 @@ import oneflow.python.framework.c_api_util as c_api_util


 @flow.unittest.skip_unless_1n1d()
-@unittest.skipIf(
-    not flow.unittest.env.eager_execution_enabled(),
-    "default use eager mode to test this case",
-)
 class TestFeedVariableTensor(unittest.TestCase):
    def test_feed_var_tensor(test_case):
        test_case.assertTrue(oneflow.distributed.is_multi_client())