Skip to content
Snippets Groups Projects
Unverified Commit a2bfb155 authored by cheng cheng's avatar cheng cheng Committed by GitHub
Browse files

GlobalMultiClientEnv and refine EagerExecution (#5523)


* core: GlobalMultiClientEnv; refine EagerExecution

* move IsMultiClient to EnvDesc

Co-authored-by: default avataroneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
parent 1eae63b9
No related branches found
No related tags found
No related merge requests found
......@@ -24,6 +24,7 @@ limitations under the License.
#include "oneflow/core/vm/symbol_storage.h"
#include "oneflow/core/vm/string_symbol.h"
#include "oneflow/core/eager/eager_symbol.cfg.h"
#include "oneflow/core/job/env_desc.h"
#include "oneflow/core/job/job_desc.h"
#include "oneflow/core/job/scope.h"
#include "oneflow/core/job/cluster_instruction.h"
......@@ -34,7 +35,6 @@ limitations under the License.
#include "oneflow/core/operator/op_conf_symbol.h"
#include "oneflow/core/common/protobuf.h"
#include "oneflow/core/common/util.h"
#include "oneflow/api/python/env/env.h"
namespace oneflow {
namespace vm {
......@@ -94,7 +94,7 @@ Maybe<void> EagerOneflow::RunPhysicalInstruction(vm::InstructionMsgList* instruc
Maybe<void> EagerOneflow::RunLogicalInstruction(vm::InstructionMsgList* instruction_list,
const vm::cfg::EagerSymbolList& eager_symbol_list) {
if (JUST(IsMultiClient())) {
if (JUST(GlobalMultiClientEnv())) {
// NOTE(chengcheng): in Multi-Client LogicalRun will degenerate directly to PhysicalRun,
// because each rank will process instructions ONLY from itself, NOT the master.
return RunPhysicalInstruction(instruction_list, eager_symbol_list);
......
......@@ -39,7 +39,7 @@ limitations under the License.
#include "oneflow/core/framework/tensor.h"
#include "oneflow/core/framework/device.h"
#include "oneflow/core/framework/instruction_replay.h"
#include "oneflow/api/python/env/env.h"
#include "oneflow/core/job/env_desc.h"
namespace oneflow {
......@@ -1578,7 +1578,7 @@ InstructionsBuilder::GetMut2OperandBlobObjects(
}
Maybe<void> LogicalRun(const std::function<Maybe<void>(InstructionsBuilder*)>& Build) {
if (JUST(IsMultiClient())) {
if (JUST(GlobalMultiClientEnv())) {
// NOTE(chengcheng): in Multi-Client LogicalRun will degenerate directly to PhysicalRun,
// because each rank will process instructions ONLY from itself, NOT the master.
return PhysicalRun(Build);
......
......@@ -23,7 +23,6 @@ limitations under the License.
#include "oneflow/core/job/job_build_and_infer_ctx_mgr.h"
#include "oneflow/core/common/buffer_manager.h"
#include "oneflow/core/rpc/include/global_process_ctx.h"
#include "oneflow/api/python/env/env.h"
#ifdef WITH_CUDA
#include <cuda.h>
#endif // WITH_CUDA
......@@ -64,7 +63,7 @@ MultiClientSessionContext::~MultiClientSessionContext() {
Maybe<void> MultiClientSessionContext::TryInit(const ConfigProto& config_proto) {
if (!is_inited_) {
CHECK_OR_RETURN(JUST(IsMultiClient()));
CHECK_OR_RETURN(JUST(GlobalMultiClientEnv()));
DumpVersionInfo();
Resource resource = config_proto.resource();
......
......@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/job/env_desc.h"
#include "oneflow/core/job/global_for.h"
namespace oneflow {
......@@ -49,4 +50,10 @@ int64_t EnvDesc::GetMachineId(const std::string& addr) const {
return machine_id;
}
Maybe<bool> GlobalMultiClientEnv() {
Maybe<bool>* is_multi_client = Global<Maybe<bool>, MultiClient>::Get();
CHECK_NOTNULL_OR_RETURN(is_multi_client);
return *is_multi_client;
}
} // namespace oneflow
......@@ -44,6 +44,8 @@ class EnvDesc final {
EnvProto env_proto_;
};
Maybe<bool> GlobalMultiClientEnv();
} // namespace oneflow
#endif // ONEFLOW_CORE_JOB_CLUSTER_DESC_H_
......@@ -17,6 +17,7 @@ limitations under the License.
#include "oneflow/core/job/global_for.h"
#include "oneflow/core/job/lazy_mode.h"
#include "oneflow/core/job/env_desc.h"
#include "oneflow/core/common/util.h"
#include <json.hpp>
......@@ -105,10 +106,15 @@ Maybe<void> EagerJobBuildAndInferCtxMgr::VirtualCloseJob() {
bool EagerExecutionEnabled() { return *Global<bool, EagerExecution>::Get(); }
Maybe<JobBuildAndInferCtxMgr*> GlobalJobBuildAndInferCtxMgr() {
if (EagerExecutionEnabled() && !LazyMode::is_enabled()) {
return JUST(GlobalMaybe<EagerJobBuildAndInferCtxMgr>());
} else {
if (JUST(GlobalMultiClientEnv())) {
return JUST(GlobalMaybe<LazyJobBuildAndInferCtxMgr>());
} else {
// single-client
if (EagerExecutionEnabled()) {
return JUST(GlobalMaybe<EagerJobBuildAndInferCtxMgr>());
} else {
return JUST(GlobalMaybe<LazyJobBuildAndInferCtxMgr>());
}
}
}
......
......@@ -16,13 +16,13 @@ limitations under the License.
#include "oneflow/core/vm/id_generator.h"
#include "oneflow/core/vm/id_util.h"
#include "oneflow/core/control/global_process_ctx.h"
#include "oneflow/api/python/env/env.h"
#include "oneflow/core/job/env_desc.h"
namespace oneflow {
namespace vm {
Maybe<int64_t> LogicalIdGenerator::NewSymbolId() {
if (JUST(IsMultiClient())) {
if (JUST(GlobalMultiClientEnv())) {
// NOTE(chengcheng): in Multi-Client LogicalIdGenerator will degenerate directly to
// PhysicalIdGenerator, because each rank will generate id ONLY from itself, NOT the master.
return IdUtil::NewPhysicalSymbolId(GlobalProcessCtx::Rank());
......@@ -32,7 +32,7 @@ Maybe<int64_t> LogicalIdGenerator::NewSymbolId() {
}
Maybe<int64_t> LogicalIdGenerator::NewObjectId() {
if (JUST(IsMultiClient())) {
if (JUST(GlobalMultiClientEnv())) {
// NOTE(chengcheng): in Multi-Client LogicalIdGenerator will degenerate directly to
// PhysicalIdGenerator, because each rank will generate id ONLY from itself, NOT the master.
return IdUtil::NewPhysicalObjectId(GlobalProcessCtx::Rank());
......
......@@ -59,10 +59,6 @@ class CustomModule(flow.nn.Module):
@flow.unittest.skip_unless_1n1d()
@unittest.skipIf(
not flow.unittest.env.eager_execution_enabled(),
".numpy() doesn't work in lazy mode",
)
class TestGraph(flow.unittest.TestCase):
def test_add_nested_module(test_case):
x = flow.Tensor(1, 1, 10, 10)
......
......@@ -33,10 +33,6 @@ import oneflow.python.framework.c_api_util as c_api_util
@flow.unittest.skip_unless_1n1d()
@unittest.skipIf(
not flow.unittest.env.eager_execution_enabled(),
"default use eager mode to test this case",
)
class TestFeedInputTensor(unittest.TestCase):
def test_feed_input_tensor(test_case):
test_case.assertTrue(oneflow.distributed.is_multi_client())
......
......@@ -33,10 +33,6 @@ import oneflow.python.framework.c_api_util as c_api_util
@flow.unittest.skip_unless_1n1d()
@unittest.skipIf(
not flow.unittest.env.eager_execution_enabled(),
"default use eager mode to test this case",
)
class TestFeedVariableTensor(unittest.TestCase):
def test_feed_var_tensor(test_case):
test_case.assertTrue(oneflow.distributed.is_multi_client())
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment