diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f51e67f580ed201de6593ecea7199f52e6b9d32..c1933f0d070caa69392652df250ac873460fde3d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,7 +120,7 @@ if(WIN32)
   #set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS} /DEBUG:FASTLINK")
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /D_ITERATOR_DEBUG_LEVEL=0")
 else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wno-sign-compare -Wno-unused-function -fPIC -Werror=return-type")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wno-sign-compare -Wno-unused-function -fPIC")
 endif()
 
 if (THIRD_PARTY)
diff --git a/oneflow/core/common/error.proto b/oneflow/core/common/error.proto
index b7d2471add4d3851da208ab552f501386f7195a3..7040f3344cb8de619533e22fe20dd164816e15a9 100644
--- a/oneflow/core/common/error.proto
+++ b/oneflow/core/common/error.proto
@@ -57,7 +57,7 @@ enum JobBuildAndInferError {
   kLogicalBlobNameInvalid = 402;
   
   kOpNameExist = 450;
-  kOpConfDeviceTypeNoSet = 460;
+  kOpConfDeviceTagNoSet = 460;
   kPlacementError = 470;
   kBlobSplitAxisInferError = 480;
   kUnknownJobBuildAndInferError = 500;
diff --git a/oneflow/core/framework/to_string.cpp b/oneflow/core/framework/to_string.cpp
index 3ea283be969a470e0f99fa6946903b7de35d257d..ddc75b751e45aced86909a43bc1d2b057ba786c4 100644
--- a/oneflow/core/framework/to_string.cpp
+++ b/oneflow/core/framework/to_string.cpp
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/common/util.h"
 #include "oneflow/core/framework/to_string.h"
 
 namespace oneflow {
@@ -20,7 +21,7 @@ namespace oneflow {
 Maybe<const char*> DeviceTag4DeviceType(DeviceType device_type) {
   if (device_type == kCPU) { return "cpu"; }
   if (device_type == kGPU) { return "gpu"; }
-  return Error::DeviceTagNotFound() << "invalid";
+  return Error::DeviceTagNotFound() << "invalid_device";
 }
 
 Maybe<DeviceType> DeviceType4DeviceTag(const std::string& device_tag) {
diff --git a/oneflow/core/framework/user_op_hob.cpp b/oneflow/core/framework/user_op_hob.cpp
index a89bc28e75c293a3f45ea44384b8cc6cfe26c641..36f70a2db3dd0c1fcc258ff12409fcb2de0bbe2c 100644
--- a/oneflow/core/framework/user_op_hob.cpp
+++ b/oneflow/core/framework/user_op_hob.cpp
@@ -40,13 +40,6 @@ hob::BoolFunctorPtr<KernelRegContext> HobFalse() {
   return krbf_ptr;
 }
 
-hob::HobContextGetter<KernelRegContext, DeviceType> HobDeviceType() {
-  std::ostringstream string_stream;
-  string_stream << "device_type";
-  return hob::HobContextGetter<KernelRegContext, DeviceType>(
-      string_stream.str(), [](const KernelRegContext& ctx) { return ctx.device_type(); });
-}
-
 hob::HobContextGetter<KernelRegContext, DataType> HobDataType(const std::string& tensor_name,
                                                               int tensor_idx) {
   std::ostringstream string_stream;
@@ -58,6 +51,14 @@ hob::HobContextGetter<KernelRegContext, DataType> HobDataType(const std::string&
       });
 }
 
+HobStringContextGetter<KernelRegContext> HobDeviceTag() {
+  std::ostringstream string_stream;
+  string_stream << "device_tag";
+  return HobStringContextGetter<KernelRegContext>(
+      string_stream.str(),
+      [](const KernelRegContext& ctx) -> const std::string& { return ctx.device_tag(); });
+}
+
 }  // namespace user_op
 
 }  // namespace oneflow
diff --git a/oneflow/core/framework/user_op_hob.h b/oneflow/core/framework/user_op_hob.h
index 3ec831d614da851fa8f853b9c4fbe8df7859b848..a1a161d2697109c58a0bf6c488e1231fe0d53214 100644
--- a/oneflow/core/framework/user_op_hob.h
+++ b/oneflow/core/framework/user_op_hob.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_FRAMEWORK_USER_OP_HOB_H_
 #define ONEFLOW_CORE_FRAMEWORK_USER_OP_HOB_H_
 
-#include "oneflow/core/common/high_order_bool.h"
 #include "oneflow/core/common/data_type.h"
+#include "oneflow/core/common/high_order_bool.h"
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/framework/user_op_registry_manager.h"
 
 namespace oneflow {
@@ -28,8 +29,6 @@ hob::BoolFunctorPtr<KernelRegContext> HobTrue();
 
 hob::BoolFunctorPtr<KernelRegContext> HobFalse();
 
-hob::HobContextGetter<KernelRegContext, DeviceType> HobDeviceType();
-
 hob::HobContextGetter<KernelRegContext, DataType> HobDataType(const std::string& tensor_name,
                                                               int tensor_idx);
 
@@ -47,6 +46,46 @@ hob::HobContextGetter<user_op::KernelRegContext, T> HobAttr(const std::string& a
   });
 }
 
+template<typename ContextT>
+class HobStringContextGetter final {
+ public:
+  HobStringContextGetter(const DeviceType& device_type) {
+    std::string str = ToString(device_type);
+    debug_str_ = str;
+    context_getter_ = [str](const ContextT&) -> const std::string& { return str; };
+  }
+  HobStringContextGetter(const char* const_value) {
+    std::string str(const_value);
+    debug_str_ = str;
+    context_getter_ = [str](const ContextT&) -> const std::string& { return str; };
+  }
+  HobStringContextGetter(const std::string& const_value)
+      : debug_str_(const_value),
+        context_getter_(
+            [const_value](const ContextT&) -> const std::string& { return const_value; }) {}
+  HobStringContextGetter(const std::string& debug_str,
+                         const std::function<const std::string&(const ContextT&)>& context_getter)
+      : debug_str_(debug_str), context_getter_(context_getter) {}
+
+  hob::BoolFunctorPtr<ContextT> operator==(const HobStringContextGetter& other) const {
+    std::ostringstream string_stream;
+    string_stream << debug_str_ << " == " << other.debug_str_;
+    std::function<std::string(const ContextT&)> l_fn = this->context_getter_;
+    std::function<std::string(const ContextT&)> r_fn = other.context_getter_;
+    std::shared_ptr<const hob::BoolFunctor<ContextT>> krbf_ptr =
+        std::make_shared<const hob::HighOrderBoolFunctor<ContextT>>(
+            string_stream.str(),
+            [l_fn, r_fn](const ContextT& ctx) { return l_fn(ctx) == r_fn(ctx); });
+    return krbf_ptr;
+  }
+
+ private:
+  std::string debug_str_;
+  std::function<const std::string&(const ContextT&)> context_getter_;
+};
+
+HobStringContextGetter<KernelRegContext> HobDeviceTag();
+
 }  // namespace user_op
 
 }  // namespace oneflow
diff --git a/oneflow/core/framework/user_op_kernel_registry.h b/oneflow/core/framework/user_op_kernel_registry.h
index 7265ff53f9a4275510e9627b1d4803dad61cbf20..412e0e2981771c65000c6d73e061d0d68ead9e00 100644
--- a/oneflow/core/framework/user_op_kernel_registry.h
+++ b/oneflow/core/framework/user_op_kernel_registry.h
@@ -37,6 +37,7 @@ class KernelRegContext {
   virtual ~KernelRegContext() = default;
 
   virtual DeviceType device_type() const = 0;
+  virtual const std::string& device_tag() const = 0;
   virtual const ParallelContext& parallel_ctx() const = 0;
   virtual const TensorDesc* TensorDesc4ArgNameAndIndex(const std::string&, int32_t) const = 0;
 
diff --git a/oneflow/core/graph/boxing/collective_boxing_sub_task_graph_builder.cpp b/oneflow/core/graph/boxing/collective_boxing_sub_task_graph_builder.cpp
index 6624e6827d0989b2e3b113091b852d37bd8b0c95..280a167d9be60143a90905781a0ffdaa0a8cb446 100644
--- a/oneflow/core/graph/boxing/collective_boxing_sub_task_graph_builder.cpp
+++ b/oneflow/core/graph/boxing/collective_boxing_sub_task_graph_builder.cpp
@@ -13,10 +13,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
+#include "oneflow/core/graph/boxing/chain_sub_task_graph_builder.h"
 #include "oneflow/core/graph/boxing/collective_boxing_sub_task_graph_builder.h"
 #include "oneflow/core/graph/boxing/sub_task_graph_builder_util.h"
 #include "oneflow/core/graph/collective_boxing_task_node.h"
-#include "oneflow/core/graph/boxing/chain_sub_task_graph_builder.h"
 #include "oneflow/core/graph/slice_boxing_task_node.h"
 
 namespace oneflow {
@@ -31,7 +32,7 @@ void NcclInitCollectiveNode(CollectiveBoxingGenericTaskNode* node,
                             const BlobDesc& logical_blob_desc, OpType op_type, int64_t root) {
   OperatorConf op_conf;
   op_conf.set_name(name);
-  op_conf.set_device_type(DeviceType::kGPU);
+  op_conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(DeviceType::kGPU)));
   CollectiveBoxingGenericOpConf* conf = op_conf.mutable_collective_boxing_generic_conf();
   *conf->mutable_lbi() = lbi;
   RankDesc* rank_desc = conf->mutable_rank_desc();
diff --git a/oneflow/core/graph/boxing_identity_compute_task_node.cpp b/oneflow/core/graph/boxing_identity_compute_task_node.cpp
index ea4a7e7d4fd79609060ac0fad5bf034e11da7f03..035700f09d395ad97abd3fd497d1892f321fd3c6 100644
--- a/oneflow/core/graph/boxing_identity_compute_task_node.cpp
+++ b/oneflow/core/graph/boxing_identity_compute_task_node.cpp
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/graph/boxing_identity_compute_task_node.h"
 #include "oneflow/core/graph/logical_node.h"
 
@@ -41,7 +42,7 @@ void BoxingIdentityCompTaskNode::BuildExecGphAndRegst() {
   ExecNode* node = mut_exec_gph().NewNode();
   OperatorConf op_conf;
   op_conf.set_name("System-Boxing-Identity-" + NewUniqueId());
-  op_conf.set_device_type(this->device_type());
+  op_conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(this->device_type())));
   *op_conf.mutable_boxing_identity_conf()->mutable_lbi() = lbi_;
   std::shared_ptr<Operator> sole_op = ConstructOp(op_conf, &GlobalJobDesc());
   node->mut_op() = sole_op;
diff --git a/oneflow/core/graph/chain_graph.cpp b/oneflow/core/graph/chain_graph.cpp
index 1bcc6c8988994d621726967aa7ca6ab35dd27fc3..9a1c898de2cab605e2a50d6f8977cba7f3642cb4 100644
--- a/oneflow/core/graph/chain_graph.cpp
+++ b/oneflow/core/graph/chain_graph.cpp
@@ -20,6 +20,7 @@ limitations under the License.
 #include "oneflow/core/thread/thread_pool.h"
 #include "oneflow/core/common/blocking_counter.h"
 #include "oneflow/core/framework/config_def.h"
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/job/global_for.h"
 
 namespace oneflow {
@@ -180,10 +181,8 @@ void CollectIgnoreTaskEdgesInFirstMergedChains(const std::vector<std::vector<Tas
       if (fw_node == nullptr) { continue; }
       if (fw_node->logical_node()->op_vec().size() != 1) { continue; }
       const auto& src_op = *fw_node->logical_node()->SoleOp();
-      if (src_op.op_conf().has_variable_conf()
-          && src_op.op_conf().device_type() == DeviceType::kGPU) {
-        return true;
-      }
+      DeviceType device_type = CHECK_JUST(DeviceType4DeviceTag(src_op.op_conf().device_tag()));
+      if (src_op.op_conf().has_variable_conf() && device_type == DeviceType::kGPU) { return true; }
     }
     return false;
   };
diff --git a/oneflow/core/graph/copy_task_node.cpp b/oneflow/core/graph/copy_task_node.cpp
index 70a658ba0a1caf64b6f68f70f0fadb766a842c74..39ec0da62807b7a5984cb809c7f95ca238e85aa7 100644
--- a/oneflow/core/graph/copy_task_node.cpp
+++ b/oneflow/core/graph/copy_task_node.cpp
@@ -13,9 +13,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/graph/copy_task_node.h"
-#include "oneflow/core/operator/operator.h"
 #include "oneflow/core/job/thrd_id_generator.h"
+#include "oneflow/core/operator/operator.h"
 
 namespace oneflow {
 
@@ -79,7 +80,7 @@ void CopyHdTaskNode::InitProducedRegstMemCase(MemoryCase* mem_case) {
 OperatorConf CopyHdTaskNode::NewCopyOpConf() {
   OperatorConf conf;
   conf.set_name("copy_hd_" + NewUniqueId());
-  conf.set_device_type(device_type());
+  conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(device_type())));
   conf.mutable_copy_hd_conf()->set_type(copy_type_);
   auto in_regst = GetSoleConsumedRegst("copy_in");
   if (in_regst->NumOfLbi() == 1) {
@@ -141,7 +142,7 @@ void CopyCommNetTaskNode::PinConsumedRegstMemCase(MemoryCase* mem_case) {
 OperatorConf CopyCommNetTaskNode::NewCopyOpConf() {
   OperatorConf conf;
   conf.set_name("copy_comm_net_" + NewUniqueId());
-  conf.set_device_type(device_type());
+  conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(this->device_type())));
   conf.mutable_copy_comm_net_conf();
   return conf;
 }
diff --git a/oneflow/core/graph/logical_graph.cpp b/oneflow/core/graph/logical_graph.cpp
index 7142edab5ec83cbd37204fb3a957fe47ab6dfe82..ede44cc8532230954daf391ddeae615cb8aebb38 100644
--- a/oneflow/core/graph/logical_graph.cpp
+++ b/oneflow/core/graph/logical_graph.cpp
@@ -13,12 +13,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/common/balanced_splitter.h"
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/graph/logical_graph.h"
 #include "oneflow/core/graph/op_graph.h"
+#include "oneflow/core/job/global_for.h"
 #include "oneflow/core/operator/operator.h"
 #include "oneflow/core/operator/op_conf_util.h"
-#include "oneflow/core/common/balanced_splitter.h"
-#include "oneflow/core/job/global_for.h"
 
 namespace oneflow {
 
@@ -63,7 +64,7 @@ void LogicalGraph::NaiveBuildFwStruct(
     auto parallel_desc_ptr_it = name2parallel_desc.find(cur_op_conf.name());
     CHECK(parallel_desc_ptr_it != name2parallel_desc.end());
     const std::shared_ptr<ParallelDesc>& parallel_desc_ptr = parallel_desc_ptr_it->second;
-    cur_op_conf.set_device_type(parallel_desc_ptr->device_type());
+    cur_op_conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(parallel_desc_ptr->device_type())));
     std::shared_ptr<Operator> cur_op = ConstructOp(cur_op_conf, &GlobalJobDesc());
     LogicalNode* cur_node = cur_op->NewProperLogicalNode();
     AddAllocatedNode(cur_node);
diff --git a/oneflow/core/graph/slice_boxing_task_node.cpp b/oneflow/core/graph/slice_boxing_task_node.cpp
index 2362705cb96e64eda8b11727b1b41e7e3d4d808f..50428f43b2707fd420ab1523bae717921a129e5b 100644
--- a/oneflow/core/graph/slice_boxing_task_node.cpp
+++ b/oneflow/core/graph/slice_boxing_task_node.cpp
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/graph/slice_boxing_task_node.h"
 
 namespace oneflow {
@@ -102,7 +103,7 @@ void SliceBoxingTaskNode::SetOutShape(const Shape& shape) { out_shape_ = shape;
 
 OperatorConf SliceBoxingTaskNode::GetBoxingOpConf() {
   OperatorConf op_conf{};
-  op_conf.set_device_type(device_type());
+  op_conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(device_type())));
   SliceBoxingConf boxing_conf{};
   *boxing_conf.mutable_lbi() = lbi_;
   out_slice_.ToProto(boxing_conf.mutable_out_slice());
diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp
index 682d95b0f693f8a435cb2c3065a4834f388be8ba..d4fdc6b101a5678a0f4956144d8b063a563c298b 100644
--- a/oneflow/core/job/job_build_and_infer_ctx.cpp
+++ b/oneflow/core/job/job_build_and_infer_ctx.cpp
@@ -13,17 +13,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/job/job_build_and_infer_ctx.h"
-#include "oneflow/core/job_rewriter/op_graph_pass.h"
-#include "oneflow/core/job_rewriter/autograd.h"
-#include "oneflow/core/framework/config_def.h"
 #include "oneflow/core/common/protobuf.h"
-#include "oneflow/core/job/mirrored_sig_infer_hint.h"
-#include "oneflow/core/job/foreign_callback.h"
 #include "oneflow/core/eager/eager_symbol_storage.h"
+#include "oneflow/core/framework/config_def.h"
+#include "oneflow/core/framework/to_string.h"
+#include "oneflow/core/job/foreign_callback.h"
+#include "oneflow/core/job/job_build_and_infer_ctx.h"
+#include "oneflow/core/job/mirrored_sig_infer_hint.h"
 #include "oneflow/core/job/scope.h"
-#include <google/protobuf/text_format.h>
+#include "oneflow/core/job_rewriter/autograd.h"
+#include "oneflow/core/job_rewriter/op_graph_pass.h"
 #include "oneflow/user/summary/summary_converter.h"
+
+#include <google/protobuf/text_format.h>
 #include <json.hpp>
 
 namespace oneflow {
@@ -505,9 +507,9 @@ Maybe<OpAttribute> JobBuildAndInferCtx::AddAndInferOp(const OperatorConf& op_con
   CHECK_OR_RETURN(op_name2op_.find(op_name) == op_name2op_.end())
       << JobBuildAndInferError::kOpNameExist << "op_name: " << op_name
       << " already exist in job: " << job_->job_conf().job_name();
-  CHECK_NE_OR_RETURN(op_conf.device_type(), DeviceType::kInvalidDevice)
-      << JobBuildAndInferError::kOpConfDeviceTypeNoSet << "op_name: " << op_name
-      << " not set device type";
+  CHECK_NE_OR_RETURN(op_conf.device_tag(), "invalid_device")
+      << JobBuildAndInferError::kOpConfDeviceTagNoSet << "op_name: " << op_name
+      << " not set device tag";
 
   op_name2op_.emplace(op_name, ConstructOp(op_conf, job_desc));
   Operator* op = op_name2op_.at(op_name).get();
@@ -836,7 +838,7 @@ Maybe<LogicalBlobId> LazyJobBuildAndInferCtx::FindOrCreateMirroredLbiFromCompati
     lbi_vec->push_back(sub_lbi);
   };
   OperatorConf op_conf;
-  op_conf.set_device_type(parallel_desc.device_type());
+  op_conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(parallel_desc.device_type())));
   if (sbp.has_broadcast_parallel()) {
     op_conf.set_name(kAutoMirroredBlobNamePrefix + "-DistributeClone-" + NewUniqueId());
     auto* distribute_clone = op_conf.mutable_distribute_clone_conf();
@@ -890,7 +892,8 @@ Maybe<LogicalBlobId> EagerJobBuildAndInferCtx::FindOrCreateMirroredLbiFromCompat
     CHECK_OR_RETURN(producer_op_conf.has_scope_symbol_id());
     op_conf.set_scope_symbol_id(producer_op_conf.scope_symbol_id());
   }
-  op_conf.set_device_type(parallel_desc.device_type());
+  // const char* device_tag = JUST(DeviceTag4DeviceType(parallel_desc.device_type()));
+  op_conf.set_device_tag(JUST(DeviceTag4DeviceType(parallel_desc.device_type())));
   op_conf.set_name(kAutoMirroredBlobNamePrefix + "-CastToMirrored-" + NewUniqueId());
   auto* cast_to_mirrored_conf = op_conf.mutable_cast_to_mirrored_conf();
   cast_to_mirrored_conf->set_in(lbn);
diff --git a/oneflow/core/job/parallel_desc.cpp b/oneflow/core/job/parallel_desc.cpp
index 016924b01d76d67108fa1cb298332db4194c8236..d829b8ad7191ccd0e790e190c296b8f678b667a9 100644
--- a/oneflow/core/job/parallel_desc.cpp
+++ b/oneflow/core/job/parallel_desc.cpp
@@ -157,10 +157,9 @@ Maybe<void> ParallelDesc::CheckWithResourceDesc(const ResourceDesc& resource_des
 
 ParallelConf ParallelDesc::GetParallelIdOnlyParallelConf(int64_t parallel_id) const {
   ParallelConf parallel_conf;
-  const char* device_tag = CHECK_JUST(DeviceTag4DeviceType(device_type()));
   std::string machine_id = std::to_string(MachineIdForParallelId(parallel_id));
   std::string device_id = std::to_string(DeviceIdForParallelId(parallel_id));
-  parallel_conf.set_device_tag(device_tag);
+  parallel_conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(device_type())));
   parallel_conf.add_device_name(machine_id + ":" + device_id);
   return parallel_conf;
 }
diff --git a/oneflow/core/job/scope.cpp b/oneflow/core/job/scope.cpp
index cb0c39146a0dd82f6ffcee25800bc8052d3375b8..def4e6cf09fa212fefc35d2946d7d587aef7069e 100644
--- a/oneflow/core/job/scope.cpp
+++ b/oneflow/core/job/scope.cpp
@@ -13,9 +13,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/job/scope.h"
-#include "oneflow/core/vm/symbol_storage.h"
 #include "oneflow/core/operator/operator.h"
+#include "oneflow/core/vm/symbol_storage.h"
 
 namespace oneflow {
 
@@ -42,7 +43,7 @@ Maybe<const JobDesc*> Scope::job_desc() const {
 }
 
 Maybe<int64_t> Scope::GetParallelDescSymbolId(const OperatorConf& op_conf) const {
-  if (op_conf.device_type() == DeviceType::kCPU || IsCpuOnly(op_conf)) {
+  if (op_conf.device_tag() == "cpu" || IsCpuOnly(op_conf)) {
     return scope_proto_.host_parallel_desc_symbol_id();
   } else {
     return scope_proto_.device_parallel_desc_symbol_id();
@@ -50,7 +51,7 @@ Maybe<int64_t> Scope::GetParallelDescSymbolId(const OperatorConf& op_conf) const
 }
 
 Maybe<const ParallelDesc*> Scope::GetParallelDesc(const OperatorConf& op_conf) const {
-  if (op_conf.device_type() == DeviceType::kCPU || IsCpuOnly(op_conf)) {
+  if (op_conf.device_tag() == "cpu" || IsCpuOnly(op_conf)) {
     return host_parallel_desc_.get();
   } else {
     return device_parallel_desc_.get();
diff --git a/oneflow/core/job_rewriter/dump_variable_info_pass.cpp b/oneflow/core/job_rewriter/dump_variable_info_pass.cpp
index 12a2bfe47d2acc1d41edc660b24c61398d5b4a6a..91946ea724529f3cf1d84171844003ae21959084 100644
--- a/oneflow/core/job_rewriter/dump_variable_info_pass.cpp
+++ b/oneflow/core/job_rewriter/dump_variable_info_pass.cpp
@@ -13,8 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/job_rewriter/op_graph_pass.h"
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/job/global_for.h"
+#include "oneflow/core/job_rewriter/op_graph_pass.h"
 
 namespace oneflow {
 
@@ -35,19 +36,19 @@ Maybe<void> DumpVariableInfoPass::Apply(const OpGraph& op_graph, JobBuilder* job
   const std::string sep = "\t";
   auto log_stream =
       TeePersistentLogStream::Create("variable_table_" + std::to_string(GlobalJobDesc().job_id()));
-  (*log_stream) << "id" << sep << "name" << sep << "device_type" << sep << "parallel_num" << sep
+  (*log_stream) << "id" << sep << "name" << sep << "device_tag" << sep << "parallel_num" << sep
                 << "distribute" << sep << "data_type" << sep << "shape" << sep << "elem_cnt" << sep
                 << "size"
                 << "\n";
-  op_graph.TopoForEachNode([&](const OpNode* node) {
+  JUST(op_graph.TopoForEachNodeWithErrorCaptured([&](const OpNode* node) -> Maybe<void> {
     const OperatorConf& op_conf = node->op().op_conf();
-    if (!op_conf.has_variable_conf()) { return; }
+    if (!op_conf.has_variable_conf()) { return Maybe<void>::Ok(); }
     const VariableOpConf& conf = op_conf.variable_conf();
     (*log_stream) << std::to_string(cnt);
     (*log_stream) << sep;
     (*log_stream) << op_conf.name();
     (*log_stream) << sep;
-    (*log_stream) << DeviceType_Name(op_conf.device_type());
+    (*log_stream) << op_conf.device_tag();
     (*log_stream) << sep;
     (*log_stream) << std::to_string(node->parallel_desc().parallel_num());
     (*log_stream) << sep;
@@ -67,7 +68,8 @@ Maybe<void> DumpVariableInfoPass::Apply(const OpGraph& op_graph, JobBuilder* job
     (*log_stream) << std::to_string(shape.elem_cnt() * GetSizeOfDataType(conf.data_type()));
     (*log_stream) << "\n";
     cnt += 1;
-  });
+    return Maybe<void>::Ok();
+  }));
   return Maybe<void>::Ok();
 }
 
diff --git a/oneflow/core/kernel/arg_where_kernel.cpp b/oneflow/core/kernel/arg_where_kernel.cpp
index 36766485ccbf284483ff85364f935742e3236455..f81262214ff56547842af9e074520e382c5c3d43 100644
--- a/oneflow/core/kernel/arg_where_kernel.cpp
+++ b/oneflow/core/kernel/arg_where_kernel.cpp
@@ -13,9 +13,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/kernel/kernel.h"
-#include "oneflow/core/kernel/arg_where_kernel_util.h"
 #include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/framework/to_string.h"
+#include "oneflow/core/kernel/arg_where_kernel_util.h"
+#include "oneflow/core/kernel/kernel.h"
 
 namespace oneflow {
 
@@ -40,14 +41,14 @@ class ArgWhereKernel : public KernelIf<DeviceType::kCPU> {
   }
 };
 
-#define REGISTER_ARG_WHERE_KERNEL(device_type_v, dtype, itype, ndims)                 \
-  NEW_REGISTER_KERNEL(OperatorConf::kArgWhereConf,                                    \
-                      ArgWhereKernel<device_type_v, dtype, itype, ndims>)             \
-      .SetIsMatchedPred([](const KernelConf& conf) {                                  \
-        return (device_type_v == conf.op_attribute().op_conf().device_type())         \
-               && (GetDataType<itype>::value == conf.data_type())                     \
-               && (GetDataType<dtype>::value == conf.arg_where_conf().in_data_type()) \
-               && (ndims == conf.arg_where_conf().num_axes());                        \
+#define REGISTER_ARG_WHERE_KERNEL(device_type_v, dtype, itype, ndims)                  \
+  NEW_REGISTER_KERNEL(OperatorConf::kArgWhereConf,                                     \
+                      ArgWhereKernel<device_type_v, dtype, itype, ndims>)              \
+      .SetIsMatchedPred([](const KernelConf& conf) -> bool {                           \
+        return (conf.op_attribute().op_conf().device_tag() == ToString(device_type_v)) \
+               && (GetDataType<itype>::value == conf.data_type())                      \
+               && (GetDataType<dtype>::value == conf.arg_where_conf().in_data_type())  \
+               && (ndims == conf.arg_where_conf().num_axes());                         \
       });
 
 #define REGISTER_ARG_WHERE_KERNELS_AT_NDIMS(device_type_v, dtype, itype) \
diff --git a/oneflow/core/kernel/indexed_slices_lazy_adam_model_update_kernel.cpp b/oneflow/core/kernel/indexed_slices_lazy_adam_model_update_kernel.cpp
index 34bef5af046ee05635b510d44a9b02964ffda3c7..a0ea46db6d446d47346bc4ee25e0271ab4a83d41 100644
--- a/oneflow/core/kernel/indexed_slices_lazy_adam_model_update_kernel.cpp
+++ b/oneflow/core/kernel/indexed_slices_lazy_adam_model_update_kernel.cpp
@@ -13,10 +13,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/kernel/kernel.h"
 #include "oneflow/core/kernel/kernel_context.h"
-#include "oneflow/core/kernel/indexed_slices_reduce_sum_kernel_util.h"
 #include "oneflow/core/kernel/indexed_slices_lazy_adam_model_update_kernel_util.h"
+#include "oneflow/core/kernel/indexed_slices_reduce_sum_kernel_util.h"
 
 namespace oneflow {
 
@@ -80,7 +81,7 @@ void IndexedSlicesLazyAdamMdUpdateKernel<device_type, T, K>::ForwardDataContent(
                                           OF_PP_PAIR_FIRST(indices_type_pair)>)                    \
       .SetIsMatchedPred([](const KernelConf& kernel_conf) -> bool {                                \
         return (                                                                                   \
-            (kernel_conf.op_attribute().op_conf().device_type() == device_type_v)                  \
+            (kernel_conf.op_attribute().op_conf().device_tag() == ToString(device_type_v))         \
             && ((OF_PP_PAIR_SECOND(data_type_pair)) == kernel_conf.data_type())                    \
             && (OF_PP_PAIR_SECOND(indices_type_pair)                                               \
                 == kernel_conf.indexed_slices_lazy_adam_model_update_conf().indices_data_type())); \
diff --git a/oneflow/core/kernel/indexed_slices_momentum_model_update_kernel.cpp b/oneflow/core/kernel/indexed_slices_momentum_model_update_kernel.cpp
index aa70b5cd33506f69c62425c6df4ad680c1fbf84f..771df11cfa17814d6ae77d3dcbaf3f04989108e2 100644
--- a/oneflow/core/kernel/indexed_slices_momentum_model_update_kernel.cpp
+++ b/oneflow/core/kernel/indexed_slices_momentum_model_update_kernel.cpp
@@ -13,10 +13,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/kernel/kernel.h"
 #include "oneflow/core/kernel/kernel_context.h"
-#include "oneflow/core/kernel/indexed_slices_reduce_sum_kernel_util.h"
 #include "oneflow/core/kernel/indexed_slices_momentum_model_update_kernel_util.h"
+#include "oneflow/core/kernel/indexed_slices_reduce_sum_kernel_util.h"
 
 namespace oneflow {
 
@@ -75,7 +76,7 @@ void IndexedSlicesMomentumMdUpdateKernel<device_type, T, K>::ForwardDataContent(
                                           OF_PP_PAIR_FIRST(indices_type_pair)>)                   \
       .SetIsMatchedPred([](const KernelConf& kernel_conf) -> bool {                               \
         return (                                                                                  \
-            (kernel_conf.op_attribute().op_conf().device_type() == device_type_v)                 \
+            (kernel_conf.op_attribute().op_conf().device_tag() == ToString(device_type_v))        \
             && ((OF_PP_PAIR_SECOND(data_type_pair)) == kernel_conf.data_type())                   \
             && (OF_PP_PAIR_SECOND(indices_type_pair)                                              \
                 == kernel_conf.indexed_slices_momentum_model_update_conf().indices_data_type())); \
diff --git a/oneflow/core/kernel/indexed_slices_naive_model_update_kernel.cpp b/oneflow/core/kernel/indexed_slices_naive_model_update_kernel.cpp
index 4f2a6ac519734e55a12163c385b1d32c2ab7b53f..434c6c4d800fdb087af397fcabc5255bd0854379 100644
--- a/oneflow/core/kernel/indexed_slices_naive_model_update_kernel.cpp
+++ b/oneflow/core/kernel/indexed_slices_naive_model_update_kernel.cpp
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/kernel/kernel.h"
 #include "oneflow/core/kernel/indexed_slices_naive_model_update_kernel_util.h"
 
@@ -59,7 +60,7 @@ void IndexedSlicesNaiveMdUpdateKernel<device_type, T, K>::ForwardDataContent(
       IndexedSlicesNaiveMdUpdateKernel<device_type_v, OF_PP_PAIR_FIRST(data_type_pair),            \
                                        OF_PP_PAIR_FIRST(indices_type_pair)>)                       \
       .SetIsMatchedPred([](const KernelConf& kernel_conf) -> bool {                                \
-        return ((kernel_conf.op_attribute().op_conf().device_type() == device_type_v)              \
+        return ((kernel_conf.op_attribute().op_conf().device_tag() == ToString(device_type_v))     \
                 && ((OF_PP_PAIR_SECOND(data_type_pair)) == kernel_conf.data_type())                \
                 && (OF_PP_PAIR_SECOND(indices_type_pair)                                           \
                     == kernel_conf.indexed_slices_naive_model_update_conf().indices_data_type())); \
diff --git a/oneflow/core/kernel/indexed_slices_reduce_sum_kernel.cpp b/oneflow/core/kernel/indexed_slices_reduce_sum_kernel.cpp
index fe6a5a1c9b9d629bcd3fa9c652a39ddff0717407..251df60a24b07328a20da697cde2c3c8edc41fbe 100644
--- a/oneflow/core/kernel/indexed_slices_reduce_sum_kernel.cpp
+++ b/oneflow/core/kernel/indexed_slices_reduce_sum_kernel.cpp
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/kernel/kernel.h"
 #include "oneflow/core/kernel/kernel_context.h"
 #include "oneflow/core/kernel/indexed_slices_reduce_sum_kernel_util.h"
@@ -54,17 +55,17 @@ void IndexedSlicesReduceSumKernel<device_type, T, K>::ForwardDataContent(
       workspace_ptr, workspace_size_in_bytes);
 }
 
-#define MAKE_INDEXED_SLICES_REDUCE_SUM_KERNEL_ENTRY(device_type_v, data_type_pair,         \
-                                                    indices_type_pair)                     \
-  NEW_REGISTER_KERNEL(                                                                     \
-      OperatorConf::kIndexedSlicesReduceSumConf,                                           \
-      IndexedSlicesReduceSumKernel<device_type_v, OF_PP_PAIR_FIRST(data_type_pair),        \
-                                   OF_PP_PAIR_FIRST(indices_type_pair)>)                   \
-      .SetIsMatchedPred([](const KernelConf& kernel_conf) -> bool {                        \
-        return ((kernel_conf.op_attribute().op_conf().device_type() == device_type_v)      \
-                && ((OF_PP_PAIR_SECOND(data_type_pair)) == kernel_conf.data_type())        \
-                && (OF_PP_PAIR_SECOND(indices_type_pair)                                   \
-                    == kernel_conf.indexed_slices_reduce_sum_conf().indices_data_type())); \
+#define MAKE_INDEXED_SLICES_REDUCE_SUM_KERNEL_ENTRY(device_type_v, data_type_pair,             \
+                                                    indices_type_pair)                         \
+  NEW_REGISTER_KERNEL(                                                                         \
+      OperatorConf::kIndexedSlicesReduceSumConf,                                               \
+      IndexedSlicesReduceSumKernel<device_type_v, OF_PP_PAIR_FIRST(data_type_pair),            \
+                                   OF_PP_PAIR_FIRST(indices_type_pair)>)                       \
+      .SetIsMatchedPred([](const KernelConf& kernel_conf) -> bool {                            \
+        return ((kernel_conf.op_attribute().op_conf().device_tag() == ToString(device_type_v)) \
+                && ((OF_PP_PAIR_SECOND(data_type_pair)) == kernel_conf.data_type())            \
+                && (OF_PP_PAIR_SECOND(indices_type_pair)                                       \
+                    == kernel_conf.indexed_slices_reduce_sum_conf().indices_data_type()));     \
       });
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_INDEXED_SLICES_REDUCE_SUM_KERNEL_ENTRY, DEVICE_TYPE_SEQ,
diff --git a/oneflow/core/kernel/kernel.h b/oneflow/core/kernel/kernel.h
index aaa3c49b7719570b6d48c921601933ce8c3f63b4..d90d7ddfe2fc520fcd12376fbf6becd8909a7124 100644
--- a/oneflow/core/kernel/kernel.h
+++ b/oneflow/core/kernel/kernel.h
@@ -16,16 +16,17 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_KERNEL_KERNEL_H_
 #define ONEFLOW_CORE_KERNEL_KERNEL_H_
 
+#include "oneflow/core/common/protobuf.h"
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/job/job.pb.h"
 #include "oneflow/core/job/resource.pb.h"
 #include "oneflow/core/kernel/kernel.pb.h"
+#include "oneflow/core/kernel/kernel_registration.h"
 #include "oneflow/core/kernel/kernel_util.h"
 #include "oneflow/core/operator/operator.h"
+#include "oneflow/core/operator/op_conf_util.h"
 #include "oneflow/core/persistence/snapshot.h"
 #include "oneflow/core/register/blob.h"
-#include "oneflow/core/common/protobuf.h"
-#include "oneflow/core/operator/op_conf_util.h"
-#include "oneflow/core/kernel/kernel_registration.h"
 
 namespace oneflow {
 
@@ -210,18 +211,19 @@ std::unique_ptr<const Kernel> ConstructKernel(const JobDesc* job_desc, const Ker
   {GetHashKey(device_type, OF_PP_PAIR_SECOND(data_type_pair)),               \
    []() { return new kernel_class<device_type, OF_PP_PAIR_FIRST(data_type_pair)>(); }},
 
-#define ADD_DEFAULT_KERNEL_CREATOR(op_type_case, kernel_class, data_type_seq)         \
-  namespace {                                                                         \
-                                                                                      \
-  Kernel* OF_PP_CAT(CreateKernel, __LINE__)(const KernelConf& kernel_conf) {          \
-    static const HashMap<std::string, std::function<Kernel*()>> creators = {          \
-        OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (kernel_class),   \
-                                         DEVICE_TYPE_SEQ, data_type_seq)};            \
-    return creators.at(GetHashKey(kernel_conf.op_attribute().op_conf().device_type(), \
-                                  kernel_conf.data_type()))();                        \
-  }                                                                                   \
-                                                                                      \
-  REGISTER_KERNEL_CREATOR(op_type_case, OF_PP_CAT(CreateKernel, __LINE__));           \
+#define ADD_DEFAULT_KERNEL_CREATOR(op_type_case, kernel_class, data_type_seq)                \
+  namespace {                                                                                \
+                                                                                             \
+  Kernel* OF_PP_CAT(CreateKernel, __LINE__)(const KernelConf& kernel_conf) {                 \
+    static const HashMap<std::string, std::function<Kernel*()>> creators = {                 \
+        OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (kernel_class),          \
+                                         DEVICE_TYPE_SEQ, data_type_seq)};                   \
+    DeviceType device_type =                                                                 \
+        CHECK_JUST(DeviceType4DeviceTag(kernel_conf.op_attribute().op_conf().device_tag())); \
+    return creators.at(GetHashKey(device_type, kernel_conf.data_type()))();                  \
+  }                                                                                          \
+                                                                                             \
+  REGISTER_KERNEL_CREATOR(op_type_case, OF_PP_CAT(CreateKernel, __LINE__));                  \
   }
 
 #define MAKE_DEVICE_TYPE_KERNEL_CREATOR_ENTRY(kernel_class, device_type) \
@@ -234,7 +236,9 @@ std::unique_ptr<const Kernel> ConstructKernel(const JobDesc* job_desc, const Ker
     static const HashMap<int, std::function<Kernel*()>> creators = {                            \
         OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_DEVICE_TYPE_KERNEL_CREATOR_ENTRY, (kernel_class), \
                                          DEVICE_TYPE_SEQ)};                                     \
-    return creators.at(kernel_conf.op_attribute().op_conf().device_type())();                   \
+    DeviceType device_type =                                                                    \
+        CHECK_JUST(DeviceType4DeviceTag(kernel_conf.op_attribute().op_conf().device_tag()));    \
+    return creators.at(device_type)();                                                          \
   }                                                                                             \
                                                                                                 \
   REGISTER_KERNEL_CREATOR(op_type_case, OF_PP_CAT(CreateKernel, __LINE__));                     \
@@ -257,20 +261,21 @@ std::unique_ptr<const Kernel> ConstructKernel(const JobDesc* job_desc, const Ker
   REGISTER_KERNEL_CREATOR(op_type_case, CreateKernel);                                  \
   }
 
-#define ADD_DEFAULT_KERNEL_CREATOR_WITH_GPU_HALF(op_type_case, kernel_class, data_type_seq) \
-  namespace {                                                                               \
-                                                                                            \
-  Kernel* OF_PP_CAT(CreateKernel, __LINE__)(const KernelConf& kernel_conf) {                \
-    static const HashMap<std::string, std::function<Kernel*()>> creators = {                \
-        OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (kernel_class),         \
-                                         DEVICE_TYPE_SEQ, data_type_seq)                    \
-            MAKE_KERNEL_CREATOR_ENTRY(kernel_class, DeviceType::kGPU,                       \
-                                      (float16, DataType::kFloat16))};                      \
-    return creators.at(GetHashKey(kernel_conf.op_attribute().op_conf().device_type(),       \
-                                  kernel_conf.data_type()))();                              \
-  }                                                                                         \
-                                                                                            \
-  REGISTER_KERNEL_CREATOR(op_type_case, OF_PP_CAT(CreateKernel, __LINE__));                 \
+#define ADD_DEFAULT_KERNEL_CREATOR_WITH_GPU_HALF(op_type_case, kernel_class, data_type_seq)  \
+  namespace {                                                                                \
+                                                                                             \
+  Kernel* OF_PP_CAT(CreateKernel, __LINE__)(const KernelConf& kernel_conf) {                 \
+    static const HashMap<std::string, std::function<Kernel*()>> creators = {                 \
+        OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (kernel_class),          \
+                                         DEVICE_TYPE_SEQ, data_type_seq)                     \
+            MAKE_KERNEL_CREATOR_ENTRY(kernel_class, DeviceType::kGPU,                        \
+                                      (float16, DataType::kFloat16))};                       \
+    DeviceType device_type =                                                                 \
+        CHECK_JUST(DeviceType4DeviceTag(kernel_conf.op_attribute().op_conf().device_tag())); \
+    return creators.at(GetHashKey(device_type, kernel_conf.data_type()))();                  \
+  }                                                                                          \
+                                                                                             \
+  REGISTER_KERNEL_CREATOR(op_type_case, OF_PP_CAT(CreateKernel, __LINE__));                  \
   }
 
 #endif  // ONEFLOW_CORE_KERNEL_KERNEL_H_
diff --git a/oneflow/core/kernel/kernel_registration.h b/oneflow/core/kernel/kernel_registration.h
index 8a19ed98dac78bf1fb20f78a2566eb15c1b9e06b..811c77e13ace363653c54cb5556eac1d3f332fbf 100644
--- a/oneflow/core/kernel/kernel_registration.h
+++ b/oneflow/core/kernel/kernel_registration.h
@@ -16,13 +16,14 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_KERNEL_KERNEL_REGISTRATION_H_
 #define ONEFLOW_CORE_KERNEL_KERNEL_REGISTRATION_H_
 
+#include "oneflow/core/common/data_type.h"
 #include "oneflow/core/common/data_type.pb.h"
 #include "oneflow/core/common/device_type.pb.h"
-#include "oneflow/core/common/util.h"
 #include "oneflow/core/common/str_util.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/operator/op_conf_util.h"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/kernel/kernel.pb.h"
+#include "oneflow/core/operator/op_conf_util.h"
 
 namespace oneflow {
 
@@ -79,20 +80,20 @@ Kernel* CreateKernel(const KernelConf& kernel_conf);
       kernel_registration::KernelRegistrarBuilder(op_type).SetCreateFn(             \
           []() { return new __VA_ARGS__(); })
 
-#define REGISTER_KERNEL_WITH_NOTHING(op_type, ...)                                   \
-  NEW_REGISTER_KERNEL(op_type, __VA_ARGS__).SetIsMatchedPred([](const KernelConf&) { \
-    return true;                                                                     \
+#define REGISTER_KERNEL_WITH_NOTHING(op_type, ...)                                           \
+  NEW_REGISTER_KERNEL(op_type, __VA_ARGS__).SetIsMatchedPred([](const KernelConf&) -> bool { \
+    return true;                                                                             \
   });
 
-#define REGISTER_KERNEL_WITH_DEVICE_AND_DTYPE(op_type, device, dtype, ...)                \
-  NEW_REGISTER_KERNEL(op_type, __VA_ARGS__).SetIsMatchedPred([](const KernelConf& conf) { \
-    return (device == conf.op_attribute().op_conf().device_type())                        \
-           && (GetDataType<dtype>::value == conf.data_type());                            \
+#define REGISTER_KERNEL_WITH_DEVICE_AND_DTYPE(op_type, device, dtype, ...)                        \
+  NEW_REGISTER_KERNEL(op_type, __VA_ARGS__).SetIsMatchedPred([](const KernelConf& conf) -> bool { \
+    return (ToString(device) == conf.op_attribute().op_conf().device_tag())                       \
+           && (GetDataType<dtype>::value == conf.data_type());                                    \
   });
 
-#define REGISTER_KERNEL_WITH_DEVICE(op_type, device, ...)                                 \
-  NEW_REGISTER_KERNEL(op_type, __VA_ARGS__).SetIsMatchedPred([](const KernelConf& conf) { \
-    return (device == conf.op_attribute().op_conf().device_type());                       \
+#define REGISTER_KERNEL_WITH_DEVICE(op_type, device, ...)                                         \
+  NEW_REGISTER_KERNEL(op_type, __VA_ARGS__).SetIsMatchedPred([](const KernelConf& conf) -> bool { \
+    return (ToString(device) == conf.op_attribute().op_conf().device_tag());                      \
   });
 
 #define REGISTER_KERNEL_HELPER_CPU_FLOATING(op_type, kernel)               \
diff --git a/oneflow/core/kernel/normal_model_update_kernel.h b/oneflow/core/kernel/normal_model_update_kernel.h
index 40b8f30a622a4db9665f540745ad70bbec1f3f79..d6108418f36c538050978d2a7cf8dc4e17f3267a 100644
--- a/oneflow/core/kernel/normal_model_update_kernel.h
+++ b/oneflow/core/kernel/normal_model_update_kernel.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_KERNEL_NORMAL_MODEL_UPDATE_KERNEL_H_
 #define ONEFLOW_CORE_KERNEL_NORMAL_MODEL_UPDATE_KERNEL_H_
 
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/kernel/kernel.h"
 
 namespace oneflow {
@@ -49,13 +50,14 @@ class NormalMdUpdateKernel : public KernelIf<device_type> {
 
 #define DECLARE_MDUPDT_KERNEL_CREATOR(x) Kernel* Create##x##MdUpdtKernel(const KernelConf&);
 
-#define DEFINE_MDUPDT_KERNEL_CREATOR(x)                                                  \
-  Kernel* Create##x##MdUpdtKernel(const KernelConf& kernel_conf) {                       \
-    static const HashMap<std::string, std::function<Kernel*()>> creators = {             \
-        OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (x##MdUpdateKernel), \
-                                         DEVICE_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ)};      \
-    return creators.at(GetHashKey(kernel_conf.op_attribute().op_conf().device_type(),    \
-                                  kernel_conf.data_type()))();                           \
+#define DEFINE_MDUPDT_KERNEL_CREATOR(x)                                                      \
+  Kernel* Create##x##MdUpdtKernel(const KernelConf& kernel_conf) {                           \
+    static const HashMap<std::string, std::function<Kernel*()>> creators = {                 \
+        OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (x##MdUpdateKernel),     \
+                                         DEVICE_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ)};          \
+    DeviceType device_type =                                                                 \
+        CHECK_JUST(DeviceType4DeviceTag(kernel_conf.op_attribute().op_conf().device_tag())); \
+    return creators.at(GetHashKey(device_type, kernel_conf.data_type()))();                  \
   }
 
 }  // namespace oneflow
diff --git a/oneflow/core/kernel/sigmoid_cross_entropy_kernel.cu b/oneflow/core/kernel/sigmoid_cross_entropy_kernel.cu
index a79a3c6f22a3f315e805c4f2a26c146f328cc1a0..98b07f23c19c39304016ee1feee3b5424de3157b 100644
--- a/oneflow/core/kernel/sigmoid_cross_entropy_kernel.cu
+++ b/oneflow/core/kernel/sigmoid_cross_entropy_kernel.cu
@@ -86,17 +86,17 @@ class SigmoidCrossEntropyGradGpuKernel final : public KernelIf<DeviceType::kGPU>
 #define REGISTER_SIGMOID_CROSS_ENTROPY_GPU_KERNEL(dtype, ltype)                                    \
   NEW_REGISTER_KERNEL(OperatorConf::kSigmoidCrossEntropyConf,                                      \
                       SigmoidCrossEntropyGpuKernel<dtype, ltype>)                                  \
-      .SetIsMatchedPred([](const KernelConf& conf) {                                               \
-        return ((conf.op_attribute().op_conf().device_type() == DeviceType::kGPU)                  \
+      .SetIsMatchedPred([](const KernelConf& conf) -> bool {                                       \
+        return ((conf.op_attribute().op_conf().device_tag() == "gpu")                              \
                 && (conf.data_type() == GetDataType<dtype>::value)                                 \
                 && (GetDataType<ltype>::value                                                      \
                     == conf.op_attribute().op_conf().sigmoid_cross_entropy_conf().label_type()));  \
       });                                                                                          \
   NEW_REGISTER_KERNEL(OperatorConf::kSigmoidCrossEntropyGradConf,                                  \
                       SigmoidCrossEntropyGradGpuKernel<dtype, ltype>)                              \
-      .SetIsMatchedPred([](const KernelConf& conf) {                                               \
+      .SetIsMatchedPred([](const KernelConf& conf) -> bool {                                       \
         return (                                                                                   \
-            (conf.op_attribute().op_conf().device_type() == DeviceType::kGPU)                      \
+            (conf.op_attribute().op_conf().device_tag() == "gpu")                                  \
             && (conf.data_type() == GetDataType<dtype>::value)                                     \
             && (GetDataType<ltype>::value                                                          \
                 == conf.op_attribute().op_conf().sigmoid_cross_entropy_grad_conf().label_type())); \
diff --git a/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp b/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
index 0ae4b53521ead0e7277ff08b4af1389bd038ff1b..b568a28890fb5b618e712096b02af9874f6b9f22 100644
--- a/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
+++ b/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
@@ -13,15 +13,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/to_string.h"
+#include "oneflow/core/kernel/kernel.h"
+#include "oneflow/core/register/register_desc.h"
+
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <mutex>
 #include <queue>
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/kernel/kernel.h"
-#include "oneflow/core/register/register_desc.h"
 
 namespace oneflow {
 
@@ -101,7 +103,7 @@ class SyncDynamicResizeGPUKernel final : public KernelIf<DeviceType::kGPU> {
 #define REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(stype)                                         \
   NEW_REGISTER_KERNEL(OperatorConf::kSyncDynamicResizeConf, SyncDynamicResizeGPUKernel<stype>) \
       .SetIsMatchedPred([](const KernelConf& kernel_conf) {                                    \
-        return (kernel_conf.op_attribute().op_conf().device_type() == DeviceType::kGPU         \
+        return (kernel_conf.op_attribute().op_conf().device_tag() == "gpu"                     \
                 && GetDataType<stype>::value                                                   \
                        == kernel_conf.sync_dynamic_resize_conf().size_data_type());            \
       })
@@ -139,7 +141,7 @@ class SyncDynamicResizeCPUKernel final : public KernelIf<DeviceType::kCPU> {
 #define REGISTER_SYNC_DYNAMIC_RESIZE_CPU_KERNEL(stype)                                         \
   NEW_REGISTER_KERNEL(OperatorConf::kSyncDynamicResizeConf, SyncDynamicResizeCPUKernel<stype>) \
       .SetIsMatchedPred([](const KernelConf& kernel_conf) {                                    \
-        return (kernel_conf.op_attribute().op_conf().device_type() == DeviceType::kCPU         \
+        return (kernel_conf.op_attribute().op_conf().device_tag() == "cpu"                     \
                 && GetDataType<stype>::value                                                   \
                        == kernel_conf.sync_dynamic_resize_conf().size_data_type());            \
       })
diff --git a/oneflow/core/kernel/tensor_buffer_to_tensor_list_kernel.cpp b/oneflow/core/kernel/tensor_buffer_to_tensor_list_kernel.cpp
index 7b065fb3819067bfe39cc51b6aa4bcfc3ed3bede..da16f4b655691d02da8e28f81589376231f09d86 100644
--- a/oneflow/core/kernel/tensor_buffer_to_tensor_list_kernel.cpp
+++ b/oneflow/core/kernel/tensor_buffer_to_tensor_list_kernel.cpp
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/kernel/kernel.h"
 #include "oneflow/core/kernel/kernel_util.h"
 #include "oneflow/core/common/tensor_buffer.h"
@@ -81,12 +82,12 @@ void TensorBufferToTensorListKernel<T>::ForwardDataContent(
   CHECK_EQ(out_blob->total_num_of_tensors(), in_blob->shape().elem_cnt());
 }
 
-#define REGISTER_TENSOR_BUFFER_TO_TENSOR_LIST_KERNEL(dtype)                      \
-  NEW_REGISTER_KERNEL(OperatorConf::kTensorBufferToTensorListConf,               \
-                      TensorBufferToTensorListKernel<dtype>)                     \
-      .SetIsMatchedPred([](const KernelConf& conf) {                             \
-        return (conf.op_attribute().op_conf().device_type() == DeviceType::kCPU) \
-               && (conf.data_type() == GetDataType<dtype>::value);               \
+#define REGISTER_TENSOR_BUFFER_TO_TENSOR_LIST_KERNEL(dtype)          \
+  NEW_REGISTER_KERNEL(OperatorConf::kTensorBufferToTensorListConf,   \
+                      TensorBufferToTensorListKernel<dtype>)         \
+      .SetIsMatchedPred([](const KernelConf& conf) {                 \
+        return (conf.op_attribute().op_conf().device_tag() == "cpu") \
+               && (conf.data_type() == GetDataType<dtype>::value);   \
       });
 
 REGISTER_TENSOR_BUFFER_TO_TENSOR_LIST_KERNEL(char)
diff --git a/oneflow/core/kernel/tensor_list_to_tensor_buffer_kernel.cpp b/oneflow/core/kernel/tensor_list_to_tensor_buffer_kernel.cpp
index 0b63ec43259f5a09d7cb99a54cbe99dc4a9f777c..b617ff4e1dcd7c76f7cc91c743796459b0cbcb7d 100644
--- a/oneflow/core/kernel/tensor_list_to_tensor_buffer_kernel.cpp
+++ b/oneflow/core/kernel/tensor_list_to_tensor_buffer_kernel.cpp
@@ -13,8 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/kernel/kernel.h"
 #include "oneflow/core/common/tensor_buffer.h"
+#include "oneflow/core/framework/to_string.h"
+#include "oneflow/core/kernel/kernel.h"
 
 namespace oneflow {
 
@@ -59,7 +60,7 @@ void TensorListToTensorBufferKernel::ForwardHeader(
 
 NEW_REGISTER_KERNEL(OperatorConf::kTensorListToTensorBufferConf, TensorListToTensorBufferKernel)
     .SetIsMatchedPred([](const KernelConf& conf) {
-      return (conf.op_attribute().op_conf().device_type() == DeviceType::kCPU)
+      return (conf.op_attribute().op_conf().device_tag() == "cpu")
              && (conf.data_type() == DataType::kTensorBuffer);
     });
 
diff --git a/oneflow/core/kernel/unique_with_counts_kernel.cpp b/oneflow/core/kernel/unique_with_counts_kernel.cpp
index a5173fb71d86619fe4c970b4983e9031fcabc94f..0af8a190d4335a91aabe884ae29ee9d11c531b9c 100644
--- a/oneflow/core/kernel/unique_with_counts_kernel.cpp
+++ b/oneflow/core/kernel/unique_with_counts_kernel.cpp
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/kernel/kernel.h"
 #include "oneflow/core/kernel/kernel_context.h"
 #include "oneflow/core/kernel/unique_kernel_util.h"
@@ -57,7 +58,7 @@ void UniqueWithCountsKernel<device_type, T, K>::ForwardDataContent(
                       UniqueWithCountsKernel<device_type_v, OF_PP_PAIR_FIRST(data_type_pair),  \
                                              OF_PP_PAIR_FIRST(indices_type_pair)>)             \
       .SetIsMatchedPred([](const KernelConf& kernel_conf) -> bool {                            \
-        return ((kernel_conf.op_attribute().op_conf().device_type() == device_type_v)          \
+        return ((kernel_conf.op_attribute().op_conf().device_tag() == ToString(device_type_v)) \
                 && ((OF_PP_PAIR_SECOND(data_type_pair)) == kernel_conf.data_type())            \
                 && (OF_PP_PAIR_SECOND(indices_type_pair)                                       \
                     == kernel_conf.unique_with_counts_conf().indices_data_type()));            \
diff --git a/oneflow/core/kernel/user_kernel.cpp b/oneflow/core/kernel/user_kernel.cpp
index f2e73da9bb57fdb99fa9955bd92f4efe536d47f0..ed9193c2cfebbbea3088013b0adde3e76f61ff3b 100644
--- a/oneflow/core/kernel/user_kernel.cpp
+++ b/oneflow/core/kernel/user_kernel.cpp
@@ -13,14 +13,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/kernel/kernel.h"
-#include "oneflow/core/kernel/eager_kernel.h"
+#include "oneflow/core/framework/infer_util.h"
 #include "oneflow/core/framework/op_kernel.h"
 #include "oneflow/core/framework/op_kernel_infer_cache.h"
-#include "oneflow/core/framework/user_op_registry_manager.h"
 #include "oneflow/core/framework/tensor.h"
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/framework/user_op_conf.h"
-#include "oneflow/core/framework/infer_util.h"
+#include "oneflow/core/framework/user_op_registry_manager.h"
+#include "oneflow/core/kernel/eager_kernel.h"
 #include "oneflow/core/kernel/kernel.h"
 
 namespace oneflow {
@@ -61,8 +61,8 @@ class UserKernelBaseContext {
     };
     InitInOrOut(kernel_conf.op_attribute().op_conf().user_conf().input(), &inputs_);
     InitInOrOut(kernel_conf.op_attribute().op_conf().user_conf().output(), &outputs_);
-
-    device_type_ = kernel_conf.op_attribute().op_conf().device_type();
+    device_tag_ = kernel_conf.op_attribute().op_conf().device_tag();
+    device_type_ = CHECK_JUST(DeviceType4DeviceTag(device_tag_));
     parallel_ctx_ = kernel_conf.user_conf().parallel_ctx();
     for (const auto& pair : kernel_conf.user_conf().bn_in_op2blob_desc()) {
       arg2tensor_desc_.emplace(GenUnRepeatedBn(pair.first), user_op::TensorDesc(pair.second));
@@ -71,6 +71,7 @@ class UserKernelBaseContext {
   ~UserKernelBaseContext() = default;
 
   DeviceType device_type() const { return device_type_; }
+  const std::string& device_tag() const { return device_tag_; }
   const ParallelContext& parallel_ctx() const { return parallel_ctx_; }
   const JobDesc& job_desc() const { return job_desc_; }
   const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
@@ -87,6 +88,7 @@ class UserKernelBaseContext {
   ArgVec inputs_;
   ArgVec outputs_;
   DeviceType device_type_;
+  std::string device_tag_;
   ParallelContext parallel_ctx_;
   HashMap<std::pair<std::string, int32_t>, user_op::TensorDesc> arg2tensor_desc_;
   const JobDesc& job_desc_;
@@ -378,6 +380,7 @@ class UserKernelRegContext final : public user_op::KernelRegContext {
   ~UserKernelRegContext() = default;
 
   DeviceType device_type() const override { return base_ctx_.device_type(); }
+  const std::string& device_tag() const override { return base_ctx_.device_tag(); }
   const ParallelContext& parallel_ctx() const override { return base_ctx_.parallel_ctx(); }
   const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
                                                         int32_t index) const override {
diff --git a/oneflow/core/operator/op_conf.proto b/oneflow/core/operator/op_conf.proto
index d91b23df15c555fd33efdb8dda0080eec5f048be..a011031e674ae70cc00a5ba51524fd7690532991 100644
--- a/oneflow/core/operator/op_conf.proto
+++ b/oneflow/core/operator/op_conf.proto
@@ -976,7 +976,7 @@ message CastToStaticShapeOpConf {
 message OperatorConf {
   required string name = 1;
   optional bool trainable = 3 [default = true];
-  optional DeviceType device_type = 4 [default = kInvalidDevice];
+  optional string device_tag = 4 [default = "invalid_device"];
   optional bool enable_cudnn = 5;
   optional int64 cudnn_buf_limit_mbyte = 6;
   repeated string ctrl_in_op_name = 7;
diff --git a/oneflow/core/operator/operator.cpp b/oneflow/core/operator/operator.cpp
index 362668402bb0fb6777181f59741130b311aa8c04..4bff087e029346c9788709240c6cb3aed3f74154 100644
--- a/oneflow/core/operator/operator.cpp
+++ b/oneflow/core/operator/operator.cpp
@@ -13,14 +13,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/operator/operator.h"
-#include "oneflow/core/graph/logical_node.h"
 #include "oneflow/core/common/balanced_splitter.h"
+#include "oneflow/core/eager/eager_symbol_storage.h"
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/framework/user_op_registry_manager.h"
-#include "oneflow/core/job/sbp_signature_builder.h"
+#include "oneflow/core/graph/logical_node.h"
 #include "oneflow/core/job/mirrored_sig_infer_hint.h"
-#include "oneflow/core/eager/eager_symbol_storage.h"
+#include "oneflow/core/job/sbp_signature_builder.h"
 #include "oneflow/core/job/scope.h"
+#include "oneflow/core/operator/operator.h"
 
 namespace oneflow {
 
@@ -39,7 +40,8 @@ DataType GetDataTypeFromBnInOpVec(
 std::shared_ptr<Operator> CheckAndConstructOp(const OperatorConf& op_conf,
                                               const JobDesc* job_desc) {
   Operator* rptr = NewObj<Operator>(op_conf.op_type_case(), op_conf);
-  if (IsCpuOnly(op_conf)) { CHECK_EQ(op_conf.device_type(), DeviceType::kCPU); }
+  DeviceType device_type = CHECK_JUST(DeviceType4DeviceTag(op_conf.device_tag()));
+  if (IsCpuOnly(op_conf)) { CHECK_EQ(device_type, DeviceType::kCPU); }
   rptr->Init(op_conf, job_desc);
   return std::shared_ptr<Operator>(rptr);
 }
@@ -72,6 +74,11 @@ LogicalBlobId* Operator::MutBnInOp2Lbi(const std::string& bn_in_op) {
   }
 }
 
+DeviceType Operator::device_type() const {
+  DeviceType device_type = CHECK_JUST(DeviceType4DeviceTag(op_attribute_.op_conf().device_tag()));
+  return device_type;
+}
+
 const std::string& Operator::SoleIbn() const {
   CHECK_EQ(input_bns().size(), 1);
   return input_bns().Get(0);
@@ -636,7 +643,7 @@ bool IsCpuOnly(const OperatorConf& op_conf) {
 std::shared_ptr<Operator> ConstructOp(const OperatorConf& op_conf, DeviceType device_type,
                                       const JobDesc* job_desc) {
   OperatorConf dev_op_conf = op_conf;
-  dev_op_conf.set_device_type(device_type);
+  dev_op_conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(device_type)));
   return CheckAndConstructOp(dev_op_conf, job_desc);
 }
 
diff --git a/oneflow/core/operator/operator.h b/oneflow/core/operator/operator.h
index fcf198ad3d7c4ae2e7c206fd98a0d28ab4c5251f..6c8892edadc09c24189a065b2801f6b1df658003 100644
--- a/oneflow/core/operator/operator.h
+++ b/oneflow/core/operator/operator.h
@@ -59,7 +59,7 @@ class Operator {
 
   // Getters
   const std::string& op_name() const { return op_conf().name(); }
-  DeviceType device_type() const { return op_attribute_.op_conf().device_type(); }
+  DeviceType device_type() const;
   bool EnableCudnn() const { return op_conf().enable_cudnn(); }
   bool DevIsGpuAndEnableCudnn() const { return device_type() == DeviceType::kGPU && EnableCudnn(); }
   const OperatorConf& op_conf() const { return op_attribute_.op_conf(); }
diff --git a/oneflow/core/operator/user_op.cpp b/oneflow/core/operator/user_op.cpp
index a648c1ac8efce173fed955e95f47b842c53f5b6e..8858ae8d8e7c00f40ec5e2754fbecd862dac369f 100644
--- a/oneflow/core/operator/user_op.cpp
+++ b/oneflow/core/operator/user_op.cpp
@@ -13,12 +13,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/operator/user_op.h"
-#include "oneflow/core/operator/user_op_util.h"
-#include "oneflow/core/framework/tensor_desc.h"
+#include "oneflow/core/framework/batch_axis_context.h"
 #include "oneflow/core/framework/infer_util.h"
 #include "oneflow/core/framework/sbp_context.h"
-#include "oneflow/core/framework/batch_axis_context.h"
+#include "oneflow/core/framework/tensor_desc.h"
+#include "oneflow/core/framework/to_string.h"
+#include "oneflow/core/operator/user_op.h"
+#include "oneflow/core/operator/user_op_util.h"
 
 namespace oneflow {
 
@@ -53,7 +54,8 @@ class UserOpKernelRegContext final : public user_op::KernelRegContext {
     const auto& op_conf = user_op->op_conf();
     CHECK(op_conf.has_user_conf());
 
-    device_type_ = op_conf.device_type();
+    device_tag_ = op_conf.device_tag();
+    device_type_ = CHECK_JUST(DeviceType4DeviceTag(device_tag_));
     parallel_ctx_ = parallel_ctx;
 
     auto InitInOrOut = [&](const PbMap<std::string, UserOpConf::ListString>& arg_map,
@@ -85,6 +87,7 @@ class UserOpKernelRegContext final : public user_op::KernelRegContext {
   ~UserOpKernelRegContext() = default;
 
   DeviceType device_type() const override { return device_type_; }
+  const std::string& device_tag() const override { return device_tag_; }
   const ParallelContext& parallel_ctx() const override { return *parallel_ctx_; }
   const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
                                                         int32_t index) const override {
@@ -99,6 +102,7 @@ class UserOpKernelRegContext final : public user_op::KernelRegContext {
   ArgVec inputs_;
   ArgVec outputs_;
   DeviceType device_type_;
+  std::string device_tag_;
   const ParallelContext* parallel_ctx_;
   HashMap<std::pair<std::string, int32_t>, user_op::TensorDesc> arg2tensor_desc_;
 };
diff --git a/oneflow/python/eager/boxing_util.py b/oneflow/python/eager/boxing_util.py
index bce9af4b9c682a7b9343e2e704d4565b365c8173..fb79e4e88cf5e86ff81f2bcc3cacfbd6995d00f0 100644
--- a/oneflow/python/eager/boxing_util.py
+++ b/oneflow/python/eager/boxing_util.py
@@ -505,6 +505,7 @@ def ConstructNaiveBoxingOpConf(
 ):
     op_conf = op_conf_pb.OperatorConf()
     op_conf.name = "undefined_boxing_op_name"
+    op_conf.device_tag = "cpu"
     op_conf.boxing_conf.lbi.op_name = "undefined_boxing_op_name"
     op_conf.boxing_conf.lbi.blob_name = "undefined_boxing_blob_name"
     op_conf.boxing_conf.in_num = in_parallel_num
@@ -623,7 +624,7 @@ def BuildCopyHdInstruction(builder, produced_blob_object, to_device_tag):
 def _MakeCopyHdOpConfAndRetLbi():
     op_conf = op_conf_pb.OperatorConf()
     op_conf.name = "copy_hd"
-    op_conf.device_type = c_api_util.DeviceType4DeviceTag("gpu")
+    op_conf.device_tag = "gpu"
     setattr(op_conf.copy_conf, "in", "%s/in" % op_conf.name)
     op_conf.copy_conf.out = "out"
     lbi = logical_blob_id_util.LogicalBlobId()
@@ -669,6 +670,8 @@ def _AssignOpConf():
     op_conf.name = "assign"
     op_conf.assign_conf.ref = "assign/ref"
     op_conf.assign_conf.value = "assign/value"
+    device_tag = oneflow.current_scope().device_parallel_desc_symbol.device_tag
+    op_conf.device_tag = device_tag
     return op_conf
 
 
@@ -722,7 +725,7 @@ def ReplaceDeviceTag(parallel_desc_symbol, device_tag, builder=None):
 
 def _GetEagerNcclAllReduce(parallel_conf, ibn2blob_object):
     op_conf = op_conf_pb.OperatorConf()
-    op_conf.device_type = c_api_util.DeviceType4DeviceTag("gpu")
+    op_conf.device_tag = "gpu"
     op_conf.name = "eager_nccl_all_reduce"
     op_conf.user_conf.op_type_name = "eager_nccl_all_reduce"
     op_conf.user_conf.input["in"].s.append("eager_nccl_all_reduce/in_0")
diff --git a/oneflow/python/eager/op_executor.py b/oneflow/python/eager/op_executor.py
index 751dd41cf389601f34446c4aba6f2e2e7ef2c64d..f2f757d74c6a543ae2ca881b3db3c90a84a3ec43 100644
--- a/oneflow/python/eager/op_executor.py
+++ b/oneflow/python/eager/op_executor.py
@@ -21,7 +21,6 @@ import oneflow.core.operator.op_conf_pb2 as op_conf_util
 import oneflow.python.eager.vm_util as vm_util
 import oneflow.python.eager.boxing_util as boxing_util
 import oneflow.python.eager.symbol_storage as symbol_storage
-import oneflow.python.framework.device_util as device_util
 import oneflow.python.framework.c_api_util as c_api_util
 import oneflow.python.framework.remote_blob as remote_blob_util
 import oneflow.python.framework.op_arg_util as op_arg_util
@@ -363,7 +362,7 @@ def _GenModelInitOpConfAndRetLbi(var_op_conf):
     variable_op_conf.CopyFrom(var_op_conf.variable_conf)
     op_conf = op_conf_util.OperatorConf()
     op_conf.name = "model_init"
-    op_conf.device_type = device_util.DeviceType4DeviceTag("cpu")
+    op_conf.device_tag = "cpu"
     op_conf.model_init_conf.out.append("out_0")
     op_conf.model_init_conf.variable_op_name.append(var_op_conf.name)
     op_conf.model_init_conf.original_variable_conf.append(variable_op_conf)
@@ -379,7 +378,7 @@ def _GenModelLoadOpConfAndRetLbi(var_op_conf, path_lbi):
 
     op_conf = op_conf_util.OperatorConf()
     op_conf.name = "model_load"
-    op_conf.device_type = device_util.DeviceType4DeviceTag("cpu")
+    op_conf.device_tag = "cpu"
     op_conf.model_load_conf.path = "{}/{}".format(path_lbi.op_name, path_lbi.blob_name)
     op_conf.model_load_conf.out.append("out_0")
     op_conf.model_load_conf.variable_op_name.append(var_op_conf.name)
@@ -394,7 +393,7 @@ def _GenModelLoadOpConfAndRetLbi(var_op_conf, path_lbi):
 def _GenModelIOPathInputOpConfAndRetLbi():
     op_conf = op_conf_util.OperatorConf()
     op_conf.name = "model_io_path_input"
-    op_conf.device_type = device_util.DeviceType4DeviceTag("cpu")
+    op_conf.device_tag = "cpu"
     op_conf.input_conf.out = "out"
 
     blob_conf = op_conf_util.InterfaceBlobConf()
@@ -413,7 +412,7 @@ def _GenModelIOPathInputOpConfAndRetLbi():
 def _GenModelSaveOpConf(var_blobs, path_lbi):
     op_conf = op_conf_util.OperatorConf()
     op_conf.name = "model_save"
-    op_conf.device_type = device_util.DeviceType4DeviceTag("cpu")
+    op_conf.device_tag = "cpu"
     op_conf.model_save_conf.path = "{}/{}".format(path_lbi.op_name, path_lbi.blob_name)
     for blob in var_blobs:
         getattr(op_conf.model_save_conf, "in").append(blob.logical_blob_name)
diff --git a/oneflow/python/framework/c_api_util.py b/oneflow/python/framework/c_api_util.py
index eccd887f92ff8782610ce63e34cc8305c59e2ee3..a03d0fb277cb0f250fc42767e739ffae2c72b1ba 100644
--- a/oneflow/python/framework/c_api_util.py
+++ b/oneflow/python/framework/c_api_util.py
@@ -554,15 +554,6 @@ def GetMachine2DeviceIdListOFRecordFromParallelConf(parallel_conf):
     return text_format.Parse(ofrecord, record_util.OFRecord())
 
 
-def DeviceType4DeviceTag(device_tag):
-    device_tag = str(device_tag)
-    device_type, error_str = oneflow_internal.DeviceType4DeviceTag(device_tag)
-    error = text_format.Parse(error_str, error_util.ErrorProto())
-    if error.HasField("error_type"):
-        raise JobBuildAndInferError(error)
-    return device_type
-
-
 def GetFunctionConfigDef():
     func_config_def, error_str = oneflow_internal.GetFunctionConfigDef()
     error = text_format.Parse(error_str, error_util.ErrorProto())
diff --git a/oneflow/python/framework/compile_context.py b/oneflow/python/framework/compile_context.py
index fbb49e5e8165e6cd1a76ee3d80e5bf1801ba3084..27e518c671bf3e3a7b484da31b3a5518fffffcd2 100644
--- a/oneflow/python/framework/compile_context.py
+++ b/oneflow/python/framework/compile_context.py
@@ -61,9 +61,9 @@ def CurJobAddConsistentOp(op_conf, scope_symbol=None):
     if scope_symbol is None:
         scope_symbol = oneflow.current_scope()
     op_conf.scope_symbol_id = scope_symbol.symbol_id
-    if not op_conf.HasField("device_type"):
+    if not op_conf.HasField("device_tag"):
         device_tag = scope_symbol.device_parallel_desc_symbol.device_tag
-        op_conf.device_type = c_api_util.DeviceType4DeviceTag(device_tag)
+        op_conf.device_tag = device_tag
     return c_api_util.CurJobBuildAndInferCtx_AddAndInferConsistentOp(op_conf)
 
 
@@ -72,7 +72,7 @@ def CurJobAddMirroredOp(op_conf, scope_symbol=None):
     if scope_symbol is None:
         scope_symbol = oneflow.current_scope()
     op_conf.scope_symbol_id = scope_symbol.symbol_id
-    if not op_conf.HasField("device_type"):
+    if not op_conf.HasField("device_tag"):
         device_tag = scope_symbol.device_parallel_desc_symbol.device_tag
-        op_conf.device_type = c_api_util.DeviceType4DeviceTag(device_tag)
+        op_conf.device_tag = device_tag
     return c_api_util.CurJobBuildAndInferCtx_AddAndInferMirroredOp(op_conf)
diff --git a/oneflow/python/framework/device_util.py b/oneflow/python/framework/device_util.py
deleted file mode 100644
index 65c4abd73ecf8e664f1570e7ceccc83c9ca8c85a..0000000000000000000000000000000000000000
--- a/oneflow/python/framework/device_util.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-from __future__ import absolute_import
-
-import oneflow.python.framework.c_api_util as c_api_util
-
-
-def DeviceType4DeviceTag(device_tag):
-    global _device_tag2device_type
-    if device_tag not in _device_tag2device_type:
-        _device_tag2device_type[device_tag] = c_api_util.DeviceType4DeviceTag(
-            device_tag
-        )
-    return _device_tag2device_type[device_tag]
-
-
-_device_tag2device_type = {}
diff --git a/oneflow/python/framework/placement_context.py b/oneflow/python/framework/placement_context.py
index 086e09d2ff52404a0492cabc37d38cf7b70299e9..5f170801070156536372f0eb3b3966ba8c77edd8 100644
--- a/oneflow/python/framework/placement_context.py
+++ b/oneflow/python/framework/placement_context.py
@@ -20,7 +20,6 @@ import re
 
 import oneflow.core.job.placement_pb2 as placement_pb
 import oneflow.python.framework.c_api_util as c_api_util
-import oneflow.python.framework.device_util as device_util
 import oneflow.python.framework.op_util as op_util
 import oneflow.python.framework.session_context as session_ctx
 import oneflow.python.framework.scope_util as scope_util
@@ -77,9 +76,6 @@ class PlacementScope(object):
             self.GetDeviceTag4OpConf(op_conf), self.machine_device_ids_
         )
 
-    def GetDeviceType4OpConf(self, op_conf):
-        return device_util.DeviceType4DeviceTag(self.GetDeviceTag4OpConf(op_conf))
-
     def GetDeviceTag4OpConf(self, op_conf):
         return self.default_device_tag
 
@@ -109,15 +105,6 @@ def PlacementScopeStackTop():
     return session_ctx.GetDefaultSession().placement_scope_stack[0]
 
 
-def CurPlacementGroupGetDeviceType(op_conf):
-    assert len(session_ctx.GetDefaultSession().placement_scope_stack) > 0
-    return (
-        session_ctx.GetDefaultSession()
-        .placement_scope_stack[0]
-        .GetDeviceType4OpConf(op_conf)
-    )
-
-
 def ParallelConf4OpConf(op_conf):
     assert len(session_ctx.GetDefaultSession().placement_scope_stack) > 0
     return (
diff --git a/oneflow/python/framework/placement_util.py b/oneflow/python/framework/placement_util.py
index 22b950dd04d3f1f071a6d976a3d43e7aab5d08b5..10f71a88cffc78129d904a85325c529753271767 100644
--- a/oneflow/python/framework/placement_util.py
+++ b/oneflow/python/framework/placement_util.py
@@ -66,6 +66,10 @@ def deprecated_placement(*args, **kwargs):
 def api_placement(
     device_tag: str, machine_device_ids: str
 ) -> placement_ctx.PlacementScope:
+    from oneflow.python.compatibility import with_cuda
+
+    if with_cuda == False:
+        device_tag = "cpu"
     func = enable_if.unique([GetPlacementScope, GetNormalModePlacementScope])
     return func(device_tag, machine_device_ids)
 
diff --git a/oneflow/python/oneflow_internal.h b/oneflow/python/oneflow_internal.h
index b301fcff6f34f9d2655f33bc012845995582c08f..9331b77223cb749bb140be2bae1cd42bac34cfc3 100644
--- a/oneflow/python/oneflow_internal.h
+++ b/oneflow/python/oneflow_internal.h
@@ -108,12 +108,6 @@ void LaunchJob(const std::shared_ptr<oneflow::ForeignJobInstance>& cb, std::stri
   return oneflow::LaunchJob(cb).GetDataAndSerializedErrorProto(error_str);
 }
 
-long DeviceType4DeviceTag(const std::string& device_tag, std::string* error_str) {
-  return oneflow::GetDeviceType4DeviceTag(device_tag)
-      .GetDataAndSerializedErrorProto(error_str,
-                                      static_cast<long>(oneflow::DeviceType::kInvalidDevice));
-}
-
 std::string GetMachine2DeviceIdListOFRecordFromParallelConf(const std::string& parallel_conf,
                                                             std::string* error_str) {
   return oneflow::GetSerializedMachineId2DeviceIdListOFRecord(parallel_conf)
diff --git a/oneflow/python/oneflow_internal_helper.h b/oneflow/python/oneflow_internal_helper.h
index 7f97d97d10af77400621a5bfbbccdf575f9b14af..893c2315719bc2bd94d0192f7eea7866cbc78eb3 100644
--- a/oneflow/python/oneflow_internal_helper.h
+++ b/oneflow/python/oneflow_internal_helper.h
@@ -209,10 +209,6 @@ Maybe<void> LaunchJob(const std::shared_ptr<oneflow::ForeignJobInstance>& cb) {
   return Maybe<void>::Ok();
 }
 
-Maybe<long long> GetDeviceType4DeviceTag(const std::string& device_tag) {
-  return JUST(DeviceType4DeviceTag(device_tag));
-}
-
 Maybe<std::string> GetSerializedMachineId2DeviceIdListOFRecord(
     const std::string& parallel_conf_str) {
   ParallelConf parallel_conf;
diff --git a/oneflow/python/ops/__init__.py b/oneflow/python/ops/__init__.py
index acfbfea1721edc59b3c95c63c524fd4e4f6176ae..c958e19d6f9912b283bcc3928ad3adc484388ac4 100644
--- a/oneflow/python/ops/__init__.py
+++ b/oneflow/python/ops/__init__.py
@@ -95,7 +95,7 @@ def _GetReturnOpConfAndOutLbiAndScope(remote_blob, allow_cpu_return_op=True):
     setattr(op_conf.return_conf, "in", remote_blob.unique_name)
     op_conf.return_conf.out = "out"
     if allow_cpu_return_op:
-        op_conf.device_type = c_api_util.DeviceType4DeviceTag("cpu")
+        op_conf.device_tag = "cpu"
 
     lbi = logical_blob_id_util.LogicalBlobId()
     lbi.op_name = op_conf.name
diff --git a/oneflow/python/ops/user_op_builder.py b/oneflow/python/ops/user_op_builder.py
index 92e3da2c378755d0843d19afc0e74e4791719cc6..31ea5342f1f0b4c7288f2e235a721dad0949eae1 100644
--- a/oneflow/python/ops/user_op_builder.py
+++ b/oneflow/python/ops/user_op_builder.py
@@ -50,6 +50,8 @@ class UserOp(object):
         self.op_conf_.name = op_name
         if op_type_name is not None:
             self.op_conf_.user_conf.op_type_name = op_type_name
+        device_tag = oneflow.current_scope().device_parallel_desc_symbol.device_tag
+        self.op_conf_.device_tag = device_tag
         self.output_arg_key_list_ = []
 
     @property
diff --git a/oneflow/user/kernels/add_n_kernel.cpp b/oneflow/user/kernels/add_n_kernel.cpp
index c7d5d941e49ce1c3b27eb131894e30051f37d375..3e9ba9294cba21b2369dd9a2ecf0765808549290 100644
--- a/oneflow/user/kernels/add_n_kernel.cpp
+++ b/oneflow/user/kernels/add_n_kernel.cpp
@@ -57,7 +57,7 @@ class CpuAddNKernel : public user_op::OpKernel {
 #define REGISTER_CPU_ADDN_KERNEL(cpp_type, dtype)                                               \
   REGISTER_USER_KERNEL("add_n")                                                                 \
       .SetCreateFn<CpuAddNKernel<cpp_type>>()                                                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                       \
                        & (user_op::HobDataType("in", 0) == dtype))                              \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/add_n_kernel.cu b/oneflow/user/kernels/add_n_kernel.cu
index 41d2223df24ef75878d25b57318935aeac382b49..0ae89377395663898e3054493049c3f8e61daa3f 100644
--- a/oneflow/user/kernels/add_n_kernel.cu
+++ b/oneflow/user/kernels/add_n_kernel.cu
@@ -116,7 +116,7 @@ class GpuAddNKernel : public user_op::OpKernel {
 #define REGISTER_GPU_ADDN_KERNEL(cpp_type, dtype)                                               \
   REGISTER_USER_KERNEL("add_n")                                                                 \
       .SetCreateFn<GpuAddNKernel<cpp_type>>()                                                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                       \
                        & (user_op::HobDataType("in", 0) == dtype))                              \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
@@ -189,7 +189,7 @@ class GpuAddNHalfKernel : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("add_n")
     .SetCreateFn<GpuAddNHalfKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")
                      & (user_op::HobDataType("in", 0) == DataType::kFloat16))
     .SetInplaceProposalFn([](const user_op::InferContext&,
                              user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
diff --git a/oneflow/user/kernels/arg_sort_kernel.cpp b/oneflow/user/kernels/arg_sort_kernel.cpp
index 4bd36041fe6db85537ffc3f4a4296d178f9b78e8..91a6e728af8a449a41f5d5faac77d2ad2210ad74 100644
--- a/oneflow/user/kernels/arg_sort_kernel.cpp
+++ b/oneflow/user/kernels/arg_sort_kernel.cpp
@@ -59,10 +59,10 @@ class CpuArgSortKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_CPU_ARG_SORT_KERNEL(dtype)                           \
-  REGISTER_USER_KERNEL("arg_sort")                                    \
-      .SetCreateFn<CpuArgSortKernel<dtype>>()                         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_CPU_ARG_SORT_KERNEL(dtype)               \
+  REGISTER_USER_KERNEL("arg_sort")                        \
+      .SetCreateFn<CpuArgSortKernel<dtype>>()             \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_ARG_SORT_KERNEL(float)
diff --git a/oneflow/user/kernels/arg_sort_kernel.cu b/oneflow/user/kernels/arg_sort_kernel.cu
index 5e4b67317d7ba860615a8378cbb99fc0c29d27a5..9e3a3090df153c2e018cad338e62c971d34d4bcf 100644
--- a/oneflow/user/kernels/arg_sort_kernel.cu
+++ b/oneflow/user/kernels/arg_sort_kernel.cu
@@ -108,7 +108,7 @@ class GpuArgSortKernel final : public user_op::OpKernel {
 #define REGISTER_GPU_ARG_SORT_KERNEL(dtype)                                                        \
   REGISTER_USER_KERNEL("arg_sort")                                                                 \
       .SetCreateFn<GpuArgSortKernel<dtype>>()                                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                          \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))             \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                               \
diff --git a/oneflow/user/kernels/argmax_kernel.cpp b/oneflow/user/kernels/argmax_kernel.cpp
index 38fad3cf152854db1cedb784552ce44621be45d6..24697c2a18988e82cb39e88686e99d690d972876 100644
--- a/oneflow/user/kernels/argmax_kernel.cpp
+++ b/oneflow/user/kernels/argmax_kernel.cpp
@@ -55,7 +55,7 @@ class CpuArgMaxKernel final : public user_op::OpKernel {
 
 #define REGISTER_CPU_ARGMAX_KERNEL(dtype)                                               \
   REGISTER_USER_KERNEL("argmax").SetCreateFn<CpuArgMaxKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCPU)                                    \
+      (user_op::HobDeviceTag() == "cpu")                                                \
       & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_ARGMAX_KERNEL(float)
diff --git a/oneflow/user/kernels/argmax_kernel.cu b/oneflow/user/kernels/argmax_kernel.cu
index a4bed87a594de36f1732ae8539f2850741524e07..15633d39760d5fba1cb1ebfa91793bcf8c91104c 100644
--- a/oneflow/user/kernels/argmax_kernel.cu
+++ b/oneflow/user/kernels/argmax_kernel.cu
@@ -148,7 +148,7 @@ class GpuArgMaxKernel final : public user_op::OpKernel {
 #define REGISTER_GPU_ARGMAX_KERNEL(dtype)                                                          \
   REGISTER_USER_KERNEL("argmax")                                                                   \
       .SetCreateFn<GpuArgMaxKernel<dtype>>()                                                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                          \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))             \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                               \
diff --git a/oneflow/user/kernels/batch_gather_kernel.cpp b/oneflow/user/kernels/batch_gather_kernel.cpp
index ec2ed34c8db8b7d07cee04ecaded8a06eed7f347..df311c895bb673b9f84f36a8bf20bc60f651fede 100644
--- a/oneflow/user/kernels/batch_gather_kernel.cpp
+++ b/oneflow/user/kernels/batch_gather_kernel.cpp
@@ -46,7 +46,7 @@ class BatchGatherKernel final : public user_op::OpKernel {
       .SetCreateFn<BatchGatherKernel<device, OF_PP_PAIR_FIRST(out_dtype),    \
                                      OF_PP_PAIR_FIRST(indices_dtype)>>()     \
       .SetIsMatchedHob(                                                      \
-          (user_op::HobDeviceType() == device)                               \
+          (user_op::HobDeviceTag() == device)                                \
           & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype)) \
           & (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_dtype)));
 
diff --git a/oneflow/user/kernels/bernoulli_kernel.cpp b/oneflow/user/kernels/bernoulli_kernel.cpp
index c6b129f5255df396ebdbdd406be96f6fe6387de8..222aa058f939a1a76533468f61ca7effc8da34cd 100644
--- a/oneflow/user/kernels/bernoulli_kernel.cpp
+++ b/oneflow/user/kernels/bernoulli_kernel.cpp
@@ -55,7 +55,7 @@ class BernoulliKerenl final : public user_op::OpKernel {
   REGISTER_USER_KERNEL("bernoulli")                                                             \
       .SetCreateFn<                                                                             \
           BernoulliKerenl<OF_PP_PAIR_FIRST(in_dtype_pair), OF_PP_PAIR_FIRST(out_dtype_pair)>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                       \
                        & (user_op::HobDataType("in", 0) == OF_PP_PAIR_SECOND(in_dtype_pair))    \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype_pair)));
 
diff --git a/oneflow/user/kernels/bias_add_kernel.h b/oneflow/user/kernels/bias_add_kernel.h
index 25cee29ff5597fa5c1b63a3e182b41281bedf136..ad657720616072a5156ef976e219b4feb9f53373 100644
--- a/oneflow/user/kernels/bias_add_kernel.h
+++ b/oneflow/user/kernels/bias_add_kernel.h
@@ -59,7 +59,7 @@ class BiasAddUserKernel final : public user_op::OpKernel {
 #define REGISTER_BIAS_ADD_USER_KERNEL(op_device_type, dtype)                                    \
   REGISTER_USER_KERNEL("bias_add")                                                              \
       .SetCreateFn<BiasAddUserKernel<DeviceType::k##op_device_type, dtype>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::k##op_device_type)              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == DeviceType::k##op_device_type)               \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))         \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/broadcast_div_grad_kernel.cpp b/oneflow/user/kernels/broadcast_div_grad_kernel.cpp
index e97b61f1450aafaa180b1c58298058df364baf53..dfbb58355258a65c5b31f51dae20597fe575b168 100644
--- a/oneflow/user/kernels/broadcast_div_grad_kernel.cpp
+++ b/oneflow/user/kernels/broadcast_div_grad_kernel.cpp
@@ -60,7 +60,7 @@ class BroadcastDivGradKernel final : public user_op::OpKernel {
 #define REGISTER_BROADCAST_DIV_GRAD_KERNEL(device, dtype_pair)                            \
   REGISTER_USER_KERNEL("broadcast_div_grad")                                              \
       .SetCreateFn<BroadcastDivGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                               \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                \
                        & (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
       .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                        \
         user_op::TensorDesc* z = ctx->TensorDesc4ArgNameAndIndex("z", 0);                 \
diff --git a/oneflow/user/kernels/broadcast_like_kernel.cpp b/oneflow/user/kernels/broadcast_like_kernel.cpp
index 539e3bae330fbdb2e0599b647035237db603cf02..95a14a84109a28067778bdc3de232831992bd4c2 100644
--- a/oneflow/user/kernels/broadcast_like_kernel.cpp
+++ b/oneflow/user/kernels/broadcast_like_kernel.cpp
@@ -44,10 +44,10 @@ class BroadcastLikeKernel final : public user_op::OpKernel {
 
 }  // namespace
 
-#define REGISTER_BROADCAST_LIKE_XPU_KERNEL(device, dtype)   \
-  REGISTER_USER_KERNEL("broadcast_like")                    \
-      .SetCreateFn<BroadcastLikeKernel<device, dtype>>()    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device) \
+#define REGISTER_BROADCAST_LIKE_XPU_KERNEL(device, dtype)  \
+  REGISTER_USER_KERNEL("broadcast_like")                   \
+      .SetCreateFn<BroadcastLikeKernel<device, dtype>>()   \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device) \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
 
 #ifdef WITH_CUDA
diff --git a/oneflow/user/kernels/cast_kernel.cpp b/oneflow/user/kernels/cast_kernel.cpp
index ba729c649be92b0e3117962ed121dcaf42cf2153..6e8e564daecd3467dce54dcfe482d3cbcfef4599 100644
--- a/oneflow/user/kernels/cast_kernel.cpp
+++ b/oneflow/user/kernels/cast_kernel.cpp
@@ -86,10 +86,10 @@ class CastKernel final : public OpKernel {
 
 #define REGISTER_CAST_KERNEL(device)                                              \
   REGISTER_USER_KERNEL("cast").SetCreateFn<CastKernel<device>>().SetIsMatchedHob( \
-      user_op::HobDeviceType() == device);                                        \
+      user_op::HobDeviceTag() == device);                                         \
   REGISTER_USER_KERNEL("cast_like")                                               \
       .SetCreateFn<CastKernel<device>>()                                          \
-      .SetIsMatchedHob(user_op::HobDeviceType() == device);
+      .SetIsMatchedHob(user_op::HobDeviceTag() == device);
 
 REGISTER_CAST_KERNEL(DeviceType::kCPU)
 REGISTER_CAST_KERNEL(DeviceType::kGPU)
diff --git a/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp b/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp
index 575cf06e5a2f02251e525d3e97400de959846ade..ae28e68b77797422364b78ead0ff9615e8c1523f 100644
--- a/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp
+++ b/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp
@@ -45,7 +45,7 @@ class CategoricalOrdinalEncodeKernel final : public user_op::OpKernel {
 #define REGISTER_CATEGORICAL_ORDINAL_ENCODE_KERNEL(device, proto_type, cpp_type) \
   REGISTER_USER_KERNEL("CategoricalOrdinalEncode")                               \
       .SetCreateFn<CategoricalOrdinalEncodeKernel<device, cpp_type>>()           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                      \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                       \
                        & (user_op::HobDataType("in", 0) == proto_type));
 
 REGISTER_CATEGORICAL_ORDINAL_ENCODE_KERNEL(DeviceType::kCPU, DataType::kInt32, int32_t);
diff --git a/oneflow/user/kernels/clip_by_value_kernel.cpp b/oneflow/user/kernels/clip_by_value_kernel.cpp
index 969e26760636f4b6de80363680eaea7b68ffaaa5..62ff5d549365cd3651a92019a6f5e19955422c3e 100644
--- a/oneflow/user/kernels/clip_by_value_kernel.cpp
+++ b/oneflow/user/kernels/clip_by_value_kernel.cpp
@@ -190,7 +190,7 @@ class ClipByScalarMaxGradKernel final : public user_op::OpKernel {
 #define REGISTER_CLIP_KERNEL(op_type_name, kernel_name, device_type_v, dtype)                   \
   REGISTER_USER_KERNEL(#op_type_name)                                                           \
       .SetCreateFn<kernel_name##Kernel<device_type_v, dtype>>()                                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device_type_v)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device_type_v)                               \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value))           \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
@@ -201,7 +201,7 @@ class ClipByScalarMaxGradKernel final : public user_op::OpKernel {
 #define REGISTER_CLIP_GRAD_KERNEL(op_type_name, kernel_name, device_type_v, dtype)              \
   REGISTER_USER_KERNEL(#op_type_name)                                                           \
       .SetCreateFn<kernel_name##GradKernel<device_type_v, dtype>>()                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device_type_v)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device_type_v)                               \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))          \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/coco_reader_kernel.cpp b/oneflow/user/kernels/coco_reader_kernel.cpp
index 4e900533d21e8c27651eb7ee85c210c94c8c77d1..fc43d60c5cc7e631d51d624ea1935b83adf12cd7 100644
--- a/oneflow/user/kernels/coco_reader_kernel.cpp
+++ b/oneflow/user/kernels/coco_reader_kernel.cpp
@@ -54,7 +54,7 @@ class COCOReaderKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("COCOReader")
     .SetCreateFn<COCOReaderKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("image", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("image_id", 0) == DataType::kInt64)
                      & (user_op::HobDataType("image_size", 0) == DataType::kInt32)
diff --git a/oneflow/user/kernels/concat_kernel.cpp b/oneflow/user/kernels/concat_kernel.cpp
index a713b45bd0cbcdec04fbbfa28fa9f075cde9308f..3619dba8cbe9913613c0bf90862c3170081f8b78 100644
--- a/oneflow/user/kernels/concat_kernel.cpp
+++ b/oneflow/user/kernels/concat_kernel.cpp
@@ -78,7 +78,7 @@ class ConcatKernel final : public user_op::OpKernel {
 
 #define REGISTER_CONCAT_KERNEL(device, dtype)                                                \
   REGISTER_USER_KERNEL("concat").SetCreateFn<ConcatKernel<device, dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == device)                                                   \
+      (user_op::HobDeviceTag() == device)                                                    \
       & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
 #define REGISTER_CONCAT_KERNEL_WITH_DEVICE(device) \
diff --git a/oneflow/user/kernels/constant_kernel.cpp b/oneflow/user/kernels/constant_kernel.cpp
index bb226e3f94b68be0b506a2e65d0020f9925de8f4..0a654bea3eff30758350ca79517a31a793c97632 100644
--- a/oneflow/user/kernels/constant_kernel.cpp
+++ b/oneflow/user/kernels/constant_kernel.cpp
@@ -60,10 +60,10 @@ class ConstantKernel final : public OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_CONSTANT_XPU_KERNEL(device, dtype)         \
-  REGISTER_USER_KERNEL("constant")                          \
-      .SetCreateFn<ConstantKernel<device, dtype>>()         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device) \
+#define REGISTER_CONSTANT_XPU_KERNEL(device, dtype)        \
+  REGISTER_USER_KERNEL("constant")                         \
+      .SetCreateFn<ConstantKernel<device, dtype>>()        \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device) \
                        & (user_op::HobAttr<DataType>("dtype") == GetDataType<dtype>::value));
 
 #define REGISTER_CONSTANT_KERNEL(device, dtype_pair) \
diff --git a/oneflow/user/kernels/conv_cudnn_kernels.cpp b/oneflow/user/kernels/conv_cudnn_kernels.cpp
index 77f33fdf52b05f90aaaa6d0b1713caec37e1c1e9..31892511a8e344b07c87fe68383b808542207c1f 100644
--- a/oneflow/user/kernels/conv_cudnn_kernels.cpp
+++ b/oneflow/user/kernels/conv_cudnn_kernels.cpp
@@ -190,7 +190,7 @@ class ConvGpuKernel final : public user_op::OpKernel {
 #define REGISTER_CONV_KERNEL(op_name, dtype, ndims)                                    \
   REGISTER_USER_KERNEL(#op_name)                                                       \
       .SetCreateFn<ConvGpuKernel<dtype, ndims>>()                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                    \
         const JobDesc& job_desc = ctx->job_desc();                                     \
@@ -248,7 +248,7 @@ class ConvDataGradGpuKernel final : public user_op::OpKernel {
 #define REGISTER_CONV_DATA_GRAD_FLOATING_KERNEL(dtype)                                 \
   REGISTER_USER_KERNEL("conv_data_grad")                                               \
       .SetCreateFn<ConvDataGradGpuKernel<dtype>>()                                     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                    \
         const JobDesc& job_desc = ctx->job_desc();                                     \
@@ -300,7 +300,7 @@ class ConvFilterGradGpuKernel final : public user_op::OpKernel {
 #define REGISTER_CONV_FILTER_GRAD_FLOATING_KERNEL(dtype)                               \
   REGISTER_USER_KERNEL("conv_filter_grad")                                             \
       .SetCreateFn<ConvFilterGradGpuKernel<dtype>>()                                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                    \
         const JobDesc& job_desc = ctx->job_desc();                                     \
@@ -371,10 +371,10 @@ class ConvBiasGradGpuKernel final : public user_op::OpKernel {
   }
 };
 
-#define REGISTER_CONV_BIAS_GRAD_FLOATING_KERNEL(dtype)                \
-  REGISTER_USER_KERNEL("conv_bias_grad")                              \
-      .SetCreateFn<ConvBiasGradGpuKernel<dtype>>()                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_CONV_BIAS_GRAD_FLOATING_KERNEL(dtype)    \
+  REGISTER_USER_KERNEL("conv_bias_grad")                  \
+      .SetCreateFn<ConvBiasGradGpuKernel<dtype>>()        \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value));
 
 REGISTER_CONV_BIAS_GRAD_FLOATING_KERNEL(float);
diff --git a/oneflow/user/kernels/conv_kernels.cpp b/oneflow/user/kernels/conv_kernels.cpp
index 5af68f5104f9ad8e0137eb95fe7eb99a54404ff8..922b7d0666594ecd9d50eac42ba7baa15c61c873 100644
--- a/oneflow/user/kernels/conv_kernels.cpp
+++ b/oneflow/user/kernels/conv_kernels.cpp
@@ -457,7 +457,7 @@ class ConvCpuKernel final : public user_op::OpKernel {
 #define REGISTER_CONV_KERNEL(op_name, dtype, ndims)                                         \
   REGISTER_USER_KERNEL(#op_name)                                                            \
       .SetCreateFn<ConvCpuKernel<dtype, ndims>>()                                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                       \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                   \
                        & (user_op::HobAttr<int32_t>("groups") == 1)                         \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))      \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                         \
@@ -536,7 +536,7 @@ class ConvDataGradCpuKernel final : public user_op::OpKernel {
 #define REGISTER_CONV_DATA_GRAD_KERNEL(op_name, dtype)                                     \
   REGISTER_USER_KERNEL(#op_name)                                                           \
       .SetCreateFn<ConvDataGradCpuKernel<dtype>>()                                         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                      \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                  \
                        & (user_op::HobAttr<int32_t>("groups") == 1)                        \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value))     \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                        \
@@ -604,7 +604,7 @@ class ConvFilterGradCpuKernel final : public user_op::OpKernel {
 #define REGISTER_CONV_FILTER_GRAD_KERNEL(op_name, dtype)                                        \
   REGISTER_USER_KERNEL(#op_name)                                                                \
       .SetCreateFn<ConvFilterGradCpuKernel<dtype>>()                                            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                       \
                        & (user_op::HobAttr<int32_t>("groups") == 1)                             \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value))          \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                             \
@@ -672,7 +672,7 @@ class ConvBiasGradCpuKernel final : public user_op::OpKernel {
 #define REGISTER_CONV_BIAS_GRAD_KERNEL(op_name, dtype)                                         \
   REGISTER_USER_KERNEL(#op_name)                                                               \
       .SetCreateFn<ConvBiasGradCpuKernel<dtype>>()                                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                          \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                      \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value))         \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                            \
         const auto& out_diff_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();        \
diff --git a/oneflow/user/kernels/deconv_cudnn_kernel.cpp b/oneflow/user/kernels/deconv_cudnn_kernel.cpp
index fd24256cc67434f25f5886dedb7835dccf598a01..a6785e29eabd586d9f5fad761863f3871df38a76 100644
--- a/oneflow/user/kernels/deconv_cudnn_kernel.cpp
+++ b/oneflow/user/kernels/deconv_cudnn_kernel.cpp
@@ -131,7 +131,7 @@ class DeConvGpuKernel final : public user_op::OpKernel {
 #define REGISTER_DECONV_KERNEL(op_name, dtype, ndims)                                  \
   REGISTER_USER_KERNEL(#op_name)                                                       \
       .SetCreateFn<DeConvGpuKernel<dtype, ndims>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                    \
         const JobDesc& job_desc = ctx->job_desc();                                     \
diff --git a/oneflow/user/kernels/dropout_kernel.cpp b/oneflow/user/kernels/dropout_kernel.cpp
index 4605e458df97c0be150052068833b239dece565c..2c7b6e6dd771f775a1d014b013188456b5526d9b 100644
--- a/oneflow/user/kernels/dropout_kernel.cpp
+++ b/oneflow/user/kernels/dropout_kernel.cpp
@@ -48,7 +48,7 @@ class DropoutKernelCPU final : public user_op::OpKernel {
 #define REGISTER_DROPOUT_KERNEL_CPU(dtype)                                                      \
   REGISTER_USER_KERNEL("dropout")                                                               \
       .SetCreateFn<DropoutKernelCPU<dtype>>()                                                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                       \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))         \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
@@ -80,7 +80,7 @@ class DropoutGradKernelCPU final : public user_op::OpKernel {
 #define REGISTER_DROPOUT_GRAD_KERNEL_CPU(dtype)                                                 \
   REGISTER_USER_KERNEL("dropout_grad")                                                          \
       .SetCreateFn<DropoutGradKernelCPU<dtype>>()                                               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                       \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))          \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
@@ -121,7 +121,7 @@ class RandomMaskLikeKernel final : public user_op::OpKernel {
 #define REGISTER_RANDOM_MASK_LIKE_KERNEL(device)   \
   REGISTER_USER_KERNEL("random_mask_like")         \
       .SetCreateFn<RandomMaskLikeKernel<device>>() \
-      .SetIsMatchedHob(user_op::HobDeviceType() == device);
+      .SetIsMatchedHob(user_op::HobDeviceTag() == device);
 
 REGISTER_RANDOM_MASK_LIKE_KERNEL(DeviceType::kCPU)
 #ifdef WITH_CUDA
diff --git a/oneflow/user/kernels/dropout_kernel.cu b/oneflow/user/kernels/dropout_kernel.cu
index 283d5562e8bca8b6c7feb47a96e62d88ec52eab5..e1d217c9b9cf6f09dfd544a280aba9b512e1473b 100644
--- a/oneflow/user/kernels/dropout_kernel.cu
+++ b/oneflow/user/kernels/dropout_kernel.cu
@@ -85,7 +85,7 @@ class DropoutKernelGPU final : public user_op::OpKernel {
 #define REGISTER_DROPOUT_KERNEL_GPU(dtype)                                                      \
   REGISTER_USER_KERNEL("dropout")                                                               \
       .SetCreateFn<DropoutKernelGPU<dtype>>()                                                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                       \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))         \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
@@ -118,7 +118,7 @@ class DropoutGradKernelGPU final : public user_op::OpKernel {
 #define REGISTER_DROPOUT_GRAD_KERNEL_GPU(dtype)                                                 \
   REGISTER_USER_KERNEL("dropout_grad")                                                          \
       .SetCreateFn<DropoutGradKernelGPU<dtype>>()                                               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                       \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))          \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/eager_nccl_kernels.cu b/oneflow/user/kernels/eager_nccl_kernels.cu
index 7ffb19edd6546b36109df7adb51350d40c149c6a..c448f753c51ea6e11776a391324e4cc053db0c94 100644
--- a/oneflow/user/kernels/eager_nccl_kernels.cu
+++ b/oneflow/user/kernels/eager_nccl_kernels.cu
@@ -50,6 +50,6 @@ class EagerNcclAllReduceKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("eager_nccl_all_reduce")
     .SetCreateFn<EagerNcclAllReduceKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kGPU);
+    .SetIsMatchedHob(user_op::HobDeviceTag() == "gpu");
 
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/expand_dims_kernel.cpp b/oneflow/user/kernels/expand_dims_kernel.cpp
index ffe221771af780f76efd4d147f8a5a4bed9bc35b..508b57dd75309ad3f5ab50666b96be518aa50e1f 100644
--- a/oneflow/user/kernels/expand_dims_kernel.cpp
+++ b/oneflow/user/kernels/expand_dims_kernel.cpp
@@ -21,7 +21,7 @@ namespace oneflow {
 #define REGISTER_EXPAND_DIMS_KERNEL(D)                                                          \
   REGISTER_USER_KERNEL("expand_dims")                                                           \
       .SetCreateFn<CopyDataContentKernel<DeviceType::D>>()                                      \
-      .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::D)                               \
+      .SetIsMatchedHob(user_op::HobDeviceTag() == DeviceType::D)                                \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
         OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));                      \
diff --git a/oneflow/user/kernels/gather_kernel.cpp b/oneflow/user/kernels/gather_kernel.cpp
index 4bea3f61352f8a3ff061b2f92ef5e7387c474b6f..ac141cbe54016d3e770f5a393c1d2d2c09dc269f 100644
--- a/oneflow/user/kernels/gather_kernel.cpp
+++ b/oneflow/user/kernels/gather_kernel.cpp
@@ -93,7 +93,7 @@ class GatherKernel final : public user_op::OpKernel {
   REGISTER_USER_KERNEL("gather")                                                             \
       .SetCreateFn<                                                                          \
           GatherKernel<device, OF_PP_PAIR_FIRST(in_type), OF_PP_PAIR_FIRST(indices_type)>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                   \
                        & (user_op::HobDataType("in", 0) == OF_PP_PAIR_SECOND(in_type))       \
                        & (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type)));
 
diff --git a/oneflow/user/kernels/gelu_kernel.cpp b/oneflow/user/kernels/gelu_kernel.cpp
index b9c7efa81dd5add328e3218613141509edb4e1ca..ac387442dddf2a78876e5807cc739bac38012936 100644
--- a/oneflow/user/kernels/gelu_kernel.cpp
+++ b/oneflow/user/kernels/gelu_kernel.cpp
@@ -41,7 +41,7 @@ class CpuGeluKernel final : public user_op::OpKernel {
 
 #define REGISTER_CPU_GELU_KERNEL(dtype)                                             \
   REGISTER_USER_KERNEL("gelu").SetCreateFn<CpuGeluKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCPU)                                \
+      (user_op::HobDeviceTag() == "cpu")                                            \
       & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_GELU_KERNEL(float)
@@ -75,10 +75,10 @@ class CpuGeluGradKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_CPU_GELU_GRAD_KERNEL(dtype)                          \
-  REGISTER_USER_KERNEL("gelu_grad")                                   \
-      .SetCreateFn<CpuGeluGradKernel<dtype>>()                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_CPU_GELU_GRAD_KERNEL(dtype)              \
+  REGISTER_USER_KERNEL("gelu_grad")                       \
+      .SetCreateFn<CpuGeluGradKernel<dtype>>()            \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_GELU_GRAD_KERNEL(float)
diff --git a/oneflow/user/kernels/gelu_kernel.cu b/oneflow/user/kernels/gelu_kernel.cu
index 3f88ba1de621e0261c286b54fa0944954474bdf9..a998044d724eeda311ce5fe954f2af1ca3474a90 100644
--- a/oneflow/user/kernels/gelu_kernel.cu
+++ b/oneflow/user/kernels/gelu_kernel.cu
@@ -123,7 +123,7 @@ class GpuGeluKernel<float16> final : public user_op::OpKernel {
 
 #define REGISTER_GPU_GELU_KERNEL(dtype)                                             \
   REGISTER_USER_KERNEL("gelu").SetCreateFn<GpuGeluKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kGPU)                                \
+      (user_op::HobDeviceTag() == "gpu")                                            \
       & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
 REGISTER_GPU_GELU_KERNEL(float)
@@ -174,10 +174,10 @@ class GpuGeluGradKernel<float16> final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_GPU_GELU_GRAD_KERNEL(dtype)                          \
-  REGISTER_USER_KERNEL("gelu_grad")                                   \
-      .SetCreateFn<GpuGeluGradKernel<dtype>>()                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_GPU_GELU_GRAD_KERNEL(dtype)              \
+  REGISTER_USER_KERNEL("gelu_grad")                       \
+      .SetCreateFn<GpuGeluGradKernel<dtype>>()            \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 REGISTER_GPU_GELU_GRAD_KERNEL(float)
diff --git a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cpp b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cpp
index bba320f106a658cd1a9f0604a27a9e8494191490..73ca49245ff62774ee448f2ccb2a5973bf018a42 100644
--- a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cpp
+++ b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cpp
@@ -44,6 +44,6 @@ class GenerateRandomBatchPermutationIndicesCPUKernel final : public user_op::OpK
 
 REGISTER_USER_KERNEL("generate_random_batch_permutation_indices")
     .SetCreateFn<GenerateRandomBatchPermutationIndicesCPUKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCPU);
+    .SetIsMatchedHob(user_op::HobDeviceTag() == "cpu");
 
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cu b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cu
index 96398345d5d8ff5639ff27e46130b660eee8fa13..db063686d94724c02ba93c687333098f4d645440 100644
--- a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cu
+++ b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cu
@@ -115,7 +115,7 @@ class GenerateRandomBatchPermutationIndicesGPUKernel final : public user_op::OpK
 
 REGISTER_USER_KERNEL("generate_random_batch_permutation_indices")
     .SetCreateFn<GenerateRandomBatchPermutationIndicesGPUKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kGPU)
+    .SetIsMatchedHob(user_op::HobDeviceTag() == "gpu")
     .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {
       const Shape* y_shape = ctx->Shape4ArgNameAndIndex("y", 0);
       const int32_t batch_size = y_shape->At(0);
diff --git a/oneflow/user/kernels/heap_selection_top_k_kernel.cu b/oneflow/user/kernels/heap_selection_top_k_kernel.cu
index 9e434dce1e035dbba14906dac742b8b30fccb51d..3e5df2bec786b588bfb464f0d4b8d8566592aa2e 100644
--- a/oneflow/user/kernels/heap_selection_top_k_kernel.cu
+++ b/oneflow/user/kernels/heap_selection_top_k_kernel.cu
@@ -216,7 +216,7 @@ class GpuHeapSelectionTopKKernel final : public user_op::OpKernel {
 
 #define REGISTER_GPU_HEAP_SELECTION_TOP_K_KERNEL(dtype)                                           \
   REGISTER_USER_KERNEL("top_k").SetCreateFn<GpuHeapSelectionTopKKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kGPU) & (user_op::HobAttr<int32_t>("k") <= 128)    \
+      (user_op::HobDeviceTag() == "gpu") & (user_op::HobAttr<int32_t>("k") <= 128)                \
       & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
 
 REGISTER_GPU_HEAP_SELECTION_TOP_K_KERNEL(float)
diff --git a/oneflow/user/kernels/identity_kernel.cpp b/oneflow/user/kernels/identity_kernel.cpp
index b7e5261ae8b4c07a095a2669eef1ae789ae6a23f..969360a33172442ff00fc25a4cb304115091542f 100644
--- a/oneflow/user/kernels/identity_kernel.cpp
+++ b/oneflow/user/kernels/identity_kernel.cpp
@@ -43,7 +43,7 @@ class IdentityKernel final : public user_op::OpKernel {
 #define REGISTER_IDENTITY_KERNEL(device)                                                        \
   REGISTER_USER_KERNEL("identity")                                                              \
       .SetCreateFn<IdentityKernel<device>>()                                                    \
-      .SetIsMatchedHob(user_op::HobDeviceType() == device)                                      \
+      .SetIsMatchedHob(user_op::HobDeviceTag() == device)                                       \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
         OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));                      \
diff --git a/oneflow/user/kernels/image_batch_align_kernel.cpp b/oneflow/user/kernels/image_batch_align_kernel.cpp
index f001a73ea829a752a38ceffd1aef7d5f63b1712f..735029f0537b06a1db5a9042970b7df25a571040 100644
--- a/oneflow/user/kernels/image_batch_align_kernel.cpp
+++ b/oneflow/user/kernels/image_batch_align_kernel.cpp
@@ -102,7 +102,7 @@ class ImageBatchAlignKernel final : public user_op::OpKernel {
 #define REGISTER_IMAGE_BATCH_ALIGN_KERNEL(dtype)                                    \
   REGISTER_USER_KERNEL("image_batch_align")                                         \
       .SetCreateFn<ImageBatchAlignKernel<dtype>>()                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)               \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                           \
                        & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer) \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
diff --git a/oneflow/user/kernels/image_decode_kernel.cpp b/oneflow/user/kernels/image_decode_kernel.cpp
index 312cf340944d9e17857233b95b3b7bc0ebd97aca..4627a9498559f6ac42381d3ea70eb238fa7b6386 100644
--- a/oneflow/user/kernels/image_decode_kernel.cpp
+++ b/oneflow/user/kernels/image_decode_kernel.cpp
@@ -86,7 +86,7 @@ class ImageDecodeKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("image_decode")
     .SetCreateFn<ImageDecodeKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer));
 ;
diff --git a/oneflow/user/kernels/image_object_preprocess_kernels.cpp b/oneflow/user/kernels/image_object_preprocess_kernels.cpp
index 0e3e2d0ffdbe89163841115b4d9f0efc62d14f83..216101cb39b56422961ea32f75cbecf08053d140 100644
--- a/oneflow/user/kernels/image_object_preprocess_kernels.cpp
+++ b/oneflow/user/kernels/image_object_preprocess_kernels.cpp
@@ -441,7 +441,7 @@ MakeInplaceProposalFn(const std::string& input_arg_name) {
 
 REGISTER_USER_KERNEL("image_flip")
     .SetCreateFn<ImageFlipKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("flip_code", 0) == DataType::kInt8)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer))
@@ -449,7 +449,7 @@ REGISTER_USER_KERNEL("image_flip")
 
 REGISTER_USER_KERNEL("object_bbox_flip")
     .SetCreateFn<ObjectBboxFlipKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("bbox", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("image_size", 0) == DataType::kInt32)
                      & (user_op::HobDataType("flip_code", 0) == DataType::kInt8)
@@ -458,7 +458,7 @@ REGISTER_USER_KERNEL("object_bbox_flip")
 
 REGISTER_USER_KERNEL("object_bbox_scale")
     .SetCreateFn<ObjectBboxScaleKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("bbox", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("scale", 0) == DataType::kFloat)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer))
@@ -466,7 +466,7 @@ REGISTER_USER_KERNEL("object_bbox_scale")
 
 REGISTER_USER_KERNEL("object_segmentation_polygon_flip")
     .SetCreateFn<ObjectSegmentationPolygonFlipKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("poly", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("image_size", 0) == DataType::kInt32)
                      & (user_op::HobDataType("flip_code", 0) == DataType::kInt8)
@@ -475,7 +475,7 @@ REGISTER_USER_KERNEL("object_segmentation_polygon_flip")
 
 REGISTER_USER_KERNEL("object_segmentation_polygon_scale")
     .SetCreateFn<ObjectSegmentationPolygonScaleKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("poly", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("scale", 0) == DataType::kFloat)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer))
@@ -483,14 +483,14 @@ REGISTER_USER_KERNEL("object_segmentation_polygon_scale")
 
 REGISTER_USER_KERNEL("image_normalize")
     .SetCreateFn<ImageNormalize>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer))
     .SetInplaceProposalFn(MakeInplaceProposalFn("in"));
 
 REGISTER_USER_KERNEL("object_segmentation_polygon_to_mask")
     .SetCreateFn<ObjectSegmentationPolygonToMask>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("poly", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("poly_index", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("image_size", 0) == DataType::kInt32)
diff --git a/oneflow/user/kernels/image_preprocess_kernels.cpp b/oneflow/user/kernels/image_preprocess_kernels.cpp
index 27fc4969758543c718e29990d03d4e504c130af4..ca1c884993392f35d1ac27a4b38e196fa0e55424 100644
--- a/oneflow/user/kernels/image_preprocess_kernels.cpp
+++ b/oneflow/user/kernels/image_preprocess_kernels.cpp
@@ -85,7 +85,7 @@ class ResizeToStaticShapeKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("image_resize")
     .SetCreateFn<ResizeToStaticShapeKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("out", 0) == DataType::kUInt8));
 
@@ -137,7 +137,7 @@ class ResizeShorterToTensorBufferKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("image_resize")
     .SetCreateFn<ResizeShorterToTensorBufferKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer));
 
@@ -314,7 +314,7 @@ class CropMirrorNormalizeFromStaticShapeToFloatKernel final : public user_op::Op
 
 REGISTER_USER_KERNEL("crop_mirror_normalize_from_uint8")
     .SetCreateFn<CropMirrorNormalizeFromStaticShapeToFloatKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kUInt8)
                      & (user_op::HobDataType("out", 0) == DataType::kFloat));
 
@@ -404,7 +404,7 @@ class CropMirrorNormalizeFromTensorBufferToFloatKernel final : public user_op::O
 
 REGISTER_USER_KERNEL("crop_mirror_normalize_from_tensorbuffer")
     .SetCreateFn<CropMirrorNormalizeFromTensorBufferToFloatKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("out", 0) == DataType::kFloat));
 
@@ -451,7 +451,7 @@ class CoinFlipKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("coin_flip")
     .SetCreateFn<CoinFlipKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("out", 0) == DataType::kInt8));
 
 namespace {
@@ -516,7 +516,7 @@ class ImageRandomCropKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("image_random_crop")
     .SetCreateFn<ImageRandomCropKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == DeviceType::kCPU)
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer));
 
diff --git a/oneflow/user/kernels/image_preprocess_kernels.cu b/oneflow/user/kernels/image_preprocess_kernels.cu
index 6b0f10a4727d98140fcc0b9a339f70bd9ec4f35e..47aed7af5d86c0584d6e89ad5dc34a155252a3c4 100644
--- a/oneflow/user/kernels/image_preprocess_kernels.cu
+++ b/oneflow/user/kernels/image_preprocess_kernels.cu
@@ -188,7 +188,7 @@ class CropMirrorNormalizeGpuKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("crop_mirror_normalize_from_uint8")
     .SetCreateFn<CropMirrorNormalizeGpuKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")
                      & (user_op::HobDataType("in", 0) == DataType::kUInt8)
                      & (user_op::HobDataType("out", 0) == DataType::kFloat));
 
diff --git a/oneflow/user/kernels/image_target_resize_kernel.cpp b/oneflow/user/kernels/image_target_resize_kernel.cpp
index c9f40d621d8eb7dc9d367f11e034a43bec5a7d5f..38bba2e1ec37854ca080f61184fe2c4857e0d411 100644
--- a/oneflow/user/kernels/image_target_resize_kernel.cpp
+++ b/oneflow/user/kernels/image_target_resize_kernel.cpp
@@ -121,7 +121,7 @@ class ImageTargetResizeKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("image_target_resize")
     .SetCreateFn<ImageTargetResizeKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("size", 0) == DataType::kInt32)
diff --git a/oneflow/user/kernels/l2_normalize_kernel.cpp b/oneflow/user/kernels/l2_normalize_kernel.cpp
index 5ae86c0b1a589e76bb40a32970bbb0c39986c6e1..0cd819735bbd86fba4e8cf15b48a93c4422f5aad 100644
--- a/oneflow/user/kernels/l2_normalize_kernel.cpp
+++ b/oneflow/user/kernels/l2_normalize_kernel.cpp
@@ -91,10 +91,10 @@ class CpuL2NormalizeKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_CPU_L2_NORMALIZE_KERNEL(dtype)                       \
-  REGISTER_USER_KERNEL("l2_normalize")                                \
-      .SetCreateFn<CpuL2NormalizeKernel<dtype>>()                     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_CPU_L2_NORMALIZE_KERNEL(dtype)           \
+  REGISTER_USER_KERNEL("l2_normalize")                    \
+      .SetCreateFn<CpuL2NormalizeKernel<dtype>>()         \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_L2_NORMALIZE_KERNEL(float)
@@ -122,10 +122,10 @@ class CpuL2NormalizeGradKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_CPU_L2_NORMALIZE_GRAD_KERNEL(dtype)                  \
-  REGISTER_USER_KERNEL("l2_normalize_grad")                           \
-      .SetCreateFn<CpuL2NormalizeGradKernel<dtype>>()                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_CPU_L2_NORMALIZE_GRAD_KERNEL(dtype)      \
+  REGISTER_USER_KERNEL("l2_normalize_grad")               \
+      .SetCreateFn<CpuL2NormalizeGradKernel<dtype>>()     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_L2_NORMALIZE_GRAD_KERNEL(float)
diff --git a/oneflow/user/kernels/l2_normalize_kernel.cu b/oneflow/user/kernels/l2_normalize_kernel.cu
index e3d11f3e585bc8f0425bc205f132563f22f9ea0c..6133bd53df9e973a356865215b993db4f8f01e1b 100644
--- a/oneflow/user/kernels/l2_normalize_kernel.cu
+++ b/oneflow/user/kernels/l2_normalize_kernel.cu
@@ -104,10 +104,10 @@ class GpuL2NormalizeKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_GPU_L2_NORMALIZE_KERNEL(dtype)                       \
-  REGISTER_USER_KERNEL("l2_normalize")                                \
-      .SetCreateFn<GpuL2NormalizeKernel<dtype>>()                     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_GPU_L2_NORMALIZE_KERNEL(dtype)           \
+  REGISTER_USER_KERNEL("l2_normalize")                    \
+      .SetCreateFn<GpuL2NormalizeKernel<dtype>>()         \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
 
 REGISTER_GPU_L2_NORMALIZE_KERNEL(float)
@@ -136,10 +136,10 @@ class GpuL2NormalizeGradKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_GPU_L2_NORMALIZE_GRAD_KERNEL(dtype)                  \
-  REGISTER_USER_KERNEL("l2_normalize_grad")                           \
-      .SetCreateFn<GpuL2NormalizeGradKernel<dtype>>()                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_GPU_L2_NORMALIZE_GRAD_KERNEL(dtype)      \
+  REGISTER_USER_KERNEL("l2_normalize_grad")               \
+      .SetCreateFn<GpuL2NormalizeGradKernel<dtype>>()     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 REGISTER_GPU_L2_NORMALIZE_GRAD_KERNEL(float)
diff --git a/oneflow/user/kernels/layer_norm_cpu_kernel.cpp b/oneflow/user/kernels/layer_norm_cpu_kernel.cpp
index 7b79ea77d65e22435e0b4ff2644b6d34b1d73dca..0000279202c079af5906bbf0babb3e4005388e22 100644
--- a/oneflow/user/kernels/layer_norm_cpu_kernel.cpp
+++ b/oneflow/user/kernels/layer_norm_cpu_kernel.cpp
@@ -28,10 +28,10 @@ class LayerNormCpuKernel final : public user_op::OpKernel {
   void Compute(user_op::KernelComputeContext* ctx) const override { TODO(); };
 };
 
-#define REGISTER_LAYER_NORM_CPU_KERNEL(dtype)                         \
-  REGISTER_USER_KERNEL("layer_norm")                                  \
-      .SetCreateFn<LayerNormCpuKernel<dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_LAYER_NORM_CPU_KERNEL(dtype)             \
+  REGISTER_USER_KERNEL("layer_norm")                      \
+      .SetCreateFn<LayerNormCpuKernel<dtype>>()           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
 
 REGISTER_LAYER_NORM_CPU_KERNEL(float)
@@ -48,10 +48,10 @@ class LayerNormGradCpuKernel final : public user_op::OpKernel {
   void Compute(user_op::KernelComputeContext* ctx) const override { TODO(); };
 };
 
-#define REGISTER_LAYER_NORM_GRAD_CPU_KERNEL(dtype)                    \
-  REGISTER_USER_KERNEL("layer_norm_grad")                             \
-      .SetCreateFn<LayerNormGradCpuKernel<dtype>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_LAYER_NORM_GRAD_CPU_KERNEL(dtype)        \
+  REGISTER_USER_KERNEL("layer_norm_grad")                 \
+      .SetCreateFn<LayerNormGradCpuKernel<dtype>>()       \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value));
 
 REGISTER_LAYER_NORM_GRAD_CPU_KERNEL(float)
@@ -68,10 +68,10 @@ class LayerNormParamGradCpuKernel final : public user_op::OpKernel {
   void Compute(user_op::KernelComputeContext* ctx) const override { TODO(); };
 };
 
-#define REGISTER_LAYER_NORM_PARAM_GRAD_CPU_KERNEL(dtype)              \
-  REGISTER_USER_KERNEL("layer_norm_param_grad")                       \
-      .SetCreateFn<LayerNormParamGradCpuKernel<dtype>>()              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_LAYER_NORM_PARAM_GRAD_CPU_KERNEL(dtype)  \
+  REGISTER_USER_KERNEL("layer_norm_param_grad")           \
+      .SetCreateFn<LayerNormParamGradCpuKernel<dtype>>()  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value));
 
 REGISTER_LAYER_NORM_PARAM_GRAD_CPU_KERNEL(float)
diff --git a/oneflow/user/kernels/layer_norm_gpu_kernel.cpp b/oneflow/user/kernels/layer_norm_gpu_kernel.cpp
index 42d472f7901e62e0e30080570812baedcab00107..71abf77556176d9b0b97927bbbfbfc2fb16c569f 100644
--- a/oneflow/user/kernels/layer_norm_gpu_kernel.cpp
+++ b/oneflow/user/kernels/layer_norm_gpu_kernel.cpp
@@ -121,7 +121,7 @@ class LayerNormGpuKernel final : public user_op::OpKernel {
 #define REGISTER_LAYER_NORM_GPU_KERNEL(dtype, bn_param_dtype)                         \
   REGISTER_USER_KERNEL("layer_norm")                                                  \
       .SetCreateFn<LayerNormGpuKernel<dtype, bn_param_dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                 \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                             \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                    \
         user_op::TensorDesc* mean = ctx->TensorDesc4ArgNameAndIndex("mean", 0);       \
@@ -174,7 +174,7 @@ class LayerNormGradGpuKernel final : public user_op::OpKernel {
 #define REGISTER_LAYER_NORM_GRAD_GPU_KERNEL(dtype, bn_param_dtype)                     \
   REGISTER_USER_KERNEL("layer_norm_grad")                                              \
       .SetCreateFn<LayerNormGradGpuKernel<dtype, bn_param_dtype>>()                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                     \
         user_op::TensorDesc* mean = ctx->TensorDesc4ArgNameAndIndex("mean", 0);        \
@@ -243,10 +243,10 @@ class LayerNormParamGradGpuKernel final : public user_op::OpKernel {
   };
 };
 
-#define REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(dtype)              \
-  REGISTER_USER_KERNEL("layer_norm_param_grad")                       \
-      .SetCreateFn<LayerNormParamGradGpuKernel<dtype>>()              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(dtype)  \
+  REGISTER_USER_KERNEL("layer_norm_param_grad")           \
+      .SetCreateFn<LayerNormParamGradGpuKernel<dtype>>()  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
                        & (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value));
 
 REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(float)
diff --git a/oneflow/user/kernels/leaky_relu_kernel.cpp b/oneflow/user/kernels/leaky_relu_kernel.cpp
index 4bbf6f61a9045506118985abb0c54919a7916da4..18b971fd1d45e283951f3e170791c75fa11574d3 100644
--- a/oneflow/user/kernels/leaky_relu_kernel.cpp
+++ b/oneflow/user/kernels/leaky_relu_kernel.cpp
@@ -36,10 +36,10 @@ class CpuLeakyReluKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_CPU_LEAKY_RELU_KERNEL(dtype)                         \
-  REGISTER_USER_KERNEL("leaky_relu")                                  \
-      .SetCreateFn<CpuLeakyReluKernel<dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_CPU_LEAKY_RELU_KERNEL(dtype)             \
+  REGISTER_USER_KERNEL("leaky_relu")                      \
+      .SetCreateFn<CpuLeakyReluKernel<dtype>>()           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_LEAKY_RELU_KERNEL(float)
@@ -66,10 +66,10 @@ class CpuLeakyReluGradKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_CPU_LEAKY_RELU_GRAD_KERNEL(dtype)                    \
-  REGISTER_USER_KERNEL("leaky_relu_grad")                             \
-      .SetCreateFn<CpuLeakyReluGradKernel<dtype>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_CPU_LEAKY_RELU_GRAD_KERNEL(dtype)        \
+  REGISTER_USER_KERNEL("leaky_relu_grad")                 \
+      .SetCreateFn<CpuLeakyReluGradKernel<dtype>>()       \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_LEAKY_RELU_GRAD_KERNEL(float)
diff --git a/oneflow/user/kernels/leaky_relu_kernel.cu b/oneflow/user/kernels/leaky_relu_kernel.cu
index fce60b156e63d840f039d0d435b5e7be51c88318..8ada7a0aa0474e1418eb77344a1fb5a79d2f34b1 100644
--- a/oneflow/user/kernels/leaky_relu_kernel.cu
+++ b/oneflow/user/kernels/leaky_relu_kernel.cu
@@ -50,10 +50,10 @@ class GpuLeakyReluKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_GPU_LEAKY_RELU_KERNEL(dtype)                         \
-  REGISTER_USER_KERNEL("leaky_relu")                                  \
-      .SetCreateFn<GpuLeakyReluKernel<dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_GPU_LEAKY_RELU_KERNEL(dtype)             \
+  REGISTER_USER_KERNEL("leaky_relu")                      \
+      .SetCreateFn<GpuLeakyReluKernel<dtype>>()           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
 
 REGISTER_GPU_LEAKY_RELU_KERNEL(float)
@@ -78,10 +78,10 @@ class GpuLeakyReluGradKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_GPU_LEAKY_RELU_GRAD_KERNEL(dtype)                    \
-  REGISTER_USER_KERNEL("leaky_relu_grad")                             \
-      .SetCreateFn<GpuLeakyReluGradKernel<dtype>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_GPU_LEAKY_RELU_GRAD_KERNEL(dtype)        \
+  REGISTER_USER_KERNEL("leaky_relu_grad")                 \
+      .SetCreateFn<GpuLeakyReluGradKernel<dtype>>()       \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 REGISTER_GPU_LEAKY_RELU_GRAD_KERNEL(float)
diff --git a/oneflow/user/kernels/math_binary_broadcast_kernels.cpp b/oneflow/user/kernels/math_binary_broadcast_kernels.cpp
index 068dbf9e2f5ab96ef2063976c13761d0af35285e..f6a320a3c0632a01e078fc9336fdc70aeacb1dec 100644
--- a/oneflow/user/kernels/math_binary_broadcast_kernels.cpp
+++ b/oneflow/user/kernels/math_binary_broadcast_kernels.cpp
@@ -52,7 +52,7 @@ class MathBinaryBroadcastKernel final : public user_op::OpKernel {
           device, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair), \
           &NdarrayUtil<device, OF_PP_PAIR_FIRST(data_type_pair)>::OF_PP_CAT(          \
               Broadcast, OF_PP_PAIR_SECOND(math_type_pair))>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                            \
                        & (user_op::HobDataType("z", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_BROADCAST_KERNEL,
@@ -71,7 +71,7 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_BROADCAST_KERNEL,
           device, OF_PP_PAIR_FIRST(data_type_pair), int8_t,                                   \
           &NdarrayUtil<device, OF_PP_PAIR_FIRST(data_type_pair)>::OF_PP_CAT(                  \
               Broadcast, OF_PP_PAIR_SECOND(math_type_pair))>>()                               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                   \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                    \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))  \
                        & (user_op::HobDataType("z", 0) == DataType::kInt8));
 
diff --git a/oneflow/user/kernels/math_binary_elementwise_kernel.cpp b/oneflow/user/kernels/math_binary_elementwise_kernel.cpp
index be0b74f7d8980f3c7997095c3bf7165d62213b4d..c615c3d661f4c10e4801324066010f92affb5b8d 100644
--- a/oneflow/user/kernels/math_binary_elementwise_kernel.cpp
+++ b/oneflow/user/kernels/math_binary_elementwise_kernel.cpp
@@ -92,20 +92,20 @@ class MathBinaryElementwiseYGradCpuKernel final : public user_op::OpKernel {
       .SetCreateFn<                                                                             \
           MathBinaryElementwiseCpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor), \
                                          OF_PP_PAIR_FIRST(data_type_pair)>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                       \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));  \
                                                                                                 \
   REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_x_grad"))        \
       .SetCreateFn<MathBinaryElementwiseXGradCpuKernel<                                         \
           OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),                                \
           OF_PP_PAIR_FIRST(data_type_pair)>>()                                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                       \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));  \
   REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_y_grad"))        \
       .SetCreateFn<MathBinaryElementwiseYGradCpuKernel<                                         \
           OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),                                \
           OF_PP_PAIR_FIRST(data_type_pair)>>()                                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                       \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CPU_KERNEL_AND_GRAD,
diff --git a/oneflow/user/kernels/math_binary_elementwise_kernel.cu b/oneflow/user/kernels/math_binary_elementwise_kernel.cu
index 7df6126b3994ec87c82daee74b07c706df9467ac..6dc1e8cea97737b20682e68c72edb45449d0afd8 100644
--- a/oneflow/user/kernels/math_binary_elementwise_kernel.cu
+++ b/oneflow/user/kernels/math_binary_elementwise_kernel.cu
@@ -108,20 +108,20 @@ class MathBinaryElementwiseYGradGpuKernel final : public user_op::OpKernel {
       .SetCreateFn<                                                                             \
           MathBinaryElementwiseGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor), \
                                          OF_PP_PAIR_FIRST(data_type_pair)>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                       \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));  \
                                                                                                 \
   REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_x_grad"))        \
       .SetCreateFn<MathBinaryElementwiseXGradGpuKernel<                                         \
           OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),                                \
           OF_PP_PAIR_FIRST(data_type_pair)>>()                                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                       \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));  \
   REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_y_grad"))        \
       .SetCreateFn<MathBinaryElementwiseYGradGpuKernel<                                         \
           OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),                                \
           OF_PP_PAIR_FIRST(data_type_pair)>>()                                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                       \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_GPU_KERNEL_AND_GRAD,
@@ -205,18 +205,18 @@ class MathBinaryElementwiseYGradGpuHalfKernel final : public user_op::OpKernel {
 #define REGISTER_MATH_BINARY_ELEMENTWISE_GPU_HALF_KERNEL_AND_GRAD(math_type_str, math_func_prefix) \
   REGISTER_USER_KERNEL(math_type_str)                                                              \
       .SetCreateFn<MathBinaryElementwiseGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                          \
                        & (user_op::HobDataType("x", 0) == DataType::kFloat16));                    \
                                                                                                    \
   REGISTER_USER_KERNEL((std::string("") + math_type_str + "_x_grad"))                              \
       .SetCreateFn<                                                                                \
           MathBinaryElementwiseXGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                          \
                        & (user_op::HobDataType("x", 0) == DataType::kFloat16));                    \
   REGISTER_USER_KERNEL((std::string("") + math_type_str + "_y_grad"))                              \
       .SetCreateFn<                                                                                \
           MathBinaryElementwiseYGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                          \
                        & (user_op::HobDataType("x", 0) == DataType::kFloat16));
 
 OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_GPU_HALF_KERNEL_AND_GRAD,
diff --git a/oneflow/user/kernels/math_unary_elementwise_kernel.cpp b/oneflow/user/kernels/math_unary_elementwise_kernel.cpp
index 6bc379c6c44eeb1e6cb4414c48398f097eaca0cc..8c2a2dee289baa9ebd8e06042fb9c2f373c70e57 100644
--- a/oneflow/user/kernels/math_unary_elementwise_kernel.cpp
+++ b/oneflow/user/kernels/math_unary_elementwise_kernel.cpp
@@ -64,7 +64,7 @@ class MathUnaryElementwiseGradCpuKernel final : public user_op::OpKernel {
       .SetCreateFn<                                                                                \
           MathUnaryElementwiseCpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),     \
                                         OF_PP_PAIR_FIRST(data_type_pair)>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                          \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))       \
                        & (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(data_type_pair)));     \
                                                                                                    \
@@ -72,7 +72,7 @@ class MathUnaryElementwiseGradCpuKernel final : public user_op::OpKernel {
       .SetCreateFn<                                                                                \
           MathUnaryElementwiseGradCpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor), \
                                             OF_PP_PAIR_FIRST(data_type_pair)>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                          \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CPU_KERNEL_AND_GRAD,
diff --git a/oneflow/user/kernels/math_unary_elementwise_kernel.cu b/oneflow/user/kernels/math_unary_elementwise_kernel.cu
index 13ccafee4bf3f5e61cc463f6c423fece725d6fb3..32144bffe3279f749b2b0fbbb22fba4300853f64 100644
--- a/oneflow/user/kernels/math_unary_elementwise_kernel.cu
+++ b/oneflow/user/kernels/math_unary_elementwise_kernel.cu
@@ -82,7 +82,7 @@ class MathUnaryElementwiseGradGpuKernel final : public user_op::OpKernel {
       .SetCreateFn<                                                                                \
           MathUnaryElementwiseGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),     \
                                         OF_PP_PAIR_FIRST(data_type_pair)>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                          \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))       \
                        & (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(data_type_pair)));     \
                                                                                                    \
@@ -90,7 +90,7 @@ class MathUnaryElementwiseGradGpuKernel final : public user_op::OpKernel {
       .SetCreateFn<                                                                                \
           MathUnaryElementwiseGradGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor), \
                                             OF_PP_PAIR_FIRST(data_type_pair)>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                          \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_GPU_KERNEL_AND_GRAD,
@@ -144,13 +144,13 @@ class MathUnaryElementwiseGradGpuHalfKernel final : public user_op::OpKernel {
 #define REGISTER_MATH_UNARY_ELEMENTWISE_GPU_HALF_KERNEL_AND_GRAD(math_type_str, math_func_prefix) \
   REGISTER_USER_KERNEL(math_type_str)                                                             \
       .SetCreateFn<MathUnaryElementwiseGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                             \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                         \
                        & (user_op::HobDataType("x", 0) == DataType::kFloat16)                     \
                        & (user_op::HobDataType("y", 0) == DataType::kFloat16));                   \
                                                                                                   \
   REGISTER_USER_KERNEL((std::string("") + math_type_str + "_grad"))                               \
       .SetCreateFn<MathUnaryElementwiseGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                             \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                         \
                        & (user_op::HobDataType("x", 0) == DataType::kFloat16));
 
 OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_GPU_HALF_KERNEL_AND_GRAD,
diff --git a/oneflow/user/kernels/matmul_kernels.cpp b/oneflow/user/kernels/matmul_kernels.cpp
index f81b044961de7ee5b726626a8bdba04b6f0e81b5..669d6ae0ddf4049010d0b5b004cdadd65c9e5e4c 100644
--- a/oneflow/user/kernels/matmul_kernels.cpp
+++ b/oneflow/user/kernels/matmul_kernels.cpp
@@ -62,10 +62,10 @@ class MatmulFloatingKernel final : public user_op::OpKernel {
   }
 };
 
-#define REGISTER_MATMUL_KERNEL(device, dtype)               \
-  REGISTER_USER_KERNEL("matmul")                            \
-      .SetCreateFn<MatmulFloatingKernel<device, dtype>>()   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device) \
+#define REGISTER_MATMUL_KERNEL(device, dtype)              \
+  REGISTER_USER_KERNEL("matmul")                           \
+      .SetCreateFn<MatmulFloatingKernel<device, dtype>>()  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device) \
                        & (user_op::HobDataType("a", 0) == GetDataType<dtype>::value));
 
 REGISTER_MATMUL_KERNEL(DeviceType::kCPU, float);
@@ -110,8 +110,7 @@ class MatmulGpuHalfKernel final : public user_op::OpKernel {
 
 #ifdef WITH_CUDA
 REGISTER_USER_KERNEL("matmul").SetCreateFn<MatmulGpuHalfKernel>().SetIsMatchedHob(
-    (user_op::HobDeviceType() == DeviceType::kGPU)
-    & (user_op::HobDataType("a", 0) == DataType::kFloat16));
+    (user_op::HobDeviceTag() == "gpu") & (user_op::HobDataType("a", 0) == DataType::kFloat16));
 #endif
 
 template<DeviceType device_type, typename T>
@@ -147,7 +146,7 @@ class BatchMatmulFloatingKernel final : public user_op::OpKernel {
 #define REGISTER_BATCH_MATMUL_KERNEL(device, dtype)                                   \
   REGISTER_USER_KERNEL("batch_matmul")                                                \
       .SetCreateFn<BatchMatmulFloatingKernel<device, dtype>>()                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                            \
                        & (user_op::HobDataType("a", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                             \
         user_op::TensorDesc* a = ctx->TensorDesc4ArgNameAndIndex("a", 0);             \
@@ -203,7 +202,7 @@ class BatchMatmulGpuHalfKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("batch_matmul")
     .SetCreateFn<BatchMatmulGpuHalfKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")
                      & (user_op::HobDataType("a", 0) == DataType::kFloat16))
     .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
       user_op::TensorDesc* a = ctx->TensorDesc4ArgNameAndIndex("a", 0);
diff --git a/oneflow/user/kernels/multiply_kernel.cpp b/oneflow/user/kernels/multiply_kernel.cpp
index 392cc4dd3f3b6022e0ca7c294a9c3d64a5bc2ce6..435601096b2c17d85da24a46dbc49ac8f84d7a6d 100644
--- a/oneflow/user/kernels/multiply_kernel.cpp
+++ b/oneflow/user/kernels/multiply_kernel.cpp
@@ -46,7 +46,7 @@ class MultiplyKernel final : public user_op::OpKernel {
 #define REGISTER_MULTIPLY_KERNEL(device, dtype_pair)                                            \
   REGISTER_USER_KERNEL("multiply")                                                              \
       .SetCreateFn<MultiplyKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair)))       \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/nd_index_slice_kernels.h b/oneflow/user/kernels/nd_index_slice_kernels.h
index 77412f1abb7829e1a88b578546c1688ee83f1844..8c879d5a69018eedd31da2bfb1bb93c9fcb01c35 100644
--- a/oneflow/user/kernels/nd_index_slice_kernels.h
+++ b/oneflow/user/kernels/nd_index_slice_kernels.h
@@ -125,7 +125,7 @@ void TensorScatterNdAddKernel<device_type, T, I>::Compute(
   REGISTER_USER_KERNEL(#op_type_name)                                                              \
       .SetCreateFn<                                                                                \
           op##Kernel<device_type_v, OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(itype_pair)>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device_type_v)                                 \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device_type_v)                                  \
                        & (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(itype_pair))     \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)));
 
@@ -134,7 +134,7 @@ void TensorScatterNdAddKernel<device_type, T, I>::Compute(
   REGISTER_USER_KERNEL(#op_type_name)                                                           \
       .SetCreateFn<TensorScatterNd##opt##Kernel<device_type_v, OF_PP_PAIR_FIRST(dtype_pair),    \
                                                 OF_PP_PAIR_FIRST(itype_pair)>>()                \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device_type_v)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device_type_v)                               \
                        & (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(itype_pair))  \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)))     \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
diff --git a/oneflow/user/kernels/normalization_kernel.cpp b/oneflow/user/kernels/normalization_kernel.cpp
index e6301b1b9aac94a0a13104e3345f25e096b8994f..c7f06a24c80dca1e556d9e7f7667efc795027b09 100644
--- a/oneflow/user/kernels/normalization_kernel.cpp
+++ b/oneflow/user/kernels/normalization_kernel.cpp
@@ -185,7 +185,7 @@ class NormalizationInferenceKernel final : public user_op::OpKernel {
 #define REGISTER_BN_INFERENCE_KERNEL(dtype)                                          \
   REGISTER_USER_KERNEL("normalization")                                              \
       .SetCreateFn<NormalizationInferenceKernel<dtype>>()                            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                            \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value) \
                        & (user_op::HobAttr<bool>("training") == false));
 
@@ -353,7 +353,7 @@ class NormalizationGradUserKernel final : public user_op::OpKernel {
 #define REGISTER_BN_TRAIN_KERNEL(dtype)                                              \
   REGISTER_USER_KERNEL("normalization")                                              \
       .SetCreateFn<NormalizationTrainKernel<dtype>>()                                \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                            \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value) \
                        & (user_op::HobAttr<bool>("training") == true))               \
       .SetInferTmpSizeFn(InferTrainTmpSize);
@@ -361,7 +361,7 @@ class NormalizationGradUserKernel final : public user_op::OpKernel {
 #define REGISTER_BN_GRAD_KERNEL(dtype)                                                 \
   REGISTER_USER_KERNEL("normalization_grad")                                           \
       .SetCreateFn<NormalizationGradUserKernel<dtype>>()                               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn(InferGradTmpSize);
 
diff --git a/oneflow/user/kernels/ofrecord_decoder_kernels.cpp b/oneflow/user/kernels/ofrecord_decoder_kernels.cpp
index b52235b49f2b11edb54f0472aac542716f11361b..b070e72f7078b35c1fe0e433f41c14e9eb87312c 100644
--- a/oneflow/user/kernels/ofrecord_decoder_kernels.cpp
+++ b/oneflow/user/kernels/ofrecord_decoder_kernels.cpp
@@ -106,7 +106,7 @@ class OFRecordRawDecoderKernel final : public user_op::OpKernel {
 #define REGISTER_RAW_DECODER_KERNEL(dtype)                                      \
   REGISTER_USER_KERNEL("ofrecord_raw_decoder")                                  \
       .SetCreateFn<OFRecordRawDecoderKernel<dtype>>()                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                       \
                        & (user_op::HobDataType("in", 0) == DataType::kOFRecord) \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
@@ -211,7 +211,7 @@ class OFRecordImageDecoderRandomCropKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("ofrecord_image_decoder_random_crop")
     .SetCreateFn<OFRecordImageDecoderRandomCropKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kOFRecord)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer));
 
@@ -243,7 +243,7 @@ class OFRecordImageDecoderKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("ofrecord_image_decoder")
     .SetCreateFn<OFRecordImageDecoderKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kOFRecord)
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer));
 
diff --git a/oneflow/user/kernels/ofrecord_image_classification_reader_kernel.cpp b/oneflow/user/kernels/ofrecord_image_classification_reader_kernel.cpp
index 89723519ad29364d7ace277771ae579a1a6c36e9..17dd2d5fb2ec4941b5efa502648358c7e229ec89 100644
--- a/oneflow/user/kernels/ofrecord_image_classification_reader_kernel.cpp
+++ b/oneflow/user/kernels/ofrecord_image_classification_reader_kernel.cpp
@@ -55,7 +55,7 @@ class OFRecordImageClassificationReaderKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("ofrecord_image_classification_reader")
     .SetCreateFn<OFRecordImageClassificationReaderKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == DeviceType::kCPU)
                      & (user_op::HobDataType("image", 0) == DataType::kTensorBuffer)
                      & (user_op::HobDataType("label", 0) == DataType::kTensorBuffer));
 
diff --git a/oneflow/user/kernels/ofrecord_reader_kernel.cpp b/oneflow/user/kernels/ofrecord_reader_kernel.cpp
index c2d838b9ad6f73eea4e610bbbb4462b7f654f37b..5d329707d227859ca9d2f45ba422cb1927196fdc 100644
--- a/oneflow/user/kernels/ofrecord_reader_kernel.cpp
+++ b/oneflow/user/kernels/ofrecord_reader_kernel.cpp
@@ -54,7 +54,7 @@ class OFRecordReaderKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("OFRecordReader")
     .SetCreateFn<OFRecordReaderKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("out", 0) == DataType::kOFRecord));
 
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/one_hot_kernel.cpp b/oneflow/user/kernels/one_hot_kernel.cpp
index f04c07e6c1c3edc4a9478d24ce2fd39627e38a09..37eba3f18c226b8da6a09d0a88db7ad166bc467a 100644
--- a/oneflow/user/kernels/one_hot_kernel.cpp
+++ b/oneflow/user/kernels/one_hot_kernel.cpp
@@ -54,7 +54,7 @@ class CpuOneHotKernel final : public user_op::OpKernel {
 
 #define REGISTER_CPU_ONE_HOT_KERNEL(dtype, itype)                                               \
   REGISTER_USER_KERNEL("one_hot").SetCreateFn<CpuOneHotKernel<dtype, itype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCPU)                                            \
+      (user_op::HobDeviceTag() == "cpu")                                                        \
       & (user_op::HobDataType("indices", 0) == GetDataType<itype>::value)                       \
       & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
diff --git a/oneflow/user/kernels/one_hot_kernel.cu b/oneflow/user/kernels/one_hot_kernel.cu
index 0221b2a97b5cdf9e3c69f247a41760f0d671f743..5fbe02c863ffad86362373c7e7b820a7a1d2f515 100644
--- a/oneflow/user/kernels/one_hot_kernel.cu
+++ b/oneflow/user/kernels/one_hot_kernel.cu
@@ -61,7 +61,7 @@ class GpuOneHotKernel final : public user_op::OpKernel {
 
 #define REGISTER_GPU_ONE_HOT_KERNEL(dtype, itype)                                               \
   REGISTER_USER_KERNEL("one_hot").SetCreateFn<GpuOneHotKernel<dtype, itype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kGPU)                                            \
+      (user_op::HobDeviceTag() == "gpu")                                                        \
       & (user_op::HobDataType("indices", 0) == GetDataType<itype>::value)                       \
       & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
diff --git a/oneflow/user/kernels/pad_kernel.cpp b/oneflow/user/kernels/pad_kernel.cpp
index 51fb2c32c8c1a3b9a4304f278d7c7e29d49f3516..f059c5101d98af7c185c260ec26bd081eacebcfc 100644
--- a/oneflow/user/kernels/pad_kernel.cpp
+++ b/oneflow/user/kernels/pad_kernel.cpp
@@ -108,7 +108,7 @@ class PadKernel final : public user_op::OpKernel {
 
 #define REGISTER_PAD_KERNEL(dev, dtype)                                             \
   REGISTER_USER_KERNEL("pad").SetCreateFn<PadKernel<dev, dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == dev)                                             \
+      (user_op::HobDeviceTag() == dev)                                              \
       & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
 
 #ifdef WITH_CUDA
@@ -164,10 +164,10 @@ class PadGradKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_PAD_GRAD_KERNEL(dev, dtype)             \
-  REGISTER_USER_KERNEL("pad_grad")                       \
-      .SetCreateFn<PadGradKernel<dev, dtype>>()          \
-      .SetIsMatchedHob((user_op::HobDeviceType() == dev) \
+#define REGISTER_PAD_GRAD_KERNEL(dev, dtype)            \
+  REGISTER_USER_KERNEL("pad_grad")                      \
+      .SetCreateFn<PadGradKernel<dev, dtype>>()         \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == dev) \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 #ifdef WITH_CUDA
diff --git a/oneflow/user/kernels/pool_cpu_kernel.cpp b/oneflow/user/kernels/pool_cpu_kernel.cpp
index 25776758668531ecac27e498930b627dfe87bd4b..34cbcf1635defecca3b0f3898839f65d96674891 100644
--- a/oneflow/user/kernels/pool_cpu_kernel.cpp
+++ b/oneflow/user/kernels/pool_cpu_kernel.cpp
@@ -580,51 +580,51 @@ class MaxPool3DGradCpuKernel final : public user_op::OpKernel {
 #define REGISTER_POOL_CPU_KERNEL(dtype)                                                \
   REGISTER_USER_KERNEL("avg_pool_1d")                                                  \
       .SetCreateFn<AvgPool1DCpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_1d_grad")                                             \
       .SetCreateFn<AvgPool1DGradCpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_2d")                                                  \
       .SetCreateFn<AvgPool2DCpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_2d_grad")                                             \
       .SetCreateFn<AvgPool2DGradCpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_3d")                                                  \
       .SetCreateFn<AvgPool3DCpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_3d_grad")                                             \
       .SetCreateFn<AvgPool3DGradCpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_1d")                                                  \
       .SetCreateFn<MaxPool1DCpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_1d_grad")                                             \
       .SetCreateFn<MaxPool1DGradCpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_2d")                                                  \
       .SetCreateFn<MaxPool2DCpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_2d_grad")                                             \
       .SetCreateFn<MaxPool2DGradCpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_3d")                                                  \
       .SetCreateFn<MaxPool3DCpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_3d_grad")                                             \
       .SetCreateFn<MaxPool3DGradCpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
 
 REGISTER_POOL_CPU_KERNEL(float)
diff --git a/oneflow/user/kernels/pool_gpu_kernel.cpp b/oneflow/user/kernels/pool_gpu_kernel.cpp
index 916d458c403938439ebc88ffac0253b339f0a929..3734d4befccf8cd0540b71de291be0d498bb1144 100644
--- a/oneflow/user/kernels/pool_gpu_kernel.cpp
+++ b/oneflow/user/kernels/pool_gpu_kernel.cpp
@@ -389,51 +389,51 @@ class MaxPool3DGradGpuKernel final : public user_op::OpKernel {
 #define REGISTER_POOL_GPU_KERNEL(dtype)                                                \
   REGISTER_USER_KERNEL("avg_pool_1d")                                                  \
       .SetCreateFn<AvgPool1DGpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_1d_grad")                                             \
       .SetCreateFn<AvgPool1DGradGpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_2d")                                                  \
       .SetCreateFn<AvgPool2DGpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_2d_grad")                                             \
       .SetCreateFn<AvgPool2DGradGpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_3d")                                                  \
       .SetCreateFn<AvgPool3DGpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("avg_pool_3d_grad")                                             \
       .SetCreateFn<AvgPool3DGradGpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_1d")                                                  \
       .SetCreateFn<MaxPool1DGpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_1d_grad")                                             \
       .SetCreateFn<MaxPool1DGradGpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_2d")                                                  \
       .SetCreateFn<MaxPool2DGpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_2d_grad")                                             \
       .SetCreateFn<MaxPool2DGradGpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_3d")                                                  \
       .SetCreateFn<MaxPool3DGpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("max_pool_3d_grad")                                             \
       .SetCreateFn<MaxPool3DGradGpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
 
 REGISTER_POOL_GPU_KERNEL(float)
diff --git a/oneflow/user/kernels/prelu_kernel.cpp b/oneflow/user/kernels/prelu_kernel.cpp
index 89f4582446a5c90fcf79f981fd4d50fb6c2ad647..21092e6d5c5d9776eb03463d646d01bdc761ca27 100644
--- a/oneflow/user/kernels/prelu_kernel.cpp
+++ b/oneflow/user/kernels/prelu_kernel.cpp
@@ -49,7 +49,7 @@ class CpuPReluKernel final : public user_op::OpKernel {
 #define REGISTER_CPU_PRELU_KERNEL(dtype)                                              \
   REGISTER_USER_KERNEL("prelu")                                                       \
       .SetCreateFn<CpuPReluKernel<dtype>>()                                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                 \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                             \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                             \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("x", 0);                   \
@@ -92,7 +92,7 @@ class CpuPReluXGradKernel final : public user_op::OpKernel {
 #define REGISTER_CPU_PRELU_X_GRAD_KERNEL(dtype)                                        \
   REGISTER_USER_KERNEL("prelu_x_grad")                                                 \
       .SetCreateFn<CpuPReluXGradKernel<dtype>>()                                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                              \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                              \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("x", 0);                    \
@@ -137,7 +137,7 @@ class CpuPReluAlphaGradKernel final : public user_op::OpKernel {
 #define REGISTER_CPU_PRELU_ALPHA_GRAD_KERNEL(dtype)                                            \
   REGISTER_USER_KERNEL("prelu_alpha_grad")                                                     \
       .SetCreateFn<CpuPReluAlphaGradKernel<dtype>>()                                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                          \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                      \
                        & (user_op::HobDataType("alpha_diff", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                      \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("x", 0);                            \
diff --git a/oneflow/user/kernels/prelu_kernel.cu b/oneflow/user/kernels/prelu_kernel.cu
index 9179a44611462ef5c22c2bdf67fedaa2580b8856..51a2a769a5e0a17b2963777258babbaa695f2f44 100644
--- a/oneflow/user/kernels/prelu_kernel.cu
+++ b/oneflow/user/kernels/prelu_kernel.cu
@@ -67,7 +67,7 @@ class GpuPReluKernel final : public user_op::OpKernel {
 #define REGISTER_GPU_PRELU_KERNEL(dtype)                                              \
   REGISTER_USER_KERNEL("prelu")                                                       \
       .SetCreateFn<GpuPReluKernel<dtype>>()                                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                 \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                             \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                             \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("x", 0);                   \
@@ -106,7 +106,7 @@ class GpuPReluXGradKernel final : public user_op::OpKernel {
 #define REGISTER_GPU_PRELU_X_GRAD_KERNEL(dtype)                                        \
   REGISTER_USER_KERNEL("prelu_x_grad")                                                 \
       .SetCreateFn<GpuPReluXGradKernel<dtype>>()                                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                              \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("x", 0);                    \
@@ -148,7 +148,7 @@ class GpuPReluAlphaGradKernel final : public user_op::OpKernel {
 #define REGISTER_GPU_PRELU_ALPHA_GRAD_KERNEL(dtype)                                            \
   REGISTER_USER_KERNEL("prelu_alpha_grad")                                                     \
       .SetCreateFn<GpuPReluAlphaGradKernel<dtype>>()                                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                          \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                      \
                        & (user_op::HobDataType("alpha_diff", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                      \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("x", 0);                            \
diff --git a/oneflow/user/kernels/radix_sort_top_k_kernel.cu b/oneflow/user/kernels/radix_sort_top_k_kernel.cu
index 9daa98fbeb696e0e4be95ef2517f2935451fa439..bb50baa3b9ba11aeca6170a4d2abace3211fbc02 100644
--- a/oneflow/user/kernels/radix_sort_top_k_kernel.cu
+++ b/oneflow/user/kernels/radix_sort_top_k_kernel.cu
@@ -107,30 +107,29 @@ class GpuRadixSortTopKKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_GPU_RADIX_SORT_TOP_K_KERNEL(dtype)                                              \
-  REGISTER_USER_KERNEL("top_k")                                                                  \
-      .SetCreateFn<GpuRadixSortTopKKernel<dtype>>()                                              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                            \
-                       & (user_op::HobAttr<int32_t>("k") > 128)                                  \
-                       & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))           \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                        \
-        const Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                             \
-        const int32_t elem_cnt = in_shape->elem_cnt();                                           \
-        const int32_t instance_size = in_shape->dim_vec().back();                                \
-        const int32_t instance_num = elem_cnt / instance_size;                                   \
-                                                                                                 \
-        /* Sorted In*/                                                                           \
-        const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));    \
-        /* Indices */                                                                            \
-        const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t));    \
-        /* Sorted Indices */                                                                     \
-        const int32_t sorted_indices_aligned_bytes = indices_aligned_bytes;                      \
-        /* CUB Temp Storage */                                                                   \
-        int32_t temp_storage_bytes =                                                             \
-            InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \
-                                                                                                 \
-        return sorted_in_aligned_bytes + indices_aligned_bytes + sorted_indices_aligned_bytes    \
-               + temp_storage_bytes;                                                             \
+#define REGISTER_GPU_RADIX_SORT_TOP_K_KERNEL(dtype)                                                \
+  REGISTER_USER_KERNEL("top_k")                                                                    \
+      .SetCreateFn<GpuRadixSortTopKKernel<dtype>>()                                                \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") & (user_op::HobAttr<int32_t>("k") > 128) \
+                       & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))             \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
+        const Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                               \
+        const int32_t elem_cnt = in_shape->elem_cnt();                                             \
+        const int32_t instance_size = in_shape->dim_vec().back();                                  \
+        const int32_t instance_num = elem_cnt / instance_size;                                     \
+                                                                                                   \
+        /* Sorted In*/                                                                             \
+        const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));      \
+        /* Indices */                                                                              \
+        const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t));      \
+        /* Sorted Indices */                                                                       \
+        const int32_t sorted_indices_aligned_bytes = indices_aligned_bytes;                        \
+        /* CUB Temp Storage */                                                                     \
+        int32_t temp_storage_bytes =                                                               \
+            InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size);   \
+                                                                                                   \
+        return sorted_in_aligned_bytes + indices_aligned_bytes + sorted_indices_aligned_bytes      \
+               + temp_storage_bytes;                                                               \
       });
 
 REGISTER_GPU_RADIX_SORT_TOP_K_KERNEL(float)
diff --git a/oneflow/user/kernels/reduce_kernel.cpp b/oneflow/user/kernels/reduce_kernel.cpp
index 2036c98c1dd5179b85f7b757cd925b74d2bab037..3360dd3bb0c4d612ea04e5f3049c08f41cb0f38a 100644
--- a/oneflow/user/kernels/reduce_kernel.cpp
+++ b/oneflow/user/kernels/reduce_kernel.cpp
@@ -48,7 +48,7 @@ class ReduceKernel final : public user_op::OpKernel {
 #define REGISTER_REDUCE_XPU_KERNEL(op_name, binary_func, device, dtype)                           \
   REGISTER_USER_KERNEL(op_name)                                                                   \
       .SetCreateFn<ReduceKernel<binary_func, device, dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                       \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                        \
                        & (user_op::HobDataType("output_tensor", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("input_tensor", 0);                    \
diff --git a/oneflow/user/kernels/reduce_like_kernels.cpp b/oneflow/user/kernels/reduce_like_kernels.cpp
index 8a00ac2896ecbc53bf47be82909de2f8b0c8e9e7..85fb9658f38c4e9ca41d06c9871cb46d4e0e5a98 100644
--- a/oneflow/user/kernels/reduce_like_kernels.cpp
+++ b/oneflow/user/kernels/reduce_like_kernels.cpp
@@ -61,7 +61,7 @@ class ReduceSumLikeOpKernel final : public user_op::OpKernel {
 #define REGISTER_REDUCE_SUM_LIKE_KERNEL(device, data_type_pair)                               \
   REGISTER_USER_KERNEL("reduce_sum_like")                                                     \
       .SetCreateFn<ReduceSumLikeOpKernel<device, OF_PP_PAIR_FIRST(data_type_pair)>>()         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                   \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                    \
                        & (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(data_type_pair))) \
       .SetInferTmpSizeFn(ReduceSumLikeInferTmpSize);
 
diff --git a/oneflow/user/kernels/relu_kernel.cpp b/oneflow/user/kernels/relu_kernel.cpp
index 53dfdb9b5540482ca5592beb44989335da5753f6..759bf4b39bba429f0db4af3e0ac04509c2df6744 100644
--- a/oneflow/user/kernels/relu_kernel.cpp
+++ b/oneflow/user/kernels/relu_kernel.cpp
@@ -39,7 +39,7 @@ class ReluKernel final : public user_op::OpKernel {
 #define REGISTER_RELU_KERNEL(device, dtype)                                                     \
   REGISTER_USER_KERNEL("relu")                                                                  \
       .SetCreateFn<ReluKernel<device, dtype>>()                                                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))         \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
@@ -76,7 +76,7 @@ class ReluGradKernel final : public user_op::OpKernel {
 #define REGISTER_RELU_GRAD_KERNEL(device, dtype)                                                \
   REGISTER_USER_KERNEL("relu_grad")                                                             \
       .SetCreateFn<ReluGradKernel<device, dtype>>()                                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))          \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/reshape_kernel.cpp b/oneflow/user/kernels/reshape_kernel.cpp
index e854ac5332dbf3a18aeb59e9274892d8024a533f..7a1f771ed7a1361a0c915f44f8b80615a5455676 100644
--- a/oneflow/user/kernels/reshape_kernel.cpp
+++ b/oneflow/user/kernels/reshape_kernel.cpp
@@ -21,7 +21,7 @@ namespace oneflow {
 #define REGISTER_RESHAPE_KERNEL(device)                                                         \
   REGISTER_USER_KERNEL("reshape")                                                               \
       .SetCreateFn<CopyDataContentKernel<device>>()                                             \
-      .SetIsMatchedHob(user_op::HobDeviceType() == device)                                      \
+      .SetIsMatchedHob(user_op::HobDeviceTag() == device)                                       \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
         OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));                      \
diff --git a/oneflow/user/kernels/reshape_like_kernel.cpp b/oneflow/user/kernels/reshape_like_kernel.cpp
index 4f5dc945519235bf156b079e1439085e1b3120b1..cf22daa70d57abdd0bc54a60bf2be0d0be9f101e 100644
--- a/oneflow/user/kernels/reshape_like_kernel.cpp
+++ b/oneflow/user/kernels/reshape_like_kernel.cpp
@@ -21,7 +21,7 @@ namespace oneflow {
 #define REGISTER_RESHAPE_LIKE_KERNEL(D)                                                         \
   REGISTER_USER_KERNEL("reshape_like")                                                          \
       .SetCreateFn<CopyDataContentKernel<DeviceType::D>>()                                      \
-      .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::D)                               \
+      .SetIsMatchedHob(user_op::HobDeviceTag() == DeviceType::D)                                \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
         OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));                      \
diff --git a/oneflow/user/kernels/same_padding_kernel.cpp b/oneflow/user/kernels/same_padding_kernel.cpp
index 9929d0df0fdbed439f626439e2de520db972c8f7..138fe30d3c520e05e5680c39ef8ccf8368a0a2ba 100644
--- a/oneflow/user/kernels/same_padding_kernel.cpp
+++ b/oneflow/user/kernels/same_padding_kernel.cpp
@@ -81,10 +81,10 @@ class SamePaddingKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_SAME_PADDING_KERNEL(dev, dtype)         \
-  REGISTER_USER_KERNEL("same_padding")                   \
-      .SetCreateFn<SamePaddingKernel<dev, dtype>>()      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == dev) \
+#define REGISTER_SAME_PADDING_KERNEL(dev, dtype)        \
+  REGISTER_USER_KERNEL("same_padding")                  \
+      .SetCreateFn<SamePaddingKernel<dev, dtype>>()     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == dev) \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
 
 #ifdef WITH_CUDA
@@ -159,10 +159,10 @@ class SamePaddingGradKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_SAME_PADDING_GRAD_KERNEL(dev, dtype)    \
-  REGISTER_USER_KERNEL("same_padding_grad")              \
-      .SetCreateFn<SamePaddingGradKernel<dev, dtype>>()  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == dev) \
+#define REGISTER_SAME_PADDING_GRAD_KERNEL(dev, dtype)   \
+  REGISTER_USER_KERNEL("same_padding_grad")             \
+      .SetCreateFn<SamePaddingGradKernel<dev, dtype>>() \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == dev) \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 #ifdef WITH_CUDA
diff --git a/oneflow/user/kernels/scalar_add_kernel.cpp b/oneflow/user/kernels/scalar_add_kernel.cpp
index 8a17a70af7a0620cbe90f4a8475e14a4b6fb33e7..08544d2c897644bac7c3bcac5e4f683f77d85a75 100644
--- a/oneflow/user/kernels/scalar_add_kernel.cpp
+++ b/oneflow/user/kernels/scalar_add_kernel.cpp
@@ -48,7 +48,7 @@ class ScalarAddUserKernel final : public user_op::OpKernel {
 #define REGISTER_KERNEL(kernel_device_type, dtype)                                              \
   REGISTER_USER_KERNEL("scalar_add")                                                            \
       .SetCreateFn<ScalarAddUserKernel<DeviceType::k##kernel_device_type, dtype>>()             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::k##kernel_device_type)          \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == DeviceType::k##kernel_device_type)           \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))         \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/scalar_by_tensor_kernel.cpp b/oneflow/user/kernels/scalar_by_tensor_kernel.cpp
index efd4fae0ff3706f6a4d3d85e96d0ad0c238214a6..016844ca8b7798514e4b715408a4dfb80ebedffe 100644
--- a/oneflow/user/kernels/scalar_by_tensor_kernel.cpp
+++ b/oneflow/user/kernels/scalar_by_tensor_kernel.cpp
@@ -84,7 +84,7 @@ class ScalarAddByTensorKernel final : public user_op::OpKernel {
   REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(scalar_by_tensor_pair))                                 \
       .SetCreateFn<ScalarAddByTensorKernel<OF_PP_PAIR_SECOND(scalar_by_tensor_pair), device,    \
                                            OF_PP_PAIR_FIRST(dtype_pair)>>()                     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair)))       \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/scalar_mul_kernel.cpp b/oneflow/user/kernels/scalar_mul_kernel.cpp
index f4bb2a7b2a578f8632f990bee339bfdf2e2ca93a..db19ae498acf6071c2f85d41177629efbb884454 100644
--- a/oneflow/user/kernels/scalar_mul_kernel.cpp
+++ b/oneflow/user/kernels/scalar_mul_kernel.cpp
@@ -49,7 +49,7 @@ class ScalarMulUserKernel final : public user_op::OpKernel {
 #define REGISTER_KERNEL(kernel_device_type, dtype)                                              \
   REGISTER_USER_KERNEL("scalar_mul")                                                            \
       .SetCreateFn<ScalarMulUserKernel<DeviceType::k##kernel_device_type, dtype>>()             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::k##kernel_device_type)          \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == DeviceType::k##kernel_device_type)           \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))         \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/sigmoid_kernel.cpp b/oneflow/user/kernels/sigmoid_kernel.cpp
index 9f34f0070a26cf6bbf39c4116bbafa9f4169565f..65c8b4cd692cd5f3ebf49b33211f4f1348e2e2a3 100644
--- a/oneflow/user/kernels/sigmoid_kernel.cpp
+++ b/oneflow/user/kernels/sigmoid_kernel.cpp
@@ -39,7 +39,7 @@ class SigmoidKernel final : public user_op::OpKernel {
 #define REGISTER_SIGMOID_KERNEL(device, dtype)                                                  \
   REGISTER_USER_KERNEL("sigmoid")                                                               \
       .SetCreateFn<SigmoidKernel<device, dtype>>()                                              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))         \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
@@ -76,7 +76,7 @@ class SigmoidGradKernel final : public user_op::OpKernel {
 #define REGISTER_SIGMOID_GRAD_KERNEL(device, dtype)                                             \
   REGISTER_USER_KERNEL("sigmoid_grad")                                                          \
       .SetCreateFn<SigmoidGradKernel<device, dtype>>()                                          \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))          \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/slice_kernel.cu b/oneflow/user/kernels/slice_kernel.cu
index 45668f5b266fdb836e5c5a625f932777bff15785..7c8cc019341dcecc6d64fabb7f34820b78498d3b 100644
--- a/oneflow/user/kernels/slice_kernel.cu
+++ b/oneflow/user/kernels/slice_kernel.cu
@@ -182,11 +182,11 @@ class SliceGradGpuKernel final : public user_op::OpKernel {
 #define REGISTER_SLICE_GPU_KERNEL(dtype)                                               \
   REGISTER_USER_KERNEL("slice_v2")                                                     \
       .SetCreateFn<SliceGpuKernel<dtype>>()                                            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
   REGISTER_USER_KERNEL("slice_grad_v2")                                                \
       .SetCreateFn<SliceGradGpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                  \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                              \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 
 REGISTER_SLICE_GPU_KERNEL(float)
diff --git a/oneflow/user/kernels/smooth_l1_loss_kernel.cpp b/oneflow/user/kernels/smooth_l1_loss_kernel.cpp
index dc71a79352d2a06cdd65f2985bd7e54c853c48a1..967e494519e08e1be048b61d70f436324df83ce6 100644
--- a/oneflow/user/kernels/smooth_l1_loss_kernel.cpp
+++ b/oneflow/user/kernels/smooth_l1_loss_kernel.cpp
@@ -43,10 +43,10 @@ class SmoothL1LossCPUKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_SMOOTH_L1_LOSS_CPU_KERNEL(dtype)                     \
-  REGISTER_USER_KERNEL("smooth_l1_loss")                              \
-      .SetCreateFn<SmoothL1LossCPUKernel<dtype>>()                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_SMOOTH_L1_LOSS_CPU_KERNEL(dtype)         \
+  REGISTER_USER_KERNEL("smooth_l1_loss")                  \
+      .SetCreateFn<SmoothL1LossCPUKernel<dtype>>()        \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("loss", 0) == GetDataType<dtype>::value));
 
 REGISTER_SMOOTH_L1_LOSS_CPU_KERNEL(float)
@@ -81,11 +81,11 @@ class SmoothL1LossGradCpuKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_SMOOTH_L1_LOSS_GRAD_CPU_KERNEL(dtype)   \
-  REGISTER_USER_KERNEL("smooth_l1_loss_grad")            \
-      .SetCreateFn<SmoothL1LossGradCpuKernel<dtype>>()   \
-      .SetIsMatchedHob(                                  \
-          (user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_SMOOTH_L1_LOSS_GRAD_CPU_KERNEL(dtype) \
+  REGISTER_USER_KERNEL("smooth_l1_loss_grad")          \
+      .SetCreateFn<SmoothL1LossGradCpuKernel<dtype>>() \
+      .SetIsMatchedHob(                                \
+          (user_op::HobDeviceTag() == "cpu")           \
           & (user_op::HobDataType("prediction_grad", 0) == GetDataType<dtype>::value));
 
 REGISTER_SMOOTH_L1_LOSS_GRAD_CPU_KERNEL(float)
diff --git a/oneflow/user/kernels/smooth_l1_loss_kernel.cu b/oneflow/user/kernels/smooth_l1_loss_kernel.cu
index 0fbc890eaeaeec1f73cbf0a261875bad06e855b1..64cccc99f579f10b850c7d31202993adb4d63887 100644
--- a/oneflow/user/kernels/smooth_l1_loss_kernel.cu
+++ b/oneflow/user/kernels/smooth_l1_loss_kernel.cu
@@ -72,10 +72,10 @@ class SmoothL1LossGPUKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_SMOOTH_L1_LOSS_GPU_KERNEL(dtype)                     \
-  REGISTER_USER_KERNEL("smooth_l1_loss")                              \
-      .SetCreateFn<SmoothL1LossGPUKernel<dtype>>()                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_SMOOTH_L1_LOSS_GPU_KERNEL(dtype)         \
+  REGISTER_USER_KERNEL("smooth_l1_loss")                  \
+      .SetCreateFn<SmoothL1LossGPUKernel<dtype>>()        \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
                        & (user_op::HobDataType("loss", 0) == GetDataType<dtype>::value));
 
 REGISTER_SMOOTH_L1_LOSS_GPU_KERNEL(float)
@@ -103,11 +103,11 @@ class SmoothL1LossGradGpuKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_SMOOTH_L1_LOSS_GRAD_GPU_KERNEL(dtype)   \
-  REGISTER_USER_KERNEL("smooth_l1_loss_grad")            \
-      .SetCreateFn<SmoothL1LossGradGpuKernel<dtype>>()   \
-      .SetIsMatchedHob(                                  \
-          (user_op::HobDeviceType() == DeviceType::kGPU) \
+#define REGISTER_SMOOTH_L1_LOSS_GRAD_GPU_KERNEL(dtype) \
+  REGISTER_USER_KERNEL("smooth_l1_loss_grad")          \
+      .SetCreateFn<SmoothL1LossGradGpuKernel<dtype>>() \
+      .SetIsMatchedHob(                                \
+          (user_op::HobDeviceTag() == "gpu")           \
           & (user_op::HobDataType("prediction_grad", 0) == GetDataType<dtype>::value));
 
 REGISTER_SMOOTH_L1_LOSS_GRAD_GPU_KERNEL(float)
diff --git a/oneflow/user/kernels/softmax_cross_entropy_kernel.h b/oneflow/user/kernels/softmax_cross_entropy_kernel.h
index 16411e4612d855362597a39ffbd76adb98db3318..e599a15d5820af1170bcf7737f60c265fa6a399c 100644
--- a/oneflow/user/kernels/softmax_cross_entropy_kernel.h
+++ b/oneflow/user/kernels/softmax_cross_entropy_kernel.h
@@ -57,7 +57,7 @@ class SoftmaxCrossEntropyKernel final : public user_op::OpKernel {
 #define REGISTER_SOFTMAX_CROSS_ENTROPY_KERNEL(device_type_v, dtype_pair)                     \
   REGISTER_USER_KERNEL("softmax_cross_entropy")                                              \
       .SetCreateFn<SoftmaxCrossEntropyKernel<device_type_v, OF_PP_PAIR_FIRST(dtype_pair)>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device_type_v)                           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device_type_v)                            \
                        & (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(dtype_pair)) \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)))  \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                    \
@@ -92,7 +92,7 @@ class SoftmaxCrossEntropyGradKernel final : public user_op::OpKernel {
   REGISTER_USER_KERNEL("softmax_cross_entropy_grad")                                             \
       .SetCreateFn<SoftmaxCrossEntropyGradKernel<device_type_v, OF_PP_PAIR_FIRST(dtype_pair)>>() \
       .SetIsMatchedHob(                                                                          \
-          (user_op::HobDeviceType() == device_type_v)                                            \
+          (user_op::HobDeviceTag() == device_type_v)                                             \
           & (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(dtype_pair))                  \
           & (user_op::HobDataType("prediction_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair)))       \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                     \
diff --git a/oneflow/user/kernels/softmax_kernel.cpp b/oneflow/user/kernels/softmax_kernel.cpp
index 9189e6c4d248466a221d54df77bda39ce637d78c..b7f69bd9012351e85561d085439399d1847b435e 100644
--- a/oneflow/user/kernels/softmax_kernel.cpp
+++ b/oneflow/user/kernels/softmax_kernel.cpp
@@ -61,7 +61,7 @@ user_op::InferTmpSizeFn GenInferTmpSizeFn(const std::string& bn) {
 #define REGISTER_SOFTMAX_KERNEL(device, dtype)                                          \
   REGISTER_USER_KERNEL("softmax")                                                       \
       .SetCreateFn<SoftmaxKernel<device, dtype>>()                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                             \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                              \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn(GenInferTmpSizeFn<dtype>("in"));
 
@@ -104,7 +104,7 @@ class SoftmaxGradKernel final : public user_op::OpKernel {
 #define REGISTER_SOFTMAX_GRAD_KERNEL(device, dtype)                                    \
   REGISTER_USER_KERNEL("softmax_grad")                                                 \
       .SetCreateFn<SoftmaxGradKernel<device, dtype>>()                                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                            \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                             \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
       .SetInferTmpSizeFn(GenInferTmpSizeFn<dtype>("dx"));
 
diff --git a/oneflow/user/kernels/sort_kernel.cpp b/oneflow/user/kernels/sort_kernel.cpp
index 5bedb432112c52d64b8a7d7022210bfe5b86b199..57b30308b5d8ecaeca1168f741ab8ac3291d0591 100644
--- a/oneflow/user/kernels/sort_kernel.cpp
+++ b/oneflow/user/kernels/sort_kernel.cpp
@@ -52,7 +52,7 @@ class CpuSortKernel final : public user_op::OpKernel {
 
 #define REGISTER_CPU_SORT_KERNEL(dtype)                                             \
   REGISTER_USER_KERNEL("sort").SetCreateFn<CpuSortKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCPU)                                \
+      (user_op::HobDeviceTag() == "cpu")                                            \
       & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
 REGISTER_CPU_SORT_KERNEL(float)
diff --git a/oneflow/user/kernels/sort_kernel.cu b/oneflow/user/kernels/sort_kernel.cu
index 8bd07555c6b54fe119c10b1a2920c345058fff6c..1e695c39a8b691776740f133950f2503a6c3ce16 100644
--- a/oneflow/user/kernels/sort_kernel.cu
+++ b/oneflow/user/kernels/sort_kernel.cu
@@ -54,7 +54,7 @@ class GpuSortKernel final : public user_op::OpKernel {
 #define REGISTER_GPU_SORT_KERNEL(dtype)                                                     \
   REGISTER_USER_KERNEL("sort")                                                              \
       .SetCreateFn<GpuSortKernel<dtype>>()                                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)                       \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")                                   \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))     \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                   \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                        \
diff --git a/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp b/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp
index f70639080d66a53b85e75caa18be1ffee81968ca..f0c6b5a69d44ad92e7c80e935ddd3f13d62e2fde 100644
--- a/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp
+++ b/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp
@@ -77,7 +77,7 @@ class SparseCrossEntropyMsKernel final : public user_op::OpKernel {
   REGISTER_USER_KERNEL(kernel_name)                                                                \
       .SetCreateFn<kernel_class<device_type_v, OF_PP_PAIR_FIRST(dtype_pair),                       \
                                 OF_PP_PAIR_FIRST(ltype_pair)>>()                                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device_type_v)                                 \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device_type_v)                                  \
                        & (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair))       \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)));
 
@@ -166,7 +166,7 @@ class SparseCrossEntropyMsGradKernel final : public user_op::OpKernel {
       .SetCreateFn<kernel_class<device_type_v, OF_PP_PAIR_FIRST(dtype_pair),                \
                                 OF_PP_PAIR_FIRST(ltype_pair)>>()                            \
       .SetIsMatchedHob(                                                                     \
-          (user_op::HobDeviceType() == device_type_v)                                       \
+          (user_op::HobDeviceTag() == device_type_v)                                        \
           & (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair))             \
           & (user_op::HobDataType("prediction_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair)));
 
diff --git a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp
index b56543eb945c6badaafb1276dad8b489e43afdaa..5c1164966a9ab4d6618963d247fcc4bbaf342c25 100644
--- a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp
+++ b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp
@@ -67,7 +67,7 @@ class SparseSoftmaxCrossEntropyMsKernel final : public user_op::OpKernel {
   REGISTER_USER_KERNEL(kernel_name)                                                            \
       .SetCreateFn<kernel_class<device_type_v, OF_PP_PAIR_FIRST(dtype_pair),                   \
                                 OF_PP_PAIR_FIRST(ltype_pair)>>()                               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device_type_v)                             \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device_type_v)                              \
                        & (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair))   \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)))    \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                      \
@@ -158,7 +158,7 @@ class SparseSoftmaxCrossEntropyMsGradKernel final : public user_op::OpKernel {
       .SetCreateFn<kernel_class<device_type_v, OF_PP_PAIR_FIRST(dtype_pair),                     \
                                 OF_PP_PAIR_FIRST(ltype_pair)>>()                                 \
       .SetIsMatchedHob(                                                                          \
-          (user_op::HobDeviceType() == device_type_v)                                            \
+          (user_op::HobDeviceTag() == device_type_v)                                             \
           & (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair))                  \
           & (user_op::HobDataType("prediction_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair)))       \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                     \
diff --git a/oneflow/user/kernels/split_like_kernel.cpp b/oneflow/user/kernels/split_like_kernel.cpp
index 641370d864e27f54e60fa26f1a609a5b4f17f3cd..4bcd10268de204da6c66cd43b6a1721006a93d8d 100644
--- a/oneflow/user/kernels/split_like_kernel.cpp
+++ b/oneflow/user/kernels/split_like_kernel.cpp
@@ -77,10 +77,10 @@ class SplitLikeKernel final : public user_op::OpKernel {
 
 }  // namespace
 
-#define REGISTER_SPLIT_LIKE_KERNEL(device, dtype)           \
-  REGISTER_USER_KERNEL("split_like")                        \
-      .SetCreateFn<SplitLikeKernel<device, dtype>>()        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device) \
+#define REGISTER_SPLIT_LIKE_KERNEL(device, dtype)          \
+  REGISTER_USER_KERNEL("split_like")                       \
+      .SetCreateFn<SplitLikeKernel<device, dtype>>()       \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device) \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 
 #define REGISTER_SPLIT_LIKE_KERNEL_WITH_DEVICE(device) \
diff --git a/oneflow/user/kernels/squeeze_kernel.cpp b/oneflow/user/kernels/squeeze_kernel.cpp
index 9e4502e877bf289c3388580687db064bc7782501..8c74c848290e1f19aaf4cfe36357b2764e0c60fe 100644
--- a/oneflow/user/kernels/squeeze_kernel.cpp
+++ b/oneflow/user/kernels/squeeze_kernel.cpp
@@ -21,7 +21,7 @@ namespace oneflow {
 #define REGISTER_SQUEEZE_KERNEL(D)                                                              \
   REGISTER_USER_KERNEL("squeeze")                                                               \
       .SetCreateFn<CopyDataContentKernel<DeviceType::D>>()                                      \
-      .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::D)                               \
+      .SetIsMatchedHob(user_op::HobDeviceTag() == DeviceType::D)                                \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
         OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));                      \
diff --git a/oneflow/user/kernels/summary_kernels.cpp b/oneflow/user/kernels/summary_kernels.cpp
index 1e6ec552032bd0ecbe7ac8b21e6cfc1b0761bdfe..5c762eb957e4c645b6b0d8118bac2acbd8fcd374 100644
--- a/oneflow/user/kernels/summary_kernels.cpp
+++ b/oneflow/user/kernels/summary_kernels.cpp
@@ -51,10 +51,10 @@ class SummaryWriteScalar final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
 };
 
-#define REGISTER_SCALAR_USER_KERNEL(dtype)                            \
-  REGISTER_USER_KERNEL("summary_write_scalar")                        \
-      .SetCreateFn<SummaryWriteScalar<dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_SCALAR_USER_KERNEL(dtype)                \
+  REGISTER_USER_KERNEL("summary_write_scalar")            \
+      .SetCreateFn<SummaryWriteScalar<dtype>>()           \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
 
 REGISTER_SCALAR_USER_KERNEL(double)
@@ -77,7 +77,7 @@ class CreateSummaryWriter final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("create_summary_writer")
     .SetCreateFn<CreateSummaryWriter>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU));
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu"));
 
 class FlushSummaryWriter final : public user_op::OpKernel {
  public:
@@ -93,7 +93,7 @@ class FlushSummaryWriter final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("flush_summary_writer")
     .SetCreateFn<FlushSummaryWriter>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU));
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu"));
 
 template<typename T>
 class SummaryWriteHistogram final : public user_op::OpKernel {
@@ -117,10 +117,10 @@ class SummaryWriteHistogram final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
 };
 
-#define REGISTER_HISTOGRAM_USER_KERNEL(dtype)                         \
-  REGISTER_USER_KERNEL("summary_write_histogram")                     \
-      .SetCreateFn<SummaryWriteHistogram<dtype>>()                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
+#define REGISTER_HISTOGRAM_USER_KERNEL(dtype)             \
+  REGISTER_USER_KERNEL("summary_write_histogram")         \
+      .SetCreateFn<SummaryWriteHistogram<dtype>>()        \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
 
 REGISTER_HISTOGRAM_USER_KERNEL(double)
@@ -152,7 +152,7 @@ class SummaryWritePb final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("summary_write_pb")
     .SetCreateFn<SummaryWritePb<int8_t>>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == GetDataType<int8_t>::value));
 
 template<typename T>
@@ -179,7 +179,7 @@ class SummaryWriteImage final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("summary_write_image")
     .SetCreateFn<SummaryWriteImage<uint8_t>>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == GetDataType<uint8_t>::value));
 
 }  // namespace summary
diff --git a/oneflow/user/kernels/tanh_kernel.cpp b/oneflow/user/kernels/tanh_kernel.cpp
index bac1e5cfd43a642b5d971180401b7f7978964b74..abf0af1f2e8be6560effcdd7178a3764f1bc3cbb 100644
--- a/oneflow/user/kernels/tanh_kernel.cpp
+++ b/oneflow/user/kernels/tanh_kernel.cpp
@@ -39,7 +39,7 @@ class TanHKernel final : public user_op::OpKernel {
 #define REGISTER_TANH_KERNEL(device, dtype)                                                     \
   REGISTER_USER_KERNEL("tanh")                                                                  \
       .SetCreateFn<TanHKernel<device, dtype>>()                                                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))         \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
@@ -76,7 +76,7 @@ class TanHGradKernel final : public user_op::OpKernel {
 #define REGISTER_TANH_GRAD_KERNEL(device, dtype)                                                \
   REGISTER_USER_KERNEL("tanh_grad")                                                             \
       .SetCreateFn<TanHGradKernel<device, dtype>>()                                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))          \
       .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
                                user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
diff --git a/oneflow/user/kernels/tensor_buffer_kernels.cpp b/oneflow/user/kernels/tensor_buffer_kernels.cpp
index b0c8b0d5b0cdbc4cc2f4d0266379e081e2195703..d0255e0e378d3e06c6a4beb6af66c1369f7212f2 100644
--- a/oneflow/user/kernels/tensor_buffer_kernels.cpp
+++ b/oneflow/user/kernels/tensor_buffer_kernels.cpp
@@ -59,7 +59,7 @@ class TensorBufferToTensorKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("tensor_buffer_to_tensor")
     .SetCreateFn<TensorBufferToTensorKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("in", 0) == DataType::kTensorBuffer));
 
 class TensorToTensorBufferKernel final : public user_op::OpKernel {
@@ -101,7 +101,7 @@ class TensorToTensorBufferKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("tensor_to_tensor_buffer")
     .SetCreateFn<TensorToTensorBufferKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("out", 0) == DataType::kTensorBuffer));
 
 }  // namespace
diff --git a/oneflow/user/kernels/test_kernels.cpp b/oneflow/user/kernels/test_kernels.cpp
index 0c3ce7ba31c6af19e6c30ade0d38af2230da056b..583decec11bc212a39c703417e5e49cecf52850c 100644
--- a/oneflow/user/kernels/test_kernels.cpp
+++ b/oneflow/user/kernels/test_kernels.cpp
@@ -127,7 +127,7 @@ class TestSourceGpuKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("TestSource")
     .SetCreateFn<TestSourceGpuKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kGPU)
+    .SetIsMatchedHob(user_op::HobDeviceTag() == "gpu")
     .SetInferTmpSizeFn([](user_op::InferContext*) { return 0; });
 
 class TestMultiOutputOrderKernel final : public user_op::OpKernel {
@@ -150,7 +150,7 @@ class TestMultiOutputOrderKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("TestMultiOutputOrder")
     .SetCreateFn<TestMultiOutputOrderKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")
                      & (user_op::HobDataType("in", 0) == DataType::kFloat));
 
 class TestMultiInputFwKernel final : public user_op::OpKernel {
@@ -170,7 +170,7 @@ class TestMultiInputFwKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("TestMultiInput")
     .SetCreateFn<TestMultiInputFwKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")
                      & (user_op::HobDataType("x1", 0) == DataType::kFloat));
 
 class TestMultiInputBwKernel final : public user_op::OpKernel {
@@ -192,7 +192,7 @@ class TestMultiInputBwKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("TestMultiInputGrad")
     .SetCreateFn<TestMultiInputBwKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "gpu")
                      & (user_op::HobDataType("x1", 0) == DataType::kFloat));
 
 #endif
@@ -233,7 +233,7 @@ class TestSourceKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("TestSource")
     .SetCreateFn<TestSourceKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == DeviceType::kCPU)
                      & (user_op::HobDataType("out", 0) == DataType::kFloat))
     .SetInferTmpSizeFn([](user_op::InferContext*) { return 0; });
 
@@ -254,7 +254,7 @@ class TestSourceMultiGpuFixedOutNumKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("TestSourceMultiGpuFixedOutNum")
     .SetCreateFn<TestSourceMultiGpuFixedOutNumKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == DeviceType::kCPU)
                      & (user_op::HobDataType("out", 0) == DataType::kFloat));
 
 class TestDynamicSourceKernel final : public user_op::OpKernel {
@@ -273,7 +273,7 @@ class TestDynamicSourceKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("TestDynamicSource")
     .SetCreateFn<TestDynamicSourceKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("out", 0) == DataType::kFloat));
 
 class TestRandomSourceKernel final : public user_op::OpKernel {
@@ -301,7 +301,7 @@ class TestRandomSourceKernel final : public user_op::OpKernel {
 
 REGISTER_USER_KERNEL("TestRandomSource")
     .SetCreateFn<TestRandomSourceKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)
+    .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")
                      & (user_op::HobDataType("out", 0) == DataType::kFloat));
 
 class TestDataTypeAttrKernel final : public user_op::OpKernel {
diff --git a/oneflow/user/kernels/top_k_kernel.cpp b/oneflow/user/kernels/top_k_kernel.cpp
index 819ece8441ac109549ea8978e7007f9ffb624858..fab1b6324b511af9edc7141818aef71035ec754d 100644
--- a/oneflow/user/kernels/top_k_kernel.cpp
+++ b/oneflow/user/kernels/top_k_kernel.cpp
@@ -99,7 +99,7 @@ class TopKCpuKernel final : public user_op::OpKernel {
 #define REGISTER_CPU_TOP_K_KERNEL(dtype)                                                 \
   REGISTER_USER_KERNEL("top_k")                                                          \
       .SetCreateFn<TopKCpuKernel<dtype>>()                                               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                    \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == "cpu")                                \
                        & (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))   \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                     \
diff --git a/oneflow/user/kernels/transpose_kernel.cpp b/oneflow/user/kernels/transpose_kernel.cpp
index b433f3a0050f7392c164929096794c3fd4a4455e..5a4dbfc91d2db203cd308958c96c4ebd3d0e1c92 100644
--- a/oneflow/user/kernels/transpose_kernel.cpp
+++ b/oneflow/user/kernels/transpose_kernel.cpp
@@ -43,7 +43,7 @@ class TransposeKernel final : public OpKernel {
 #define REGISTER_TRANSPOSE_KERNEL(device, dtype)                                         \
   REGISTER_USER_KERNEL("transpose")                                                      \
       .SetCreateFn<TransposeKernel<device, dtype>>()                                     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                               \
                        & (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
                        & (user_op::HobDataType("output", 0) == GetDataType<dtype>::value));
 
diff --git a/oneflow/user/kernels/two_stage_reduce_kernel.cpp b/oneflow/user/kernels/two_stage_reduce_kernel.cpp
index d9cecaf63989d956300511271c16f0dd2de0c2d9..5bd1ecad471573cc2ac464d5a339dee00ff07239 100644
--- a/oneflow/user/kernels/two_stage_reduce_kernel.cpp
+++ b/oneflow/user/kernels/two_stage_reduce_kernel.cpp
@@ -97,7 +97,7 @@ user_op::InferTmpSizeFn GenDeviceStageInferTmpSizeFn() {
 #define REGISTER_REDUCE_DEVICE_STAGE_KERNEL(op_name, binary_func, device, dtype_pair)            \
   REGISTER_USER_KERNEL(op_name)                                                                  \
       .SetCreateFn<ReduceDeviceStageKernel<binary_func, device, OF_PP_PAIR_FIRST(dtype_pair)>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                      \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                       \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)))      \
       .SetInferTmpSizeFn(GenDeviceStageInferTmpSizeFn<OF_PP_PAIR_FIRST(dtype_pair)>());
 
@@ -155,7 +155,7 @@ user_op::InferTmpSizeFn GenDeviceStageGradInferTmpSizeFn() {
 #define REGISTER_REDUCE_DEVICE_STAGE_GRAD_KERNEL(op_name, device, dtype_pair)                   \
   REGISTER_USER_KERNEL(op_name)                                                                 \
       .SetCreateFn<ReduceDeviceStageGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("in_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
       .SetInferTmpSizeFn(GenDeviceStageGradInferTmpSizeFn<OF_PP_PAIR_FIRST(dtype_pair)>());
 
@@ -196,7 +196,7 @@ class ReduceGlobalStageKernel final : public OpKernel {
 #define REGISTER_REDUCE_GLOBAL_STAGE_KERNEL(op_name, binary_func, device, dtype_pair)            \
   REGISTER_USER_KERNEL(op_name)                                                                  \
       .SetCreateFn<ReduceGlobalStageKernel<binary_func, device, OF_PP_PAIR_FIRST(dtype_pair)>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                      \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                       \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)))      \
       .SetInferTmpSizeFn([](InferContext* ctx) {                                                 \
         const Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                             \
@@ -296,7 +296,7 @@ user_op::InferTmpSizeFn GenGlobalStageGradInferTmpSizeFn() {
 #define REGISTER_REDUCE_GLOBAL_STAGE_GRAD_KERNEL(op_name, device, dtype_pair)                   \
   REGISTER_USER_KERNEL(op_name)                                                                 \
       .SetCreateFn<ReduceGlobalStageGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                     \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                                      \
                        & (user_op::HobDataType("in_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
       .SetInferTmpSizeFn(GenGlobalStageGradInferTmpSizeFn<OF_PP_PAIR_FIRST(dtype_pair)>());
 
diff --git a/oneflow/user/kernels/unsorted_batch_segment_sum_kernel.cpp b/oneflow/user/kernels/unsorted_batch_segment_sum_kernel.cpp
index 191c63b3e268f186dbe3341c0bca5e8c88cd1e7c..198f22d5cb1564f5ec5dc6c3f4e155fb19873d7b 100644
--- a/oneflow/user/kernels/unsorted_batch_segment_sum_kernel.cpp
+++ b/oneflow/user/kernels/unsorted_batch_segment_sum_kernel.cpp
@@ -58,7 +58,7 @@ class UnsortedBatchSegmentSumKernel final : public user_op::OpKernel {
       .SetCreateFn<UnsortedBatchSegmentSumKernel<device, OF_PP_PAIR_FIRST(out_dtype),        \
                                                  OF_PP_PAIR_FIRST(segment_ids_dtype)>>()     \
       .SetIsMatchedHob(                                                                      \
-          (user_op::HobDeviceType() == device)                                               \
+          (user_op::HobDeviceTag() == device)                                                \
           & (user_op::HobDataType("segment_ids", 0) == OF_PP_PAIR_SECOND(segment_ids_dtype)) \
           & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype)));
 
diff --git a/oneflow/user/kernels/unsorted_segment_sum_kernel.cpp b/oneflow/user/kernels/unsorted_segment_sum_kernel.cpp
index 83f65c4c2a40666494b6afe86b95681d78438228..4b7aea8c40eb4a41de86832f17bb5511d2ea271f 100644
--- a/oneflow/user/kernels/unsorted_segment_sum_kernel.cpp
+++ b/oneflow/user/kernels/unsorted_segment_sum_kernel.cpp
@@ -95,7 +95,7 @@ class UnsortedSegmentSumKernel final : public user_op::OpKernel {
       .SetCreateFn<UnsortedSegmentSumKernel<device, OF_PP_PAIR_FIRST(out_type),               \
                                             OF_PP_PAIR_FIRST(segment_ids_type)>>()            \
       .SetIsMatchedHob(                                                                       \
-          (user_op::HobDeviceType() == device)                                                \
+          (user_op::HobDeviceTag() == device)                                                 \
           & (user_op::HobDataType("segment_ids", 0) == OF_PP_PAIR_SECOND(segment_ids_type))   \
           & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_type)));
 
diff --git a/oneflow/user/kernels/upsample_kernel.cu b/oneflow/user/kernels/upsample_kernel.cu
index 4e973ecb5828940692ffb5b9d2ea98c631ce0690..78387c493820b1f2c319ffaa8efc6a725725fea6 100644
--- a/oneflow/user/kernels/upsample_kernel.cu
+++ b/oneflow/user/kernels/upsample_kernel.cu
@@ -190,13 +190,13 @@ class UpsampleNearestGradGPUKernel final : public user_op::OpKernel {
   REGISTER_USER_KERNEL("upsample")                                                       \
       .SetCreateFn<UpsampleNearestGPUKernel<dtype>>()                                    \
       .SetIsMatchedHob(                                                                  \
-          (user_op::HobDeviceType() == DeviceType::kGPU)                                 \
+          (user_op::HobDeviceTag() == "gpu")                                             \
           & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)                  \
           & (user_op::HobAttr<std::string>("interpolation") == std::string("nearest"))); \
   REGISTER_USER_KERNEL("upsample_grad")                                                  \
       .SetCreateFn<UpsampleNearestGradGPUKernel<dtype>>()                                \
       .SetIsMatchedHob(                                                                  \
-          (user_op::HobDeviceType() == DeviceType::kGPU)                                 \
+          (user_op::HobDeviceTag() == "gpu")                                             \
           & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)                 \
           & (user_op::HobAttr<std::string>("interpolation") == std::string("nearest")));
 
@@ -262,13 +262,13 @@ class UpsampleBilinearGradGPUKernel final : public user_op::OpKernel {
   REGISTER_USER_KERNEL("upsample")                                                        \
       .SetCreateFn<UpsampleBilinearGPUKernel<dtype>>()                                    \
       .SetIsMatchedHob(                                                                   \
-          (user_op::HobDeviceType() == DeviceType::kGPU)                                  \
+          (user_op::HobDeviceTag() == "gpu")                                              \
           & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)                   \
           & (user_op::HobAttr<std::string>("interpolation") == std::string("bilinear"))); \
   REGISTER_USER_KERNEL("upsample_grad")                                                   \
       .SetCreateFn<UpsampleBilinearGradGPUKernel<dtype>>()                                \
       .SetIsMatchedHob(                                                                   \
-          (user_op::HobDeviceType() == DeviceType::kGPU)                                  \
+          (user_op::HobDeviceTag() == "gpu")                                              \
           & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)                  \
           & (user_op::HobAttr<std::string>("interpolation") == std::string("bilinear")));
 
diff --git a/oneflow/user/kernels/where_kernel.cpp b/oneflow/user/kernels/where_kernel.cpp
index 7f07807051d5d5cf5e3a657dcbcb84a44e27ab2d..ea10e1db15166e61c73930e43d7f8b6402095a88 100644
--- a/oneflow/user/kernels/where_kernel.cpp
+++ b/oneflow/user/kernels/where_kernel.cpp
@@ -41,7 +41,7 @@ class WhereKernel final : public user_op::OpKernel {
   REGISTER_USER_KERNEL("where")                                                                  \
       .SetCreateFn<WhereKernel<device_type_v, OF_PP_PAIR_FIRST(dtype_pair),                      \
                                OF_PP_PAIR_FIRST(ctype_pair)>>()                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device_type_v)                               \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device_type_v)                                \
                        & (user_op::HobDataType("condition", 0) == OF_PP_PAIR_SECOND(ctype_pair)) \
                        & (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)));
 
diff --git a/oneflow/user/kernels/zero_like_kernel.cpp b/oneflow/user/kernels/zero_like_kernel.cpp
index 7895296d66dba8e0bcc000403774987872571a38..d2f821f6e0d771363c32cb950e86f2b90eace781 100644
--- a/oneflow/user/kernels/zero_like_kernel.cpp
+++ b/oneflow/user/kernels/zero_like_kernel.cpp
@@ -36,7 +36,7 @@ class ZeroLikeKernel final : public user_op::OpKernel {
 #define REGISTER_ZERO_LIKE_KERNEL(device_type_v)    \
   REGISTER_USER_KERNEL("zero_like")                 \
       .SetCreateFn<ZeroLikeKernel<device_type_v>>() \
-      .SetIsMatchedHob(user_op::HobDeviceType() == device_type_v);
+      .SetIsMatchedHob(user_op::HobDeviceTag() == device_type_v);
 
 REGISTER_ZERO_LIKE_KERNEL(DeviceType::kCPU)
 #ifdef WITH_CUDA
diff --git a/oneflow/xrt/api.cpp b/oneflow/xrt/api.cpp
index 88cd1509f519135c75ca828f2c60553d7ad33629..99fd5886a7a1df33902b6f02953fd16de5878694 100644
--- a/oneflow/xrt/api.cpp
+++ b/oneflow/xrt/api.cpp
@@ -13,15 +13,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/operator/operator.h"  // GenLogicalBlobName, GenLogicalBlobId
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/xrt/api.h"
+#include "oneflow/xrt/build_graph.h"
+#include "oneflow/xrt/utility/env.h"
 
 #include "absl/strings/str_cat.h"
 #include "glog/logging.h"
 
-#include "oneflow/core/operator/operator.h"  // GenLogicalBlobName, GenLogicalBlobId
-#include "oneflow/xrt/build_graph.h"
-#include "oneflow/xrt/utility/env.h"
-
 #include <fstream>
 #include <mutex>
 
@@ -122,6 +122,11 @@ std::string ExtractOpTypeAsString(const OperatorConf &conf) {
   }
 }
 
+XrtDevice DeviceTagToXrtDevice(const std::string &device_tag) {
+  DeviceType device_type = CHECK_JUST(DeviceType4DeviceTag(device_tag));
+  return DeviceTypeToXrtDevice(device_type);
+}
+
 XrtDevice DeviceTypeToXrtDevice(const DeviceType &device_type) {
   switch (device_type) {
     case DeviceType::kGPU: return XrtDevice::GPU_CUDA;
diff --git a/oneflow/xrt/api.h b/oneflow/xrt/api.h
index 28c77d7b0b6d70ba0169bdbacd4a9f326c54fbf0..f4b44a379dc89ba0cc24a4286fc545d02db873b9 100644
--- a/oneflow/xrt/api.h
+++ b/oneflow/xrt/api.h
@@ -33,6 +33,8 @@ std::string ExtractOpTypeAsString(const OperatorConf &conf);
 
 XrtDevice DeviceTypeToXrtDevice(const DeviceType &device_type);
 
+XrtDevice DeviceTagToXrtDevice(const std::string &device_tag);
+
 DeviceType XrtDeviceToDeviceType(const XrtDevice &device);
 
 XrtEngine StringToXrtEngine(const std::string &engine);
diff --git a/oneflow/xrt/build_graph.h b/oneflow/xrt/build_graph.h
index 19a524fc9948305844fbfcfd994ad3da32fa90e7..465e8b2ae966b60fda26a03638b86e903bca77d2 100644
--- a/oneflow/xrt/build_graph.h
+++ b/oneflow/xrt/build_graph.h
@@ -53,7 +53,7 @@ class GraphBuilder {
   void SetupXrtNode(XrtNode *node, const OperatorConf &node_conf) const {
     node->set_name(node_conf.name());
     node->set_type(ExtractOpTypeAsString(node_conf));
-    node->set_device(DeviceTypeToXrtDevice(node_conf.device_type()));
+    node->set_device(DeviceTagToXrtDevice(node_conf.device_tag()));
   }
 
   void SetupXrtNode(XrtNode *node, const XrtLaunchOpConf::Argument &arg_conf) const {
diff --git a/oneflow/xrt/launch_op.cpp b/oneflow/xrt/launch_op.cpp
index 8bc43ca9eaa489c00766ac7a9283570ce4695f0d..068f98c8ae4417b99de88360288f456bdb41e960 100644
--- a/oneflow/xrt/launch_op.cpp
+++ b/oneflow/xrt/launch_op.cpp
@@ -13,13 +13,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
-
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/job/sbp_signature_builder.h"
 #include "oneflow/xrt/api.h"
 #include "oneflow/xrt/launch_op.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+
 namespace oneflow {
 
 void XrtLaunchOp::InitFromOpConf() {
@@ -61,8 +62,9 @@ Maybe<void> XrtLaunchOp::InferBlobDescs(
     // Run InferShape pass
     const auto &sbp_signatures = launch_conf.sbp_signatures();
     auto options = xrt::CreateDefaultXrtPassOptions();
+    DeviceType device_type = JUST(DeviceType4DeviceTag(op_conf().device_tag()));
     auto graph =
-        xrt::BuildXrtGraph(launch_conf.function(), op_conf().device_type(), this->job_desc());
+        xrt::BuildXrtGraph(launch_conf.function(), device_type, this->job_desc());
     xrt::RunXrtPass("InferShape", graph.get(), options, &this->job_desc(), parallel_ctx,
                     &sbp_signatures, &blob_descs);
   }
diff --git a/oneflow/xrt/passes/rebuild_job_pass.cpp b/oneflow/xrt/passes/rebuild_job_pass.cpp
index 82736d5ebfb290639dd02e4e09aa60a5288f3057..b83cb16ebb6a672ae335f2f0eb5bd476201a8561 100644
--- a/oneflow/xrt/passes/rebuild_job_pass.cpp
+++ b/oneflow/xrt/passes/rebuild_job_pass.cpp
@@ -13,14 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/xrt/passes/pass.h"
-
-#include <string>
-#include <vector>
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
-#include "glog/logging.h"
-
+#include "oneflow/core/framework/to_string.h"
 #include "oneflow/core/job/job_builder.h"
 #include "oneflow/core/operator/op_conf.pb.h"
 #include "oneflow/xrt/api.h"
@@ -28,9 +21,17 @@ limitations under the License.
 #include "oneflow/xrt/graph/graph.h"
 #include "oneflow/xrt/kernel/op_kernel.h"
 #include "oneflow/xrt/node_util.h"
+#include "oneflow/xrt/passes/pass.h"
 #include "oneflow/xrt/types.h"
 #include "oneflow/xrt/utility/stl.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "glog/logging.h"
+
+#include <string>
+#include <vector>
+
 namespace oneflow {
 namespace xrt {
 
@@ -222,7 +223,7 @@ void FoldSubgraphBuilder::BuildXrtLaunchOps() {
     OperatorConf op_conf;
     op_conf.set_name(node->name());
     DeviceType device_type = XrtDeviceToDeviceType(node->device());
-    op_conf.set_device_type(device_type);
+    op_conf.set_device_tag(CHECK_JUST(DeviceTag4DeviceType(device_type)));
 
     XrtLaunchOpConf *launch_conf = op_conf.mutable_xrt_launch_conf();
     // Add inputs and outputs in launch_conf