Dev pure cpu (#3398)

* cmake dont panic when build cuda * naive changes * fix cudaMemcpyKind * fix acc actor * fix actor * fix gdb * fix vm * fix work type * fix cuda type * fix cuda type * fix collective backend * fix global scope * amp * rm PackKernelUtil gpu * fix log * fix rand * fix sync size * fix allocator * fix vm * fix kernel * fix kernel * fix kernels * fix kernel * fix softmax * fix kernels * fix reshape kernels * add workaround * try fix symbol not found * fix vm * fix vm * fix jpeg * fix broadcast gpu * fix broadcast like * fix transpose * fix matmul * fix CopyElemOnGpu * fix sigmoid * fix sigmoid and softmax * fix relu * fix sparse cross entropy * fix kernels * fix tanh * fix same padding * fix softmax * fix undefined symbol: gzgets * fix CopyField * fix scalar add * fix CopyNDGpuImpl * copier * fix slice boxing * fix mem copier * fix zero like * fix acc actor * fix dev pure cpu (#3410) * add pure cpu message * add default for src dir in ci * add cpu workflow * fix extra_oneflow_cmake_args * fix link problem add update readme * fix _GetDefaultConfigProto * use gpu * rm default value for gpu dev num * rm cpu ci * check in skip code * refine ci and add back * add arg for appendix * fix env arg * fix env arg * rm make nccl * add back to pass gpu ci * get with cuda in py * fix fmt * fix test kernels * fix activation * add skips * fmt * skip all reduce * fix assign * fix bn * fix bn * rm tmp_wheel * rm tmp_wheel * move clean up * for CPU-only OneFlow make gpu_device_num equivalent to cpu_device_num * change warning to info * only run 3 iters for cpu * skip bert for cpu * fix check * add warning * add cpu Integration test * Dev pure cpu test cases (#3422) * fix test cpu cases * change os.getenv('ONEFLOW_TEST_CPU_ONLY') == 'True' to os.getenv('ONEFLOW_TEST_CPU_ONLY') * print traceback for info and warning * fix test cpu cases * add more info on why skip check of resnet * skip gan in cpu Co-authored-by: tsai <caishenghang@1f-dev.kbaeegfb1x0ubnoznzequyxzve.bx.internal.cloudapp.net> Co-authored-by: tsai <caishenghang@oneflow.org> Co-authored-by: oneflow-bot <69100618+oneflow-bot@users.noreply.github.com> Co-authored-by: JackieWu <wkcn@live.cn> Co-authored-by: OuYang Yu <xuanjiuye@gmail.com>

Dev pure cpu (#3398)
* cmake dont panic when build cuda * naive changes * fix cudaMemcpyKind * fix acc actor * fix actor * fix gdb * fix vm * fix work type * fix cuda type * fix cuda type * fix collective backend * fix global scope * amp * rm PackKernelUtil gpu * fix log * fix rand * fix sync size * fix allocator * fix vm * fix kernel * fix kernel * fix kernels * fix kernel * fix softmax * fix kernels * fix reshape kernels * add workaround * try fix symbol not found * fix vm * fix vm * fix jpeg * fix broadcast gpu * fix broadcast like * fix transpose * fix matmul * fix CopyElemOnGpu * fix sigmoid * fix sigmoid and softmax * fix relu * fix sparse cross entropy * fix kernels * fix tanh * fix same padding * fix softmax * fix undefined symbol: gzgets * fix CopyField * fix scalar add * fix CopyNDGpuImpl * copier * fix slice boxing * fix mem copier * fix zero like * fix acc actor * fix dev pure cpu (#3410) * add pure cpu message * add default for src dir in ci * add cpu workflow * fix extra_oneflow_cmake_args * fix link problem add update readme * fix _GetDefaultConfigProto * use gpu * rm default value for gpu dev num * rm cpu ci * check in skip code * refine ci and add back * add arg for appendix * fix env arg * fix env arg * rm make nccl * add back to pass gpu ci * get with cuda in py * fix fmt * fix test kernels * fix activation * add skips * fmt * skip all reduce * fix assign * fix bn * fix bn * rm tmp_wheel * rm tmp_wheel * move clean up * for CPU-only OneFlow make gpu_device_num equivalent to cpu_device_num * change warning to info * only run 3 iters for cpu * skip bert for cpu * fix check * add warning * add cpu Integration test * Dev pure cpu test cases (#3422) * fix test cpu cases * change os.getenv('ONEFLOW_TEST_CPU_ONLY') == 'True' to os.getenv('ONEFLOW_TEST_CPU_ONLY') * print traceback for info and warning * fix test cpu cases * add more info on why skip check of resnet * skip gan in cpu Co-authored-by: tsai <caishenghang@1f-dev.kbaeegfb1x0ubnoznzequyxzve.bx.internal.cloudapp.net> Co-authored-by: tsai <caishenghang@oneflow.org> Co-authored-by: oneflow-bot <69100618+oneflow-bot@users.noreply.github.com> Co-authored-by: JackieWu <wkcn@live.cn> Co-authored-by: OuYang Yu <xuanjiuye@gmail.com>
47aba214 · Shenghang Tsai · GitHub · e5e3eb31 · 47aba214 · 47aba214
Unverified Commit 47aba214 authored 4 years ago by Shenghang Tsai Committed by GitHub 4 years ago
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,6 +20,10 @@ jobs:
    runs-on: [self-hosted, linux, gpu]
    if: github.event.pull_request.draft == false
    steps:
+    - name: Clean environment
+      run: |
+        rm -rf build/third_party
+        bash ci/build/clean.sh
    - uses: actions/checkout@v2
    - name: Check license (please run 'make of_format' if failed)
      run: |
@@ -33,8 +37,6 @@ jobs:
    - name: Setup environment
      run: |
        echo $HOSTNAME
-        rm -rf build/third_party
-        bash ci/build/clean.sh
        bash ci/setup_submodule.sh
    - name: Checkout submodules
      shell: bash
@@ -43,6 +45,7 @@ jobs:
        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --recursive
    - name: Build OneFlow
      run: |
+        ONEFLOW_CI_PACKAGE_APPENDIX="_cu102" \
        bash ci/build/make.sh
    - name: Build docker image for testing
      run: |
@@ -107,3 +110,54 @@ jobs:
      if: ${{ always() }}
      run: |
        bash ci/build/clean.sh
+
+  build_and_test_cpu:
+
+    runs-on: [self-hosted, linux, gpu]
+    if: github.event.pull_request.draft == false
+    steps:
+    - name: Clean environment
+      run: |
+        rm -rf build/third_party
+        bash ci/build/clean.sh
+    - uses: actions/checkout@v2
+    - name: Setup environment
+      run: |
+        echo $HOSTNAME
+        bash ci/setup_submodule.sh
+    - name: Checkout submodules
+      shell: bash
+      run: |
+        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
+        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --recursive
+    - name: Build OneFlow
+      run: |
+        export ONEFLOW_CI_EXTRA_ONEFLOW_CMAKE_ARGS="-DBUILD_CUDA=OFF"
+        export ONEFLOW_CI_TMP_DIR=$HOME/ci-tmp-cpu
+        bash ci/build/make.sh
+    - name: Build docker image for testing
+      run: |
+        bash docker/ci/test/build.sh
+    - name: Unit test
+      run: |
+        docker run --shm-size=8g --rm \
+          -v $HOME/ci-tmp-cpu:/ci-tmp \
+          -w $PWD -v $PWD:$PWD -v /dataset:/dataset -v /model_zoo:/model_zoo \
+          --env ONEFLOW_WHEEL_PATH=/ci-tmp/wheelhouse \
+          --env ONEFLOW_TEST_CPU_ONLY=1 \
+          oneflow-test \
+          bash -c "bash ci/test/try_install.sh && bash ci/test/1node_op_test.sh"
+    - name: Integration test
+      run: |
+        docker run --shm-size=8g --rm \
+          -v $HOME/ci-tmp-cpu:/ci-tmp \
+          -w $PWD -v $PWD:$PWD -v /dataset:/dataset -v /model_zoo:/model_zoo \
+          --env ONEFLOW_WHEEL_PATH=/ci-tmp/wheelhouse \
+          --env ONEFLOW_TEST_CPU_ONLY=1 \
+          oneflow-test \
+          bash -c "bash ci/test/try_install.sh && bash ci/test/1node_model_test.sh"
+    - name: Clean up files created by root
+      if: ${{ always() }}
+      run: |
+        ONEFLOW_CI_TMP_DIR=$HOME/ci-tmp-cpu \
+        bash ci/build/clean.sh
--- a/README.md
+++ b/README.md
@@ -122,6 +122,8 @@
    make pip_install
    ```

+    - For pure CPU build, please add this CMake flag `-DBUILD_CUDA=OFF`.
+
 ### Troubleshooting

 Please refer to [troubleshooting](docs/source/troubleshooting.md) for common issues you might encounter when compiling and running OneFlow.

--- a/ci/build/clean.sh
+++ b/ci/build/clean.sh
 set -ex
+tmp_dir=${ONEFLOW_CI_TMP_DIR:-"$HOME/ci-tmp"}
 docker run --rm \
-    -v $HOME/ci-tmp:/ci-tmp \
-    -w $HOME/ci-tmp:/ci-tmp busybox rm -rf /ci-tmp/wheelhouse
+    -v $tmp_dir:/ci-tmp \
+    -w $tmp_dir:/ci-tmp busybox rm -rf /ci-tmp/wheelhouse
+docker run --rm -v $PWD:/p -w /p busybox rm -rf tmp_wheel
 docker run --rm -v $PWD:/p -w /p busybox rm -rf build
--- a/ci/build/make.sh
+++ b/ci/build/make.sh
@@ -2,6 +2,8 @@ set -ex

 src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
 tmp_dir=${ONEFLOW_CI_TMP_DIR:-"$HOME/ci-tmp"}
+extra_oneflow_cmake_args=${ONEFLOW_CI_EXTRA_ONEFLOW_CMAKE_ARGS:-""}
+package_appendix=${ONEFLOW_CI_PACKAGE_APPENDIX:-""}
 mkdir -p $tmp_dir
 docker_tag=${ONEFLOW_CI_DOCKER_TAG:-"oneflow:ci-manylinux2014-cuda10.2"}

@@ -35,7 +37,8 @@ function build() {
        "$docker_tag" \
        /oneflow-src/docker/package/manylinux/build_wheel.sh \
            --python3.6 \
-            --package-name oneflow_cu102
+            --package-name oneflow${package_appendix} \
+            $extra_oneflow_cmake_args
 }

 set +e

--- a/ci/setup_submodule.sh
+++ b/ci/setup_submodule.sh
 set -x
 set -e
-python3 ci/setup_submodule.py --oneflow_src_local_path=${ONEFLOW_CI_SRC_DIR}
+src_dir=${ONEFLOW_CI_SRC_DIR:-"$HOME/oneflow"}
+python3 ci/setup_submodule.py --oneflow_src_local_path=$src_dir
 git submodule sync
 git submodule update --init --recursive
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
 # main cpp
-list(APPEND of_main_cc ${PROJECT_SOURCE_DIR}/oneflow/core/job/oneflow_worker.cpp)
-
+# TODO(tsai): skip for now, fail to link when building CPU only
+if (BUILD_CUDA)
+  list(APPEND of_main_cc ${PROJECT_SOURCE_DIR}/oneflow/core/job/oneflow_worker.cpp)
+endif()
 function(oneflow_add_executable)
  if (BUILD_CUDA)
    cuda_add_executable(${ARGV})
@@ -291,6 +293,14 @@ add_custom_target(of_pyscript_copy ALL
    COMMAND ${Python_EXECUTABLE} "${PROJECT_SOURCE_DIR}/tools/generate_oneflow_symbols_export_file.py"
        "${PROJECT_SOURCE_DIR}" "${of_pyscript_dir}/oneflow/python/__export_symbols__.py")
 file(GLOB_RECURSE oneflow_all_python_file "${PROJECT_SOURCE_DIR}/oneflow/python/*.py")
+if (BUILD_CUDA)
+  add_custom_command(TARGET of_pyscript_copy POST_BUILD
+        COMMAND echo "with_cuda=True" >> "${of_pyscript_dir}/oneflow/python/compatibility.py")
+else()
+  add_custom_command(TARGET of_pyscript_copy POST_BUILD
+        COMMAND echo "with_cuda=False" >> "${of_pyscript_dir}/oneflow/python/compatibility.py")
+endif()
+
 copy_files("${oneflow_all_python_file}" "${PROJECT_SOURCE_DIR}" "${of_pyscript_dir}" of_pyscript_copy)

 file(WRITE ${of_pyscript_dir}/oneflow/python/framework/sysconfig_gen.py "generated_compile_flags = []\n")
@@ -334,28 +344,29 @@ endforeach()

 # build test
 if(BUILD_TESTING)
-  if(NOT BUILD_CUDA)
-    message(FATAL_ERROR "BUILD_TESTING without BUILD_CUDA")
-  endif()
-  if (of_all_test_cc)
-    oneflow_add_executable(oneflow_testexe ${of_all_test_cc})
-    target_link_libraries(oneflow_testexe ${of_libs} ${oneflow_third_party_libs})
-    set_target_properties(oneflow_testexe PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
-    add_test(NAME oneflow_test COMMAND oneflow_testexe)
-    #  foreach(cc ${of_all_test_cc})
-    #    get_filename_component(test_name ${cc} NAME_WE)
-    #    string(CONCAT test_exe_name ${test_name} exe)
-    #    oneflow_add_executable(${test_exe_name} ${cc})
-    #    target_link_libraries(${test_exe_name} ${of_libs} ${oneflow_third_party_libs})
-    #  endforeach()
-  endif()
-  if (of_separate_test_cc)
-    foreach(cc ${of_separate_test_cc})
-      get_filename_component(test_name ${cc} NAME_WE)
-      string(CONCAT test_exe_name ${test_name} exe)
-      oneflow_add_executable(${test_exe_name} ${cc})
-      target_link_libraries(${test_exe_name} ${of_libs} ${oneflow_third_party_libs})
-    endforeach()
+  if(BUILD_CUDA)
+    if (of_all_test_cc)
+      oneflow_add_executable(oneflow_testexe ${of_all_test_cc})
+      target_link_libraries(oneflow_testexe ${of_libs} ${oneflow_third_party_libs})
+      set_target_properties(oneflow_testexe PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
+      add_test(NAME oneflow_test COMMAND oneflow_testexe)
+      #  foreach(cc ${of_all_test_cc})
+      #    get_filename_component(test_name ${cc} NAME_WE)
+      #    string(CONCAT test_exe_name ${test_name} exe)
+      #    oneflow_add_executable(${test_exe_name} ${cc})
+      #    target_link_libraries(${test_exe_name} ${of_libs} ${oneflow_third_party_libs})
+      #  endforeach()
+    endif()
+    if (of_separate_test_cc)
+      foreach(cc ${of_separate_test_cc})
+        get_filename_component(test_name ${cc} NAME_WE)
+        string(CONCAT test_exe_name ${test_name} exe)
+        oneflow_add_executable(${test_exe_name} ${cc})
+        target_link_libraries(${test_exe_name} ${of_libs} ${oneflow_third_party_libs})
+      endforeach()
+    endif()
+  else()
+    message(ERROR "BUILD_TESTING=ON has no effect when BUILD_CUDA=OFF")
  endif()
 endif()


--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -103,12 +103,12 @@ set(oneflow_third_party_libs
    ${GOOGLEMOCK_STATIC_LIBRARIES}
    ${PROTOBUF_STATIC_LIBRARIES}
    ${GRPC_STATIC_LIBRARIES}
-    ${ZLIB_STATIC_LIBRARIES}
    ${farmhash_STATIC_LIBRARIES}
    ${BLAS_LIBRARIES}
-    ${LIBJPEG_STATIC_LIBRARIES}
    ${OPENCV_STATIC_LIBRARIES}
    ${COCOAPI_STATIC_LIBRARIES}
+    ${LIBJPEG_STATIC_LIBRARIES}
+    ${ZLIB_STATIC_LIBRARIES}
 )

 if (NOT WITH_XLA)

--- a/docker/package/manylinux/build_wheel.sh
+++ b/docker/package/manylinux/build_wheel.sh
@@ -58,8 +58,8 @@ if [[ $SKIP_THIRD_PARTY != 1 ]]; then
    cmake -DTHIRD_PARTY=ON \
        $COMMON_CMAKE_ARGS \
        -DONEFLOW=OFF \
+        $EXTRA_ONEFLOW_CMAKE_ARGS \
        $ONEFLOW_SRC_DIR
-    make -j nccl
    make -j`nproc` prepare_oneflow_third_party

    popd
@@ -86,7 +86,7 @@ do
    cmake -DTHIRD_PARTY=OFF -DONEFLOW=ON\
        $COMMON_CMAKE_ARGS \
        -DPython3_ROOT_DIR=$PY_ROOT \
-        $EXTRA_ONEFLOW_CMAKE_ARGS   \
+        $EXTRA_ONEFLOW_CMAKE_ARGS \
        $ONEFLOW_SRC_DIR
    cmake --build . -j `nproc`
    popd

--- a/oneflow/core/actor/accumulate_compute_actor.cpp
+++ b/oneflow/core/actor/accumulate_compute_actor.cpp
@@ -21,12 +21,7 @@ void AccumulateCompActor::Init(const TaskProto& task_proto, int32_t max_acc_cnt,
  using namespace std::placeholders;
  order_ = order;
  if (GetDeviceType() == DeviceType::kCPU) {
-    cpy_func_ = std::bind(Memcpy<DeviceType::kCPU>, _1, _2, _3, _4
-#ifdef WITH_CUDA
-                          ,
-                          cudaMemcpyHostToHost
-#endif
-    );
+    cpy_func_ = std::bind(Memcpy<DeviceType::kCPU>, _1, _2, _3, _4, cudaMemcpyHostToHost);
  } else {
 #ifdef WITH_CUDA
    cpy_func_ = std::bind(Memcpy<DeviceType::kGPU>, _1, _2, _3, _4, cudaMemcpyDeviceToDevice);
@@ -54,8 +49,12 @@ void AccumulateCompActor::Act() {
      Memset<DeviceType::kCPU>(kernel_ctx.device_ctx, out_blob->mut_dptr(), 0,
                               out_blob->ByteSizeOfBlobBody());
    } else if (GetDeviceType() == DeviceType::kGPU) {
+#ifdef WITH_CUDA
      Memset<DeviceType::kGPU>(kernel_ctx.device_ctx, out_blob->mut_dptr(), 0,
                               out_blob->ByteSizeOfBlobBody());
+#else
+      UNIMPLEMENTED();
+#endif
    } else {
      UNIMPLEMENTED();
    }

--- a/oneflow/core/actor/actor.cpp
+++ b/oneflow/core/actor/actor.cpp
@@ -236,6 +236,7 @@ void Actor::InitDeviceCtx(const ThreadCtx& thread_ctx) {
      device_ctx_.reset(new CpuDeviceCtx());
      break;
    }
+#ifdef WITH_CUDA
    case DeviceType::kGPU: {
      CudaStreamHandle* cuda_handle = nullptr;
      CHECK_EQ(GetLocalWorkStreamId(), 0);
@@ -243,6 +244,7 @@ void Actor::InitDeviceCtx(const ThreadCtx& thread_ctx) {
      device_ctx_.reset(new CudaDeviceCtx(cuda_handle));
      break;
    }
+#endif
    default: { UNIMPLEMENTED(); }
  }
 }

--- a/oneflow/core/common/blas.h
+++ b/oneflow/core/common/blas.h
@@ -18,7 +18,9 @@ limitations under the License.

 #include <type_traits>
 #include <utility>
+#ifdef WITH_CUDA
 #include <cuda_fp16.h>
+#endif  // WITH_CUDA
 #include "oneflow/core/common/cblas.h"
 #include "oneflow/core/common/preprocessor.h"


--- a/oneflow/core/common/gdb.cpp
+++ b/oneflow/core/common/gdb.cpp
@@ -32,14 +32,22 @@ namespace {

 static char* MallocThenCpyD2H(const char* gpu_src, size_t size) {
  char* cpu_dst = reinterpret_cast<char*>(malloc(size));
+#ifdef WITH_CUDA
  cudaMemcpy(cpu_dst, gpu_src, size, cudaMemcpyDeviceToHost);
+#else
+  UNIMPLEMENTED();
+#endif
  return cpu_dst;
 }

 static void CpyH2DThenFree(char* gpu_dst, char* cpu_src, size_t size) {
+#ifdef WITH_CUDA
  cudaMemcpy(gpu_dst, cpu_src, size, cudaMemcpyHostToDevice);
+#else
+  UNIMPLEMENTED();
+#endif
  free(cpu_src);
-}
+}  // namespace

 template<typename T>
 void LoadFromStrFile(T* buf, const std::string& file_name) {

--- a/oneflow/core/device/cuda_util.h
+++ b/oneflow/core/device/cuda_util.h
@@ -123,6 +123,16 @@ class CudaCurrentDeviceGuard final {

 }  // namespace oneflow

+#else
+
+namespace oneflow {
+
+enum class CudaWorkType {};
+
+inline size_t GetCudaWorkTypeSize() { return 0; }
+
+}  // namespace oneflow
+
 #endif  // WITH_CUDA

 #endif  // ONEFLOW_CORE_DEVICE_CUDA_UTIL_H_
--- a/oneflow/core/device/memory_copier.cpp
+++ b/oneflow/core/device/memory_copier.cpp
@@ -251,13 +251,11 @@ void CudaAsyncMemoryCopier::CopyND(DeviceCtx* ctx, void* dst, const void* src,
    UNIMPLEMENTED();
  }
 }
+#endif

 REGISTER_DEFAULT_MEMORY_COPIER(DeviceType::kCPU, []() { return new HostMemoryCopier(); });
-
 #ifdef WITH_CUDA
-
 REGISTER_DEFAULT_MEMORY_COPIER(DeviceType::kGPU, []() { return new CudaAsyncMemoryCopier(); });
-
 #endif

 MemoryCopier* NewDefaultMemoryCopier(DeviceType device_type) {
@@ -266,8 +264,6 @@ MemoryCopier* NewDefaultMemoryCopier(DeviceType device_type) {
      ->Create();
 }

-#endif
-
 #define SPECIALIZE_COPY_ELEM(dtype)                                                        \
  template void MemoryCopier::CopyElem<dtype>(DeviceCtx * ctx, void* dst, const void* src, \
                                              const MemoryCopyNdDesc& desc) const;

--- a/oneflow/core/device/memory_copier.h
+++ b/oneflow/core/device/memory_copier.h
@@ -35,8 +35,10 @@ struct MemoryCopyNdDesc {

 template<int32_t NDIMS>
 void CopyNDCpuImpl(DeviceCtx* ctx, void* dst, const void* src, const MemoryCopyNdDesc& desc);
+#ifdef WITH_CUDA
 template<int32_t NDIMS>
 void CopyNDGpuImpl(DeviceCtx* ctx, void* dst, const void* src, const MemoryCopyNdDesc& desc);
+#endif

 class MemoryCopier {
 public:

--- a/oneflow/core/eager/blob_instruction_type.cpp
+++ b/oneflow/core/eager/blob_instruction_type.cpp
@@ -35,6 +35,7 @@ FLAT_MSG_VIEW_END(PinBlobInstruction);

 }  // namespace

+#ifdef WITH_CUDA
 class CudaHostRegisterBlobInstructionType final : public vm::InstructionType {
 public:
  CudaHostRegisterBlobInstructionType() = default;
@@ -84,6 +85,7 @@ class CudaHostUnregisterBlobInstructionType final : public vm::InstructionType {
 };
 COMMAND(
    vm::RegisterInstructionType<CudaHostUnregisterBlobInstructionType>("CudaHostUnregisterBlob"));
+#endif

 }  // namespace eager
 }  // namespace oneflow
--- a/oneflow/core/eager/cuda_opkernel_instruction_type.cpp
+++ b/oneflow/core/eager/cuda_opkernel_instruction_type.cpp
@@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#ifdef WITH_CUDA
+
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/job/job_desc.h"
 #include "oneflow/core/eager/opkernel_object.h"
@@ -143,3 +145,5 @@ COMMAND(vm::RegisterInstructionType<GpuFeedBlobInstructionType>("gpu.FeedBlob"))

 }  // namespace eager
 }  // namespace oneflow
+
+#endif
--- a/oneflow/core/graph/boxing/slice_boxing_sub_task_graph_builder.cpp
+++ b/oneflow/core/graph/boxing/slice_boxing_sub_task_graph_builder.cpp
@@ -83,6 +83,7 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build(
    return Error::BoxingNotSupported();
  }
  const auto GetBoxingGpuThrdId = [](const int64_t dev_id, CudaWorkType work_type) -> int64_t {
+#ifdef WITH_CUDA
    if (work_type == CudaWorkType::kCopyH2D) {
      return Global<IDMgr>::Get()->GetGpuH2DThrdId(dev_id);
    } else if (work_type == CudaWorkType::kCopyD2H) {
@@ -90,7 +91,11 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build(
    } else {
      return Global<IDMgr>::Get()->GetGpuMixThrdId(dev_id);
    }
+#else
+    UNIMPLEMENTED();
+#endif
  };
+
  const auto NewEdge = [&ctx]() -> TaskEdge* { return ctx->task_graph()->NewEdge(); };
  const auto CreateBoxingNode121 = [&ctx, &lbi, &GetBoxingGpuThrdId](
                                       const ParallelDesc& pd, const int64_t parallel_id,
@@ -102,7 +107,11 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build(
    if (pd.device_type() == DeviceType::kCPU) {
      thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(machine_id);
    } else if (pd.device_type() == DeviceType::kGPU) {
+#ifdef WITH_CUDA
      thrd_id = GetBoxingGpuThrdId(pd.DeviceIdForParallelId(parallel_id), CudaWorkType::kCopyH2D);
+#else
+      UNIMPLEMENTED();
+#endif
    } else {
      UNIMPLEMENTED();
    }
@@ -118,7 +127,11 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build(
    if (src_node->device_type() == DeviceType::kCPU) {
      thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(src_node->machine_id());
    } else if (src_node->device_type() == DeviceType::kGPU) {
+#ifdef WITH_CUDA
      thrd_id = GetBoxingGpuThrdId(src_node->GpuPhyId(), CudaWorkType::kCopyD2H);
+#else
+      UNIMPLEMENTED();
+#endif
    } else {
      UNIMPLEMENTED();
    }
@@ -235,9 +248,13 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build(
          if (in_pd.device_type() == DeviceType::kCPU) {
            local_concat_thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(in_machine_id);
          } else if (in_pd.device_type() == DeviceType::kGPU) {
+#ifdef WITH_CUDA
            local_concat_thrd_id = GetBoxingGpuThrdId(
                in_nodes.at(in_parallel_ids.at(out_id % in_parallel_ids.size()))->GpuPhyId(),
                CudaWorkType::kCopyD2H);
+#else
+            UNIMPLEMENTED();
+#endif
          }
          local_concat_node->Init(lbi, concat_slice, kSliceBoxingTaskModeCopy, in_machine_id,
                                  local_concat_thrd_id, Global<IDMgr>::Get()->CpuMemZoneId());
@@ -293,9 +310,13 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build(
              if (in_pd.device_type() == DeviceType::kCPU) {
                local_add_thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(in_machine_id);
              } else if (in_pd.device_type() == DeviceType::kGPU) {
+#ifdef WITH_CUDA
                local_add_thrd_id = GetBoxingGpuThrdId(
                    in_nodes.at(in_parallel_ids.at(out_id % in_parallel_ids.size()))->GpuPhyId(),
                    CudaWorkType::kCopyD2H);
+#else
+                UNIMPLEMENTED();
+#endif
              }
              local_add_node->Init(lbi, out_slice, kSliceBoxingTaskModeAdd, in_machine_id,
                                   local_add_thrd_id, Global<IDMgr>::Get()->CpuMemZoneId());
@@ -337,8 +358,12 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build(
        if (in_pd.device_type() == DeviceType::kCPU) {
          local_add_thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(in_machine_id);
        } else if (in_pd.device_type() == DeviceType::kGPU) {
+#ifdef WITH_CUDA
          local_add_thrd_id = GetBoxingGpuThrdId(in_nodes.at(in_ids_on_machine.front())->GpuPhyId(),
                                                 CudaWorkType::kCopyH2D);
+#else
+          UNIMPLEMENTED();
+#endif
        }
        local_add_node->Init(lbi, slice, kSliceBoxingTaskModeAdd, in_machine_id, local_add_thrd_id);
        FOR_RANGE(int64_t, i, 0, in_ids_on_machine.size()) {

--- a/oneflow/core/graph/case_compute_task_node.h
+++ b/oneflow/core/graph/case_compute_task_node.h
@@ -30,7 +30,13 @@ class CaseCompTaskNode final : public CompTaskNode {
  void ConsumeAllRegsts() override;

  TaskType GetTaskType() const override { return TaskType::kCase; }
-  CudaWorkType GetCudaWorkType() const override { return CudaWorkType::kCompute; }
+  CudaWorkType GetCudaWorkType() const override {
+#ifdef WITH_CUDA
+    return CudaWorkType::kCompute;
+#else
+    UNIMPLEMENTED();
+#endif
+  }

 private:
  void BuildExecGphAndRegst() override;

--- a/oneflow/core/graph/compute_task_node.h
+++ b/oneflow/core/graph/compute_task_node.h
@@ -29,7 +29,13 @@ class CompTaskNode : public TaskNode {
  CompTaskNode() = default;
  virtual ~CompTaskNode() = default;

-  virtual CudaWorkType GetCudaWorkType() const { return CudaWorkType::kCompute; }
+  virtual CudaWorkType GetCudaWorkType() const {
+#ifdef WITH_CUDA
+    return CudaWorkType::kCompute;
+#else
+    UNIMPLEMENTED();
+#endif
+  }
  virtual void ToProto(TaskProto*) override;

  // parallel_ctx_