diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 33ea06d9a4ff437b68480c4a68ef8df9f2085617..f6317b6374b9c1291e543b435712547ff131b72a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,6 +20,10 @@ jobs: runs-on: [self-hosted, linux, gpu] if: github.event.pull_request.draft == false steps: + - name: Clean environment + run: | + rm -rf build/third_party + bash ci/build/clean.sh - uses: actions/checkout@v2 - name: Check license (please run 'make of_format' if failed) run: | @@ -33,8 +37,6 @@ jobs: - name: Setup environment run: | echo $HOSTNAME - rm -rf build/third_party - bash ci/build/clean.sh bash ci/setup_submodule.sh - name: Checkout submodules shell: bash @@ -43,6 +45,7 @@ jobs: git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --recursive - name: Build OneFlow run: | + ONEFLOW_CI_PACKAGE_APPENDIX="_cu102" \ bash ci/build/make.sh - name: Build docker image for testing run: | @@ -107,3 +110,54 @@ jobs: if: ${{ always() }} run: | bash ci/build/clean.sh + + build_and_test_cpu: + + runs-on: [self-hosted, linux, gpu] + if: github.event.pull_request.draft == false + steps: + - name: Clean environment + run: | + rm -rf build/third_party + bash ci/build/clean.sh + - uses: actions/checkout@v2 + - name: Setup environment + run: | + echo $HOSTNAME + bash ci/setup_submodule.sh + - name: Checkout submodules + shell: bash + run: | + auth_header="$(git config --local --get http.https://github.com/.extraheader)" + git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --recursive + - name: Build OneFlow + run: | + export ONEFLOW_CI_EXTRA_ONEFLOW_CMAKE_ARGS="-DBUILD_CUDA=OFF" + export ONEFLOW_CI_TMP_DIR=$HOME/ci-tmp-cpu + bash ci/build/make.sh + - name: Build docker image for testing + run: | + bash docker/ci/test/build.sh + - name: Unit test + run: | + docker run --shm-size=8g --rm \ + -v $HOME/ci-tmp-cpu:/ci-tmp \ + -w $PWD -v $PWD:$PWD -v /dataset:/dataset -v /model_zoo:/model_zoo \ + --env ONEFLOW_WHEEL_PATH=/ci-tmp/wheelhouse \ + --env ONEFLOW_TEST_CPU_ONLY=1 \ + oneflow-test \ + bash -c "bash ci/test/try_install.sh && bash ci/test/1node_op_test.sh" + - name: Integration test + run: | + docker run --shm-size=8g --rm \ + -v $HOME/ci-tmp-cpu:/ci-tmp \ + -w $PWD -v $PWD:$PWD -v /dataset:/dataset -v /model_zoo:/model_zoo \ + --env ONEFLOW_WHEEL_PATH=/ci-tmp/wheelhouse \ + --env ONEFLOW_TEST_CPU_ONLY=1 \ + oneflow-test \ + bash -c "bash ci/test/try_install.sh && bash ci/test/1node_model_test.sh" + - name: Clean up files created by root + if: ${{ always() }} + run: | + ONEFLOW_CI_TMP_DIR=$HOME/ci-tmp-cpu \ + bash ci/build/clean.sh diff --git a/README.md b/README.md index f89f23a50193caa6c3e443b0a1cd3e1bdb9c4127..c7c9f4db930898e92207669a1d0597c3f42cc975 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,8 @@ make pip_install ``` + - For pure CPU build, please add this CMake flag `-DBUILD_CUDA=OFF`. + ### Troubleshooting Please refer to [troubleshooting](docs/source/troubleshooting.md) for common issues you might encounter when compiling and running OneFlow. diff --git a/ci/build/clean.sh b/ci/build/clean.sh index b26d4febd3ce7a9538e66958a53897c5a630b0c5..d209cd081696102f3611fe77aa6788eb52afa77f 100644 --- a/ci/build/clean.sh +++ b/ci/build/clean.sh @@ -1,5 +1,7 @@ set -ex +tmp_dir=${ONEFLOW_CI_TMP_DIR:-"$HOME/ci-tmp"} docker run --rm \ - -v $HOME/ci-tmp:/ci-tmp \ - -w $HOME/ci-tmp:/ci-tmp busybox rm -rf /ci-tmp/wheelhouse + -v $tmp_dir:/ci-tmp \ + -w $tmp_dir:/ci-tmp busybox rm -rf /ci-tmp/wheelhouse +docker run --rm -v $PWD:/p -w /p busybox rm -rf tmp_wheel docker run --rm -v $PWD:/p -w /p busybox rm -rf build diff --git a/ci/build/make.sh b/ci/build/make.sh index 44d3bce20be0a0eadfd23c59cfaf68647521d741..f7c6bd6b0a2cdb4b6d5c7393ff8eb34847ed04ab 100644 --- a/ci/build/make.sh +++ b/ci/build/make.sh @@ -2,6 +2,8 @@ set -ex src_dir=${ONEFLOW_SRC_DIR:-"$PWD"} tmp_dir=${ONEFLOW_CI_TMP_DIR:-"$HOME/ci-tmp"} +extra_oneflow_cmake_args=${ONEFLOW_CI_EXTRA_ONEFLOW_CMAKE_ARGS:-""} +package_appendix=${ONEFLOW_CI_PACKAGE_APPENDIX:-""} mkdir -p $tmp_dir docker_tag=${ONEFLOW_CI_DOCKER_TAG:-"oneflow:ci-manylinux2014-cuda10.2"} @@ -35,7 +37,8 @@ function build() { "$docker_tag" \ /oneflow-src/docker/package/manylinux/build_wheel.sh \ --python3.6 \ - --package-name oneflow_cu102 + --package-name oneflow${package_appendix} \ + $extra_oneflow_cmake_args } set +e diff --git a/ci/setup_submodule.sh b/ci/setup_submodule.sh index 1e5560652dc05578a1bb722c07a1e12c090d0110..5fdd8c2a2689779e0edd000dd2dd7410b4a8c1b5 100644 --- a/ci/setup_submodule.sh +++ b/ci/setup_submodule.sh @@ -1,5 +1,6 @@ set -x set -e -python3 ci/setup_submodule.py --oneflow_src_local_path=${ONEFLOW_CI_SRC_DIR} +src_dir=${ONEFLOW_CI_SRC_DIR:-"$HOME/oneflow"} +python3 ci/setup_submodule.py --oneflow_src_local_path=$src_dir git submodule sync git submodule update --init --recursive diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake index 4a3adbff6a51eeb3745a4a174513ff26a02dd95b..79ba81950b4b60fbc5ff1c6921b24703b7d0ea05 100644 --- a/cmake/oneflow.cmake +++ b/cmake/oneflow.cmake @@ -1,6 +1,8 @@ # main cpp -list(APPEND of_main_cc ${PROJECT_SOURCE_DIR}/oneflow/core/job/oneflow_worker.cpp) - +# TODO(tsai): skip for now, fail to link when building CPU only +if (BUILD_CUDA) + list(APPEND of_main_cc ${PROJECT_SOURCE_DIR}/oneflow/core/job/oneflow_worker.cpp) +endif() function(oneflow_add_executable) if (BUILD_CUDA) cuda_add_executable(${ARGV}) @@ -291,6 +293,14 @@ add_custom_target(of_pyscript_copy ALL COMMAND ${Python_EXECUTABLE} "${PROJECT_SOURCE_DIR}/tools/generate_oneflow_symbols_export_file.py" "${PROJECT_SOURCE_DIR}" "${of_pyscript_dir}/oneflow/python/__export_symbols__.py") file(GLOB_RECURSE oneflow_all_python_file "${PROJECT_SOURCE_DIR}/oneflow/python/*.py") +if (BUILD_CUDA) + add_custom_command(TARGET of_pyscript_copy POST_BUILD + COMMAND echo "with_cuda=True" >> "${of_pyscript_dir}/oneflow/python/compatibility.py") +else() + add_custom_command(TARGET of_pyscript_copy POST_BUILD + COMMAND echo "with_cuda=False" >> "${of_pyscript_dir}/oneflow/python/compatibility.py") +endif() + copy_files("${oneflow_all_python_file}" "${PROJECT_SOURCE_DIR}" "${of_pyscript_dir}" of_pyscript_copy) file(WRITE ${of_pyscript_dir}/oneflow/python/framework/sysconfig_gen.py "generated_compile_flags = []\n") @@ -334,28 +344,29 @@ endforeach() # build test if(BUILD_TESTING) - if(NOT BUILD_CUDA) - message(FATAL_ERROR "BUILD_TESTING without BUILD_CUDA") - endif() - if (of_all_test_cc) - oneflow_add_executable(oneflow_testexe ${of_all_test_cc}) - target_link_libraries(oneflow_testexe ${of_libs} ${oneflow_third_party_libs}) - set_target_properties(oneflow_testexe PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") - add_test(NAME oneflow_test COMMAND oneflow_testexe) - # foreach(cc ${of_all_test_cc}) - # get_filename_component(test_name ${cc} NAME_WE) - # string(CONCAT test_exe_name ${test_name} exe) - # oneflow_add_executable(${test_exe_name} ${cc}) - # target_link_libraries(${test_exe_name} ${of_libs} ${oneflow_third_party_libs}) - # endforeach() - endif() - if (of_separate_test_cc) - foreach(cc ${of_separate_test_cc}) - get_filename_component(test_name ${cc} NAME_WE) - string(CONCAT test_exe_name ${test_name} exe) - oneflow_add_executable(${test_exe_name} ${cc}) - target_link_libraries(${test_exe_name} ${of_libs} ${oneflow_third_party_libs}) - endforeach() + if(BUILD_CUDA) + if (of_all_test_cc) + oneflow_add_executable(oneflow_testexe ${of_all_test_cc}) + target_link_libraries(oneflow_testexe ${of_libs} ${oneflow_third_party_libs}) + set_target_properties(oneflow_testexe PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") + add_test(NAME oneflow_test COMMAND oneflow_testexe) + # foreach(cc ${of_all_test_cc}) + # get_filename_component(test_name ${cc} NAME_WE) + # string(CONCAT test_exe_name ${test_name} exe) + # oneflow_add_executable(${test_exe_name} ${cc}) + # target_link_libraries(${test_exe_name} ${of_libs} ${oneflow_third_party_libs}) + # endforeach() + endif() + if (of_separate_test_cc) + foreach(cc ${of_separate_test_cc}) + get_filename_component(test_name ${cc} NAME_WE) + string(CONCAT test_exe_name ${test_name} exe) + oneflow_add_executable(${test_exe_name} ${cc}) + target_link_libraries(${test_exe_name} ${of_libs} ${oneflow_third_party_libs}) + endforeach() + endif() + else() + message(ERROR "BUILD_TESTING=ON has no effect when BUILD_CUDA=OFF") endif() endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index dee2d607ebd1e116d2323321518a169423066f6e..a908ecb0bc87240835f21fa4f304423558cb0fb0 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -103,12 +103,12 @@ set(oneflow_third_party_libs ${GOOGLEMOCK_STATIC_LIBRARIES} ${PROTOBUF_STATIC_LIBRARIES} ${GRPC_STATIC_LIBRARIES} - ${ZLIB_STATIC_LIBRARIES} ${farmhash_STATIC_LIBRARIES} ${BLAS_LIBRARIES} - ${LIBJPEG_STATIC_LIBRARIES} ${OPENCV_STATIC_LIBRARIES} ${COCOAPI_STATIC_LIBRARIES} + ${LIBJPEG_STATIC_LIBRARIES} + ${ZLIB_STATIC_LIBRARIES} ) if (NOT WITH_XLA) diff --git a/docker/package/manylinux/build_wheel.sh b/docker/package/manylinux/build_wheel.sh index 96ed699668cdce430ec7671440a6249832a68197..3b3af072c44df278b334ae6300f73eb8a8b6528b 100755 --- a/docker/package/manylinux/build_wheel.sh +++ b/docker/package/manylinux/build_wheel.sh @@ -58,8 +58,8 @@ if [[ $SKIP_THIRD_PARTY != 1 ]]; then cmake -DTHIRD_PARTY=ON \ $COMMON_CMAKE_ARGS \ -DONEFLOW=OFF \ + $EXTRA_ONEFLOW_CMAKE_ARGS \ $ONEFLOW_SRC_DIR - make -j nccl make -j`nproc` prepare_oneflow_third_party popd @@ -86,7 +86,7 @@ do cmake -DTHIRD_PARTY=OFF -DONEFLOW=ON\ $COMMON_CMAKE_ARGS \ -DPython3_ROOT_DIR=$PY_ROOT \ - $EXTRA_ONEFLOW_CMAKE_ARGS \ + $EXTRA_ONEFLOW_CMAKE_ARGS \ $ONEFLOW_SRC_DIR cmake --build . -j `nproc` popd diff --git a/oneflow/core/actor/accumulate_compute_actor.cpp b/oneflow/core/actor/accumulate_compute_actor.cpp index 6388f8be9ad1be047f06b80fc749f63452efbd79..b13e385525480c9b7584f92fd22101426b8b4205 100644 --- a/oneflow/core/actor/accumulate_compute_actor.cpp +++ b/oneflow/core/actor/accumulate_compute_actor.cpp @@ -21,12 +21,7 @@ void AccumulateCompActor::Init(const TaskProto& task_proto, int32_t max_acc_cnt, using namespace std::placeholders; order_ = order; if (GetDeviceType() == DeviceType::kCPU) { - cpy_func_ = std::bind(Memcpy<DeviceType::kCPU>, _1, _2, _3, _4 -#ifdef WITH_CUDA - , - cudaMemcpyHostToHost -#endif - ); + cpy_func_ = std::bind(Memcpy<DeviceType::kCPU>, _1, _2, _3, _4, cudaMemcpyHostToHost); } else { #ifdef WITH_CUDA cpy_func_ = std::bind(Memcpy<DeviceType::kGPU>, _1, _2, _3, _4, cudaMemcpyDeviceToDevice); @@ -54,8 +49,12 @@ void AccumulateCompActor::Act() { Memset<DeviceType::kCPU>(kernel_ctx.device_ctx, out_blob->mut_dptr(), 0, out_blob->ByteSizeOfBlobBody()); } else if (GetDeviceType() == DeviceType::kGPU) { +#ifdef WITH_CUDA Memset<DeviceType::kGPU>(kernel_ctx.device_ctx, out_blob->mut_dptr(), 0, out_blob->ByteSizeOfBlobBody()); +#else + UNIMPLEMENTED(); +#endif } else { UNIMPLEMENTED(); } diff --git a/oneflow/core/actor/actor.cpp b/oneflow/core/actor/actor.cpp index 77eb16a12ed814c8e6a7004749bd398eb9cca5a1..27f69025caa0367963c39ae64e65c299d3f2f75c 100644 --- a/oneflow/core/actor/actor.cpp +++ b/oneflow/core/actor/actor.cpp @@ -236,6 +236,7 @@ void Actor::InitDeviceCtx(const ThreadCtx& thread_ctx) { device_ctx_.reset(new CpuDeviceCtx()); break; } +#ifdef WITH_CUDA case DeviceType::kGPU: { CudaStreamHandle* cuda_handle = nullptr; CHECK_EQ(GetLocalWorkStreamId(), 0); @@ -243,6 +244,7 @@ void Actor::InitDeviceCtx(const ThreadCtx& thread_ctx) { device_ctx_.reset(new CudaDeviceCtx(cuda_handle)); break; } +#endif default: { UNIMPLEMENTED(); } } } diff --git a/oneflow/core/common/blas.h b/oneflow/core/common/blas.h index d4f0c0f87306c291100bf80e7d63f77a6c7acfd2..4184396db07952fdd8c40ee03b187721ba2fb799 100644 --- a/oneflow/core/common/blas.h +++ b/oneflow/core/common/blas.h @@ -18,7 +18,9 @@ limitations under the License. #include <type_traits> #include <utility> +#ifdef WITH_CUDA #include <cuda_fp16.h> +#endif // WITH_CUDA #include "oneflow/core/common/cblas.h" #include "oneflow/core/common/preprocessor.h" diff --git a/oneflow/core/common/gdb.cpp b/oneflow/core/common/gdb.cpp index 27d16b22b93f87b6c3a9b9b6a48eccdf088c7d6f..d38ccb931252a6e7f4b15894d74272bdb7c3445a 100644 --- a/oneflow/core/common/gdb.cpp +++ b/oneflow/core/common/gdb.cpp @@ -32,14 +32,22 @@ namespace { static char* MallocThenCpyD2H(const char* gpu_src, size_t size) { char* cpu_dst = reinterpret_cast<char*>(malloc(size)); +#ifdef WITH_CUDA cudaMemcpy(cpu_dst, gpu_src, size, cudaMemcpyDeviceToHost); +#else + UNIMPLEMENTED(); +#endif return cpu_dst; } static void CpyH2DThenFree(char* gpu_dst, char* cpu_src, size_t size) { +#ifdef WITH_CUDA cudaMemcpy(gpu_dst, cpu_src, size, cudaMemcpyHostToDevice); +#else + UNIMPLEMENTED(); +#endif free(cpu_src); -} +} // namespace template<typename T> void LoadFromStrFile(T* buf, const std::string& file_name) { diff --git a/oneflow/core/device/cuda_util.h b/oneflow/core/device/cuda_util.h index 6b047d21aff97b6a2de1702eef82c894c910cf4a..1261b6ac42685bf4743c638716be9c21c4897cb3 100644 --- a/oneflow/core/device/cuda_util.h +++ b/oneflow/core/device/cuda_util.h @@ -123,6 +123,16 @@ class CudaCurrentDeviceGuard final { } // namespace oneflow +#else + +namespace oneflow { + +enum class CudaWorkType {}; + +inline size_t GetCudaWorkTypeSize() { return 0; } + +} // namespace oneflow + #endif // WITH_CUDA #endif // ONEFLOW_CORE_DEVICE_CUDA_UTIL_H_ diff --git a/oneflow/core/device/memory_copier.cpp b/oneflow/core/device/memory_copier.cpp index a889ea08a3e14ed5376022d16aa56b39bc517deb..c21c70e9f744fbbe1f07806e600d732c4f92fe8d 100644 --- a/oneflow/core/device/memory_copier.cpp +++ b/oneflow/core/device/memory_copier.cpp @@ -251,13 +251,11 @@ void CudaAsyncMemoryCopier::CopyND(DeviceCtx* ctx, void* dst, const void* src, UNIMPLEMENTED(); } } +#endif REGISTER_DEFAULT_MEMORY_COPIER(DeviceType::kCPU, []() { return new HostMemoryCopier(); }); - #ifdef WITH_CUDA - REGISTER_DEFAULT_MEMORY_COPIER(DeviceType::kGPU, []() { return new CudaAsyncMemoryCopier(); }); - #endif MemoryCopier* NewDefaultMemoryCopier(DeviceType device_type) { @@ -266,8 +264,6 @@ MemoryCopier* NewDefaultMemoryCopier(DeviceType device_type) { ->Create(); } -#endif - #define SPECIALIZE_COPY_ELEM(dtype) \ template void MemoryCopier::CopyElem<dtype>(DeviceCtx * ctx, void* dst, const void* src, \ const MemoryCopyNdDesc& desc) const; diff --git a/oneflow/core/device/memory_copier.h b/oneflow/core/device/memory_copier.h index 8a62dcb9d57e5ab1e0599eb1394bcdd765143fea..7b12acfce66e5f2e8abffa664f365f89223cc939 100644 --- a/oneflow/core/device/memory_copier.h +++ b/oneflow/core/device/memory_copier.h @@ -35,8 +35,10 @@ struct MemoryCopyNdDesc { template<int32_t NDIMS> void CopyNDCpuImpl(DeviceCtx* ctx, void* dst, const void* src, const MemoryCopyNdDesc& desc); +#ifdef WITH_CUDA template<int32_t NDIMS> void CopyNDGpuImpl(DeviceCtx* ctx, void* dst, const void* src, const MemoryCopyNdDesc& desc); +#endif class MemoryCopier { public: diff --git a/oneflow/core/eager/blob_instruction_type.cpp b/oneflow/core/eager/blob_instruction_type.cpp index 4fcc2fd0e6d6a2fee3417b41063487ef800bcaf7..13d9bf7362d499bdeaec6aff6ae7ed142b6b7f29 100644 --- a/oneflow/core/eager/blob_instruction_type.cpp +++ b/oneflow/core/eager/blob_instruction_type.cpp @@ -35,6 +35,7 @@ FLAT_MSG_VIEW_END(PinBlobInstruction); } // namespace +#ifdef WITH_CUDA class CudaHostRegisterBlobInstructionType final : public vm::InstructionType { public: CudaHostRegisterBlobInstructionType() = default; @@ -84,6 +85,7 @@ class CudaHostUnregisterBlobInstructionType final : public vm::InstructionType { }; COMMAND( vm::RegisterInstructionType<CudaHostUnregisterBlobInstructionType>("CudaHostUnregisterBlob")); +#endif } // namespace eager } // namespace oneflow diff --git a/oneflow/core/eager/cuda_opkernel_instruction_type.cpp b/oneflow/core/eager/cuda_opkernel_instruction_type.cpp index 0fcebcfe1928eb8b4a0ddd827581a22c8fe91b44..47ced1b8a149ec1c546c57268b40c965e78502d3 100644 --- a/oneflow/core/eager/cuda_opkernel_instruction_type.cpp +++ b/oneflow/core/eager/cuda_opkernel_instruction_type.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/common/util.h" #include "oneflow/core/job/job_desc.h" #include "oneflow/core/eager/opkernel_object.h" @@ -143,3 +145,5 @@ COMMAND(vm::RegisterInstructionType<GpuFeedBlobInstructionType>("gpu.FeedBlob")) } // namespace eager } // namespace oneflow + +#endif diff --git a/oneflow/core/graph/boxing/slice_boxing_sub_task_graph_builder.cpp b/oneflow/core/graph/boxing/slice_boxing_sub_task_graph_builder.cpp index 5e80fb118a229902091f761dc77b39081c6fa253..3923cd3f0db7efaa1266c2ae39c35a5e95d2707d 100644 --- a/oneflow/core/graph/boxing/slice_boxing_sub_task_graph_builder.cpp +++ b/oneflow/core/graph/boxing/slice_boxing_sub_task_graph_builder.cpp @@ -83,6 +83,7 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build( return Error::BoxingNotSupported(); } const auto GetBoxingGpuThrdId = [](const int64_t dev_id, CudaWorkType work_type) -> int64_t { +#ifdef WITH_CUDA if (work_type == CudaWorkType::kCopyH2D) { return Global<IDMgr>::Get()->GetGpuH2DThrdId(dev_id); } else if (work_type == CudaWorkType::kCopyD2H) { @@ -90,7 +91,11 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build( } else { return Global<IDMgr>::Get()->GetGpuMixThrdId(dev_id); } +#else + UNIMPLEMENTED(); +#endif }; + const auto NewEdge = [&ctx]() -> TaskEdge* { return ctx->task_graph()->NewEdge(); }; const auto CreateBoxingNode121 = [&ctx, &lbi, &GetBoxingGpuThrdId]( const ParallelDesc& pd, const int64_t parallel_id, @@ -102,7 +107,11 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build( if (pd.device_type() == DeviceType::kCPU) { thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(machine_id); } else if (pd.device_type() == DeviceType::kGPU) { +#ifdef WITH_CUDA thrd_id = GetBoxingGpuThrdId(pd.DeviceIdForParallelId(parallel_id), CudaWorkType::kCopyH2D); +#else + UNIMPLEMENTED(); +#endif } else { UNIMPLEMENTED(); } @@ -118,7 +127,11 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build( if (src_node->device_type() == DeviceType::kCPU) { thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(src_node->machine_id()); } else if (src_node->device_type() == DeviceType::kGPU) { +#ifdef WITH_CUDA thrd_id = GetBoxingGpuThrdId(src_node->GpuPhyId(), CudaWorkType::kCopyD2H); +#else + UNIMPLEMENTED(); +#endif } else { UNIMPLEMENTED(); } @@ -235,9 +248,13 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build( if (in_pd.device_type() == DeviceType::kCPU) { local_concat_thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(in_machine_id); } else if (in_pd.device_type() == DeviceType::kGPU) { +#ifdef WITH_CUDA local_concat_thrd_id = GetBoxingGpuThrdId( in_nodes.at(in_parallel_ids.at(out_id % in_parallel_ids.size()))->GpuPhyId(), CudaWorkType::kCopyD2H); +#else + UNIMPLEMENTED(); +#endif } local_concat_node->Init(lbi, concat_slice, kSliceBoxingTaskModeCopy, in_machine_id, local_concat_thrd_id, Global<IDMgr>::Get()->CpuMemZoneId()); @@ -293,9 +310,13 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build( if (in_pd.device_type() == DeviceType::kCPU) { local_add_thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(in_machine_id); } else if (in_pd.device_type() == DeviceType::kGPU) { +#ifdef WITH_CUDA local_add_thrd_id = GetBoxingGpuThrdId( in_nodes.at(in_parallel_ids.at(out_id % in_parallel_ids.size()))->GpuPhyId(), CudaWorkType::kCopyD2H); +#else + UNIMPLEMENTED(); +#endif } local_add_node->Init(lbi, out_slice, kSliceBoxingTaskModeAdd, in_machine_id, local_add_thrd_id, Global<IDMgr>::Get()->CpuMemZoneId()); @@ -337,8 +358,12 @@ Maybe<void> SliceBoxingSubTskGphBuilder::Build( if (in_pd.device_type() == DeviceType::kCPU) { local_add_thrd_id = Global<IDMgr>::Get()->PickCpuThrdIdEvenly(in_machine_id); } else if (in_pd.device_type() == DeviceType::kGPU) { +#ifdef WITH_CUDA local_add_thrd_id = GetBoxingGpuThrdId(in_nodes.at(in_ids_on_machine.front())->GpuPhyId(), CudaWorkType::kCopyH2D); +#else + UNIMPLEMENTED(); +#endif } local_add_node->Init(lbi, slice, kSliceBoxingTaskModeAdd, in_machine_id, local_add_thrd_id); FOR_RANGE(int64_t, i, 0, in_ids_on_machine.size()) { diff --git a/oneflow/core/graph/case_compute_task_node.h b/oneflow/core/graph/case_compute_task_node.h index 433936470fec651607b4916433c2a69a150ac751..a767483ef21021f45cc7a186ffb94cbae5dabe98 100644 --- a/oneflow/core/graph/case_compute_task_node.h +++ b/oneflow/core/graph/case_compute_task_node.h @@ -30,7 +30,13 @@ class CaseCompTaskNode final : public CompTaskNode { void ConsumeAllRegsts() override; TaskType GetTaskType() const override { return TaskType::kCase; } - CudaWorkType GetCudaWorkType() const override { return CudaWorkType::kCompute; } + CudaWorkType GetCudaWorkType() const override { +#ifdef WITH_CUDA + return CudaWorkType::kCompute; +#else + UNIMPLEMENTED(); +#endif + } private: void BuildExecGphAndRegst() override; diff --git a/oneflow/core/graph/compute_task_node.h b/oneflow/core/graph/compute_task_node.h index d6cac9d593b6a122a3a28c79bf9724b8987129c6..be1008b337d23c2803071d7ee5e32887751d954f 100644 --- a/oneflow/core/graph/compute_task_node.h +++ b/oneflow/core/graph/compute_task_node.h @@ -29,7 +29,13 @@ class CompTaskNode : public TaskNode { CompTaskNode() = default; virtual ~CompTaskNode() = default; - virtual CudaWorkType GetCudaWorkType() const { return CudaWorkType::kCompute; } + virtual CudaWorkType GetCudaWorkType() const { +#ifdef WITH_CUDA + return CudaWorkType::kCompute; +#else + UNIMPLEMENTED(); +#endif + } virtual void ToProto(TaskProto*) override; // parallel_ctx_ diff --git a/oneflow/core/graph/esac_compute_task_node.h b/oneflow/core/graph/esac_compute_task_node.h index 8500562c59879a1cb8aac7139224098e1b6cbb09..5b20565a70028c86c3b0863a42bf634085c54735 100644 --- a/oneflow/core/graph/esac_compute_task_node.h +++ b/oneflow/core/graph/esac_compute_task_node.h @@ -30,7 +30,13 @@ class EsacCompTaskNode final : public CompTaskNode { void ConsumeAllRegsts() override; TaskType GetTaskType() const override { return TaskType::kEsac; } - CudaWorkType GetCudaWorkType() const override { return CudaWorkType::kCompute; } + CudaWorkType GetCudaWorkType() const override { +#ifdef WITH_CUDA + return CudaWorkType::kCompute; +#else + UNIMPLEMENTED(); +#endif + } private: void BuildExecGphAndRegst() override; diff --git a/oneflow/core/graph/logical_node.cpp b/oneflow/core/graph/logical_node.cpp index 979874a55f651729b220234b5952d26e9c33bd03..d6d6b774044eae69c171d5313697cdc90974293c 100644 --- a/oneflow/core/graph/logical_node.cpp +++ b/oneflow/core/graph/logical_node.cpp @@ -134,6 +134,7 @@ void LogicalNode::GenSortedCompTaskNodes( const IDMgr* id_mgr = Global<IDMgr>::Get(); if (parallel_desc_->device_type() == DeviceType::kGPU) { +#ifdef WITH_CUDA switch (comp_task_node->GetCudaWorkType()) { case CudaWorkType::kCompute: { comp_task_node->set_thrd_id(id_mgr->GetGpuComputeThrdId(dev_phy_id)); @@ -161,6 +162,9 @@ void LogicalNode::GenSortedCompTaskNodes( } default: UNIMPLEMENTED(); } +#else + UNIMPLEMENTED(); +#endif } else if (parallel_desc_->device_type() == DeviceType::kCPU) { if (comp_task_node->IsIndependent()) { nodes->push_back({machine_id, comp_task_node}); diff --git a/oneflow/core/graph/optimizer_compute_task_node.h b/oneflow/core/graph/optimizer_compute_task_node.h index c10a4a5457f054926c17d6eb5a685c0f415c93a3..29eab2c2b7fe60904e85efb56cb09c72d4244f23 100644 --- a/oneflow/core/graph/optimizer_compute_task_node.h +++ b/oneflow/core/graph/optimizer_compute_task_node.h @@ -30,7 +30,13 @@ class OptimizerCompTaskNode final : public CompTaskNode { void ConsumeAllRegsts() override; TaskType GetTaskType() const override { return TaskType::kOptimizer; } - CudaWorkType GetCudaWorkType() const override { return CudaWorkType::kCompute; } + CudaWorkType GetCudaWorkType() const override { +#ifdef WITH_CUDA + return CudaWorkType::kCompute; +#else + UNIMPLEMENTED(); +#endif + } private: void BuildExecGphAndRegst() override; diff --git a/oneflow/core/graph/repeat_forward_compute_task_node.h b/oneflow/core/graph/repeat_forward_compute_task_node.h index f2e9f3775f85224d1db63d8633fd8e8825f8f58a..9a7826c069f5c5d624fefd54078e628135293157 100644 --- a/oneflow/core/graph/repeat_forward_compute_task_node.h +++ b/oneflow/core/graph/repeat_forward_compute_task_node.h @@ -30,7 +30,13 @@ class RepeatForwardCompTaskNode final : public CompTaskNode { void ConsumeAllRegsts() override; TaskType GetTaskType() const override { return TaskType::kRepeatForward; } - CudaWorkType GetCudaWorkType() const override { return CudaWorkType::kCompute; } + CudaWorkType GetCudaWorkType() const override { +#ifdef WITH_CUDA + return CudaWorkType::kCompute; +#else + UNIMPLEMENTED(); +#endif + } private: void BuildExecGphAndRegst() override; diff --git a/oneflow/core/job/collective_boxing_executor.cpp b/oneflow/core/job/collective_boxing_executor.cpp index 5bb35f183cdcc218a64b8a927e21a41fa7727143..b9e95f2a52e412aa13fac435497c4a5d133d335a 100644 --- a/oneflow/core/job/collective_boxing_executor.cpp +++ b/oneflow/core/job/collective_boxing_executor.cpp @@ -32,6 +32,7 @@ namespace collective { namespace { +#ifdef WITH_CUDA ncclRedOp_t GetNcclReduceOp(ReduceMethod reduce_method) { if (reduce_method == kReduceMethodSum) { return ncclRedOp_t::ncclSum; @@ -39,6 +40,7 @@ ncclRedOp_t GetNcclReduceOp(ReduceMethod reduce_method) { UNIMPLEMENTED(); } } +#endif void SortRequestsByOrder(std::vector<const RequestDesc*>* requests) { std::sort(requests->begin(), requests->end(), @@ -70,6 +72,8 @@ int64_t GetAlignedRequestSize(const RequestDesc* request) { } // namespace +#ifdef WITH_CUDA + void CollectiveBoxingExecutorBackend::GroupRequests( const std::vector<const RequestDesc*>& requests, std::vector<std::vector<const RequestDesc*>>* groups) { @@ -466,13 +470,17 @@ void NcclCollectiveBoxingExecutorBackend::Init(const CollectiveBoxingPlan& colle } } +#endif // WITH_CUDA + CollectiveBoxingExecutor::CollectiveBoxingExecutor(const Plan& plan) : collective_boxing_plan_(plan.collective_boxing_plan()) { +#ifdef WITH_CUDA auto it = backends_ .emplace(Backend::kBackendNCCL, std::make_unique<NcclCollectiveBoxingExecutorBackend>()) .first; it->second->Init(collective_boxing_plan_); +#endif Init(); DumpSummary(); } diff --git a/oneflow/core/job/env_global_objects_scope.cpp b/oneflow/core/job/env_global_objects_scope.cpp index c64f4d0e9f4955710e1e46a6322f2aaea68cf410..8ae3861e919d37a15db553037c6a51112c0fb248 100644 --- a/oneflow/core/job/env_global_objects_scope.cpp +++ b/oneflow/core/job/env_global_objects_scope.cpp @@ -13,7 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA #include <cuda.h> +#endif // WITH_CUDA #include <thread> #include "oneflow/core/thread/thread_pool.h" #include "oneflow/core/job/env_global_objects_scope.h" @@ -87,12 +89,16 @@ Maybe<void> EnvGlobalObjectsScope::Init(const EnvProto& env_proto) { Global<ThreadPool>::New(Global<ResourceDesc, ForSession>::Get()->ComputeThreadPoolSize()); Global<vm::VirtualMachineScope>::New(Global<ResourceDesc, ForSession>::Get()->resource()); Global<EagerJobBuildAndInferCtxMgr>::New(); +#ifdef WITH_CUDA Global<EagerNcclCommMgr>::New(); +#endif return Maybe<void>::Ok(); } EnvGlobalObjectsScope::~EnvGlobalObjectsScope() { +#ifdef WITH_CUDA Global<EagerNcclCommMgr>::Delete(); +#endif Global<EagerJobBuildAndInferCtxMgr>::Delete(); Global<vm::VirtualMachineScope>::Delete(); Global<ThreadPool>::Delete(); diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index f46d35dd523dc974d657dfd85df5919712ba059c..04198c25e287b1adab1586a60cd14906a79b45c7 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -920,7 +920,9 @@ Maybe<void> LazyJobBuildAndInferCtx::Complete() { if (GlobalJobDesc().Bool("__is_user_function__")) { JUST(DoPass("CompleteOfrecordDecoder")); JUST(DoPass("SetDefaultVariableConf")); +#ifdef WITH_CUDA JUST(DoPass("AutoMixedPrecision")); +#endif JUST(DoPass("TieUpChainHeadersUnReachableFromAnyVariableOps")); JUST(DoPass("NonDistributedOptimizerPass")); JUST(DoPass("AutoTrainStep")); diff --git a/oneflow/core/job_rewriter/auto_mixed_precision.cpp b/oneflow/core/job_rewriter/auto_mixed_precision.cpp index a648b1956778901f6c13e1410b8485fb0658f57f..b89bd5babbcb6baaf6b0a58eaeebf0823d262a0a 100644 --- a/oneflow/core/job_rewriter/auto_mixed_precision.cpp +++ b/oneflow/core/job_rewriter/auto_mixed_precision.cpp @@ -13,6 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + +#ifdef WITH_CUDA + #include "oneflow/core/job_rewriter/auto_mixed_precision_lists.h" #include <algorithm> @@ -389,3 +392,5 @@ REGISTER_NO_CAST_REGISTRY("normalization", "beta", 0) } // namespace } // namespace oneflow + +#endif // WITH_CUDA diff --git a/oneflow/core/kernel/assign_kernel.cpp b/oneflow/core/kernel/assign_kernel.cpp index 38b883a1130dd66b2bd8561a4385a15c74698e72..246ee8207192a65a5d62cac13a803203bb3ee38d 100644 --- a/oneflow/core/kernel/assign_kernel.cpp +++ b/oneflow/core/kernel/assign_kernel.cpp @@ -38,7 +38,9 @@ void AssignKernel<device_type>::ForwardDataContent( REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kAssignConf, DeviceType::kCPU, AssignKernel<DeviceType::kCPU>); +#ifdef WITH_CUDA REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kAssignConf, DeviceType::kGPU, AssignKernel<DeviceType::kGPU>); +#endif } // namespace oneflow diff --git a/oneflow/core/kernel/constant_like_kernel.cpp b/oneflow/core/kernel/constant_like_kernel.cpp index d12e646255c21006385fb4d0278f90fbad0e2877..082cb457b31e83728a58fb23c9dea88d1e971ecf 100644 --- a/oneflow/core/kernel/constant_like_kernel.cpp +++ b/oneflow/core/kernel/constant_like_kernel.cpp @@ -47,12 +47,17 @@ class ConstantLikeKernel final : public KernelIf<device_type> { } }; +#ifdef WITH_CUDA #define REGISTER_CONSTANT_LIKE_KERNEL(dtype) \ REGISTER_KERNEL_WITH_DEVICE_AND_DTYPE(OperatorConf::kConstantLikeConf, DeviceType::kCPU, dtype, \ ConstantLikeKernel<DeviceType::kCPU, dtype>) \ REGISTER_KERNEL_WITH_DEVICE_AND_DTYPE(OperatorConf::kConstantLikeConf, DeviceType::kGPU, dtype, \ ConstantLikeKernel<DeviceType::kGPU, dtype>) - +#else +#define REGISTER_CONSTANT_LIKE_KERNEL(dtype) \ + REGISTER_KERNEL_WITH_DEVICE_AND_DTYPE(OperatorConf::kConstantLikeConf, DeviceType::kCPU, dtype, \ + ConstantLikeKernel<DeviceType::kCPU, dtype>) +#endif REGISTER_CONSTANT_LIKE_KERNEL(float); REGISTER_CONSTANT_LIKE_KERNEL(double); REGISTER_CONSTANT_LIKE_KERNEL(int8_t); diff --git a/oneflow/core/kernel/kernel.cpp b/oneflow/core/kernel/kernel.cpp index a49640b90f770c46f0d08096a91e0620cdcbdb0a..249e8dcdaee56b816e93dabec28288b2f63c896b 100644 --- a/oneflow/core/kernel/kernel.cpp +++ b/oneflow/core/kernel/kernel.cpp @@ -134,44 +134,6 @@ void Kernel::ForwardShape(const KernelCtx& ctx, return shape_infer_helper_->InferShape(BnInOp2Blob); } -template<DeviceType device_type> -void KernelIf<device_type>::ForwardPackedHeader( - const KernelCtx& ctx, std::function<Blob*(const std::string&)> BnInOp2Blob) const { - CopyField(ctx.device_ctx, BnInOp2Blob, op_attribute().input_bns(), op_attribute().output_bns(), - &Blob::CopyHeaderFrom); -} - -template<DeviceType device_type> -void KernelIf<device_type>::CopyField(DeviceCtx* ctx, - std::function<Blob*(const std::string&)> BnInOp2Blob, - const Blob* from_blob, const PbRpf<std::string>& to_bns, - void (Blob::*Copy)(DeviceCtx*, const Blob*)) const { - for (const std::string& to_bn : to_bns) { (BnInOp2Blob(to_bn)->*Copy)(ctx, from_blob); } -} - -template<DeviceType device_type> -void KernelIf<device_type>::CopyField(DeviceCtx* ctx, - std::function<Blob*(const std::string&)> BnInOp2Blob, - const PbRpf<std::string>& from_bns, - const PbRpf<std::string>& to_bns, - void (Blob::*Copy)(DeviceCtx*, const Blob*)) const { - if (from_bns.size() == 1) { - const Blob* in_blob = BnInOp2Blob(from_bns[0]); - CopyField(ctx, BnInOp2Blob, in_blob, to_bns, Copy); - } else if (to_bns.size() == 1) { - Blob* in_blob = BnInOp2Blob(from_bns[0]); - Blob* out_blob = BnInOp2Blob(to_bns[0]); - (out_blob->*Copy)(ctx, in_blob); - } else { - CHECK_EQ(from_bns.size(), to_bns.size()); - FOR_RANGE(size_t, i, 0, from_bns.size()) { - Blob* in_blob = BnInOp2Blob(from_bns[i]); - Blob* out_blob = BnInOp2Blob(to_bns[i]); - (out_blob->*Copy)(ctx, in_blob); - } - } -} - std::unique_ptr<const Kernel> ConstructKernel(const JobDesc* job_desc, const KernelConf& conf, DeviceCtx* device_ctx) { auto op_type = conf.op_attribute().op_conf().op_type_case(); diff --git a/oneflow/core/kernel/kernel.h b/oneflow/core/kernel/kernel.h index 36af5ab288ba69191635a572f63adee02c1e1407..aaa3c49b7719570b6d48c921601933ce8c3f63b4 100644 --- a/oneflow/core/kernel/kernel.h +++ b/oneflow/core/kernel/kernel.h @@ -165,13 +165,34 @@ class KernelIf : public Kernel { KernelIf() = default; virtual void ForwardPackedHeader( - const KernelCtx& ctx, std::function<Blob*(const std::string&)> BnInOp2Blob) const override; + const KernelCtx& ctx, std::function<Blob*(const std::string&)> BnInOp2Blob) const override { + CopyField(ctx.device_ctx, BnInOp2Blob, op_attribute().input_bns(), op_attribute().output_bns(), + &Blob::CopyHeaderFrom); + } void CopyField(DeviceCtx* ctx, std::function<Blob*(const std::string&)> BnInOp2Blob, const Blob* from_blob, const PbRpf<std::string>& to_bns, - void (Blob::*Copy)(DeviceCtx*, const Blob*)) const; + void (Blob::*Copy)(DeviceCtx*, const Blob*)) const { + for (const std::string& to_bn : to_bns) { (BnInOp2Blob(to_bn)->*Copy)(ctx, from_blob); } + } void CopyField(DeviceCtx* ctx, std::function<Blob*(const std::string&)> BnInOp2Blob, const PbRpf<std::string>& from_bns, const PbRpf<std::string>& to_bns, - void (Blob::*Copy)(DeviceCtx*, const Blob*)) const; + void (Blob::*Copy)(DeviceCtx*, const Blob*)) const { + if (from_bns.size() == 1) { + const Blob* in_blob = BnInOp2Blob(from_bns[0]); + CopyField(ctx, BnInOp2Blob, in_blob, to_bns, Copy); + } else if (to_bns.size() == 1) { + Blob* in_blob = BnInOp2Blob(from_bns[0]); + Blob* out_blob = BnInOp2Blob(to_bns[0]); + (out_blob->*Copy)(ctx, in_blob); + } else { + CHECK_EQ(from_bns.size(), to_bns.size()); + FOR_RANGE(size_t, i, 0, from_bns.size()) { + Blob* in_blob = BnInOp2Blob(from_bns[i]); + Blob* out_blob = BnInOp2Blob(to_bns[i]); + (out_blob->*Copy)(ctx, in_blob); + } + } + } bool EnableCudnn() const { return op_conf().enable_cudnn(); } }; diff --git a/oneflow/core/kernel/kernel_util.cpp b/oneflow/core/kernel/kernel_util.cpp index c62d83420e5a7b5c4ed8f9f41e9e4ce3f8562505..cb15f471de6fcc56d7edcc797cbcd6beada82cfc 100644 --- a/oneflow/core/kernel/kernel_util.cpp +++ b/oneflow/core/kernel/kernel_util.cpp @@ -263,6 +263,7 @@ void AutoMemcpy(DeviceCtx* ctx, void* dst, const void* src, size_t sz, func = &Memcpy<DeviceType::kCPU>; kind = cudaMemcpyKind::cudaMemcpyHostToHost; } else { +#ifdef WITH_CUDA func = &Memcpy<DeviceType::kGPU>; if (src_mem_case.has_host_mem() && dst_mem_case.has_device_cuda_mem()) { kind = cudaMemcpyKind::cudaMemcpyHostToDevice; @@ -273,6 +274,9 @@ void AutoMemcpy(DeviceCtx* ctx, void* dst, const void* src, size_t sz, } else { UNIMPLEMENTED(); } +#else + UNIMPLEMENTED(); +#endif // WITH_CUDA } func(ctx, dst, src, sz, kind); } @@ -281,7 +285,11 @@ void SyncAutoMemcpy(DeviceCtx* ctx, void* dst, const void* src, size_t sz, const MemoryCase& dst_mem_case, const MemoryCase& src_mem_case) { AutoMemcpy(ctx, dst, src, sz, dst_mem_case, src_mem_case); if (src_mem_case.has_device_cuda_mem() || dst_mem_case.has_device_cuda_mem()) { +#ifdef WITH_CUDA CudaCheck(cudaStreamSynchronize(ctx->cuda_stream())); +#else + UNIMPLEMENTED(); +#endif // WITH_CUDA } } diff --git a/oneflow/core/kernel/kernel_util.cuh b/oneflow/core/kernel/kernel_util.cuh index 10b4b998d7ae65f8fd2ee70e60d204cf09ad6300..01192dc75c6de0ded86c015373cba3469427dd9b 100644 --- a/oneflow/core/kernel/kernel_util.cuh +++ b/oneflow/core/kernel/kernel_util.cuh @@ -18,6 +18,8 @@ limitations under the License. namespace oneflow { +#ifdef WITH_CUDA + template<typename T> __device__ T gpu_atomic_add(T* address, const T val); @@ -35,6 +37,21 @@ __host__ __device__ T SafeLog(T x) { return logf(MaxWithLogThreshold(x)); } +#else + +template<typename T> +T MaxWithLogThreshold(T x) { + const T threshold = 1e-20; + return x > threshold ? x : threshold; +} + +template<typename T> +T SafeLog(T x) { + return logf(MaxWithLogThreshold(x)); +} + +#endif // WITH_CUDA + } // namespace oneflow #endif // ONEFLOW_CORE_KERNEL_KERNEL_UTIL_CUH_ diff --git a/oneflow/core/kernel/kernel_util.h b/oneflow/core/kernel/kernel_util.h index c5a7d03d7c29750e3dc717d6cea92edbe2062463..4ead67550f43c717f40966abb3fe4272a1a0d519 100644 --- a/oneflow/core/kernel/kernel_util.h +++ b/oneflow/core/kernel/kernel_util.h @@ -504,8 +504,10 @@ typename std::enable_if<!std::is_same<T, U>::value>::type CopyElem(const T* in_d FOR_RANGE(int64_t, i, 0, elem_num) { *(out_dptr++) = static_cast<U>(*(in_dptr++)); } } +#ifdef WITH_CUDA template<typename T, typename U> void CopyElemOnGpu(DeviceCtx* ctx, const T* in_dptr, U* out_dptr, int64_t elem_num); +#endif } // namespace oneflow diff --git a/oneflow/core/kernel/new_kernel_util.cpp b/oneflow/core/kernel/new_kernel_util.cpp index cc481b988d96305aa16476c4679cfc1b801ca319..fcc8a36d85fd1129ba28a22e3f0e546ac7cc5563 100644 --- a/oneflow/core/kernel/new_kernel_util.cpp +++ b/oneflow/core/kernel/new_kernel_util.cpp @@ -20,11 +20,8 @@ limitations under the License. namespace oneflow { template<> -void Memcpy<DeviceType::kCPU>(DeviceCtx* ctx, void* dst, const void* src, size_t sz -#ifdef WITH_CUDA - , +void Memcpy<DeviceType::kCPU>(DeviceCtx* ctx, void* dst, const void* src, size_t sz, cudaMemcpyKind kind -#endif ) { if (dst == src) { return; } @@ -38,6 +35,7 @@ void Memset<DeviceType::kCPU>(DeviceCtx* ctx, void* dst, const char value, size_ void WithHostBlobAndStreamSynchronizeEnv(DeviceCtx* ctx, Blob* blob, std::function<void(Blob*)> Callback) { +#ifdef WITH_CUDA char* host_raw_dptr = nullptr; CudaCheck(cudaMallocHost(&host_raw_dptr, blob->AlignedTotalByteSize())); Blob host_blob(MemoryCase(), &blob->blob_desc(), host_raw_dptr); @@ -46,6 +44,9 @@ void WithHostBlobAndStreamSynchronizeEnv(DeviceCtx* ctx, Blob* blob, cudaMemcpyHostToDevice); CudaCheck(cudaStreamSynchronize(ctx->cuda_stream())); CudaCheck(cudaFreeHost(host_raw_dptr)); +#else + UNIMPLEMENTED(); +#endif } } // namespace oneflow diff --git a/oneflow/core/kernel/new_kernel_util.h b/oneflow/core/kernel/new_kernel_util.h index 8160f2b1e05b3711940cd7dfd8bb32a1b1336cea..b753c85c73cf2ab7ac2b3d66064fb581e8ab8257 100644 --- a/oneflow/core/kernel/new_kernel_util.h +++ b/oneflow/core/kernel/new_kernel_util.h @@ -20,6 +20,14 @@ limitations under the License. namespace oneflow { +#ifndef WITH_CUDA +enum cudaMemcpyKind { + cudaMemcpyHostToHost = 0, + cudaMemcpyHostToDevice = 1, + cudaMemcpyDefault = 4, +}; +#endif + template<DeviceType deivce_type> struct NewKernelUtil : public DnnIf<deivce_type>, public BlasIf<deivce_type>, @@ -33,10 +41,12 @@ struct GetCudaMemcpyKind<DeviceType::kCPU> { static const cudaMemcpyKind val = cudaMemcpyKind::cudaMemcpyHostToHost; }; +#ifdef WITH_CUDA template<> struct GetCudaMemcpyKind<DeviceType::kGPU> { static const cudaMemcpyKind val = cudaMemcpyKind::cudaMemcpyDeviceToDevice; }; +#endif // WITH_CUDA template<DeviceType device_type> void Memcpy(DeviceCtx*, void* dst, const void* src, size_t sz, diff --git a/oneflow/core/kernel/pack_kernel_util.cpp b/oneflow/core/kernel/pack_kernel_util.cpp index fc460fa9bda24592bb8e0c0ea2682b87b7ef6f72..06c4fa0945e3e0100b6edbd6c180fcc843e04f3d 100644 --- a/oneflow/core/kernel/pack_kernel_util.cpp +++ b/oneflow/core/kernel/pack_kernel_util.cpp @@ -42,6 +42,8 @@ void PackKernelUtil<device_type>::Unpack(DeviceCtx* ctx, size_t out_index, size_ } template class PackKernelUtil<DeviceType::kCPU>; +#ifdef WITH_CUDA template class PackKernelUtil<DeviceType::kGPU>; +#endif } // namespace oneflow diff --git a/oneflow/core/kernel/random_generator.h b/oneflow/core/kernel/random_generator.h index 5ddb91b23b8771038d9f8118ff17e38d33b05e5c..f76aa66a28ed61e3017c19ddc48f21297e995659 100644 --- a/oneflow/core/kernel/random_generator.h +++ b/oneflow/core/kernel/random_generator.h @@ -54,7 +54,9 @@ class RandomGenerator<DeviceType::kGPU> final { void Uniform(const int64_t elem_cnt, T* dptr); private: +#ifdef WITH_CUDA curandGenerator_t curand_generator_; +#endif }; } // namespace oneflow diff --git a/oneflow/core/kernel/slice_boxing_kernel.cpp b/oneflow/core/kernel/slice_boxing_kernel.cpp index bd05fc8a9ffe3e2f2ab34d52475a803f685f7f5c..4fbfe4b20365963fe79c948ec22edb316832e4ec 100644 --- a/oneflow/core/kernel/slice_boxing_kernel.cpp +++ b/oneflow/core/kernel/slice_boxing_kernel.cpp @@ -121,12 +121,16 @@ void SliceBoxingAddKernel<device_type, T>::ForwardDataContent( } else { bool can_direct_access = (device_type == kCPU) +#ifdef WITH_CUDA || (device_type == DeviceType::kGPU && in_i->mem_case().has_host_mem() && in_i->mem_case().host_mem().has_cuda_pinned_mem()) || (device_type == DeviceType::kGPU && in_i->mem_case().has_device_cuda_mem() && out->mem_case().has_device_cuda_mem() && out->mem_case().device_cuda_mem().device_id() == in_i->mem_case().device_cuda_mem().device_id()); +#else + ; +#endif if (in_i->shape() == out->shape() && can_direct_access) { SliceBoxingKernelUtil<device_type, T>::Add(ctx.device_ctx, out->shape().elem_cnt(), in_i->dptr<T>(), out->dptr<T>(), diff --git a/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp b/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp index 6b2e48c8f90dcce290d42bd950589bb012294ce5..ecb70bfd9296c0c4c204444833a2255da5d8e870 100644 --- a/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp +++ b/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp @@ -25,6 +25,8 @@ limitations under the License. namespace oneflow { +#ifdef WITH_CUDA + namespace { class CudaHostMem { @@ -107,6 +109,8 @@ REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int8_t); REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int32_t); REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t); +#endif // WITH_CUDA + template<typename SizeType> class SyncDynamicResizeCPUKernel final : public KernelIf<DeviceType::kCPU> { public: diff --git a/oneflow/core/kernel/util/cuda_arithemetic_interface.h b/oneflow/core/kernel/util/cuda_arithemetic_interface.h index 5086475c1260e0f85636784cbc1c86b0bc5b7a74..6b2806128f7073f57f5ec4c086e7084dda92045c 100644 --- a/oneflow/core/kernel/util/cuda_arithemetic_interface.h +++ b/oneflow/core/kernel/util/cuda_arithemetic_interface.h @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #ifndef ONEFLOW_CORE_KERNEL_UTIL_CUDA_ARITHEMETIC_INTERFACE_H_ #define ONEFLOW_CORE_KERNEL_UTIL_CUDA_ARITHEMETIC_INTERFACE_H_ @@ -133,3 +135,5 @@ struct ArithemeticIf<DeviceType::kGPU> { } // namespace oneflow #endif // ONEFLOW_CORE_KERNEL_UTIL_CUDA_ARITHEMETIC_INTERFACE_H_ + +#endif diff --git a/oneflow/core/memory/memory_allocator.cpp b/oneflow/core/memory/memory_allocator.cpp index c918e28c3c1f2fa82408672aa596129ab7d06573..77ac3488e252a67904eedcc8470ff2480a5974d0 100644 --- a/oneflow/core/memory/memory_allocator.cpp +++ b/oneflow/core/memory/memory_allocator.cpp @@ -28,18 +28,26 @@ void* MemoryAllocatorImpl::Allocate(MemoryCase mem_case, size_t size) { void* ptr = nullptr; if (mem_case.has_host_mem()) { if (mem_case.host_mem().has_cuda_pinned_mem()) { +#ifdef WITH_CUDA if (Global<ResourceDesc, ForSession>::Get()->enable_numa_aware_cuda_malloc_host()) { NumaAwareCudaMallocHost(mem_case.host_mem().cuda_pinned_mem().device_id(), &ptr, size); } else { CudaCheck(cudaMallocHost(&ptr, size)); } +#else + UNIMPLEMENTED(); +#endif } else { ptr = malloc(size); CHECK_NOTNULL(ptr); } } else if (mem_case.has_device_cuda_mem()) { +#ifdef WITH_CUDA CudaCurrentDeviceGuard guard(mem_case.device_cuda_mem().device_id()); CudaCheck(cudaMalloc(&ptr, size)); +#else + UNIMPLEMENTED(); +#endif } else { UNIMPLEMENTED(); } @@ -49,13 +57,21 @@ void* MemoryAllocatorImpl::Allocate(MemoryCase mem_case, size_t size) { void MemoryAllocatorImpl::Deallocate(void* ptr, MemoryCase mem_case) { if (mem_case.has_host_mem()) { if (mem_case.host_mem().has_cuda_pinned_mem()) { +#ifdef WITH_CUDA CudaCheck(cudaFreeHost(ptr)); +#else + UNIMPLEMENTED(); +#endif } else { free(ptr); } } else if (mem_case.has_device_cuda_mem()) { +#ifdef WITH_CUDA CudaCurrentDeviceGuard guard(mem_case.device_cuda_mem().device_id()); CudaCheck(cudaFree(ptr)); +#else + UNIMPLEMENTED(); +#endif } else { UNIMPLEMENTED(); } @@ -79,8 +95,12 @@ char* MemoryAllocator::Allocate(MemoryCase mem_case, std::size_t size) { if (mem_case.has_host_mem()) { memset(dptr, memset_val, size); } else if (mem_case.has_device_cuda_mem()) { +#ifdef WITH_CUDA CudaCurrentDeviceGuard guard(mem_case.device_cuda_mem().device_id()); CudaCheck(cudaMemset(dptr, memset_val, size)); +#else + UNIMPLEMENTED(); +#endif } else { UNIMPLEMENTED(); } diff --git a/oneflow/core/vm/cuda_allocator.cpp b/oneflow/core/vm/cuda_allocator.cpp index 5f6b90f48159e9464515418bbc87a88691b171f7..cf09b4922f2173b35d53259a88c063da0aba179f 100644 --- a/oneflow/core/vm/cuda_allocator.cpp +++ b/oneflow/core/vm/cuda_allocator.cpp @@ -13,6 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + +#ifdef WITH_CUDA + #include "oneflow/core/vm/cuda_allocator.h" #include "oneflow/core/device/cuda_util.h" #include <iostream> @@ -309,3 +312,5 @@ void CudaAllocator::Deallocate(char* mem_ptr, std::size_t size) { } // namespace vm } // namespace oneflow + +#endif diff --git a/oneflow/core/vm/cuda_copy_d2h_stream_type.cpp b/oneflow/core/vm/cuda_copy_d2h_stream_type.cpp index 4adbcf071ac35904d07f7929a4553c032ffb6a4e..59544ca4adc68323bb56ff98ffe34a47394a4157 100644 --- a/oneflow/core/vm/cuda_copy_d2h_stream_type.cpp +++ b/oneflow/core/vm/cuda_copy_d2h_stream_type.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/vm/cuda_copy_d2h_stream_type.h" #include "oneflow/core/vm/cuda_copy_d2h_device_context.h" @@ -69,3 +71,5 @@ ObjectMsgPtr<StreamDesc> CudaCopyD2HStreamType::MakeStreamDesc(const Resource& r } // namespace vm } // namespace oneflow + +#endif diff --git a/oneflow/core/vm/cuda_copy_h2d_stream_type.cpp b/oneflow/core/vm/cuda_copy_h2d_stream_type.cpp index 220b9f29c1871866997ae852978164c837dabfcd..509a53c4302b231b7b146755f55a9f817826373a 100644 --- a/oneflow/core/vm/cuda_copy_h2d_stream_type.cpp +++ b/oneflow/core/vm/cuda_copy_h2d_stream_type.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/vm/cuda_copy_h2d_stream_type.h" namespace oneflow { @@ -69,3 +71,5 @@ ObjectMsgPtr<StreamDesc> CudaCopyH2DStreamType::MakeStreamDesc(const Resource& r } // namespace vm } // namespace oneflow + +#endif diff --git a/oneflow/core/vm/cuda_host_allocator.cpp b/oneflow/core/vm/cuda_host_allocator.cpp index 9abcb18c1eb15290586300593c8c233992428b5e..e6888afb4bcaec23468b244ced4e4ad6bd67fa6f 100644 --- a/oneflow/core/vm/cuda_host_allocator.cpp +++ b/oneflow/core/vm/cuda_host_allocator.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/vm/cuda_host_allocator.h" #include "oneflow/core/device/cuda_util.h" @@ -29,3 +31,5 @@ void CudaHostAllocator::Deallocate(char* mem_ptr, std::size_t size) { } // namespace vm } // namespace oneflow + +#endif diff --git a/oneflow/core/vm/cuda_instruction_status_querier.cpp b/oneflow/core/vm/cuda_instruction_status_querier.cpp index f23869e139b95ff741a035f65465b28f50a6ecb4..6560244950b89a8b2510337a57bda3b5228c02e1 100644 --- a/oneflow/core/vm/cuda_instruction_status_querier.cpp +++ b/oneflow/core/vm/cuda_instruction_status_querier.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/vm/cuda_instruction_status_querier.h" #include "oneflow/core/device/device_context.h" @@ -33,3 +35,5 @@ void CudaInstrStatusQuerier::SetLaunched(DeviceCtx* device_ctx) { } // namespace vm } // namespace oneflow + +#endif diff --git a/oneflow/core/vm/cuda_instruction_status_querier.h b/oneflow/core/vm/cuda_instruction_status_querier.h index 52dc61b726f1131f95ef855a3bf69f99093ebbb2..80c91fa3a6a414fe2c4f9698f1c3dc8c2cf7864d 100644 --- a/oneflow/core/vm/cuda_instruction_status_querier.h +++ b/oneflow/core/vm/cuda_instruction_status_querier.h @@ -24,6 +24,7 @@ class DeviceCtx; namespace vm { +#ifdef WITH_CUDA class CudaInstrStatusQuerier { public: ~CudaInstrStatusQuerier() = default; @@ -50,6 +51,8 @@ class CudaInstrStatusQuerier { cudaEvent_t event_; }; +#endif + } // namespace vm } // namespace oneflow diff --git a/oneflow/core/vm/cuda_stream_type.cpp b/oneflow/core/vm/cuda_stream_type.cpp index aab63aa072754aaddce464e5cab16191b60bb5a6..1b6e9f98b1f656225c5773b0a103cba44d711f46 100644 --- a/oneflow/core/vm/cuda_stream_type.cpp +++ b/oneflow/core/vm/cuda_stream_type.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/vm/cuda_stream_type.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/vm/stream.msg.h" @@ -75,3 +77,5 @@ ObjectMsgPtr<StreamDesc> CudaStreamType::MakeStreamDesc(const Resource& resource } // namespace vm } // namespace oneflow + +#endif diff --git a/oneflow/core/vm/cuda_stream_type.h b/oneflow/core/vm/cuda_stream_type.h index 868c2e91f850a5c45cfd363f015a0dacf3ae5903..800a8695333d3fcb05209f95cdd9b584198a1df7 100644 --- a/oneflow/core/vm/cuda_stream_type.h +++ b/oneflow/core/vm/cuda_stream_type.h @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #ifndef ONEFLOW_CORE_VM_CUDA_STREAM_TYPE_H_ #define ONEFLOW_CORE_VM_CUDA_STREAM_TYPE_H_ @@ -49,3 +51,4 @@ class CudaStreamType final : public StreamType { } // namespace oneflow #endif // ONEFLOW_CORE_VM_CUDA_STREAM_TYPE_H_ +#endif // WITH_CUDA diff --git a/oneflow/core/vm/device_helper_stream_type.cpp b/oneflow/core/vm/device_helper_stream_type.cpp index 9de73a46507ce694d7a68ddd101c5867163f11ba..bcd534034bf441e9b5a11ddb7ca0567c83720ff9 100644 --- a/oneflow/core/vm/device_helper_stream_type.cpp +++ b/oneflow/core/vm/device_helper_stream_type.cpp @@ -27,6 +27,8 @@ limitations under the License. namespace oneflow { namespace vm { +#ifdef WITH_CUDA + namespace { class CudaMallocInstructionType final : public InstructionType { @@ -105,6 +107,8 @@ COMMAND(RegisterInstructionType<CudaFreeInstructionType>("CudaFree")); } // namespace +#endif + void DeviceHelperStreamType::InitInstructionStatus(const Stream& stream, InstructionStatusBuffer* status_buffer) const { static_assert(sizeof(NaiveInstrStatusQuerier) < kInstructionStatusBufferBytes, ""); diff --git a/oneflow/core/vm/host_stream_type.cpp b/oneflow/core/vm/host_stream_type.cpp index 076f1b52a4fcf2b03451ef3c45c2e7c15226494b..f6dfe1dc589629fb4c4b6ab20ae9205fa4a06694 100644 --- a/oneflow/core/vm/host_stream_type.cpp +++ b/oneflow/core/vm/host_stream_type.cpp @@ -30,6 +30,8 @@ namespace vm { namespace { +#ifdef WITH_CUDA + class CudaMallocHostInstructionType final : public InstructionType { public: CudaMallocHostInstructionType() = default; @@ -70,6 +72,8 @@ class CudaMallocHostInstructionType final : public InstructionType { }; COMMAND(RegisterInstructionType<CudaMallocHostInstructionType>("CudaMallocHost")); +#endif // WITH_CUDA + class MallocInstructionType final : public InstructionType { public: MallocInstructionType() = default; @@ -108,6 +112,8 @@ class MallocInstructionType final : public InstructionType { }; COMMAND(RegisterInstructionType<MallocInstructionType>("Malloc")); +#ifdef WITH_CUDA + class CudaFreeHostInstructionType final : public InstructionType { public: CudaFreeHostInstructionType() = default; @@ -142,6 +148,8 @@ class CudaFreeHostInstructionType final : public InstructionType { }; COMMAND(RegisterInstructionType<CudaFreeHostInstructionType>("CudaFreeHost")); +#endif // WITH_CUDA + class FreeInstructionType final : public InstructionType { public: FreeInstructionType() = default; diff --git a/oneflow/python/framework/config_util.py b/oneflow/python/framework/config_util.py index 4eeee5e827e84852ffead2bd4446998493d3826a..0589a03becddaabe72049754a90a8976cda8a0ab 100644 --- a/oneflow/python/framework/config_util.py +++ b/oneflow/python/framework/config_util.py @@ -19,6 +19,7 @@ import oneflow.python.framework.hob as hob import oneflow.python.framework.session_context as session_ctx import oneflow.python.lib.core.enable_if as enable_if from oneflow.python.oneflow_export import oneflow_export +import traceback @oneflow_export("config.load_library") @@ -60,10 +61,19 @@ def api_gpu_device_num(val: int) -> None: r"""Set number of GPUs on each machine to run oneflow on. Args: - val (int): number of GPUs. It is identical on every machine. In other words, + val (int): number of GPUs. It is identical on every machine. In other words, you can't specify different number of GPUs you would like to use on each machine. """ - return enable_if.unique([gpu_device_num, do_nothing])(val) + from oneflow.python.compatibility import with_cuda + + if with_cuda == False: + print( + "INFO: for CPU-only OneFlow, oneflow.config.gpu_device_num is equivalent to oneflow.config.cpu_device_num" + ) + print(traceback.format_stack()[-2]) + return enable_if.unique([cpu_device_num, do_nothing])(val) + else: + return enable_if.unique([gpu_device_num, do_nothing])(val) @enable_if.condition(hob.in_normal_mode & ~hob.session_initialized) @@ -111,7 +121,7 @@ def comm_net_worker_num(val): @oneflow_export("config.max_mdsave_worker_num") def api_max_mdsave_worker_num(val: int) -> None: r"""Set up max number of workers for mdsave process. - + Args: val (int): max number of workers """ @@ -144,7 +154,7 @@ def enable_numa_aware_cuda_malloc_host(val): @oneflow_export("config.compute_thread_pool_size") def api_compute_thread_pool_size(val: int) -> None: - r"""Set up the size of compute thread pool + r"""Set up the size of compute thread pool Args: val (int): size of thread pool @@ -247,7 +257,7 @@ def use_rdma(val=True): @oneflow_export("config.thread_enable_local_message_queue") def api_thread_enable_local_message_queue(val: bool) -> None: - """Whether or not enable thread using local message queue. + """Whether or not enable thread using local message queue. Args: val (bool): True or False diff --git a/oneflow/python/framework/placement_util.py b/oneflow/python/framework/placement_util.py index 23f6538536e75a355ba4545ed9085010a6d08f0e..22b950dd04d3f1f071a6d976a3d43e7aab5d08b5 100644 --- a/oneflow/python/framework/placement_util.py +++ b/oneflow/python/framework/placement_util.py @@ -31,6 +31,7 @@ def api_current_placement_scope() -> placement_ctx.PlacementScope: "WARNING: oneflow.placement.current_scope has been deprecated. " "Please use oneflow.current_scope.device_parallel_desc_symbol instead." ) + print(traceback.format_stack()[-2]) api = enable_if.unique( [global_mode_cur_placement_scope, normal_mode_cur_placement_scope] ) diff --git a/oneflow/python/framework/session_util.py b/oneflow/python/framework/session_util.py index c4218d1649801e14a35560bca7d14eddac383433..24922ae8377a660d902f62375d7bda26a5606cfa 100644 --- a/oneflow/python/framework/session_util.py +++ b/oneflow/python/framework/session_util.py @@ -447,9 +447,15 @@ def _TryCompleteConfigProto(config_proto): def _GetDefaultConfigProto(): + from oneflow.python.compatibility import with_cuda + config_proto = job_set_util.ConfigProto() config_proto.resource.machine_num = 0 - config_proto.resource.gpu_device_num = 1 + if with_cuda: + config_proto.resource.gpu_device_num = 1 + else: + config_proto.resource.cpu_device_num = 1 + config_proto.resource.gpu_device_num = 0 config_proto.io_conf.data_fs_conf.localfs_conf.SetInParent() config_proto.io_conf.snapshot_fs_conf.localfs_conf.SetInParent() return config_proto diff --git a/oneflow/python/test/models/cnns_tests.py b/oneflow/python/test/models/cnns_tests.py index c54465f95596f9f05c42fdf3fa901e3d9746709b..59ddf70dd72738566cbcc87965b0623bb30d9591 100644 --- a/oneflow/python/test/models/cnns_tests.py +++ b/oneflow/python/test/models/cnns_tests.py @@ -41,6 +41,8 @@ class TestNetMixin: self.tf_loss_dir = "" self.of_loss_dir = "" self.num_iter = 10 + if os.getenv("ONEFLOW_TEST_CPU_ONLY"): + self.num_iter = 3 self.set_params() oneflow.clear_default_session() @@ -55,6 +57,8 @@ class TestNetMixin: spec = net_modudle.DLNetSpec(FLAGS.enable_auto_mixed_precision) spec.num_nodes = num_node spec.gpu_num_per_node = num_gpu_per_node + if os.getenv("ONEFLOW_TEST_CPU_ONLY"): + spec.iter_num = 3 net_modudle.main(spec) return if num_node > 1: @@ -81,6 +85,10 @@ class TestNetMixin: return of_loss[0 : self.num_iter] def print_and_check_result(self, result_name): + if os.getenv("ONEFLOW_TEST_CPU_ONLY"): + if self.net == "resnet50": + print("WARNING: skipping check for resnet50 cpu due to GEMM NaN") + return loss_dict = {} loss_dict["tensorflow"] = self.load_tf_loss() loss_dict["oneflow"] = self.load_of_loss(result_name) diff --git a/oneflow/python/test/models/test_bert.py b/oneflow/python/test/models/test_bert.py index b7d0fa650252e1e31ec01bc322993e5b9b67f332..496461178a6cbabd301c5a7c1683bf589f52a2ee 100644 --- a/oneflow/python/test/models/test_bert.py +++ b/oneflow/python/test/models/test_bert.py @@ -20,6 +20,8 @@ import numpy as np import oneflow as flow from absl import flags from pretrain import PreTrain +import unittest +import os FLAGS = flags.FLAGS flags.DEFINE_string("data_dir", "/dataset/bert/bert_seq_len_128_repeat1024", "") @@ -199,6 +201,7 @@ func_config.default_logical_view(flow.scope.consistent_view()) func_config.enable_auto_mixed_precision(FLAGS.enable_auto_mixed_precision) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_1n1c(test_case): flow.config.enable_debug_mode(True) flow.config.gpu_device_num(1) @@ -211,6 +214,7 @@ def test_1n1c(test_case): print(of_loss) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_1n4c(test_case): flow.config.gpu_device_num(4) pretrain_job = flow.global_function(type="train", function_config=func_config)( @@ -222,6 +226,7 @@ def test_1n4c(test_case): print(of_loss) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") @flow.unittest.num_nodes_required(2) def test_2n8c(test_case): flow.config.gpu_device_num(4) @@ -234,6 +239,7 @@ def test_2n8c(test_case): print(of_loss) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_inplace(test_case): test_case.assertTrue( np.allclose(GetSeveralLossesAsNumpy(True), GetSeveralLossesAsNumpy(False)) diff --git a/oneflow/python/test/models/test_dcgan.py b/oneflow/python/test/models/test_dcgan.py index 68e4504a5c627a93af9158a78a1a8478c45c3dea..af6565312695d7bfe1afcd7796966041061463ca 100644 --- a/oneflow/python/test/models/test_dcgan.py +++ b/oneflow/python/test/models/test_dcgan.py @@ -17,13 +17,16 @@ import oneflow as flow import oneflow.typing as oft import numpy as np import os +import unittest +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_1n1c(test_case): dcgan = DCGAN() dcgan.compare_with_tf(1) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_1n4c(test_case): dcgan = DCGAN() dcgan.compare_with_tf(4) diff --git a/oneflow/python/test/ops/test_2d_gpu_variable.py b/oneflow/python/test/ops/test_2d_gpu_variable.py index cb07aeafe2771c96a8ab07b64814c5cdf9584e77..6cb35d83383cc6580751979b9a431103e645a534 100644 --- a/oneflow/python/test/ops/test_2d_gpu_variable.py +++ b/oneflow/python/test/ops/test_2d_gpu_variable.py @@ -14,8 +14,11 @@ See the License for the specific language governing permissions and limitations under the License. """ import oneflow as flow +import os +import unittest +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_2d_gpu_variable(test_case): flow.enable_eager_execution() flow.config.gpu_device_num(2) diff --git a/oneflow/python/test/ops/test_TestDataTypeAttr.py b/oneflow/python/test/ops/test_TestDataTypeAttr.py index 2dafe3379ba49fdc81bc0b04835625b902b50859..0ec43b5b2050fb3e153ae762561fda51c3d41607 100644 --- a/oneflow/python/test/ops/test_TestDataTypeAttr.py +++ b/oneflow/python/test/ops/test_TestDataTypeAttr.py @@ -19,6 +19,8 @@ import numpy as np import oneflow as flow import oneflow.typing as oft from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type +import unittest +import os def TestDataTypeAttr(input, output_type): @@ -49,6 +51,7 @@ def RunTest(data_type): assert output.dtype == type_name_to_np_type[data_type] +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_data_type_attr(test_case): # TODO: fix bugs in ForeignOutputKernel with "float16" and "char" dtype, do not test these two dtypes here for data_type in ["float32", "double", "int8", "int32", "int64", "uint8"]: diff --git a/oneflow/python/test/ops/test_TestDynamicSource.py b/oneflow/python/test/ops/test_TestDynamicSource.py index be8d38d13495369d58e487d34637d5ee69ad0a41..300a8b82df50dfba480f34f8abec408021385a9f 100644 --- a/oneflow/python/test/ops/test_TestDynamicSource.py +++ b/oneflow/python/test/ops/test_TestDynamicSource.py @@ -15,6 +15,8 @@ limitations under the License. """ import numpy as np import oneflow as flow +import unittest +import os def my_test_source(name): @@ -28,6 +30,7 @@ def my_test_source(name): ) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_test_dynamic_source(test_case): func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) diff --git a/oneflow/python/test/ops/test_TestMultiInputGrad.py b/oneflow/python/test/ops/test_TestMultiInputGrad.py index 4178427230e2cf5ec9165d657e78c02c5f1e6138..bd6ce37e9c8514b6a7058471208fba1790e13dbc 100644 --- a/oneflow/python/test/ops/test_TestMultiInputGrad.py +++ b/oneflow/python/test/ops/test_TestMultiInputGrad.py @@ -20,6 +20,7 @@ import numpy as np import oneflow as flow import test_global_storage from test_util import GenArgList +import unittest def TestMultiInput(x1, x2): @@ -35,6 +36,7 @@ def TestMultiInput(x1, x2): ) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_TestMultiInput_grad_mirrored_inplace(test_case): func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) diff --git a/oneflow/python/test/ops/test_TestMultiOutputOrder.py b/oneflow/python/test/ops/test_TestMultiOutputOrder.py index 0e9b8bdb085ea09fd48beb3d3e6bf51f07b6cc06..89dacc0664efb053745acc525de790b49d0662f5 100644 --- a/oneflow/python/test/ops/test_TestMultiOutputOrder.py +++ b/oneflow/python/test/ops/test_TestMultiOutputOrder.py @@ -16,8 +16,11 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def TestMultiOutputOrder(x, name): return ( flow.user_op_builder(name) diff --git a/oneflow/python/test/ops/test_TestReshape.py b/oneflow/python/test/ops/test_TestReshape.py index fa2885bab9a90642f05a8947a8635fc8ad3e55f0..9ef9648053891537cec5e2fe85c0641f3380a1ae 100644 --- a/oneflow/python/test/ops/test_TestReshape.py +++ b/oneflow/python/test/ops/test_TestReshape.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os def TestReshape(x, shape, name): @@ -58,12 +60,14 @@ def mirrored_tensor_def_test(test_case, func_config): test_case.assertTrue(np.array_equal(x.reshape(5, 4), y)) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_fixed_TestReshape(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.consistent_view()) fixed_tensor_def_test(test_case, func_config) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_mirrored_TestReshape(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.mirrored_view()) diff --git a/oneflow/python/test/ops/test_activations.py b/oneflow/python/test/ops/test_activations.py index 4496fa7cd79705b6a75144a2ddf06e9151311d34..9befef857900e029eb7cec8fb651d8fe1b92f428 100644 --- a/oneflow/python/test/ops/test_activations.py +++ b/oneflow/python/test/ops/test_activations.py @@ -99,5 +99,6 @@ def test_activations(test_case): for arg in GenArgList(arg_dict): compare_with_tensorflow(*arg) - for act_type in arg_dict["activation_type"]: - compare_with_tensorflow("gpu", act_type, (1024, 1024), flow.float16) + if os.getenv("ONEFLOW_TEST_CPU_ONLY") is None: + for act_type in arg_dict["activation_type"]: + compare_with_tensorflow("gpu", act_type, (1024, 1024), flow.float16) diff --git a/oneflow/python/test/ops/test_all_reduce_group.py b/oneflow/python/test/ops/test_all_reduce_group.py index feeb8ade207dab18a78c4e7ee0edcda838ec38f0..a599db8341a4ef5305b5804420461332c9ae0b37 100644 --- a/oneflow/python/test/ops/test_all_reduce_group.py +++ b/oneflow/python/test/ops/test_all_reduce_group.py @@ -18,6 +18,8 @@ from collections import OrderedDict import numpy as np import oneflow as flow from test_util import GenArgList +import unittest +import os def do_test(test_case, mirrored): @@ -45,6 +47,7 @@ def do_test(test_case, mirrored): test_case.assertTrue(np.all(r2 == 0.5)) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_variable_as_loss_on_two_device(test_case): arg_dict = OrderedDict() arg_dict["mirrored"] = [True, False] diff --git a/oneflow/python/test/ops/test_assign.py b/oneflow/python/test/ops/test_assign.py index d8f47daee74826f03d37a44ed3bb4968f686507c..49e89b8eaf0c0b9140252252633b0e914fbf61c9 100644 --- a/oneflow/python/test/ops/test_assign.py +++ b/oneflow/python/test/ops/test_assign.py @@ -19,6 +19,7 @@ import numpy as np import oneflow as flow from test_util import GenArgDict import oneflow.typing as oft +import os flow_to_np_dtype_dict = { flow.int32: np.int32, @@ -39,7 +40,8 @@ def _random_input(shape, dtype): def _of_assign_and_relu(value, dtype, device_type): flow.clear_default_session() - flow.config.gpu_device_num(1) + if os.getenv("ONEFLOW_TEST_CPU_ONLY") is None: + flow.config.gpu_device_num(1) flow.config.cpu_device_num(1) func_config = flow.FunctionConfig() func_config.default_data_type(dtype) diff --git a/oneflow/python/test/ops/test_batch_normalization.py b/oneflow/python/test/ops/test_batch_normalization.py index c9d92b7a689a5fd4589a69d7fc846eefc6ce348c..6d86b731f462dbd86df58c2271f2283e23306eed 100644 --- a/oneflow/python/test/ops/test_batch_normalization.py +++ b/oneflow/python/test/ops/test_batch_normalization.py @@ -22,8 +22,10 @@ import tensorflow as tf import test_global_storage from test_util import Args, GenArgDict, type_name_to_flow_type, type_name_to_np_type import oneflow.typing as oft +import unittest +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_no_watch_scope_consistent(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.consistent_view()) @@ -36,6 +38,7 @@ def test_no_watch_scope_consistent(test_case): Foo(np.ones((2, 8, 32, 32), dtype=np.float32)) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_train_consistent(test_case): flow.config.enable_debug_mode(True) func_config = flow.FunctionConfig() @@ -359,6 +362,7 @@ def test_layer_batchnorm_trainable_without_training(test_case): CompareBnWithTensorFlow(**arg, training=False, trainable=True) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_nn_batchnorm(test_case): arg_dict = OrderedDict() arg_dict["input_shape"] = [(2, 4, 3, 5)] diff --git a/oneflow/python/test/ops/test_broadcast_maximum.py b/oneflow/python/test/ops/test_broadcast_maximum.py index 02f5a5701b66d7c40a0b379227a040329e213eda..321c7e8e4fba2928117adb134e4c63dce0bbd0ef 100644 --- a/oneflow/python/test/ops/test_broadcast_maximum.py +++ b/oneflow/python/test/ops/test_broadcast_maximum.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @@ -38,12 +40,14 @@ def _run_test(test_case, a, b, dtype, device): _check(test_case, a, b, out.numpy()) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_broadcast_maximum_random_gpu(test_case): a = np.random.rand(1024, 1024).astype(np.float32) b = np.random.rand(1024, 1024).astype(np.float32) _run_test(test_case, a, b, flow.float32, "gpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_broadcast_maximum_broadcast_gpu(test_case): a = np.random.rand(1024, 1).astype(np.float32) b = np.random.rand(1, 1024).astype(np.float32) diff --git a/oneflow/python/test/ops/test_broadcast_minimum.py b/oneflow/python/test/ops/test_broadcast_minimum.py index 86caf2be34c3ca649f03e77e189472a2e8cdb8fc..840a631d46c7686cc8c8bc3b940924e34ed6df7a 100644 --- a/oneflow/python/test/ops/test_broadcast_minimum.py +++ b/oneflow/python/test/ops/test_broadcast_minimum.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @@ -38,12 +40,14 @@ def _run_test(test_case, a, b, dtype, device): _check(test_case, a, b, out.numpy()) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_broadcast_minimum_random_gpu(test_case): a = np.random.rand(1024, 1024).astype(np.float32) b = np.random.rand(1024, 1024).astype(np.float32) _run_test(test_case, a, b, flow.float32, "gpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_broadcast_minimum_broadcast_gpu(test_case): a = np.random.rand(1024, 1).astype(np.float32) b = np.random.rand(1, 1024).astype(np.float32) diff --git a/oneflow/python/test/ops/test_categorical_ordinal_encoder.py b/oneflow/python/test/ops/test_categorical_ordinal_encoder.py index df359a737d58370930b601f822c2cfc8994fd79d..90fa55a4726da90ff8910a07776e53baede492bb 100644 --- a/oneflow/python/test/ops/test_categorical_ordinal_encoder.py +++ b/oneflow/python/test/ops/test_categorical_ordinal_encoder.py @@ -18,6 +18,8 @@ import numpy as np import oneflow as flow import oneflow.typing as oft import typing +import unittest +import os def _test_categorical_ordinal_encoder( @@ -74,6 +76,7 @@ def _test_categorical_ordinal_encoder( test_case.assertEqual(len(vk_set), unique_size) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_categorical_ordinal_encoder_gpu_large(test_case): _test_categorical_ordinal_encoder( test_case=test_case, @@ -86,6 +89,7 @@ def test_categorical_ordinal_encoder_gpu_large(test_case): ) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_categorical_ordinal_encoder_gpu_small(test_case): _test_categorical_ordinal_encoder( test_case=test_case, diff --git a/oneflow/python/test/ops/test_ccrelu.py b/oneflow/python/test/ops/test_ccrelu.py index 422de3599c03f14cd2461c68e81cf22b466beaf4..00620beba74cde61d17cc5c3484d2912dee51466 100644 --- a/oneflow/python/test/ops/test_ccrelu.py +++ b/oneflow/python/test/ops/test_ccrelu.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os def ccrelu(x, name): @@ -54,18 +56,21 @@ def mirrored_tensor_def_test(test_case, func_config): test_case.assertTrue(np.array_equal(y, np.maximum(x, 0))) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_ccrelu(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.consistent_view()) fixed_tensor_def_test(test_case, func_config) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_mirror_ccrelu(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.mirrored_view()) mirrored_tensor_def_test(test_case, func_config) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_1n2c_mirror_dynamic_ccrelu(test_case): flow.config.gpu_device_num(2) func_config = flow.FunctionConfig() diff --git a/oneflow/python/test/ops/test_concat.py b/oneflow/python/test/ops/test_concat.py index 6bc7e553338a3041461229734552593f2d408a1a..2f6be5141477db3e9a99d2f336a7137e11bd40f7 100644 --- a/oneflow/python/test/ops/test_concat.py +++ b/oneflow/python/test/ops/test_concat.py @@ -20,6 +20,8 @@ import tensorflow as tf import test_global_storage import random import math +import unittest +import os from test_util import GenArgList, type_name_to_flow_type from collections import OrderedDict @@ -242,6 +244,7 @@ def _test_dynamic_concat(test_case, shape, axis, device_type, verbose=False): test_case.assertTrue(np.array_equal(of_output, output)) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dynamic_concat_case_0(test_case): _test_dynamic_concat(test_case, (64, 4), 0, "gpu") @@ -250,6 +253,7 @@ def test_dynamic_concat_case_1(test_case): _test_dynamic_concat(test_case, (2, 10), 1, "cpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dynamic_concat_case_2(test_case): _test_dynamic_concat(test_case, (4, 7, 128), 2, "gpu") diff --git a/oneflow/python/test/ops/test_constant_like.py b/oneflow/python/test/ops/test_constant_like.py index b9bbcb71f74845f3346c960dbd9cb1f9dc6bd185..802108455b4e497dacd674b451ad452563e52625 100644 --- a/oneflow/python/test/ops/test_constant_like.py +++ b/oneflow/python/test/ops/test_constant_like.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os def _check(test_case, x, y, value, dtype=None): @@ -36,6 +38,7 @@ def _run_test(test_case, x, value, dtype=None, device="gpu"): _check(test_case, x, y.numpy(), value, dtype=dtype) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_constant_like_gpu_float(test_case): x = np.random.rand(10, 3, 32, 1024).astype(np.float32) _run_test(test_case, x, 1.0, flow.float, "gpu") @@ -46,6 +49,7 @@ def test_constant_like_cpu_float(test_case): _run_test(test_case, x, 2.0, flow.float, "cpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_constant_like_gpu_double(test_case): x = np.random.rand(10, 3, 32, 1024).astype(np.float32) _run_test(test_case, x, 3.0, flow.double, "gpu") @@ -56,6 +60,7 @@ def test_constant_like_cpu_double(test_case): _run_test(test_case, x, 4.0, flow.double, "cpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_constant_like_gpu_int8(test_case): x = np.random.rand(10, 3, 32, 1024).astype(np.float32) _run_test(test_case, x, 5.0, flow.int8, "gpu") @@ -66,6 +71,7 @@ def test_constant_like_cpu_int8(test_case): _run_test(test_case, x, 6.0, flow.int8, "cpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_constant_like_gpu_int32(test_case): x = np.random.rand(10, 3, 32, 1024).astype(np.float32) _run_test(test_case, x, 7.0, flow.int32, "gpu") @@ -76,6 +82,7 @@ def test_constant_like_cpu_int32(test_case): _run_test(test_case, x, 8.0, flow.int32, "cpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_constant_like_gpu_int64(test_case): x = np.random.rand(10, 3, 32, 1024).astype(np.float32) _run_test(test_case, x, 9.0, flow.int64, "gpu") @@ -86,6 +93,7 @@ def test_constant_like_cpu_int64(test_case): _run_test(test_case, x, 10.0, flow.int64, "cpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_constant_like_gpu(test_case): x = np.random.rand(10, 3, 32, 1024).astype(np.float32) _run_test(test_case, x, 11.0, device="gpu") diff --git a/oneflow/python/test/ops/test_copy_comm_net_pass_empty.py b/oneflow/python/test/ops/test_copy_comm_net_pass_empty.py index e9b8acf5a88216aa595812e98796e1f5722e9962..ff19569ca0739f735123bcde94e7af60d78c41e1 100644 --- a/oneflow/python/test/ops/test_copy_comm_net_pass_empty.py +++ b/oneflow/python/test/ops/test_copy_comm_net_pass_empty.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os def ccrelu(x, name): @@ -30,6 +32,7 @@ def ccrelu(x, name): ) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") @flow.unittest.num_nodes_required(2) def test_multi_node_comm_net(test_case): func_config = flow.FunctionConfig() @@ -64,6 +67,7 @@ def test_multi_node_comm_net(test_case): ) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") @flow.unittest.num_nodes_required(2) def test_multi_node_comm_net_dynamic(test_case): func_config = flow.FunctionConfig() diff --git a/oneflow/python/test/ops/test_cpu_only_user_op.py b/oneflow/python/test/ops/test_cpu_only_user_op.py index df93e1b27d083ce86345ac578e57251e50c05880..c4dbe4bcfc0e3af0b635471489bd1399d337d9ba 100644 --- a/oneflow/python/test/ops/test_cpu_only_user_op.py +++ b/oneflow/python/test/ops/test_cpu_only_user_op.py @@ -16,6 +16,8 @@ limitations under the License. import oneflow as flow import numpy as np import oneflow.typing as oft +import unittest +import os def _cpu_only_relu(x): @@ -33,7 +35,7 @@ def _check_cpu_only_relu_device(test_case, verbose=False): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) - func_config.default_placement_scope(flow.scope.placement("gpu", "0:0")) + func_config.default_placement_scope(flow.scope.placement("cpu", "0:0")) @flow.global_function(function_config=func_config) def cpu_only_relu_job(x_def: oft.Numpy.Placeholder(shape=(2, 5), dtype=flow.float)): @@ -68,5 +70,6 @@ def test_cpu_only_user_op(test_case): _check_cpu_only_relu_device(test_case) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_non_cpu_only_user_op(test_case): _check_non_cpu_only_relu_device(test_case) diff --git a/oneflow/python/test/ops/test_distribute_concat.py b/oneflow/python/test/ops/test_distribute_concat.py index f33963e6c9a284ca082ae6b8c90724198c469f8b..f1dbb674fe9bb340a10737b93d7e0efa766c3698 100644 --- a/oneflow/python/test/ops/test_distribute_concat.py +++ b/oneflow/python/test/ops/test_distribute_concat.py @@ -15,8 +15,11 @@ limitations under the License. """ import numpy as np import oneflow as flow +import unittest +import os +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_deadlock(test_case): flow.config.gpu_device_num(2) func_config = flow.FunctionConfig() diff --git a/oneflow/python/test/ops/test_gather_nd.py b/oneflow/python/test/ops/test_gather_nd.py index 6097e20f3dbec909a56764e8423d9c57c11ac272..e54f5f8fbe344d1313ebd2d45559f471b549b5d1 100644 --- a/oneflow/python/test/ops/test_gather_nd.py +++ b/oneflow/python/test/ops/test_gather_nd.py @@ -20,6 +20,8 @@ import oneflow as flow import tensorflow as tf from test_util import GenArgList import oneflow.typing as oft +import unittest +import os gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: @@ -271,6 +273,7 @@ def test_gather_nd_case_4(test_case): _compare_gather_nd_with_tf(test_case, *arg) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dynamic_gather_nd(test_case): arg_dict = OrderedDict() arg_dict["params_shape"] = [(30, 15)] diff --git a/oneflow/python/test/ops/test_indexed_slices_reduce_sum.py b/oneflow/python/test/ops/test_indexed_slices_reduce_sum.py index d6f769f7ef0ad13cb8a0302dfaab5845e0b7436f..6b13671f8e88fb439506ac321811bed9e1070f60 100644 --- a/oneflow/python/test/ops/test_indexed_slices_reduce_sum.py +++ b/oneflow/python/test/ops/test_indexed_slices_reduce_sum.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @@ -59,6 +61,7 @@ def _run_test(test_case, indices, values, indices_dtype, values_dtype, device): ) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_indexed_slices_reduce_sum_gpu(test_case): indices = np.random.randint(0, 32, 1024).astype(np.int32) values = np.random.rand(1024, 8).astype(np.float32) diff --git a/oneflow/python/test/ops/test_scatter_nd.py b/oneflow/python/test/ops/test_scatter_nd.py index 08942caa0a5c11a0e067fe709df1e237e50c8d6e..40fce1777278d981c3e7d689dcf8a994ac59aaad 100644 --- a/oneflow/python/test/ops/test_scatter_nd.py +++ b/oneflow/python/test/ops/test_scatter_nd.py @@ -20,6 +20,8 @@ import oneflow as flow import tensorflow as tf from test_util import GenArgList import oneflow.typing as oft +import unittest +import os gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: @@ -624,6 +626,7 @@ def test_tensor_scatter_nd_add_case2(test_case): _compare_tensor_scatter_nd_add_with_tf(test_case, *arg) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_scatter_nd_dynamic_indices(test_case): arg_dict = OrderedDict() arg_dict["indices_shape"] = [(12, 10, 2)] @@ -635,6 +638,7 @@ def test_scatter_nd_dynamic_indices(test_case): _compare_scatter_nd_dynamic_indices_with_tf(test_case, *arg) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_scatter_nd_empty_indices(test_case): arg_dict = OrderedDict() arg_dict["indices_shape"] = [(0, 1)] @@ -646,6 +650,7 @@ def test_scatter_nd_empty_indices(test_case): _compare_scatter_nd_dynamic_indices_with_tf(test_case, *arg) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_tensor_scatter_nd_update_dynamic_indices(test_case): arg_dict = OrderedDict() arg_dict["params_shape"] = [(32, 33, 4, 5)] @@ -657,6 +662,7 @@ def test_tensor_scatter_nd_update_dynamic_indices(test_case): _compare_tensor_scatter_nd_update_dynamic_indices_with_tf(test_case, *arg) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_tensor_scatter_nd_update_empty_indices(test_case): arg_dict = OrderedDict() arg_dict["params_shape"] = [(37, 14)] @@ -668,6 +674,7 @@ def test_tensor_scatter_nd_update_empty_indices(test_case): _compare_tensor_scatter_nd_update_dynamic_indices_with_tf(test_case, *arg) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_tensor_scatter_nd_add_dynamic_indices(test_case): arg_dict = OrderedDict() arg_dict["params_shape"] = [(2, 9, 7, 5, 4)] @@ -679,6 +686,7 @@ def test_tensor_scatter_nd_add_dynamic_indices(test_case): _compare_tensor_scatter_nd_add_dynamic_indices_with_tf(test_case, *arg) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_tensor_scatter_nd_add_empty_indices(test_case): arg_dict = OrderedDict() arg_dict["params_shape"] = [(24, 30, 14)] diff --git a/oneflow/python/test/ops/test_slice_v2.py b/oneflow/python/test/ops/test_slice_v2.py index 5feebfce148a7fbca36e9c2acfc078a47a61e399..951d8788d00b141bea587e9629094a4296b66339 100644 --- a/oneflow/python/test/ops/test_slice_v2.py +++ b/oneflow/python/test/ops/test_slice_v2.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os def _run_slice(input, index_args, dynamic=False, dtype=flow.float, input_shape=None): @@ -60,6 +62,7 @@ def _check(test_case, ref, out): test_case.assertTrue(np.allclose(_ref, _out)) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_into_two_parts(test_case): input = np.random.rand(2, 5, 4).astype(np.float32) results = [input[:, 0:2, :], input[:, 2:, :]] @@ -71,6 +74,7 @@ def test_slice_into_two_parts(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_at_first_dim(test_case): input = np.random.rand(4, 5, 4).astype(np.float32) results = [input[2:None, :, :]] @@ -79,6 +83,7 @@ def test_slice_at_first_dim(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_at_two_dims(test_case): input = np.random.rand(2, 5, 4).astype(np.float32) results = [input[:, 0:2, 2:]] @@ -87,6 +92,7 @@ def test_slice_at_two_dims(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_with_collapse_dims(test_case): input = np.random.rand(2, 5, 4, 4, 3).astype(np.float32) results = [input[:, 0:2, :, :, 1:None]] @@ -103,6 +109,7 @@ def test_slice_with_collapse_dims(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dynamic_slice(test_case): input = np.random.rand(2, 4, 4).astype(np.float32) results = [input[:, 1:, :]] @@ -111,6 +118,7 @@ def test_dynamic_slice(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dynamic_slice_case2(test_case): input = np.random.rand(2, 6, 3).astype(np.float32) results = [input[:, 1:, :]] @@ -119,6 +127,7 @@ def test_dynamic_slice_case2(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dynamic_slice_at_two_dims(test_case): input = np.random.rand(2, 3, 2, 2).astype(np.float32) results = [input[:, 2:, :, 1:]] @@ -127,6 +136,7 @@ def test_dynamic_slice_at_two_dims(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dynamic_slice_at_first_dim_and_anthor_dim(test_case): input = np.random.rand(3, 6, 3, 3).astype(np.float32) results = [input[1:, :, :, 1:]] @@ -135,6 +145,7 @@ def test_dynamic_slice_at_first_dim_and_anthor_dim(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_with_stride2(test_case): input = np.random.rand(2, 5, 4).astype(np.float32) results = [input[:, 1::2, :]] @@ -143,6 +154,7 @@ def test_slice_with_stride2(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_at_two_dim_with_stride_more_than_one(test_case): input = np.random.rand(2, 5, 4).astype(np.float32) results = [input[:, 1::3, ::2]] @@ -151,6 +163,7 @@ def test_slice_at_two_dim_with_stride_more_than_one(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_with_neg_index(test_case): input = np.random.rand(2, 5, 4).astype(np.float32) results = [input[:, 2:-2, :]] @@ -159,6 +172,7 @@ def test_slice_with_neg_index(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_with_neg_stride(test_case): input = np.random.rand(2, 5, 4).astype(np.float32) results = [input[:, 3:-4:-1, :]] @@ -167,6 +181,7 @@ def test_slice_with_neg_stride(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_with_neg_stride2(test_case): input = np.random.rand(2, 5, 4).astype(np.float32) results = [input[:, -1:1:-2, :]] @@ -175,6 +190,7 @@ def test_slice_with_neg_stride2(test_case): _check(test_case, results, outputs) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_slice_grad(test_case): input = np.random.rand(2, 5, 4).astype(np.float32) ref = np.zeros(input.shape, dtype=np.float32) diff --git a/oneflow/python/test/ops/test_square_sum.py b/oneflow/python/test/ops/test_square_sum.py index 35091851c6a3da770d15fce723b7e5050e89282e..b8da39d3af1888de9c43c37f97e3c32ff43061fc 100644 --- a/oneflow/python/test/ops/test_square_sum.py +++ b/oneflow/python/test/ops/test_square_sum.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @@ -36,11 +38,13 @@ def _run_test(test_case, x, dtype, device): _check(test_case, x, y.numpy()) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_square_sum_random_gpu(test_case): x = np.random.uniform(-0.01, 0.01, (64, 64)).astype(np.float32) _run_test(test_case, x, flow.float32, "gpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_square_sum_small_blob_gpu(test_case): x = np.random.uniform(-0.01, 0.01, (64,)).astype(np.float32) _run_test(test_case, x, flow.float32, "gpu") diff --git a/oneflow/python/test/ops/test_squeeze.py b/oneflow/python/test/ops/test_squeeze.py index 4366e652e80f1864b9e9aff0f36224b0e7492d38..34f8f183f86fd059f205656a520689b69889df7c 100644 --- a/oneflow/python/test/ops/test_squeeze.py +++ b/oneflow/python/test/ops/test_squeeze.py @@ -15,6 +15,7 @@ limitations under the License. """ from collections import OrderedDict +import os import numpy as np import oneflow as flow import tensorflow as tf @@ -74,7 +75,8 @@ def gen_arg_list(): def test_squeeze(test_case): for arg in gen_arg_list(): compare_with_tensorflow(*arg) - compare_with_tensorflow("gpu", (1, 1, 1), [0, 1, 2]) + if os.getenv("ONEFLOW_TEST_CPU_ONLY") is None: + compare_with_tensorflow("gpu", (1, 1, 1), [0, 1, 2]) + compare_with_tensorflow("gpu", (5, 6, 7), None) compare_with_tensorflow("cpu", (1, 1, 1), [0, 1, 2]) - compare_with_tensorflow("gpu", (5, 6, 7), None) compare_with_tensorflow("cpu", (5, 6, 7), None) diff --git a/oneflow/python/test/ops/test_tensor_list_split.py b/oneflow/python/test/ops/test_tensor_list_split.py index afacc4a1d67129cdd874b3d6306cb384079da443..e337ea657e6dbb73ee0016c848afc82c3133cf78 100644 --- a/oneflow/python/test/ops/test_tensor_list_split.py +++ b/oneflow/python/test/ops/test_tensor_list_split.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os def _gen_random_input_list(input_static_shape): @@ -52,6 +54,7 @@ def _of_tensor_list_split(input_tensor_list, input_static_shape, device_tag="gpu return [output.numpy_list()[0] for output in outputs] +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_tensor_list_input_output(test_case, verbose=False): input_shape = [2, 5, 4] input_list = _gen_random_input_list(input_shape) diff --git a/oneflow/python/test/ops/test_unique.py b/oneflow/python/test/ops/test_unique.py index ec5033a5c6df03598fc182541b3fd3478a135e60..895e6ea836975191edb9f8a30a1c84f41ac76ac9 100644 --- a/oneflow/python/test/ops/test_unique.py +++ b/oneflow/python/test/ops/test_unique.py @@ -16,6 +16,8 @@ limitations under the License. import numpy as np import oneflow as flow import oneflow.typing as oft +import unittest +import os func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @@ -48,18 +50,21 @@ def _run_test(test_case, x, dtype, device): ) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_unique_with_counts_int(test_case): x = np.asarray(list(range(32)) * 2).astype(np.int32) np.random.shuffle(x) _run_test(test_case, x, flow.int32, "gpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_unique_with_counts_float(test_case): x = np.asarray(list(range(32)) * 2).astype(np.float32) np.random.shuffle(x) _run_test(test_case, x, flow.float32, "gpu") +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_unique_with_counts_random_gpu(test_case): x = np.random.randint(0, 32, 1024).astype(np.int32) np.random.shuffle(x) diff --git a/oneflow/python/test/ops/test_util.py b/oneflow/python/test/ops/test_util.py index 63f4972ce222f6fad2be8b244a294d79337cff92..7acfc11743e6d4ff7209b58f01596ec0e7fd2ab7 100644 --- a/oneflow/python/test/ops/test_util.py +++ b/oneflow/python/test/ops/test_util.py @@ -30,6 +30,9 @@ def GenCartesianProduct(sets): assert isinstance(sets, Iterable) for set in sets: assert isinstance(set, Iterable) + if os.getenv("ONEFLOW_TEST_CPU_ONLY"): + if "gpu" in set: + set.remove("gpu") return itertools.product(*sets) diff --git a/oneflow/user/kernels/broadcast_div_grad_kernel.cpp b/oneflow/user/kernels/broadcast_div_grad_kernel.cpp index 7f3f62e4585cd463d8d5795e7517db3056605a09..e97b61f1450aafaa180b1c58298058df364baf53 100644 --- a/oneflow/user/kernels/broadcast_div_grad_kernel.cpp +++ b/oneflow/user/kernels/broadcast_div_grad_kernel.cpp @@ -71,7 +71,9 @@ class BroadcastDivGradKernel final : public user_op::OpKernel { OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_DIV_GRAD_KERNEL, DEVICE_TYPE_SEQ, ARITHMETIC_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_DIV_GRAD_KERNEL, (DeviceType::kGPU), FLOAT16_DATA_TYPE_SEQ) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/broadcast_like_kernel.cpp b/oneflow/user/kernels/broadcast_like_kernel.cpp index 6f3326b3c1f1a89dc5ceaae91e33fa623db5e94d..539e3bae330fbdb2e0599b647035237db603cf02 100644 --- a/oneflow/user/kernels/broadcast_like_kernel.cpp +++ b/oneflow/user/kernels/broadcast_like_kernel.cpp @@ -50,9 +50,14 @@ class BroadcastLikeKernel final : public user_op::OpKernel { .SetIsMatchedHob((user_op::HobDeviceType() == device) \ & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); +#ifdef WITH_CUDA #define REGISTER_BROADCAST_LIKE_KERNEL(dtype) \ REGISTER_BROADCAST_LIKE_XPU_KERNEL(DeviceType::kCPU, dtype) \ REGISTER_BROADCAST_LIKE_XPU_KERNEL(DeviceType::kGPU, dtype) +#else +#define REGISTER_BROADCAST_LIKE_KERNEL(dtype) \ + REGISTER_BROADCAST_LIKE_XPU_KERNEL(DeviceType::kCPU, dtype) +#endif REGISTER_BROADCAST_LIKE_KERNEL(float) REGISTER_BROADCAST_LIKE_KERNEL(float16) diff --git a/oneflow/user/kernels/cast_kernel.cpp b/oneflow/user/kernels/cast_kernel.cpp index f5b633d6e600436093b206bd98943c5fb9a91351..ba729c649be92b0e3117962ed121dcaf42cf2153 100644 --- a/oneflow/user/kernels/cast_kernel.cpp +++ b/oneflow/user/kernels/cast_kernel.cpp @@ -35,7 +35,11 @@ struct CopyTensor<DeviceType::kCPU, T, U> { template<typename T, typename U> struct CopyTensor<DeviceType::kGPU, T, U> { static void Call(DeviceCtx* ctx, const Tensor* src, Tensor* dst) { +#ifdef WITH_CUDA CopyElemOnGpu(ctx, src->dptr<T>(), dst->mut_dptr<U>(), src->shape().elem_cnt()); +#else + UNIMPLEMENTED(); +#endif } }; diff --git a/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp b/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp index b2670d11bc4a75a538eb720482c90502576f568b..575cf06e5a2f02251e525d3e97400de959846ade 100644 --- a/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp +++ b/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp @@ -50,7 +50,9 @@ class CategoricalOrdinalEncodeKernel final : public user_op::OpKernel { REGISTER_CATEGORICAL_ORDINAL_ENCODE_KERNEL(DeviceType::kCPU, DataType::kInt32, int32_t); REGISTER_CATEGORICAL_ORDINAL_ENCODE_KERNEL(DeviceType::kCPU, DataType::kInt64, int64_t); +#ifdef WITH_CUDA REGISTER_CATEGORICAL_ORDINAL_ENCODE_KERNEL(DeviceType::kGPU, DataType::kInt32, int32_t); REGISTER_CATEGORICAL_ORDINAL_ENCODE_KERNEL(DeviceType::kGPU, DataType::kInt64, int64_t); +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/concat_kernel.cpp b/oneflow/user/kernels/concat_kernel.cpp index c51d1c3970ef31510fd402f19523864bee03700b..a713b45bd0cbcdec04fbbfa28fa9f075cde9308f 100644 --- a/oneflow/user/kernels/concat_kernel.cpp +++ b/oneflow/user/kernels/concat_kernel.cpp @@ -89,7 +89,9 @@ class ConcatKernel final : public user_op::OpKernel { REGISTER_CONCAT_KERNEL(device, int64_t) REGISTER_CONCAT_KERNEL_WITH_DEVICE(DeviceType::kCPU) +#ifdef WITH_CUDA REGISTER_CONCAT_KERNEL_WITH_DEVICE(DeviceType::kGPU) REGISTER_CONCAT_KERNEL(DeviceType::kGPU, float16) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/conv_cudnn_kernels.cpp b/oneflow/user/kernels/conv_cudnn_kernels.cpp index 1faf453607ed6c53557993f4f7e2d8ed70985796..b4d9a3cf5297abedc243ecc2bb82ef54ea51cb66 100644 --- a/oneflow/user/kernels/conv_cudnn_kernels.cpp +++ b/oneflow/user/kernels/conv_cudnn_kernels.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/framework/framework.h" #include "oneflow/user/ops/nn_util.h" #include "oneflow/core/device/cudnn_conv_util.h" @@ -382,3 +384,5 @@ REGISTER_CONV_BIAS_GRAD_FLOATING_KERNEL(float16); } // namespace } // namespace oneflow + +#endif diff --git a/oneflow/user/kernels/deconv_cudnn_kernel.cpp b/oneflow/user/kernels/deconv_cudnn_kernel.cpp index dccc2770ac0f51da8d29dc019578a799cae27f95..65f4847567cb779e542071cf3e185f6dbb878db8 100644 --- a/oneflow/user/kernels/deconv_cudnn_kernel.cpp +++ b/oneflow/user/kernels/deconv_cudnn_kernel.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/framework/framework.h" #include "oneflow/user/ops/nn_util.h" #include "oneflow/core/device/cudnn_conv_util.h" @@ -150,3 +152,5 @@ REGISTER_DECONV_KERNEL(deconv2d, double, 2); REGISTER_DECONV_KERNEL(deconv3d, double, 3); } // namespace oneflow + +#endif diff --git a/oneflow/user/kernels/dropout_kernel.cpp b/oneflow/user/kernels/dropout_kernel.cpp index 0639c366f12a83c295059e138054824e91708d48..4605e458df97c0be150052068833b239dece565c 100644 --- a/oneflow/user/kernels/dropout_kernel.cpp +++ b/oneflow/user/kernels/dropout_kernel.cpp @@ -124,7 +124,9 @@ class RandomMaskLikeKernel final : public user_op::OpKernel { .SetIsMatchedHob(user_op::HobDeviceType() == device); REGISTER_RANDOM_MASK_LIKE_KERNEL(DeviceType::kCPU) +#ifdef WITH_CUDA REGISTER_RANDOM_MASK_LIKE_KERNEL(DeviceType::kGPU) +#endif } // namespace } // namespace oneflow diff --git a/oneflow/user/kernels/expand_dims_kernel.cpp b/oneflow/user/kernels/expand_dims_kernel.cpp index 56999e8bc6102426b9d6164137fd08865eb320d4..ffe221771af780f76efd4d147f8a5a4bed9bc35b 100644 --- a/oneflow/user/kernels/expand_dims_kernel.cpp +++ b/oneflow/user/kernels/expand_dims_kernel.cpp @@ -29,6 +29,8 @@ namespace oneflow { }); REGISTER_EXPAND_DIMS_KERNEL(kCPU) +#ifdef WITH_CUDA REGISTER_EXPAND_DIMS_KERNEL(kGPU) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/identity_kernel.cpp b/oneflow/user/kernels/identity_kernel.cpp index 1be28e4fa0a405e81ecddc34597fbe173b7666fe..b7e5261ae8b4c07a095a2669eef1ae789ae6a23f 100644 --- a/oneflow/user/kernels/identity_kernel.cpp +++ b/oneflow/user/kernels/identity_kernel.cpp @@ -51,7 +51,9 @@ class IdentityKernel final : public user_op::OpKernel { }); REGISTER_IDENTITY_KERNEL(DeviceType::kCPU) +#ifdef WITH_CUDA REGISTER_IDENTITY_KERNEL(DeviceType::kGPU) +#endif } // namespace diff --git a/oneflow/user/kernels/layer_norm_gpu_kernel.cpp b/oneflow/user/kernels/layer_norm_gpu_kernel.cpp index e02942a02275927a264832cb8685219fb1587e5d..0dfba2b0e3160c3646c1458a0366396d26bd00f0 100644 --- a/oneflow/user/kernels/layer_norm_gpu_kernel.cpp +++ b/oneflow/user/kernels/layer_norm_gpu_kernel.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/device/cudnn_util.h" #include "oneflow/core/framework/framework.h" #include "oneflow/core/ndarray/ndarray_util.h" @@ -252,3 +254,5 @@ REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(double) REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(float16) } // namespace oneflow + +#endif diff --git a/oneflow/user/kernels/math_binary_broadcast_kernels.cpp b/oneflow/user/kernels/math_binary_broadcast_kernels.cpp index b49b3276b450d495885417bcf20c15670ba8870c..068dbf9e2f5ab96ef2063976c13761d0af35285e 100644 --- a/oneflow/user/kernels/math_binary_broadcast_kernels.cpp +++ b/oneflow/user/kernels/math_binary_broadcast_kernels.cpp @@ -59,9 +59,11 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_BROADCAST_KERNEL, MATH_BINARY_BROADCAST_FUNC_SEQ, DEVICE_TYPE_SEQ, ARITHMETIC_DATA_TYPE_SEQ) // gpu half +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_BROADCAST_KERNEL, MATH_BINARY_BROADCAST_FUNC_SEQ, (DeviceType::kGPU), FLOAT16_DATA_TYPE_SEQ) +#endif #define REGISTER_MATH_BINARY_BROADCAST_LOGICAL_KERNEL(math_type_pair, device, data_type_pair) \ REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair)) \ diff --git a/oneflow/user/kernels/matmul_kernels.cpp b/oneflow/user/kernels/matmul_kernels.cpp index 93519d578ee12a4ab32e2edb911a609b2c24aafb..f81b044961de7ee5b726626a8bdba04b6f0e81b5 100644 --- a/oneflow/user/kernels/matmul_kernels.cpp +++ b/oneflow/user/kernels/matmul_kernels.cpp @@ -70,9 +70,12 @@ class MatmulFloatingKernel final : public user_op::OpKernel { REGISTER_MATMUL_KERNEL(DeviceType::kCPU, float); REGISTER_MATMUL_KERNEL(DeviceType::kCPU, double); +#ifdef WITH_CUDA REGISTER_MATMUL_KERNEL(DeviceType::kGPU, float); REGISTER_MATMUL_KERNEL(DeviceType::kGPU, double); +#endif +#ifdef WITH_CUDA class MatmulGpuHalfKernel final : public user_op::OpKernel { public: MatmulGpuHalfKernel() = default; @@ -103,10 +106,13 @@ class MatmulGpuHalfKernel final : public user_op::OpKernel { } } }; +#endif +#ifdef WITH_CUDA REGISTER_USER_KERNEL("matmul").SetCreateFn<MatmulGpuHalfKernel>().SetIsMatchedHob( (user_op::HobDeviceType() == DeviceType::kGPU) & (user_op::HobDataType("a", 0) == DataType::kFloat16)); +#endif template<DeviceType device_type, typename T> class BatchMatmulFloatingKernel final : public user_op::OpKernel { @@ -152,9 +158,12 @@ class BatchMatmulFloatingKernel final : public user_op::OpKernel { REGISTER_BATCH_MATMUL_KERNEL(DeviceType::kCPU, float); REGISTER_BATCH_MATMUL_KERNEL(DeviceType::kCPU, double); +#ifdef WITH_CUDA REGISTER_BATCH_MATMUL_KERNEL(DeviceType::kGPU, float); REGISTER_BATCH_MATMUL_KERNEL(DeviceType::kGPU, double); +#endif +#ifdef WITH_CUDA class BatchMatmulGpuHalfKernel final : public user_op::OpKernel { public: BatchMatmulGpuHalfKernel() = default; @@ -202,5 +211,6 @@ REGISTER_USER_KERNEL("batch_matmul") size_t batch_num = a->shape().Count(0, num_axes - 2); return sizeof(int64_t) * 3 * batch_num; }); +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/normalization_kernel.cpp b/oneflow/user/kernels/normalization_kernel.cpp index 350c73c4c01728455fc1b599b9395011e7a4db19..5f3e5dd2733136612aecec62a166060b462b692b 100644 --- a/oneflow/user/kernels/normalization_kernel.cpp +++ b/oneflow/user/kernels/normalization_kernel.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/framework/framework.h" #include "oneflow/core/device/cudnn_util.h" @@ -372,3 +374,5 @@ REGISTER_BN_GRAD_KERNEL(double) } // namespace } // namespace oneflow + +#endif diff --git a/oneflow/user/kernels/pad_kernel.cpp b/oneflow/user/kernels/pad_kernel.cpp index 471794c37d814a5d8888a417c2168719d6be53b0..51fb2c32c8c1a3b9a4304f278d7c7e29d49f3516 100644 --- a/oneflow/user/kernels/pad_kernel.cpp +++ b/oneflow/user/kernels/pad_kernel.cpp @@ -111,12 +111,14 @@ class PadKernel final : public user_op::OpKernel { (user_op::HobDeviceType() == dev) \ & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); +#ifdef WITH_CUDA REGISTER_PAD_KERNEL(DeviceType::kGPU, double) REGISTER_PAD_KERNEL(DeviceType::kGPU, float) REGISTER_PAD_KERNEL(DeviceType::kGPU, float16) REGISTER_PAD_KERNEL(DeviceType::kGPU, int32_t) REGISTER_PAD_KERNEL(DeviceType::kGPU, int64_t) REGISTER_PAD_KERNEL(DeviceType::kGPU, int8_t) +#endif REGISTER_PAD_KERNEL(DeviceType::kCPU, double) REGISTER_PAD_KERNEL(DeviceType::kCPU, float) REGISTER_PAD_KERNEL(DeviceType::kCPU, int32_t) @@ -168,12 +170,14 @@ class PadGradKernel final : public user_op::OpKernel { .SetIsMatchedHob((user_op::HobDeviceType() == dev) \ & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)); +#ifdef WITH_CUDA REGISTER_PAD_GRAD_KERNEL(DeviceType::kGPU, double) REGISTER_PAD_GRAD_KERNEL(DeviceType::kGPU, float) REGISTER_PAD_GRAD_KERNEL(DeviceType::kGPU, float16) REGISTER_PAD_GRAD_KERNEL(DeviceType::kGPU, int32_t) REGISTER_PAD_GRAD_KERNEL(DeviceType::kGPU, int64_t) REGISTER_PAD_GRAD_KERNEL(DeviceType::kGPU, int8_t) +#endif REGISTER_PAD_GRAD_KERNEL(DeviceType::kCPU, double) REGISTER_PAD_GRAD_KERNEL(DeviceType::kCPU, float) REGISTER_PAD_GRAD_KERNEL(DeviceType::kCPU, int32_t) diff --git a/oneflow/user/kernels/pool_gpu_kernel.cpp b/oneflow/user/kernels/pool_gpu_kernel.cpp index c7848b70de766b1e6d8bd75c4429cb52f2480ed0..aefe39c3a5bc23360abcdd0599618563244035c4 100644 --- a/oneflow/user/kernels/pool_gpu_kernel.cpp +++ b/oneflow/user/kernels/pool_gpu_kernel.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUDA + #include "oneflow/core/framework/framework.h" #include "oneflow/user/utils/pool_util.h" #include "oneflow/core/device/cudnn_util.h" @@ -439,3 +441,5 @@ REGISTER_POOL_GPU_KERNEL(double) REGISTER_POOL_GPU_KERNEL(float16) } // namespace oneflow + +#endif diff --git a/oneflow/user/kernels/random_mask_generator.h b/oneflow/user/kernels/random_mask_generator.h index 87d10a8432260b0b8c33bdfb899fe2d9708cd820..b1d87a78db7a5e689f475e6ca362ab302bc01368 100644 --- a/oneflow/user/kernels/random_mask_generator.h +++ b/oneflow/user/kernels/random_mask_generator.h @@ -18,8 +18,10 @@ limitations under the License. #include "oneflow/core/common/data_type.h" #include "oneflow/core/device/device_context.h" +#ifdef WITH_CUDA #include <curand.h> #include <curand_kernel.h> +#endif namespace oneflow { @@ -39,6 +41,7 @@ class RandomMaskGenerator<DeviceType::kCPU> final { std::mt19937 mt19937_generator_; }; +#ifdef WITH_CUDA template<> class RandomMaskGenerator<DeviceType::kGPU> final { public: @@ -53,6 +56,7 @@ class RandomMaskGenerator<DeviceType::kGPU> final { int32_t block_num_; int32_t thread_num_; }; +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/reduce_kernel.cpp b/oneflow/user/kernels/reduce_kernel.cpp index dd7ce0fab8f8d413bea66b839d4553d40c5356c5..2036c98c1dd5179b85f7b757cd925b74d2bab037 100644 --- a/oneflow/user/kernels/reduce_kernel.cpp +++ b/oneflow/user/kernels/reduce_kernel.cpp @@ -69,15 +69,18 @@ class ReduceKernel final : public user_op::OpKernel { REGISTER_REDUCE_ARITHMETIC_KERNELS(device, int64_t) REGISTER_REDUCE_ARITHMETIC_KERNELS_BY_DEVICE(DeviceType::kCPU) +#ifdef WITH_CUDA REGISTER_REDUCE_ARITHMETIC_KERNELS_BY_DEVICE(DeviceType::kGPU) +#endif #define REGISTER_REDUCE_LOGICAL_KERNELS(device) \ REGISTER_REDUCE_XPU_KERNEL("reduce_any", BinaryFuncAny, device, int8_t) \ REGISTER_REDUCE_XPU_KERNEL("reduce_all", BinaryFuncAll, device, int8_t) REGISTER_REDUCE_LOGICAL_KERNELS(DeviceType::kCPU) +#ifdef WITH_CUDA REGISTER_REDUCE_LOGICAL_KERNELS(DeviceType::kGPU) - REGISTER_REDUCE_XPU_KERNEL("reduce_sum", BinaryFuncSum, DeviceType::kGPU, float16) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/relu_kernel.cpp b/oneflow/user/kernels/relu_kernel.cpp index 6548fd44ac01a68175d0ea783a7bced963974e2e..53dfdb9b5540482ca5592beb44989335da5753f6 100644 --- a/oneflow/user/kernels/relu_kernel.cpp +++ b/oneflow/user/kernels/relu_kernel.cpp @@ -49,9 +49,11 @@ class ReluKernel final : public user_op::OpKernel { REGISTER_RELU_KERNEL(DeviceType::kCPU, float) REGISTER_RELU_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_RELU_KERNEL(DeviceType::kGPU, float) REGISTER_RELU_KERNEL(DeviceType::kGPU, double) REGISTER_RELU_KERNEL(DeviceType::kGPU, float16) +#endif template<DeviceType device_type, typename T> class ReluGradKernel final : public user_op::OpKernel { @@ -84,9 +86,11 @@ class ReluGradKernel final : public user_op::OpKernel { REGISTER_RELU_GRAD_KERNEL(DeviceType::kCPU, float) REGISTER_RELU_GRAD_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_RELU_GRAD_KERNEL(DeviceType::kGPU, float) REGISTER_RELU_GRAD_KERNEL(DeviceType::kGPU, double) REGISTER_RELU_GRAD_KERNEL(DeviceType::kGPU, float16) +#endif } // namespace diff --git a/oneflow/user/kernels/reshape_kernel.cpp b/oneflow/user/kernels/reshape_kernel.cpp index c4712e606333676f607ec00a75f48f3889ac208c..e854ac5332dbf3a18aeb59e9274892d8024a533f 100644 --- a/oneflow/user/kernels/reshape_kernel.cpp +++ b/oneflow/user/kernels/reshape_kernel.cpp @@ -29,5 +29,8 @@ namespace oneflow { }); REGISTER_RESHAPE_KERNEL(DeviceType::kCPU) +#ifdef WITH_CUDA REGISTER_RESHAPE_KERNEL(DeviceType::kGPU) +#endif + } // namespace oneflow diff --git a/oneflow/user/kernels/reshape_like_kernel.cpp b/oneflow/user/kernels/reshape_like_kernel.cpp index c394a2a9c64d23bd7eb8f7c57fd95026934ac2e7..4f5dc945519235bf156b079e1439085e1b3120b1 100644 --- a/oneflow/user/kernels/reshape_like_kernel.cpp +++ b/oneflow/user/kernels/reshape_like_kernel.cpp @@ -29,6 +29,8 @@ namespace oneflow { }); REGISTER_RESHAPE_LIKE_KERNEL(kCPU) +#ifdef WITH_CUDA REGISTER_RESHAPE_LIKE_KERNEL(kGPU) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/same_padding_kernel.cpp b/oneflow/user/kernels/same_padding_kernel.cpp index a15148112c55a793eef3fccf751ba63621bd4409..9929d0df0fdbed439f626439e2de520db972c8f7 100644 --- a/oneflow/user/kernels/same_padding_kernel.cpp +++ b/oneflow/user/kernels/same_padding_kernel.cpp @@ -87,12 +87,14 @@ class SamePaddingKernel final : public user_op::OpKernel { .SetIsMatchedHob((user_op::HobDeviceType() == dev) \ & (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); +#ifdef WITH_CUDA REGISTER_SAME_PADDING_KERNEL(DeviceType::kGPU, double) REGISTER_SAME_PADDING_KERNEL(DeviceType::kGPU, float) REGISTER_SAME_PADDING_KERNEL(DeviceType::kGPU, float16) REGISTER_SAME_PADDING_KERNEL(DeviceType::kGPU, int32_t) REGISTER_SAME_PADDING_KERNEL(DeviceType::kGPU, int64_t) REGISTER_SAME_PADDING_KERNEL(DeviceType::kGPU, int8_t) +#endif REGISTER_SAME_PADDING_KERNEL(DeviceType::kCPU, double) REGISTER_SAME_PADDING_KERNEL(DeviceType::kCPU, float) REGISTER_SAME_PADDING_KERNEL(DeviceType::kCPU, int32_t) @@ -163,12 +165,14 @@ class SamePaddingGradKernel final : public user_op::OpKernel { .SetIsMatchedHob((user_op::HobDeviceType() == dev) \ & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)); +#ifdef WITH_CUDA REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kGPU, double) REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kGPU, float) REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kGPU, float16) REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kGPU, int32_t) REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kGPU, int64_t) REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kGPU, int8_t) +#endif REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kCPU, double) REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kCPU, float) REGISTER_SAME_PADDING_GRAD_KERNEL(DeviceType::kCPU, int32_t) diff --git a/oneflow/user/kernels/scalar_add_kernel.cpp b/oneflow/user/kernels/scalar_add_kernel.cpp index d536844bbb4cd14a8e72966a1f350746a487838d..8a17a70af7a0620cbe90f4a8475e14a4b6fb33e7 100644 --- a/oneflow/user/kernels/scalar_add_kernel.cpp +++ b/oneflow/user/kernels/scalar_add_kernel.cpp @@ -61,10 +61,12 @@ REGISTER_KERNEL(CPU, int32_t) REGISTER_KERNEL(CPU, int64_t) REGISTER_KERNEL(CPU, float) REGISTER_KERNEL(CPU, double) +#ifdef WITH_CUDA REGISTER_KERNEL(GPU, int8_t) REGISTER_KERNEL(GPU, int32_t) REGISTER_KERNEL(GPU, int64_t) REGISTER_KERNEL(GPU, float) REGISTER_KERNEL(GPU, double) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/scalar_mul_kernel.cpp b/oneflow/user/kernels/scalar_mul_kernel.cpp index 8cd89678a76655942f716d70e9622181cb7a347d..f4bb2a7b2a578f8632f990bee339bfdf2e2ca93a 100644 --- a/oneflow/user/kernels/scalar_mul_kernel.cpp +++ b/oneflow/user/kernels/scalar_mul_kernel.cpp @@ -61,10 +61,12 @@ REGISTER_KERNEL(CPU, int32_t) REGISTER_KERNEL(CPU, int64_t) REGISTER_KERNEL(CPU, float) REGISTER_KERNEL(CPU, double) +#ifdef WITH_CUDA REGISTER_KERNEL(GPU, int32_t) REGISTER_KERNEL(GPU, int64_t) REGISTER_KERNEL(GPU, float) REGISTER_KERNEL(GPU, double) REGISTER_KERNEL(GPU, float16) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/sigmoid_kernel.cpp b/oneflow/user/kernels/sigmoid_kernel.cpp index fdfe25a89e612e65b56cc4738809f1cbf11afb7e..9f34f0070a26cf6bbf39c4116bbafa9f4169565f 100644 --- a/oneflow/user/kernels/sigmoid_kernel.cpp +++ b/oneflow/user/kernels/sigmoid_kernel.cpp @@ -49,9 +49,11 @@ class SigmoidKernel final : public user_op::OpKernel { REGISTER_SIGMOID_KERNEL(DeviceType::kCPU, float) REGISTER_SIGMOID_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_SIGMOID_KERNEL(DeviceType::kGPU, float) REGISTER_SIGMOID_KERNEL(DeviceType::kGPU, double) REGISTER_SIGMOID_KERNEL(DeviceType::kGPU, float16) +#endif template<DeviceType device_type, typename T> class SigmoidGradKernel final : public user_op::OpKernel { @@ -84,9 +86,11 @@ class SigmoidGradKernel final : public user_op::OpKernel { REGISTER_SIGMOID_GRAD_KERNEL(DeviceType::kCPU, float) REGISTER_SIGMOID_GRAD_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_SIGMOID_GRAD_KERNEL(DeviceType::kGPU, float) REGISTER_SIGMOID_GRAD_KERNEL(DeviceType::kGPU, double) REGISTER_SIGMOID_GRAD_KERNEL(DeviceType::kGPU, float16) +#endif } // namespace diff --git a/oneflow/user/kernels/softmax_kernel.cpp b/oneflow/user/kernels/softmax_kernel.cpp index 841a28754cfbe98a6432c19909f88515b2e56c00..9189e6c4d248466a221d54df77bda39ce637d78c 100644 --- a/oneflow/user/kernels/softmax_kernel.cpp +++ b/oneflow/user/kernels/softmax_kernel.cpp @@ -67,9 +67,11 @@ user_op::InferTmpSizeFn GenInferTmpSizeFn(const std::string& bn) { REGISTER_SOFTMAX_KERNEL(DeviceType::kCPU, float) REGISTER_SOFTMAX_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_SOFTMAX_KERNEL(DeviceType::kGPU, float16) REGISTER_SOFTMAX_KERNEL(DeviceType::kGPU, float) REGISTER_SOFTMAX_KERNEL(DeviceType::kGPU, double) +#endif template<DeviceType device_type, typename T> class SoftmaxGradKernel final : public user_op::OpKernel { @@ -108,9 +110,11 @@ class SoftmaxGradKernel final : public user_op::OpKernel { REGISTER_SOFTMAX_GRAD_KERNEL(DeviceType::kCPU, float) REGISTER_SOFTMAX_GRAD_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_SOFTMAX_GRAD_KERNEL(DeviceType::kGPU, float16) REGISTER_SOFTMAX_GRAD_KERNEL(DeviceType::kGPU, float) REGISTER_SOFTMAX_GRAD_KERNEL(DeviceType::kGPU, double) +#endif } // namespace diff --git a/oneflow/user/kernels/softmax_kernel_util.cpp b/oneflow/user/kernels/softmax_kernel_util.cpp index 6389217994dd77ef4bca8a12926deb6df62e92a0..4cd30884a87ffb8c2fc230640cae0bcc3472326f 100644 --- a/oneflow/user/kernels/softmax_kernel_util.cpp +++ b/oneflow/user/kernels/softmax_kernel_util.cpp @@ -66,9 +66,11 @@ void SoftmaxKernelUtil<device_type, T>::ComputeDiff(DeviceCtx* ctx, const int64_ #define INSTANTIATE_SOFTMAX_KERNEL_UTIL(device_type, data_type) \ template struct SoftmaxKernelUtil<device_type, data_type>; +#ifdef WITH_CUDA INSTANTIATE_SOFTMAX_KERNEL_UTIL(DeviceType::kGPU, float16) INSTANTIATE_SOFTMAX_KERNEL_UTIL(DeviceType::kGPU, float) INSTANTIATE_SOFTMAX_KERNEL_UTIL(DeviceType::kGPU, double) +#endif INSTANTIATE_SOFTMAX_KERNEL_UTIL(DeviceType::kCPU, float) INSTANTIATE_SOFTMAX_KERNEL_UTIL(DeviceType::kCPU, double) #undef INSTANTIATE_SOFTMAX_KERNEL_UTIL diff --git a/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp b/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp index 45611d96cf46895013917a396541cc3e536c8688..f70639080d66a53b85e75caa18be1ffee81968ca 100644 --- a/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp +++ b/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp @@ -84,17 +84,21 @@ class SparseCrossEntropyMsKernel final : public user_op::OpKernel { OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_CROSS_ENTROPY_KERNEL, (SparseCrossEntropyKernel), ("sparse_cross_entropy"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCPU), FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_CROSS_ENTROPY_KERNEL, (SparseCrossEntropyKernel), ("sparse_cross_entropy"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kGPU), FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#endif OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_CROSS_ENTROPY_KERNEL, (SparseCrossEntropyMsKernel), ("sparse_cross_entropy_ms"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCPU), FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_CROSS_ENTROPY_KERNEL, (SparseCrossEntropyMsKernel), ("sparse_cross_entropy_ms"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kGPU), FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#endif template<DeviceType device_type, typename T, typename K> class SparseCrossEntropyGradKernel final : public user_op::OpKernel { @@ -170,17 +174,22 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_CROSS_ENTROPY_GRAD_KERNEL, (SparseCrossEntropyGradKernel), ("sparse_cross_entropy_grad"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCPU), FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_CROSS_ENTROPY_GRAD_KERNEL, (SparseCrossEntropyGradKernel), ("sparse_cross_entropy_grad"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kGPU), FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#endif OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_CROSS_ENTROPY_GRAD_KERNEL, (SparseCrossEntropyMsGradKernel), ("sparse_cross_entropy_ms_grad"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCPU), FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_CROSS_ENTROPY_GRAD_KERNEL, (SparseCrossEntropyMsGradKernel), ("sparse_cross_entropy_ms_grad"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kGPU), FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#endif + } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp index bd9cbae66199050aa8a8ed8b0d5f24c25a819a7d..b56543eb945c6badaafb1276dad8b489e43afdaa 100644 --- a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp +++ b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp @@ -80,21 +80,25 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL, ("sparse_softmax_cross_entropy"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCPU), FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL, (SparseSoftmaxCrossEntropyKernel), ("sparse_softmax_cross_entropy"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kGPU), FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#endif OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL, (SparseSoftmaxCrossEntropyMsKernel), ("sparse_softmax_cross_entropy_ms"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCPU), FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL, (SparseSoftmaxCrossEntropyMsKernel), ("sparse_softmax_cross_entropy_ms"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kGPU), FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#endif template<DeviceType device_type, typename T, typename K> class SparseSoftmaxCrossEntropyGradKernel final : public user_op::OpKernel { @@ -168,20 +172,24 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_GRAD_KERN ("sparse_softmax_cross_entropy_grad"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCPU), FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_GRAD_KERNEL, (SparseSoftmaxCrossEntropyGradKernel), ("sparse_softmax_cross_entropy_grad"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kGPU), FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#endif OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_GRAD_KERNEL, (SparseSoftmaxCrossEntropyMsGradKernel), ("sparse_softmax_cross_entropy_ms_grad"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCPU), FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#ifdef WITH_CUDA OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_GRAD_KERNEL, (SparseSoftmaxCrossEntropyMsGradKernel), ("sparse_softmax_cross_entropy_ms_grad"), OF_PP_MAKE_TUPLE_SEQ(DeviceType::kGPU), FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#endif } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/split_like_kernel.cpp b/oneflow/user/kernels/split_like_kernel.cpp index e8bb9e73f349b81ee741b9c0686457c1b0053962..641370d864e27f54e60fa26f1a609a5b4f17f3cd 100644 --- a/oneflow/user/kernels/split_like_kernel.cpp +++ b/oneflow/user/kernels/split_like_kernel.cpp @@ -91,7 +91,9 @@ class SplitLikeKernel final : public user_op::OpKernel { REGISTER_SPLIT_LIKE_KERNEL(device, int64_t) REGISTER_SPLIT_LIKE_KERNEL_WITH_DEVICE(DeviceType::kCPU) +#ifdef WITH_CUDA REGISTER_SPLIT_LIKE_KERNEL_WITH_DEVICE(DeviceType::kGPU) REGISTER_SPLIT_LIKE_KERNEL(DeviceType::kGPU, float16) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/squeeze_kernel.cpp b/oneflow/user/kernels/squeeze_kernel.cpp index d81056725a3f3abd821956ee8c10509fe37e436d..9e4502e877bf289c3388580687db064bc7782501 100644 --- a/oneflow/user/kernels/squeeze_kernel.cpp +++ b/oneflow/user/kernels/squeeze_kernel.cpp @@ -29,6 +29,8 @@ namespace oneflow { }); REGISTER_SQUEEZE_KERNEL(kCPU) +#ifdef WITH_CUDA REGISTER_SQUEEZE_KERNEL(kGPU) +#endif } // namespace oneflow diff --git a/oneflow/user/kernels/tanh_kernel.cpp b/oneflow/user/kernels/tanh_kernel.cpp index 2b451f93584469375c0d95fafe80befb8903e7d4..bac1e5cfd43a642b5d971180401b7f7978964b74 100644 --- a/oneflow/user/kernels/tanh_kernel.cpp +++ b/oneflow/user/kernels/tanh_kernel.cpp @@ -49,9 +49,11 @@ class TanHKernel final : public user_op::OpKernel { REGISTER_TANH_KERNEL(DeviceType::kCPU, float) REGISTER_TANH_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_TANH_KERNEL(DeviceType::kGPU, float) REGISTER_TANH_KERNEL(DeviceType::kGPU, double) REGISTER_TANH_KERNEL(DeviceType::kGPU, float16) +#endif template<DeviceType device_type, typename T> class TanHGradKernel final : public user_op::OpKernel { @@ -84,9 +86,11 @@ class TanHGradKernel final : public user_op::OpKernel { REGISTER_TANH_GRAD_KERNEL(DeviceType::kCPU, float) REGISTER_TANH_GRAD_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_TANH_GRAD_KERNEL(DeviceType::kGPU, float) REGISTER_TANH_GRAD_KERNEL(DeviceType::kGPU, double) REGISTER_TANH_GRAD_KERNEL(DeviceType::kGPU, float16) +#endif } // namespace diff --git a/oneflow/user/kernels/test_kernels.cpp b/oneflow/user/kernels/test_kernels.cpp index 0ba24de1b32b8510b19c8e0841830ba33e83456e..0c3ce7ba31c6af19e6c30ade0d38af2230da056b 100644 --- a/oneflow/user/kernels/test_kernels.cpp +++ b/oneflow/user/kernels/test_kernels.cpp @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include "oneflow/core/framework/framework.h" #include "oneflow/core/kernel/new_kernel_util.h" #include "oneflow/core/kernel/random_generator.h" @@ -20,6 +21,8 @@ limitations under the License. namespace oneflow { +#ifdef WITH_CUDA + class ReluKernel final : public user_op::OpKernel { public: ReluKernel() = default; @@ -54,22 +57,6 @@ class ReluGradKernel final : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -template<typename T> -class ReluCpuKernel final : public user_op::OpKernel { - public: - ReluCpuKernel() = default; - ~ReluCpuKernel() = default; - - private: - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - NewKernelUtil<DeviceType::kCPU>::Relu(ctx->device_ctx(), in->shape().elem_cnt(), in->dptr<T>(), - out->mut_dptr<T>()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - REGISTER_USER_KERNEL("ccrelu") .SetCreateFn<ReluKernel>() .SetIsMatchedHob(user_op::HobTrue()) @@ -80,11 +67,6 @@ REGISTER_USER_KERNEL("ccrelu") return Maybe<void>::Ok(); }); -REGISTER_USER_KERNEL("cpu_only_relu_test") - .SetCreateFn<ReluCpuKernel<float>>() - .SetIsMatchedHob((user_op::HobDataType("in", 0) == DataType::kFloat) - & (user_op::HobDataType("out", 0) == DataType::kFloat)); - REGISTER_USER_KERNEL("ccrelu_grad") .SetCreateFn<ReluGradKernel>() .SetIsMatchedHob(user_op::HobTrue()) @@ -132,25 +114,6 @@ REGISTER_USER_KERNEL("TestReshapeLike4KeepHeaderOnly") .SetCreateFn<CopyIn2OutKernel>() .SetIsMatchedHob(user_op::HobTrue()); -class TestSourceKernel final : public user_op::OpKernel { - public: - TestSourceKernel() = default; - ~TestSourceKernel() = default; - - private: - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - for (int i = 0; i < 5; ++i) { *(out_blob->mut_dptr<float>() + i) = static_cast<float>(i); } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("TestSource") - .SetCreateFn<TestSourceKernel>() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) - & (user_op::HobDataType("out", 0) == DataType::kFloat)) - .SetInferTmpSizeFn([](user_op::InferContext*) { return 0; }); - class TestSourceGpuKernel final : public user_op::OpKernel { public: TestSourceGpuKernel() = default; @@ -190,26 +153,6 @@ REGISTER_USER_KERNEL("TestMultiOutputOrder") .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) & (user_op::HobDataType("in", 0) == DataType::kFloat)); -class TestSourceMultiGpuFixedOutNumKernel final : public user_op::OpKernel { - public: - TestSourceMultiGpuFixedOutNumKernel() = default; - ~TestSourceMultiGpuFixedOutNumKernel() = default; - - private: - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - for (int i = 0; i < out_blob->shape().elem_cnt(); ++i) { - *(out_blob->mut_dptr<float>() + i) = static_cast<float>(i); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("TestSourceMultiGpuFixedOutNum") - .SetCreateFn<TestSourceMultiGpuFixedOutNumKernel>() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) - & (user_op::HobDataType("out", 0) == DataType::kFloat)); - class TestMultiInputFwKernel final : public user_op::OpKernel { public: TestMultiInputFwKernel() = default; @@ -252,6 +195,68 @@ REGISTER_USER_KERNEL("TestMultiInputGrad") .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kGPU) & (user_op::HobDataType("x1", 0) == DataType::kFloat)); +#endif + +template<typename T> +class ReluCpuKernel final : public user_op::OpKernel { + public: + ReluCpuKernel() = default; + ~ReluCpuKernel() = default; + + private: + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + NewKernelUtil<DeviceType::kCPU>::Relu(ctx->device_ctx(), in->shape().elem_cnt(), in->dptr<T>(), + out->mut_dptr<T>()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("cpu_only_relu_test") + .SetCreateFn<ReluCpuKernel<float>>() + .SetIsMatchedHob((user_op::HobDataType("in", 0) == DataType::kFloat) + & (user_op::HobDataType("out", 0) == DataType::kFloat)); + +class TestSourceKernel final : public user_op::OpKernel { + public: + TestSourceKernel() = default; + ~TestSourceKernel() = default; + + private: + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); + for (int i = 0; i < 5; ++i) { *(out_blob->mut_dptr<float>() + i) = static_cast<float>(i); } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("TestSource") + .SetCreateFn<TestSourceKernel>() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) + & (user_op::HobDataType("out", 0) == DataType::kFloat)) + .SetInferTmpSizeFn([](user_op::InferContext*) { return 0; }); + +class TestSourceMultiGpuFixedOutNumKernel final : public user_op::OpKernel { + public: + TestSourceMultiGpuFixedOutNumKernel() = default; + ~TestSourceMultiGpuFixedOutNumKernel() = default; + + private: + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); + for (int i = 0; i < out_blob->shape().elem_cnt(); ++i) { + *(out_blob->mut_dptr<float>() + i) = static_cast<float>(i); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("TestSourceMultiGpuFixedOutNum") + .SetCreateFn<TestSourceMultiGpuFixedOutNumKernel>() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) + & (user_op::HobDataType("out", 0) == DataType::kFloat)); + class TestDynamicSourceKernel final : public user_op::OpKernel { public: TestDynamicSourceKernel() = default; diff --git a/oneflow/user/kernels/transpose_kernel.cpp b/oneflow/user/kernels/transpose_kernel.cpp index 1e479962df358fe0e1ba63579dcdc2c97cdaf76b..b433f3a0050f7392c164929096794c3fd4a4455e 100644 --- a/oneflow/user/kernels/transpose_kernel.cpp +++ b/oneflow/user/kernels/transpose_kernel.cpp @@ -53,11 +53,14 @@ REGISTER_TRANSPOSE_KERNEL(DeviceType::kCPU, int64_t) REGISTER_TRANSPOSE_KERNEL(DeviceType::kCPU, float) REGISTER_TRANSPOSE_KERNEL(DeviceType::kCPU, double) +#ifdef WITH_CUDA REGISTER_TRANSPOSE_KERNEL(DeviceType::kGPU, int8_t) REGISTER_TRANSPOSE_KERNEL(DeviceType::kGPU, int32_t) REGISTER_TRANSPOSE_KERNEL(DeviceType::kGPU, int64_t) REGISTER_TRANSPOSE_KERNEL(DeviceType::kGPU, float) REGISTER_TRANSPOSE_KERNEL(DeviceType::kGPU, double) REGISTER_TRANSPOSE_KERNEL(DeviceType::kGPU, float16) +#endif + } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/two_stage_reduce_kernel.cpp b/oneflow/user/kernels/two_stage_reduce_kernel.cpp index f114760406f28e5d69991a6761825bd92178345e..d9cecaf63989d956300511271c16f0dd2de0c2d9 100644 --- a/oneflow/user/kernels/two_stage_reduce_kernel.cpp +++ b/oneflow/user/kernels/two_stage_reduce_kernel.cpp @@ -32,7 +32,11 @@ struct CopyTensor<DeviceType::kCPU, T, U> { template<typename T, typename U> struct CopyTensor<DeviceType::kGPU, T, U> { static void Call(DeviceCtx* ctx, const int64_t n, const T* src, U* dst) { +#ifdef WITH_CUDA CopyElemOnGpu(ctx, src, dst, n); +#else + UNIMPLEMENTED(); +#endif } }; diff --git a/oneflow/user/kernels/zero_like_kernel.cpp b/oneflow/user/kernels/zero_like_kernel.cpp index 7a9b8d9dd62c77947563f52a079e3b7276aa462f..7895296d66dba8e0bcc000403774987872571a38 100644 --- a/oneflow/user/kernels/zero_like_kernel.cpp +++ b/oneflow/user/kernels/zero_like_kernel.cpp @@ -39,6 +39,8 @@ class ZeroLikeKernel final : public user_op::OpKernel { .SetIsMatchedHob(user_op::HobDeviceType() == device_type_v); REGISTER_ZERO_LIKE_KERNEL(DeviceType::kCPU) +#ifdef WITH_CUDA REGISTER_ZERO_LIKE_KERNEL(DeviceType::kGPU) +#endif } // namespace oneflow