Dev cuda 9 arch 70 (#2318)

* kCudaAlignSize = 256 * always compute_70 * __CUDA_API_VERSION >= 10000 * __CUDA_API_VERSION >= 10000 * disable_all_reduce_sequence

Dev cuda 9 arch 70 (#2318)
* kCudaAlignSize = 256 * always compute_70 * __CUDA_API_VERSION >= 10000 * __CUDA_API_VERSION >= 10000 * disable_all_reduce_sequence
77bcbedd · Juncheng · GitHub · f4887a09 · 77bcbedd · 77bcbedd
Unverified Commit 77bcbedd authored 5 years ago by Juncheng Committed by GitHub 5 years ago
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,9 +48,7 @@ else()
  #  list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_52,code=\"sm_52,compute_52\")
  list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_60,code=\"sm_60,compute_60\")
  list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_61,code=\"sm_61,compute_61\")
-  if(NOT CUDA_VERSION VERSION_LESS "10.0")
-    list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_70,code=\"sm_70,compute_70\")
-  endif()
+  list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_70,code=\"sm_70,compute_70\")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Wno-sign-compare -Wno-unused-function -fPIC")
  if (RELEASE_VERSION)
    list(APPEND CUDA_NVCC_FLAGS -O3)

--- a/oneflow/core/common/util.h
+++ b/oneflow/core/common/util.h
@@ -180,7 +180,7 @@ inline double GetCurTime() {
  return std::chrono::high_resolution_clock::now().time_since_epoch().count();
 }

-const size_t kCudaAlignSize = 8;
+const size_t kCudaAlignSize = 256;
 const size_t kCudaMemAllocAlignSize = 256;
 inline size_t RoundUp(size_t n, size_t val) { return (n + val - 1) / val * val; }


--- a/oneflow/core/job/job.proto
+++ b/oneflow/core/job/job.proto
@@ -88,6 +88,7 @@ message JobConfigProto {
  optional bool all_reduce_fp16 = 505 [default = true];
  optional bool enable_non_distributed_optimizer = 506 [default = false];
  optional int64 non_distributed_optimizer_group_size_mbyte = 507 [default = 100];
+  optional bool disable_all_reduce_sequence = 508 [default = false];

  optional bool enable_true_half_config_when_conv = 600 [default = false];
  optional bool enable_float_compute_for_half_gemm = 601 [default = true];

--- a/oneflow/core/job/job_desc.h
+++ b/oneflow/core/job/job_desc.h
@@ -53,6 +53,7 @@ class JobDesc final {
  int64_t non_distributed_optimizer_group_size_mbyte() const {
    return job_conf_.non_distributed_optimizer_group_size_mbyte();
  }
+  bool disable_all_reduce_sequence() const { return job_conf_.disable_all_reduce_sequence(); }
  int64_t all_reduce_group_num() const;
  int64_t all_reduce_group_min_byte() const;
  float all_reduce_group_size_warmup() const;

--- a/oneflow/core/job_completer/job_completer.cpp
+++ b/oneflow/core/job_completer/job_completer.cpp
@@ -321,6 +321,7 @@ void DumpLogicalBlobDescAndSbpSignature(const OpGraph& op_graph, JobBuilder* job
 }

 void MakeAllReduceSequence(const OpGraph& op_graph, JobBuilder* job_builder) {
+  if (GlobalJobDesc().disable_all_reduce_sequence()) { return; }
  AllReduceSequencePass().Apply(op_graph, job_builder);
 }


--- a/oneflow/core/kernel/gather_kernel_util.cpp
+++ b/oneflow/core/kernel/gather_kernel_util.cpp
@@ -136,7 +136,8 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_CPU_IMPL, FLOATING_
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL, DEVICE_TYPE_SEQ,
                                 FLOATING_DATA_TYPE_SEQ);
 #undef INITIATE_GATHER_KERNEL_UTIL
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
+    && __CUDA_API_VERSION >= 10000
 template struct GatherKernelUtil<DeviceType::kGPU, float16>;
 #endif


--- a/oneflow/core/kernel/gather_kernel_util.cu
+++ b/oneflow/core/kernel/gather_kernel_util.cu
@@ -71,7 +71,8 @@ struct GatherKernelUtilImpl<DeviceType::kGPU, T, K> final {
  }
 };

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
+    && __CUDA_API_VERSION >= 10000
 template<typename K>
 struct GatherKernelUtilImpl<DeviceType::kGPU, float16, K> final {
  static void Forward(DeviceCtx* ctx, const K* indices, int64_t num_indices, const float16* in,
@@ -99,7 +100,8 @@ struct GatherKernelUtilImpl<DeviceType::kGPU, float16, K> final {
                                       OF_PP_PAIR_FIRST(index_type_pair)>;
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_GPU_IMPL,
                                 FLOATING_DATA_TYPE_SEQ
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
+    && __CUDA_API_VERSION >= 10000
                                     FLOAT16_DATA_TYPE_SEQ
 #endif
                                 ,

--- a/oneflow/core/kernel/gather_ms0_grad_kernel.cpp
+++ b/oneflow/core/kernel/gather_ms0_grad_kernel.cpp
@@ -26,7 +26,8 @@ Kernel* CreateGatherGradKernel(const KernelConf& kernel_conf) {
  static const HashMap<std::string, std::function<Kernel*()>> creators = {
    OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (GatherMs0GradKernel),
                                     DEVICE_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ)
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
+    && __CUDA_API_VERSION >= 10000
        MAKE_KERNEL_CREATOR_ENTRY(GatherMs0GradKernel, DeviceType::kGPU,
                                  (float16, DataType::kFloat16))
 #endif

--- a/oneflow/core/kernel/kernel_util.cu
+++ b/oneflow/core/kernel/kernel_util.cu
@@ -633,7 +633,8 @@ __device__ float gpu_atomic_add(float* address, float val) {
  return atomicAdd(address, val);
 }

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
+    && __CUDA_API_VERSION >= 10000
 template<>
 __device__ half gpu_atomic_add(half* address, half val) {
  return atomicAdd(address, val);

--- a/oneflow/core/kernel/unsorted_segment_sum_kernel.cpp
+++ b/oneflow/core/kernel/unsorted_segment_sum_kernel.cpp
@@ -25,7 +25,8 @@ Kernel* CreateUnsortedSegmentSumKernel(const KernelConf& kernel_conf) {
  static const HashMap<std::string, std::function<Kernel*()>> creators = {
    OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (UnsortedSegmentSumKernel),
                                     DEVICE_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ)
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
+    && __CUDA_API_VERSION >= 10000
        MAKE_KERNEL_CREATOR_ENTRY(UnsortedSegmentSumKernel, DeviceType::kGPU,
                                  (float16, DataType::kFloat16))
 #endif

--- a/oneflow/python/framework/config_util.py
+++ b/oneflow/python/framework/config_util.py
@@ -319,6 +319,11 @@ def set_enable_non_distributed_optimizer(value = True):
    _SetJobConfAttr(lambda x:x, 'enable_non_distributed_optimizer', value)
    return oneflow.config

+@oneflow_export('config.disable_all_reduce_sequence')
+def disable_all_reduce_sequence(value=True):
+    _SetJobConfAttr(lambda x: x, 'disable_all_reduce_sequence', value)
+    return oneflow.config
+
 @oneflow_export('config.non_distributed_optimizer_group_size_mbyte')
 def set_non_distributed_optimizer_group_size_mbyte(value):
    _SetJobConfAttr(lambda x:x, 'non_distributed_optimizer_group_size_mbyte', value)