Skip to content
Snippets Groups Projects
Unverified Commit 77bcbedd authored by Juncheng's avatar Juncheng Committed by GitHub
Browse files

Dev cuda 9 arch 70 (#2318)

* kCudaAlignSize = 256

* always compute_70

* __CUDA_API_VERSION >= 10000

* __CUDA_API_VERSION >= 10000

* disable_all_reduce_sequence
parent f4887a09
No related branches found
No related tags found
No related merge requests found
......@@ -48,9 +48,7 @@ else()
# list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_52,code=\"sm_52,compute_52\")
list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_60,code=\"sm_60,compute_60\")
list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_61,code=\"sm_61,compute_61\")
if(NOT CUDA_VERSION VERSION_LESS "10.0")
list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_70,code=\"sm_70,compute_70\")
endif()
list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_70,code=\"sm_70,compute_70\")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Wno-sign-compare -Wno-unused-function -fPIC")
if (RELEASE_VERSION)
list(APPEND CUDA_NVCC_FLAGS -O3)
......
......@@ -180,7 +180,7 @@ inline double GetCurTime() {
return std::chrono::high_resolution_clock::now().time_since_epoch().count();
}
const size_t kCudaAlignSize = 8;
const size_t kCudaAlignSize = 256;
const size_t kCudaMemAllocAlignSize = 256;
inline size_t RoundUp(size_t n, size_t val) { return (n + val - 1) / val * val; }
......
......@@ -88,6 +88,7 @@ message JobConfigProto {
optional bool all_reduce_fp16 = 505 [default = true];
optional bool enable_non_distributed_optimizer = 506 [default = false];
optional int64 non_distributed_optimizer_group_size_mbyte = 507 [default = 100];
optional bool disable_all_reduce_sequence = 508 [default = false];
optional bool enable_true_half_config_when_conv = 600 [default = false];
optional bool enable_float_compute_for_half_gemm = 601 [default = true];
......
......@@ -53,6 +53,7 @@ class JobDesc final {
int64_t non_distributed_optimizer_group_size_mbyte() const {
return job_conf_.non_distributed_optimizer_group_size_mbyte();
}
bool disable_all_reduce_sequence() const { return job_conf_.disable_all_reduce_sequence(); }
int64_t all_reduce_group_num() const;
int64_t all_reduce_group_min_byte() const;
float all_reduce_group_size_warmup() const;
......
......@@ -321,6 +321,7 @@ void DumpLogicalBlobDescAndSbpSignature(const OpGraph& op_graph, JobBuilder* job
}
void MakeAllReduceSequence(const OpGraph& op_graph, JobBuilder* job_builder) {
if (GlobalJobDesc().disable_all_reduce_sequence()) { return; }
AllReduceSequencePass().Apply(op_graph, job_builder);
}
......
......@@ -136,7 +136,8 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_CPU_IMPL, FLOATING_
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL, DEVICE_TYPE_SEQ,
FLOATING_DATA_TYPE_SEQ);
#undef INITIATE_GATHER_KERNEL_UTIL
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
&& __CUDA_API_VERSION >= 10000
template struct GatherKernelUtil<DeviceType::kGPU, float16>;
#endif
......
......@@ -71,7 +71,8 @@ struct GatherKernelUtilImpl<DeviceType::kGPU, T, K> final {
}
};
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
&& __CUDA_API_VERSION >= 10000
template<typename K>
struct GatherKernelUtilImpl<DeviceType::kGPU, float16, K> final {
static void Forward(DeviceCtx* ctx, const K* indices, int64_t num_indices, const float16* in,
......@@ -99,7 +100,8 @@ struct GatherKernelUtilImpl<DeviceType::kGPU, float16, K> final {
OF_PP_PAIR_FIRST(index_type_pair)>;
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_GPU_IMPL,
FLOATING_DATA_TYPE_SEQ
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
&& __CUDA_API_VERSION >= 10000
FLOAT16_DATA_TYPE_SEQ
#endif
,
......
......@@ -26,7 +26,8 @@ Kernel* CreateGatherGradKernel(const KernelConf& kernel_conf) {
static const HashMap<std::string, std::function<Kernel*()>> creators = {
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (GatherMs0GradKernel),
DEVICE_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ)
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
&& __CUDA_API_VERSION >= 10000
MAKE_KERNEL_CREATOR_ENTRY(GatherMs0GradKernel, DeviceType::kGPU,
(float16, DataType::kFloat16))
#endif
......
......@@ -633,7 +633,8 @@ __device__ float gpu_atomic_add(float* address, float val) {
return atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
&& __CUDA_API_VERSION >= 10000
template<>
__device__ half gpu_atomic_add(half* address, half val) {
return atomicAdd(address, val);
......
......@@ -25,7 +25,8 @@ Kernel* CreateUnsortedSegmentSumKernel(const KernelConf& kernel_conf) {
static const HashMap<std::string, std::function<Kernel*()>> creators = {
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_KERNEL_CREATOR_ENTRY, (UnsortedSegmentSumKernel),
DEVICE_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ)
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && defined(__CUDA_API_VERSION) \
&& __CUDA_API_VERSION >= 10000
MAKE_KERNEL_CREATOR_ENTRY(UnsortedSegmentSumKernel, DeviceType::kGPU,
(float16, DataType::kFloat16))
#endif
......
......@@ -319,6 +319,11 @@ def set_enable_non_distributed_optimizer(value = True):
_SetJobConfAttr(lambda x:x, 'enable_non_distributed_optimizer', value)
return oneflow.config
@oneflow_export('config.disable_all_reduce_sequence')
def disable_all_reduce_sequence(value=True):
_SetJobConfAttr(lambda x: x, 'disable_all_reduce_sequence', value)
return oneflow.config
@oneflow_export('config.non_distributed_optimizer_group_size_mbyte')
def set_non_distributed_optimizer_group_size_mbyte(value):
_SetJobConfAttr(lambda x:x, 'non_distributed_optimizer_group_size_mbyte', value)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment