diff --git a/oneflow/core/autograd/autograd_engine.cpp b/oneflow/core/autograd/autograd_engine.cpp index bda275dc6724a065d41910071cad197dbced4240..ff01fe8737510399ebab89a932a8540875d7d028 100644 --- a/oneflow/core/autograd/autograd_engine.cpp +++ b/oneflow/core/autograd/autograd_engine.cpp @@ -57,12 +57,15 @@ Maybe<void> CopyOrAccGrad(AutogradMeta* autograd_meta, bool autograd_mode) { StackFunctionNode::StackFunctionNode( const std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>& backward_fn, - const TensorTuple& inputs, const TensorTuple& outputs) { + const TensorTuple& inputs, const TensorTuple& outputs) + : FunctionNode() { input_meta_datas_.resize(inputs.size()); - input_tensors_.resize(inputs.size()); + next_functions_->reserve(inputs.size()); for (int i = 0; i < inputs.size(); ++i) { input_meta_datas_.at(i) = inputs.at(i)->mut_autograd_meta(); - if (input_meta_datas_.at(i)->requires_grad()) { input_tensors_.at(i) = inputs.at(i); } + if (input_meta_datas_.at(i)->requires_grad()) { + next_functions_->emplace_back(inputs.at(i)->grad_fn_node()); + } } output_meta_datas_.resize(outputs.size()); @@ -101,7 +104,7 @@ void StackFunctionNode::ReleaseOutTensorArgs() { void StackFunctionNode::ReleaseData() { // Releases backward function and makes useless tensors release as early as possible if (!input_meta_datas_.empty()) { backward_fn_.reset(); } - input_tensors_.clear(); + next_functions_->clear(); is_in_stack_ = false; } diff --git a/oneflow/core/autograd/autograd_engine.h b/oneflow/core/autograd/autograd_engine.h index 53f9c3e1361db492b14b0a57dad5636178e913a1..b6df4c57fd2a802e8c957b8c055169595540ffc2 100644 --- a/oneflow/core/autograd/autograd_engine.h +++ b/oneflow/core/autograd/autograd_engine.h @@ -53,7 +53,7 @@ class FunctionNode { const std::string& GetOpName() const { return op_name_; } protected: - FunctionNode() = default; + FunctionNode() : next_functions_(new std::vector<std::shared_ptr<const FunctionNode>>{}) {} const std::string op_name_; std::shared_ptr<std::vector<std::shared_ptr<const FunctionNode>>> next_functions_; @@ -102,7 +102,6 @@ class StackFunctionNode final : public FunctionNode { void set_is_in_stack(bool in_stack) { is_in_stack_ = in_stack; } private: - std::vector<std::shared_ptr<Tensor>> input_tensors_; std::vector<std::shared_ptr<AutogradMeta>> input_meta_datas_; std::vector<std::shared_ptr<AutogradMeta>> output_meta_datas_; std::vector<TensorInfo> output_tensor_infos_; diff --git a/oneflow/core/vm/cuda_allocator.cpp b/oneflow/core/vm/cuda_allocator.cpp index 95c60a57982f346c8be1a153649faf114dfafcfd..128798f24c91a42731d4344d4ab7f110f73d72f4 100644 --- a/oneflow/core/vm/cuda_allocator.cpp +++ b/oneflow/core/vm/cuda_allocator.cpp @@ -31,6 +31,10 @@ inline bool IsAlignedSize(size_t size) { return size % kCudaMemAllocAlignSize == static const size_t kPieceSplitThreshold = 128 << 20; // 128MiB +constexpr size_t kMinBlockSize = 20 << 20; // 20MiB +constexpr size_t kMinAlloc = + 10 << 20; // allocations less than 10MiB should be packed in kMinBlockSize bytes. + } // namespace CudaAllocator::CudaAllocator(int64_t device_id) @@ -169,9 +173,10 @@ bool CudaAllocator::AllocateBlockToExtendTotalMem(size_t aligned_size) { const size_t available_bytes = free_bytes - remain_bytes; // remain at least 50MiB memory // growth double total memory bytes if could - if (total_memory_bytes_ > 0) { - allocate_bytes = std::max(allocate_bytes, std::min(total_memory_bytes_, available_bytes)); - } + // if (total_memory_bytes_ > 0) { + // allocate_bytes = std::max(allocate_bytes, std::min(total_memory_bytes_, available_bytes)); + // } + if (allocate_bytes < kMinAlloc) { allocate_bytes = kMinBlockSize; } const size_t final_allocate_bytes = CudaMemAlignedBytes(allocate_bytes); if (final_allocate_bytes > available_bytes) { return false; }