Optimize memory occupancy for interface 1.0 (#4844)

* Do not save inputs in function nodes even if requires_grad is true. * Allocate raw memory with actual size. Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>

Optimize memory occupancy for interface 1.0 (#4844)
* Do not save inputs in function nodes even if requires_grad is true. * Allocate raw memory with actual size. Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
e5dadaf5 · Houjiang Chen · GitHub · f450ff89 · e5dadaf5 · e5dadaf5
Unverified Commit e5dadaf5 authored 3 years ago by Houjiang Chen Committed by GitHub 3 years ago
--- a/oneflow/core/autograd/autograd_engine.cpp
+++ b/oneflow/core/autograd/autograd_engine.cpp
@@ -57,12 +57,15 @@ Maybe<void> CopyOrAccGrad(AutogradMeta* autograd_meta, bool autograd_mode) {
 StackFunctionNode::StackFunctionNode(
    const std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>&
        backward_fn,
-    const TensorTuple& inputs, const TensorTuple& outputs) {
+    const TensorTuple& inputs, const TensorTuple& outputs)
+    : FunctionNode() {
  input_meta_datas_.resize(inputs.size());
-  input_tensors_.resize(inputs.size());
+  next_functions_->reserve(inputs.size());
  for (int i = 0; i < inputs.size(); ++i) {
    input_meta_datas_.at(i) = inputs.at(i)->mut_autograd_meta();
-    if (input_meta_datas_.at(i)->requires_grad()) { input_tensors_.at(i) = inputs.at(i); }
+    if (input_meta_datas_.at(i)->requires_grad()) {
+      next_functions_->emplace_back(inputs.at(i)->grad_fn_node());
+    }
  }

  output_meta_datas_.resize(outputs.size());
@@ -101,7 +104,7 @@ void StackFunctionNode::ReleaseOutTensorArgs() {
 void StackFunctionNode::ReleaseData() {
  // Releases backward function and makes useless tensors release as early as possible
  if (!input_meta_datas_.empty()) { backward_fn_.reset(); }
-  input_tensors_.clear();
+  next_functions_->clear();
  is_in_stack_ = false;
 }


--- a/oneflow/core/autograd/autograd_engine.h
+++ b/oneflow/core/autograd/autograd_engine.h
@@ -53,7 +53,7 @@ class FunctionNode {
  const std::string& GetOpName() const { return op_name_; }

 protected:
-  FunctionNode() = default;
+  FunctionNode() : next_functions_(new std::vector<std::shared_ptr<const FunctionNode>>{}) {}

  const std::string op_name_;
  std::shared_ptr<std::vector<std::shared_ptr<const FunctionNode>>> next_functions_;
@@ -102,7 +102,6 @@ class StackFunctionNode final : public FunctionNode {
  void set_is_in_stack(bool in_stack) { is_in_stack_ = in_stack; }

 private:
-  std::vector<std::shared_ptr<Tensor>> input_tensors_;
  std::vector<std::shared_ptr<AutogradMeta>> input_meta_datas_;
  std::vector<std::shared_ptr<AutogradMeta>> output_meta_datas_;
  std::vector<TensorInfo> output_tensor_infos_;

--- a/oneflow/core/vm/cuda_allocator.cpp
+++ b/oneflow/core/vm/cuda_allocator.cpp
@@ -31,6 +31,10 @@ inline bool IsAlignedSize(size_t size) { return size % kCudaMemAllocAlignSize ==

 static const size_t kPieceSplitThreshold = 128 << 20;  // 128MiB

+constexpr size_t kMinBlockSize = 20 << 20;  // 20MiB
+constexpr size_t kMinAlloc =
+    10 << 20;  // allocations less than 10MiB should be packed in kMinBlockSize bytes.
+
 }  // namespace

 CudaAllocator::CudaAllocator(int64_t device_id)
@@ -169,9 +173,10 @@ bool CudaAllocator::AllocateBlockToExtendTotalMem(size_t aligned_size) {
  const size_t available_bytes = free_bytes - remain_bytes;  // remain at least 50MiB memory

  // growth double total memory bytes if could
-  if (total_memory_bytes_ > 0) {
-    allocate_bytes = std::max(allocate_bytes, std::min(total_memory_bytes_, available_bytes));
-  }
+  // if (total_memory_bytes_ > 0) {
+  //   allocate_bytes = std::max(allocate_bytes, std::min(total_memory_bytes_, available_bytes));
+  // }
+  if (allocate_bytes < kMinAlloc) { allocate_bytes = kMinBlockSize; }
  const size_t final_allocate_bytes = CudaMemAlignedBytes(allocate_bytes);

  if (final_allocate_bytes > available_bytes) { return false; }