diff --git a/oneflow/core/autograd/autograd_engine.cpp b/oneflow/core/autograd/autograd_engine.cpp
index bda275dc6724a065d41910071cad197dbced4240..ff01fe8737510399ebab89a932a8540875d7d028 100644
--- a/oneflow/core/autograd/autograd_engine.cpp
+++ b/oneflow/core/autograd/autograd_engine.cpp
@@ -57,12 +57,15 @@ Maybe<void> CopyOrAccGrad(AutogradMeta* autograd_meta, bool autograd_mode) {
 StackFunctionNode::StackFunctionNode(
     const std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>&
         backward_fn,
-    const TensorTuple& inputs, const TensorTuple& outputs) {
+    const TensorTuple& inputs, const TensorTuple& outputs)
+    : FunctionNode() {
   input_meta_datas_.resize(inputs.size());
-  input_tensors_.resize(inputs.size());
+  next_functions_->reserve(inputs.size());
   for (int i = 0; i < inputs.size(); ++i) {
     input_meta_datas_.at(i) = inputs.at(i)->mut_autograd_meta();
-    if (input_meta_datas_.at(i)->requires_grad()) { input_tensors_.at(i) = inputs.at(i); }
+    if (input_meta_datas_.at(i)->requires_grad()) {
+      next_functions_->emplace_back(inputs.at(i)->grad_fn_node());
+    }
   }
 
   output_meta_datas_.resize(outputs.size());
@@ -101,7 +104,7 @@ void StackFunctionNode::ReleaseOutTensorArgs() {
 void StackFunctionNode::ReleaseData() {
   // Releases backward function and makes useless tensors release as early as possible
   if (!input_meta_datas_.empty()) { backward_fn_.reset(); }
-  input_tensors_.clear();
+  next_functions_->clear();
   is_in_stack_ = false;
 }
 
diff --git a/oneflow/core/autograd/autograd_engine.h b/oneflow/core/autograd/autograd_engine.h
index 53f9c3e1361db492b14b0a57dad5636178e913a1..b6df4c57fd2a802e8c957b8c055169595540ffc2 100644
--- a/oneflow/core/autograd/autograd_engine.h
+++ b/oneflow/core/autograd/autograd_engine.h
@@ -53,7 +53,7 @@ class FunctionNode {
   const std::string& GetOpName() const { return op_name_; }
 
  protected:
-  FunctionNode() = default;
+  FunctionNode() : next_functions_(new std::vector<std::shared_ptr<const FunctionNode>>{}) {}
 
   const std::string op_name_;
   std::shared_ptr<std::vector<std::shared_ptr<const FunctionNode>>> next_functions_;
@@ -102,7 +102,6 @@ class StackFunctionNode final : public FunctionNode {
   void set_is_in_stack(bool in_stack) { is_in_stack_ = in_stack; }
 
  private:
-  std::vector<std::shared_ptr<Tensor>> input_tensors_;
   std::vector<std::shared_ptr<AutogradMeta>> input_meta_datas_;
   std::vector<std::shared_ptr<AutogradMeta>> output_meta_datas_;
   std::vector<TensorInfo> output_tensor_infos_;
diff --git a/oneflow/core/vm/cuda_allocator.cpp b/oneflow/core/vm/cuda_allocator.cpp
index 95c60a57982f346c8be1a153649faf114dfafcfd..128798f24c91a42731d4344d4ab7f110f73d72f4 100644
--- a/oneflow/core/vm/cuda_allocator.cpp
+++ b/oneflow/core/vm/cuda_allocator.cpp
@@ -31,6 +31,10 @@ inline bool IsAlignedSize(size_t size) { return size % kCudaMemAllocAlignSize ==
 
 static const size_t kPieceSplitThreshold = 128 << 20;  // 128MiB
 
+constexpr size_t kMinBlockSize = 20 << 20;  // 20MiB
+constexpr size_t kMinAlloc =
+    10 << 20;  // allocations less than 10MiB should be packed in kMinBlockSize bytes.
+
 }  // namespace
 
 CudaAllocator::CudaAllocator(int64_t device_id)
@@ -169,9 +173,10 @@ bool CudaAllocator::AllocateBlockToExtendTotalMem(size_t aligned_size) {
   const size_t available_bytes = free_bytes - remain_bytes;  // remain at least 50MiB memory
 
   // growth double total memory bytes if could
-  if (total_memory_bytes_ > 0) {
-    allocate_bytes = std::max(allocate_bytes, std::min(total_memory_bytes_, available_bytes));
-  }
+  // if (total_memory_bytes_ > 0) {
+  //   allocate_bytes = std::max(allocate_bytes, std::min(total_memory_bytes_, available_bytes));
+  // }
+  if (allocate_bytes < kMinAlloc) { allocate_bytes = kMinBlockSize; }
   const size_t final_allocate_bytes = CudaMemAlignedBytes(allocate_bytes);
 
   if (final_allocate_bytes > available_bytes) { return false; }