Skip to content
Snippets Groups Projects
Unverified Commit e5dadaf5 authored by Houjiang Chen's avatar Houjiang Chen Committed by GitHub
Browse files

Optimize memory occupancy for interface 1.0 (#4844)


* Do not save inputs in function nodes even if requires_grad is true.

* Allocate raw memory with actual size.

Co-authored-by: default avataroneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
parent f450ff89
No related branches found
No related tags found
No related merge requests found
......@@ -57,12 +57,15 @@ Maybe<void> CopyOrAccGrad(AutogradMeta* autograd_meta, bool autograd_mode) {
StackFunctionNode::StackFunctionNode(
const std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>&
backward_fn,
const TensorTuple& inputs, const TensorTuple& outputs) {
const TensorTuple& inputs, const TensorTuple& outputs)
: FunctionNode() {
input_meta_datas_.resize(inputs.size());
input_tensors_.resize(inputs.size());
next_functions_->reserve(inputs.size());
for (int i = 0; i < inputs.size(); ++i) {
input_meta_datas_.at(i) = inputs.at(i)->mut_autograd_meta();
if (input_meta_datas_.at(i)->requires_grad()) { input_tensors_.at(i) = inputs.at(i); }
if (input_meta_datas_.at(i)->requires_grad()) {
next_functions_->emplace_back(inputs.at(i)->grad_fn_node());
}
}
output_meta_datas_.resize(outputs.size());
......@@ -101,7 +104,7 @@ void StackFunctionNode::ReleaseOutTensorArgs() {
void StackFunctionNode::ReleaseData() {
// Releases backward function and makes useless tensors release as early as possible
if (!input_meta_datas_.empty()) { backward_fn_.reset(); }
input_tensors_.clear();
next_functions_->clear();
is_in_stack_ = false;
}
......
......@@ -53,7 +53,7 @@ class FunctionNode {
const std::string& GetOpName() const { return op_name_; }
protected:
FunctionNode() = default;
FunctionNode() : next_functions_(new std::vector<std::shared_ptr<const FunctionNode>>{}) {}
const std::string op_name_;
std::shared_ptr<std::vector<std::shared_ptr<const FunctionNode>>> next_functions_;
......@@ -102,7 +102,6 @@ class StackFunctionNode final : public FunctionNode {
void set_is_in_stack(bool in_stack) { is_in_stack_ = in_stack; }
private:
std::vector<std::shared_ptr<Tensor>> input_tensors_;
std::vector<std::shared_ptr<AutogradMeta>> input_meta_datas_;
std::vector<std::shared_ptr<AutogradMeta>> output_meta_datas_;
std::vector<TensorInfo> output_tensor_infos_;
......
......@@ -31,6 +31,10 @@ inline bool IsAlignedSize(size_t size) { return size % kCudaMemAllocAlignSize ==
static const size_t kPieceSplitThreshold = 128 << 20; // 128MiB
constexpr size_t kMinBlockSize = 20 << 20; // 20MiB
constexpr size_t kMinAlloc =
10 << 20; // allocations less than 10MiB should be packed in kMinBlockSize bytes.
} // namespace
CudaAllocator::CudaAllocator(int64_t device_id)
......@@ -169,9 +173,10 @@ bool CudaAllocator::AllocateBlockToExtendTotalMem(size_t aligned_size) {
const size_t available_bytes = free_bytes - remain_bytes; // remain at least 50MiB memory
// growth double total memory bytes if could
if (total_memory_bytes_ > 0) {
allocate_bytes = std::max(allocate_bytes, std::min(total_memory_bytes_, available_bytes));
}
// if (total_memory_bytes_ > 0) {
// allocate_bytes = std::max(allocate_bytes, std::min(total_memory_bytes_, available_bytes));
// }
if (allocate_bytes < kMinAlloc) { allocate_bytes = kMinBlockSize; }
const size_t final_allocate_bytes = CudaMemAlignedBytes(allocate_bytes);
if (final_allocate_bytes > available_bytes) { return false; }
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment