From cd24775269c13ea6e0cdab5da3974b3a737a54dd Mon Sep 17 00:00:00 2001
From: "jie.wang" <38901892+jievince@users.noreply.github.com>
Date: Mon, 19 Oct 2020 07:44:17 -0500
Subject: [PATCH] Optimizer rule for TopN (#279)

* add TopN PlanNode and Executor

* add topn rule

* fix topn rule

* fix auto

* add topn rule test

* fix indent

* fix conflict

* fix conflict

* fix ;

* add newline

* fix colNames

* update code

* update code

* fix static function naming style

* fix name

* fix name

* check result data in test_optimzier

* fix
---
 src/optimizer/CMakeLists.txt                  |   1 +
 src/optimizer/OptRule.h                       |   4 +
 .../rule/PushFilterDownGetNbrsRule.cpp        |   4 +-
 .../rule/PushFilterDownGetNbrsRule.h          |   2 +
 src/optimizer/rule/TopNRule.cpp               |  75 ++++++++++++
 src/optimizer/rule/TopNRule.h                 |  35 ++++++
 src/planner/Query.h                           |   1 -
 tests/query/v2/test_optimizer.py              | 110 +++++++++++++++---
 8 files changed, 216 insertions(+), 16 deletions(-)
 create mode 100644 src/optimizer/rule/TopNRule.cpp
 create mode 100644 src/optimizer/rule/TopNRule.h

diff --git a/src/optimizer/CMakeLists.txt b/src/optimizer/CMakeLists.txt
index 248d05f0..b9541b65 100644
--- a/src/optimizer/CMakeLists.txt
+++ b/src/optimizer/CMakeLists.txt
@@ -12,6 +12,7 @@ nebula_add_library(
     OptRule.cpp
     rule/PushFilterDownGetNbrsRule.cpp
     rule/IndexScanRule.cpp
+    rule/TopNRule.cpp
 )
 
 nebula_add_subdirectory(test)
diff --git a/src/optimizer/OptRule.h b/src/optimizer/OptRule.h
index af3a764d..fe616376 100644
--- a/src/optimizer/OptRule.h
+++ b/src/optimizer/OptRule.h
@@ -47,6 +47,10 @@ private:
 class OptRule {
 public:
     struct TransformResult {
+        static const TransformResult &noTransform() {
+            static TransformResult kNoTrans{false, false, {}};
+            return kNoTrans;
+        }
         bool eraseCurr{false};
         bool eraseAll{false};
         std::vector<OptGroupExpr *> newGroupExprs;
diff --git a/src/optimizer/rule/PushFilterDownGetNbrsRule.cpp b/src/optimizer/rule/PushFilterDownGetNbrsRule.cpp
index c992e4a4..430fcda8 100644
--- a/src/optimizer/rule/PushFilterDownGetNbrsRule.cpp
+++ b/src/optimizer/rule/PushFilterDownGetNbrsRule.cpp
@@ -50,7 +50,7 @@ StatusOr<OptRule::TransformResult> PushFilterDownGetNbrsRule::transform(
     graph::ExtractFilterExprVisitor visitor;
     condition->accept(&visitor);
     if (!visitor.ok()) {
-        return TransformResult{false, false, {}};
+        return TransformResult::noTransform();
     }
 
     auto pool = qctx->objPool();
@@ -91,8 +91,8 @@ StatusOr<OptRule::TransformResult> PushFilterDownGetNbrsRule::transform(
     }
 
     TransformResult result;
-    result.newGroupExprs.emplace_back(newFilterGroupExpr ? newFilterGroupExpr : newGnGroupExpr);
     result.eraseCurr = true;
+    result.newGroupExprs.emplace_back(newFilterGroupExpr ? newFilterGroupExpr : newGnGroupExpr);
     return result;
 }
 
diff --git a/src/optimizer/rule/PushFilterDownGetNbrsRule.h b/src/optimizer/rule/PushFilterDownGetNbrsRule.h
index 11ebbe6d..4cabc067 100644
--- a/src/optimizer/rule/PushFilterDownGetNbrsRule.h
+++ b/src/optimizer/rule/PushFilterDownGetNbrsRule.h
@@ -21,8 +21,10 @@ namespace opt {
 class PushFilterDownGetNbrsRule final : public OptRule {
 public:
     const Pattern &pattern() const override;
+
     StatusOr<TransformResult> transform(graph::QueryContext *qctx,
                                         const MatchedResult &matched) const override;
+
     std::string toString() const override;
 
 private:
diff --git a/src/optimizer/rule/TopNRule.cpp b/src/optimizer/rule/TopNRule.cpp
new file mode 100644
index 00000000..1912d096
--- /dev/null
+++ b/src/optimizer/rule/TopNRule.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2020 vesoft inc. All rights reserved.
+ *
+ * This source code is licensed under Apache 2.0 License,
+ * attached with Common Clause Condition 1.0, found in the LICENSES directory.
+ */
+
+#include "optimizer/rule/TopNRule.h"
+
+#include "common/expression/BinaryExpression.h"
+#include "common/expression/ConstantExpression.h"
+#include "common/expression/Expression.h"
+#include "common/expression/FunctionCallExpression.h"
+#include "common/expression/LogicalExpression.h"
+#include "common/expression/UnaryExpression.h"
+#include "optimizer/OptGroup.h"
+#include "planner/PlanNode.h"
+#include "planner/Query.h"
+#include "visitor/ExtractFilterExprVisitor.h"
+
+using nebula::graph::Limit;
+using nebula::graph::PlanNode;
+using nebula::graph::QueryContext;
+using nebula::graph::Sort;
+using nebula::graph::TopN;
+
+namespace nebula {
+namespace opt {
+
+std::unique_ptr<OptRule> TopNRule::kInstance = std::unique_ptr<TopNRule>(new TopNRule());
+
+TopNRule::TopNRule() {
+    RuleSet::QueryRules().addRule(this);
+}
+
+const Pattern &TopNRule::pattern() const {
+    static Pattern pattern = Pattern::create(graph::PlanNode::Kind::kLimit,
+                                             {Pattern::create(graph::PlanNode::Kind::kSort)});
+    return pattern;
+}
+
+StatusOr<OptRule::TransformResult> TopNRule::transform(QueryContext *qctx,
+                                                       const MatchedResult &matched) const {
+    auto limitExpr = matched.node;
+    auto sortExpr = matched.dependencies.front().node;
+    auto limit = static_cast<const Limit *>(limitExpr->node());
+    auto sort = static_cast<const Sort *>(sortExpr->node());
+
+    // Currently, we cannot know the total amount of input data,
+    // so only apply topn rule when offset of limit is 0
+    if (limit->offset() != 0) {
+        return TransformResult::noTransform();
+    }
+
+    auto topn = TopN::make(qctx, nullptr, sort->factors(), limit->offset(), limit->count());
+    topn->setOutputVar(limit->outputVar());
+    topn->setInputVar(sort->inputVar());
+    topn->setColNames(sort->colNames());
+    auto topnExpr = OptGroupExpr::create(qctx, topn, limitExpr->group());
+    for (auto dep : sortExpr->dependencies()) {
+        topnExpr->dependsOn(dep);
+    }
+
+    TransformResult result;
+    result.newGroupExprs.emplace_back(topnExpr);
+    result.eraseAll = true;
+    result.eraseCurr = true;
+    return result;
+}
+
+std::string TopNRule::toString() const {
+    return "TopNRule";
+}
+
+}   // namespace opt
+}   // namespace nebula
diff --git a/src/optimizer/rule/TopNRule.h b/src/optimizer/rule/TopNRule.h
new file mode 100644
index 00000000..f6114f23
--- /dev/null
+++ b/src/optimizer/rule/TopNRule.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2020 vesoft inc. All rights reserved.
+ *
+ * This source code is licensed under Apache 2.0 License,
+ * attached with Common Clause Condition 1.0, found in the LICENSES directory.
+ */
+
+#ifndef OPTIMIZER_RULE_TOPNRULE_H_
+#define OPTIMIZER_RULE_TOPNRULE_H_
+
+#include <memory>
+
+#include "optimizer/OptRule.h"
+
+namespace nebula {
+namespace opt {
+
+class TopNRule final : public OptRule {
+public:
+    const Pattern &pattern() const override;
+
+    StatusOr<OptRule::TransformResult> transform(graph::QueryContext *qctx,
+                                                 const MatchedResult &matched) const override;
+
+    std::string toString() const override;
+
+private:
+    TopNRule();
+
+    static std::unique_ptr<OptRule> kInstance;
+};
+
+}   // namespace opt
+}   // namespace nebula
+
+#endif   // OPTIMIZER_RULE_TOPNRULE_H_
diff --git a/src/planner/Query.h b/src/planner/Query.h
index 10215e34..64fa76d4 100644
--- a/src/planner/Query.h
+++ b/src/planner/Query.h
@@ -747,7 +747,6 @@ private:
     int64_t     count_{-1};
 };
 
-
 /**
  * Do Aggregation with the given set of records,
  * such as AVG(), COUNT()...
diff --git a/tests/query/v2/test_optimizer.py b/tests/query/v2/test_optimizer.py
index 434ce91a..6353588a 100644
--- a/tests/query/v2/test_optimizer.py
+++ b/tests/query/v2/test_optimizer.py
@@ -18,53 +18,61 @@ class TestOptimizer(NebulaTestSuite):
 
     def test_PushFilterDownGetNbrsRule(self):
         resp = self.execute_query('''
-            GO 1 STEPS FROM "Kobe Bryant" OVER serve
-            WHERE $^.player.age > 18 YIELD $^.player.name AS name
+            GO 1 STEPS FROM "Boris Diaw" OVER serve
+            WHERE $^.player.age > 18 YIELD serve.start_year as start_year
         ''')
         expected_plan = [
             ["Project", [1]],
             ["GetNeighbors", [2], ['($^.player.age>18)']],
             ["Start", []]
         ]
+        expected_data = [[2003], [2005], [2008], [2012], [2016]]
         self.check_exec_plan(resp, expected_plan)
+        self.check_out_of_order_result(resp, expected_data)
 
         resp = self.execute_query('''
-            GO 1 STEPS FROM "Kobe Bryant" OVER like REVERSELY
-            WHERE $^.player.age > 18 YIELD $^.player.name AS name
+            GO 1 STEPS FROM "James Harden" OVER like REVERSELY
+            WHERE $^.player.age > 18 YIELD like.likeness as likeness
         ''')
         expected_plan = [
             ["Project", [1]],
             ["GetNeighbors", [2], ['($^.player.age>18)']],
             ["Start", []]
         ]
+        expected_data = [[90], [80], [99]]
         self.check_exec_plan(resp, expected_plan)
+        self.check_out_of_order_result(resp, expected_data)
 
         resp = self.execute_query('''
-            GO 1 STEPS FROM "Kobe Bryant" OVER serve
-            WHERE serve.start_year > 2002 YIELD $^.player.name AS name
+            GO 1 STEPS FROM "Boris Diaw" OVER serve
+            WHERE serve.start_year > 2005 YIELD serve.start_year as start_year
         ''')
         expected_plan = [
             ["Project", [1]],
-            ["GetNeighbors", [2], ['(serve.start_year>2002)']],
+            ["GetNeighbors", [2], ['(serve.start_year>2005)']],
             ["Start", []]
         ]
+        expected_data = [[2008], [2012], [2016]]
         self.check_exec_plan(resp, expected_plan)
+        self.check_out_of_order_result(resp, expected_data)
 
         resp = self.execute_query('''
-            GO 1 STEPS FROM "Lakerys" OVER serve REVERSELY
-            WHERE serve.start_year > 2002 YIELD $^.player.name AS name
+            GO 1 STEPS FROM "Lakers" OVER serve REVERSELY
+            WHERE serve.start_year < 2017 YIELD serve.start_year as start_year
         ''')
         expected_plan = [
             ["Project", [1]],
-            ["GetNeighbors", [2], ['(serve.start_year>2002)']],
+            ["GetNeighbors", [2], ['(serve.start_year<2017)']],
             ["Start", []]
         ]
+        expected_data = [[2012], [1996], [2008], [1996], [2012]]
         self.check_exec_plan(resp, expected_plan)
+        self.check_out_of_order_result(resp, expected_data)
 
     @pytest.mark.skip(reason="Depends on other opt rules to eliminate duplicate project nodes")
     def test_PushFilterDownGetNbrsRule_Failed(self):
         resp = self.execute_query('''
-            GO 1 STEPS FROM "Kobe Bryant" OVER serve
+            GO 1 STEPS FROM "Boris Diaw" OVER serve
             WHERE $^.player.age > 18 AND $$.team.name == "Lakers"
             YIELD $^.player.name AS name
         ''')
@@ -74,10 +82,12 @@ class TestOptimizer(NebulaTestSuite):
             ["GetNeighbors", [3], ['($^.player.age>18)']],
             ["Start", []]
         ]
+        expected_data = [['Boris Diaw']]
         self.check_exec_plan(resp, expected_plan)
+        self.check_out_of_order_result(resp, expected_data)
 
         resp = self.execute_query('''
-            GO 1 STEPS FROM "Kobe Bryant" OVER serve
+            GO 1 STEPS FROM "Boris Diaw" OVER serve
             WHERE $^.player.age > 18 OR $$.team.name == "Lakers"
             YIELD $^.player.name AS name
         ''')
@@ -87,11 +97,13 @@ class TestOptimizer(NebulaTestSuite):
             ["GetNeighbors", [3]],
             ["Start", []]
         ]
+        expected_data = [['Boris Diaw']]
         self.check_exec_plan(resp, expected_plan)
+        self.check_out_of_order_result(resp, expected_data)
 
         # fail to optimize cases
         resp = self.execute_query('''
-            GO 1 STEPS FROM "Kobe Bryant" OVER serve \
+            GO 1 STEPS FROM "Boris Diaw" OVER serve \
             WHERE $$.team.name == "Lakers" YIELD $^.player.name AS name
         ''')
         expected_plan = [
@@ -100,4 +112,76 @@ class TestOptimizer(NebulaTestSuite):
             ["GetNeighbors", [3]],
             ["Start", []]
         ]
+        expected_data = [['Boris Diaw']]
         self.check_exec_plan(resp, expected_plan)
+        self.check_out_of_order_result(resp, expected_data)
+
+    def test_TopNRule(self):
+        resp = self.execute_query('''
+            GO 1 STEPS FROM "Marco Belinelli" OVER like
+            YIELD like.likeness AS likeness
+             | ORDER BY likeness
+             | LIMIT 2
+        ''')
+        expected_plan = [
+            ["DataCollect", [1]],
+            ["TopN", [2]],
+            ["Project", [3]],
+            ["GetNeighbors", [4]],
+            ["Start", []]
+        ]
+        expected_data = [[50], [55]]
+        self.check_exec_plan(resp, expected_plan)
+        self.check_result(resp, expected_data)
+
+        resp = self.execute_query('''
+            GO 1 STEPS FROM "Marco Belinelli" OVER like REVERSELY
+            YIELD like.likeness AS likeness |
+            ORDER BY likeness |
+            LIMIT 1
+        ''')
+        expected_plan = [
+            ["DataCollect", [1]],
+            ["TopN", [2]],
+            ["Project", [3]],
+            ["GetNeighbors", [4]],
+            ["Start", []]
+        ]
+        expected_data = [[83]]
+        self.check_exec_plan(resp, expected_plan)
+        self.check_result(resp, expected_data)
+
+    def test_TopNRule_Failed(self):
+        resp = self.execute_query('''
+            GO 1 STEPS FROM "Marco Belinelli" OVER like
+            YIELD like.likeness as likeness
+             | ORDER BY likeness
+             | LIMIT 2, 3
+        ''')
+        expected_plan = [
+            ["DataCollect", [1]],
+            ["Limit", [2]],
+            ["Sort", [3]],
+            ["Project", [4]],
+            ["GetNeighbors", [5]],
+            ["Start", []]
+        ]
+        expected_data = [[60]]
+        self.check_exec_plan(resp, expected_plan)
+        self.check_result(resp, expected_data)
+
+        resp = self.execute_query('''
+            GO 1 STEPS FROM "Marco Belinelli" OVER like
+            YIELD like.likeness AS likeness
+             | ORDER BY likeness
+        ''')
+        expected_plan = [
+            ["DataCollect", [1]],
+            ["Sort", [2]],
+            ["Project", [3]],
+            ["GetNeighbors", [4]],
+            ["Start", []]
+        ]
+        expected_data = [[50], [55], [60]]
+        self.check_exec_plan(resp, expected_plan)
+        self.check_result(resp, expected_data)
-- 
GitLab