From 26e1c1ea9c611d588fe39e9f09fd30f6d5e46ee6 Mon Sep 17 00:00:00 2001
From: Shylock Hg <33566796+Shylock-Hg@users.noreply.github.com>
Date: Fri, 14 Aug 2020 10:59:54 +0800
Subject: [PATCH] Dedup the subgraph vertices and edges. (#160)

* Dedup the subgraph vertices and edges.

* Fix the dedup logic.
---
 src/context/Iterator.h                  | 13 +++-------
 src/context/test/IteratorTest.cpp       |  6 +++--
 src/exec/query/DataCollectExecutor.cpp  | 32 +++++++++++++++++++++----
 src/exec/query/test/DataCollectTest.cpp | 30 +++++++++++++++++++++--
 src/validator/GetSubgraphValidator.cpp  |  3 ++-
 5 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/src/context/Iterator.h b/src/context/Iterator.h
index 8a4e083c..3d48eb89 100644
--- a/src/context/Iterator.h
+++ b/src/context/Iterator.h
@@ -252,25 +252,18 @@ public:
     Value getEdge() const override;
 
     // getVertices and getEdges arg batch interface use for subgraph
+    // Its unique based on the plan
     List getVertices() {
         DCHECK(iter_ == logicalRows_.begin());
         List vertices;
-        std::unordered_set<Value> vids;
         for (; valid(); next()) {
-            auto vid = getColumn(kVid);
-            if (vid.isNull()) {
-                continue;
-            }
-            auto found = vids.find(vid);
-            if (found == vids.end()) {
-                vertices.values.emplace_back(getVertex());
-                vids.emplace(std::move(vid));
-            }
+            vertices.values.emplace_back(getVertex());
         }
         reset();
         return vertices;
     }
 
+    // Its unique based on the GN interface dedup
     List getEdges() {
         DCHECK(iter_ == logicalRows_.begin());
         List edges;
diff --git a/src/context/test/IteratorTest.cpp b/src/context/test/IteratorTest.cpp
index dc373f9f..4ba47c69 100644
--- a/src/context/test/IteratorTest.cpp
+++ b/src/context/test/IteratorTest.cpp
@@ -339,6 +339,7 @@ TEST(IteratorTest, GetNeighbor) {
             Vertex vertex;
             vertex.vid = folly::to<std::string>(i);
             vertex.tags.emplace_back(tag1);
+            expected.emplace_back(vertex);
             expected.emplace_back(std::move(vertex));
         }
         Tag tag2;
@@ -348,14 +349,15 @@ TEST(IteratorTest, GetNeighbor) {
             Vertex vertex;
             vertex.vid = folly::to<std::string>(i);
             vertex.tags.emplace_back(tag2);
+            expected.emplace_back(vertex);
             expected.emplace_back(std::move(vertex));
         }
         List result = iter.getVertices();
-        EXPECT_EQ(result.values.size(), 20);
+        EXPECT_EQ(result.values.size(), 40);
         EXPECT_EQ(result.values, expected);
 
         result = iter.getVertices();
-        EXPECT_EQ(result.values.size(), 20);
+        EXPECT_EQ(result.values.size(), 40);
         EXPECT_EQ(result.values, expected);
     }
     {
diff --git a/src/exec/query/DataCollectExecutor.cpp b/src/exec/query/DataCollectExecutor.cpp
index bfb8ae0a..2afd2a1d 100644
--- a/src/exec/query/DataCollectExecutor.cpp
+++ b/src/exec/query/DataCollectExecutor.cpp
@@ -44,16 +44,40 @@ folly::Future<Status> DataCollectExecutor::doCollect() {
 Status DataCollectExecutor::collectSubgraph(const std::vector<std::string>& vars) {
     DataSet ds;
     ds.colNames = std::move(colNames_);
+    // the subgraph not need duplicate vertices or edges, so dedup here directly
+    std::unordered_set<std::string> vids;
+    std::unordered_set<std::tuple<std::string, int64_t, int64_t, std::string>> edgeKeys;
     for (auto& var : vars) {
         auto& hist = ectx_->getHistory(var);
         for (auto& result : hist) {
             auto iter = result.iter();
             if (iter->isGetNeighborsIter()) {
-                Row row;
+                List vertices;
+                List edges;
                 auto* gnIter = static_cast<GetNeighborsIter*>(iter.get());
-                row.values.emplace_back(gnIter->getVertices());
-                row.values.emplace_back(gnIter->getEdges());
-                ds.rows.emplace_back(std::move(row));
+                auto originVertices = gnIter->getVertices();
+                for (auto& v : originVertices.values) {
+                    if (!v.isVertex()) {
+                        continue;
+                    }
+                    if (vids.emplace(v.getVertex().vid).second) {
+                        vertices.emplace_back(std::move(v));
+                    }
+                }
+                auto originEdges = gnIter->getEdges();
+                for (auto& e : originEdges.values) {
+                    if (!e.isEdge()) {
+                        continue;
+                    }
+                    auto edgeKey = std::make_tuple(e.getEdge().src,
+                                                   e.getEdge().type,
+                                                   e.getEdge().ranking,
+                                                   e.getEdge().dst);
+                    if (edgeKeys.emplace(std::move(edgeKey)).second) {
+                        edges.emplace_back(std::move(e));
+                    }
+                }
+                ds.rows.emplace_back(Row({std::move(vertices), std::move(edges)}));
             } else {
                 return Status::Error("Iterator should be kind of GetNeighborIter.");
             }
diff --git a/src/exec/query/test/DataCollectTest.cpp b/src/exec/query/test/DataCollectTest.cpp
index c83dad84..51589a57 100644
--- a/src/exec/query/test/DataCollectTest.cpp
+++ b/src/exec/query/test/DataCollectTest.cpp
@@ -144,8 +144,34 @@ TEST_F(DataCollectTest, CollectSubgraph) {
     auto iter = input.iter();
     auto* gNIter = static_cast<GetNeighborsIter*>(iter.get());
     Row row;
-    row.values.emplace_back(gNIter->getVertices());
-    row.values.emplace_back(gNIter->getEdges());
+    std::unordered_set<std::string> vids;
+    std::unordered_set<std::tuple<std::string, int64_t, int64_t, std::string>> edgeKeys;
+    List vertices;
+    List edges;
+    auto originVertices = gNIter->getVertices();
+    for (auto& v : originVertices.values) {
+        if (!v.isVertex()) {
+            continue;
+        }
+        if (vids.emplace(v.getVertex().vid).second) {
+            vertices.emplace_back(std::move(v));
+        }
+    }
+    auto originEdges = gNIter->getEdges();
+    for (auto& e : originEdges.values) {
+        if (!e.isEdge()) {
+            continue;
+        }
+        auto edgeKey = std::make_tuple(e.getEdge().src,
+                                        e.getEdge().type,
+                                        e.getEdge().ranking,
+                                        e.getEdge().dst);
+        if (edgeKeys.emplace(std::move(edgeKey)).second) {
+            edges.emplace_back(std::move(e));
+        }
+    }
+    row.values.emplace_back(std::move(vertices));
+    row.values.emplace_back(std::move(edges));
     expected.rows.emplace_back(std::move(row));
 
     EXPECT_EQ(result.value().getDataSet(), expected);
diff --git a/src/validator/GetSubgraphValidator.cpp b/src/validator/GetSubgraphValidator.cpp
index 15403cbd..0b049809 100644
--- a/src/validator/GetSubgraphValidator.cpp
+++ b/src/validator/GetSubgraphValidator.cpp
@@ -180,7 +180,8 @@ Status GetSubgraphValidator::toPlan() {
             std::move(vertexProps),
             std::move(edgeProps),
             std::move(statProps),
-            std::move(exprs));
+            std::move(exprs),
+            true /*subgraph not need duplicate*/);
     gn1->setInputVar(vidsToSave);
 
     auto* columns = new YieldColumns();
-- 
GitLab