From 26e1c1ea9c611d588fe39e9f09fd30f6d5e46ee6 Mon Sep 17 00:00:00 2001 From: Shylock Hg <33566796+Shylock-Hg@users.noreply.github.com> Date: Fri, 14 Aug 2020 10:59:54 +0800 Subject: [PATCH] Dedup the subgraph vertices and edges. (#160) * Dedup the subgraph vertices and edges. * Fix the dedup logic. --- src/context/Iterator.h | 13 +++------- src/context/test/IteratorTest.cpp | 6 +++-- src/exec/query/DataCollectExecutor.cpp | 32 +++++++++++++++++++++---- src/exec/query/test/DataCollectTest.cpp | 30 +++++++++++++++++++++-- src/validator/GetSubgraphValidator.cpp | 3 ++- 5 files changed, 65 insertions(+), 19 deletions(-) diff --git a/src/context/Iterator.h b/src/context/Iterator.h index 8a4e083c..3d48eb89 100644 --- a/src/context/Iterator.h +++ b/src/context/Iterator.h @@ -252,25 +252,18 @@ public: Value getEdge() const override; // getVertices and getEdges arg batch interface use for subgraph + // Its unique based on the plan List getVertices() { DCHECK(iter_ == logicalRows_.begin()); List vertices; - std::unordered_set<Value> vids; for (; valid(); next()) { - auto vid = getColumn(kVid); - if (vid.isNull()) { - continue; - } - auto found = vids.find(vid); - if (found == vids.end()) { - vertices.values.emplace_back(getVertex()); - vids.emplace(std::move(vid)); - } + vertices.values.emplace_back(getVertex()); } reset(); return vertices; } + // Its unique based on the GN interface dedup List getEdges() { DCHECK(iter_ == logicalRows_.begin()); List edges; diff --git a/src/context/test/IteratorTest.cpp b/src/context/test/IteratorTest.cpp index dc373f9f..4ba47c69 100644 --- a/src/context/test/IteratorTest.cpp +++ b/src/context/test/IteratorTest.cpp @@ -339,6 +339,7 @@ TEST(IteratorTest, GetNeighbor) { Vertex vertex; vertex.vid = folly::to<std::string>(i); vertex.tags.emplace_back(tag1); + expected.emplace_back(vertex); expected.emplace_back(std::move(vertex)); } Tag tag2; @@ -348,14 +349,15 @@ TEST(IteratorTest, GetNeighbor) { Vertex vertex; vertex.vid = folly::to<std::string>(i); vertex.tags.emplace_back(tag2); + expected.emplace_back(vertex); expected.emplace_back(std::move(vertex)); } List result = iter.getVertices(); - EXPECT_EQ(result.values.size(), 20); + EXPECT_EQ(result.values.size(), 40); EXPECT_EQ(result.values, expected); result = iter.getVertices(); - EXPECT_EQ(result.values.size(), 20); + EXPECT_EQ(result.values.size(), 40); EXPECT_EQ(result.values, expected); } { diff --git a/src/exec/query/DataCollectExecutor.cpp b/src/exec/query/DataCollectExecutor.cpp index bfb8ae0a..2afd2a1d 100644 --- a/src/exec/query/DataCollectExecutor.cpp +++ b/src/exec/query/DataCollectExecutor.cpp @@ -44,16 +44,40 @@ folly::Future<Status> DataCollectExecutor::doCollect() { Status DataCollectExecutor::collectSubgraph(const std::vector<std::string>& vars) { DataSet ds; ds.colNames = std::move(colNames_); + // the subgraph not need duplicate vertices or edges, so dedup here directly + std::unordered_set<std::string> vids; + std::unordered_set<std::tuple<std::string, int64_t, int64_t, std::string>> edgeKeys; for (auto& var : vars) { auto& hist = ectx_->getHistory(var); for (auto& result : hist) { auto iter = result.iter(); if (iter->isGetNeighborsIter()) { - Row row; + List vertices; + List edges; auto* gnIter = static_cast<GetNeighborsIter*>(iter.get()); - row.values.emplace_back(gnIter->getVertices()); - row.values.emplace_back(gnIter->getEdges()); - ds.rows.emplace_back(std::move(row)); + auto originVertices = gnIter->getVertices(); + for (auto& v : originVertices.values) { + if (!v.isVertex()) { + continue; + } + if (vids.emplace(v.getVertex().vid).second) { + vertices.emplace_back(std::move(v)); + } + } + auto originEdges = gnIter->getEdges(); + for (auto& e : originEdges.values) { + if (!e.isEdge()) { + continue; + } + auto edgeKey = std::make_tuple(e.getEdge().src, + e.getEdge().type, + e.getEdge().ranking, + e.getEdge().dst); + if (edgeKeys.emplace(std::move(edgeKey)).second) { + edges.emplace_back(std::move(e)); + } + } + ds.rows.emplace_back(Row({std::move(vertices), std::move(edges)})); } else { return Status::Error("Iterator should be kind of GetNeighborIter."); } diff --git a/src/exec/query/test/DataCollectTest.cpp b/src/exec/query/test/DataCollectTest.cpp index c83dad84..51589a57 100644 --- a/src/exec/query/test/DataCollectTest.cpp +++ b/src/exec/query/test/DataCollectTest.cpp @@ -144,8 +144,34 @@ TEST_F(DataCollectTest, CollectSubgraph) { auto iter = input.iter(); auto* gNIter = static_cast<GetNeighborsIter*>(iter.get()); Row row; - row.values.emplace_back(gNIter->getVertices()); - row.values.emplace_back(gNIter->getEdges()); + std::unordered_set<std::string> vids; + std::unordered_set<std::tuple<std::string, int64_t, int64_t, std::string>> edgeKeys; + List vertices; + List edges; + auto originVertices = gNIter->getVertices(); + for (auto& v : originVertices.values) { + if (!v.isVertex()) { + continue; + } + if (vids.emplace(v.getVertex().vid).second) { + vertices.emplace_back(std::move(v)); + } + } + auto originEdges = gNIter->getEdges(); + for (auto& e : originEdges.values) { + if (!e.isEdge()) { + continue; + } + auto edgeKey = std::make_tuple(e.getEdge().src, + e.getEdge().type, + e.getEdge().ranking, + e.getEdge().dst); + if (edgeKeys.emplace(std::move(edgeKey)).second) { + edges.emplace_back(std::move(e)); + } + } + row.values.emplace_back(std::move(vertices)); + row.values.emplace_back(std::move(edges)); expected.rows.emplace_back(std::move(row)); EXPECT_EQ(result.value().getDataSet(), expected); diff --git a/src/validator/GetSubgraphValidator.cpp b/src/validator/GetSubgraphValidator.cpp index 15403cbd..0b049809 100644 --- a/src/validator/GetSubgraphValidator.cpp +++ b/src/validator/GetSubgraphValidator.cpp @@ -180,7 +180,8 @@ Status GetSubgraphValidator::toPlan() { std::move(vertexProps), std::move(edgeProps), std::move(statProps), - std::move(exprs)); + std::move(exprs), + true /*subgraph not need duplicate*/); gn1->setInputVar(vidsToSave); auto* columns = new YieldColumns(); -- GitLab