From 8f68a563290e3daa5d68731e6a9f45ac947c572c Mon Sep 17 00:00:00 2001
From: Valentin Lehuger
Date: Sun, 4 Dec 2016 14:33:35 +0100
Subject: [PATCH 1/2] deduplicated urls in backlink plugin results #65
---
.gitignore | 1 +
plugins/backlinks.py | 4 +--
tests/sparktests/test_plugin_backlinks.py | 40 +++++++++++++++++++++++
3 files changed, 43 insertions(+), 2 deletions(-)
create mode 100644 tests/sparktests/test_plugin_backlinks.py
diff --git a/.gitignore b/.gitignore
index 0ae84c8..98f6139 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
/.cache
/src
*.pyc
+*.~
__pycache__
/coverage.xml
/.coverage
diff --git a/plugins/backlinks.py b/plugins/backlinks.py
index 9bb35b5..8003313 100644
--- a/plugins/backlinks.py
+++ b/plugins/backlinks.py
@@ -40,9 +40,9 @@ def hook_spark_pipeline_action(self, sc, sqlc, df, indexer):
CONCAT(
regexp_replace(url_to, "^http(s?)://", ""),
" ",
- COUNT(*),
+ SIZE(COLLECT_SET(url_from)),
" ",
- CONCAT_WS(" ", COLLECT_LIST(url_from))
+ CONCAT_WS(" ", COLLECT_SET(url_from))
) r
FROM (
SELECT url url_from, EXPLODE(external_links.href) url_to
diff --git a/tests/sparktests/test_plugin_backlinks.py b/tests/sparktests/test_plugin_backlinks.py
new file mode 100644
index 0000000..fd9619c
--- /dev/null
+++ b/tests/sparktests/test_plugin_backlinks.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+
+import tempfile
+import os
+import pipes
+import pytest
+import shutil
+import ujson as json
+
+
+CORPUS = {
+ "desc": "Simple test with to backlinks to one page",
+ "docs": [
+ {
+ "url": "http://example-a.com/page1",
+ "content": """Page A1First backlinkSecond backlink"""
+ },
+ {
+ "url": "http://example-c.com",
+ "content": """Example C"""
+ }
+ ]
+}
+
+
+def test_spark_plugin_backlinks(sparksubmit):
+ tmp_dir = tempfile.mkdtemp()
+ try:
+ sparksubmit("spark/jobs/pipeline.py\
+ --source corpus:{corpus}\
+ --plugin plugins.backlinks.MostExternallyLinkedPages:domain=example-c.com,output={tmpdir}/out\
+ ".format(corpus=pipes.quote(json.dumps(CORPUS)),
+ tmpdir=tmp_dir))
+ parts = [os.path.join(tmp_dir, 'out', f) for f in os.listdir(tmp_dir + '/out/') if f.startswith("part-")]
+ assert len(parts) == 1
+ with open(parts[0], 'r') as f:
+ data = set(f.read().strip().split('\n'))
+ assert data == set(["example-c.com/ 1 http://example-a.com/page1"])
+ finally:
+ shutil.rmtree(tmp_dir)
From b0da1d5b44cbf825b5e5ffcaaf9b65e015ee7ff2 Mon Sep 17 00:00:00 2001
From: Valentin Lehuger
Date: Sun, 4 Dec 2016 14:47:47 +0100
Subject: [PATCH 2/2] emacs tmp files in gitignore
---
.gitignore | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index 98f6139..92fc809 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,7 @@
/.cache
/src
*.pyc
-*.~
+*~
__pycache__
/coverage.xml
/.coverage