From 8f68a563290e3daa5d68731e6a9f45ac947c572c Mon Sep 17 00:00:00 2001 From: Valentin Lehuger Date: Sun, 4 Dec 2016 14:33:35 +0100 Subject: [PATCH 1/2] deduplicated urls in backlink plugin results #65 --- .gitignore | 1 + plugins/backlinks.py | 4 +-- tests/sparktests/test_plugin_backlinks.py | 40 +++++++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 tests/sparktests/test_plugin_backlinks.py diff --git a/.gitignore b/.gitignore index 0ae84c8..98f6139 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ /.cache /src *.pyc +*.~ __pycache__ /coverage.xml /.coverage diff --git a/plugins/backlinks.py b/plugins/backlinks.py index 9bb35b5..8003313 100644 --- a/plugins/backlinks.py +++ b/plugins/backlinks.py @@ -40,9 +40,9 @@ def hook_spark_pipeline_action(self, sc, sqlc, df, indexer): CONCAT( regexp_replace(url_to, "^http(s?)://", ""), " ", - COUNT(*), + SIZE(COLLECT_SET(url_from)), " ", - CONCAT_WS(" ", COLLECT_LIST(url_from)) + CONCAT_WS(" ", COLLECT_SET(url_from)) ) r FROM ( SELECT url url_from, EXPLODE(external_links.href) url_to diff --git a/tests/sparktests/test_plugin_backlinks.py b/tests/sparktests/test_plugin_backlinks.py new file mode 100644 index 0000000..fd9619c --- /dev/null +++ b/tests/sparktests/test_plugin_backlinks.py @@ -0,0 +1,40 @@ +# coding: utf-8 + +import tempfile +import os +import pipes +import pytest +import shutil +import ujson as json + + +CORPUS = { + "desc": "Simple test with to backlinks to one page", + "docs": [ + { + "url": "http://example-a.com/page1", + "content": """Page A1First backlinkSecond backlink""" + }, + { + "url": "http://example-c.com", + "content": """Example C""" + } + ] +} + + +def test_spark_plugin_backlinks(sparksubmit): + tmp_dir = tempfile.mkdtemp() + try: + sparksubmit("spark/jobs/pipeline.py\ + --source corpus:{corpus}\ + --plugin plugins.backlinks.MostExternallyLinkedPages:domain=example-c.com,output={tmpdir}/out\ + ".format(corpus=pipes.quote(json.dumps(CORPUS)), + tmpdir=tmp_dir)) + parts = [os.path.join(tmp_dir, 'out', f) for f in os.listdir(tmp_dir + '/out/') if f.startswith("part-")] + assert len(parts) == 1 + with open(parts[0], 'r') as f: + data = set(f.read().strip().split('\n')) + assert data == set(["example-c.com/ 1 http://example-a.com/page1"]) + finally: + shutil.rmtree(tmp_dir) From b0da1d5b44cbf825b5e5ffcaaf9b65e015ee7ff2 Mon Sep 17 00:00:00 2001 From: Valentin Lehuger Date: Sun, 4 Dec 2016 14:47:47 +0100 Subject: [PATCH 2/2] emacs tmp files in gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 98f6139..92fc809 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ /.cache /src *.pyc -*.~ +*~ __pycache__ /coverage.xml /.coverage