diff --git a/.gitignore b/.gitignore index 0ae84c8..92fc809 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ /.cache /src *.pyc +*~ __pycache__ /coverage.xml /.coverage diff --git a/plugins/backlinks.py b/plugins/backlinks.py index 9bb35b5..8003313 100644 --- a/plugins/backlinks.py +++ b/plugins/backlinks.py @@ -40,9 +40,9 @@ def hook_spark_pipeline_action(self, sc, sqlc, df, indexer): CONCAT( regexp_replace(url_to, "^http(s?)://", ""), " ", - COUNT(*), + SIZE(COLLECT_SET(url_from)), " ", - CONCAT_WS(" ", COLLECT_LIST(url_from)) + CONCAT_WS(" ", COLLECT_SET(url_from)) ) r FROM ( SELECT url url_from, EXPLODE(external_links.href) url_to diff --git a/tests/sparktests/test_plugin_backlinks.py b/tests/sparktests/test_plugin_backlinks.py new file mode 100644 index 0000000..fd9619c --- /dev/null +++ b/tests/sparktests/test_plugin_backlinks.py @@ -0,0 +1,40 @@ +# coding: utf-8 + +import tempfile +import os +import pipes +import pytest +import shutil +import ujson as json + + +CORPUS = { + "desc": "Simple test with to backlinks to one page", + "docs": [ + { + "url": "http://example-a.com/page1", + "content": """