|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | +package org.apache.nutch.crawl; |
| 18 | + |
| 19 | +import org.apache.hadoop.conf.Configuration; |
| 20 | +import org.apache.hadoop.fs.FileSystem; |
| 21 | +import org.apache.hadoop.fs.Path; |
| 22 | +import org.apache.hadoop.io.SequenceFile; |
| 23 | +import org.apache.hadoop.io.SequenceFile.Reader.Option; |
| 24 | +import org.apache.hadoop.io.Text; |
| 25 | +import org.apache.logging.log4j.core.config.Configurator; |
| 26 | +import org.junit.jupiter.api.AfterEach; |
| 27 | +import org.junit.jupiter.api.BeforeEach; |
| 28 | +import org.junit.jupiter.api.Test; |
| 29 | + |
| 30 | +import java.io.File; |
| 31 | +import java.io.IOException; |
| 32 | +import java.util.ArrayList; |
| 33 | +import java.util.Collections; |
| 34 | +import java.util.List; |
| 35 | + |
| 36 | +import static org.hamcrest.CoreMatchers.is; |
| 37 | +import static org.hamcrest.MatcherAssert.assertThat; |
| 38 | +import static org.junit.jupiter.api.Assertions.assertFalse; |
| 39 | +import static org.junit.jupiter.api.Assertions.assertTrue; |
| 40 | + |
| 41 | +/** |
| 42 | + * End-to-end test for {@link SitemapInjector}: points the injector at a local |
| 43 | + * {@code file://} sitemap, runs the full two-step MapReduce pipeline, then |
| 44 | + * reads the resulting CrawlDb and asserts the expected URLs (including |
| 45 | + * hreflang alternates) are present. |
| 46 | + */ |
| 47 | +public class TestSitemapInjector { |
| 48 | + |
| 49 | + private Configuration conf; |
| 50 | + private FileSystem fs; |
| 51 | + private final static Path testdir = new Path("build/test/sitemap-inject-test"); |
| 52 | + private Path crawldbPath; |
| 53 | + private Path urlPath; |
| 54 | + private String sitemapUrl; |
| 55 | + |
| 56 | + @BeforeEach |
| 57 | + public void setUp() throws Exception { |
| 58 | + Configurator.setLevel( |
| 59 | + "org.apache.hadoop.mapred", org.apache.logging.log4j.Level.DEBUG); |
| 60 | + |
| 61 | + conf = CrawlDBTestUtil.createContext().getConfiguration(); |
| 62 | + |
| 63 | + conf.set("plugin.folders", new File("build/plugins").getAbsolutePath()); |
| 64 | + conf.set("plugin.includes", |
| 65 | + "protocol-file|urlfilter-regex|index-basic"); |
| 66 | + conf.setInt("http.time.limit", 120); |
| 67 | + conf.setBoolean("mime.type.magic", false); |
| 68 | + conf.setBoolean("db.injector.sitemap.check-cross-submits", false); |
| 69 | + conf.setInt("mapreduce.mapper.multithreadedmapper.threads", 1); |
| 70 | + conf.set("http.filter.ipaddress.exclude", ""); |
| 71 | + |
| 72 | + conf.set("urlfilter.regex.rules", "+."); |
| 73 | +// conf.set("urlfilter.regex.file", |
| 74 | +// new File("src/testresources/regex-urlfilter-test.txt").getAbsoluteFile().toURI().toString()); |
| 75 | + |
| 76 | + urlPath = new Path(testdir, "urls"); |
| 77 | + crawldbPath = new Path(testdir, "crawldb"); |
| 78 | + fs = FileSystem.get(conf); |
| 79 | + fs.delete(testdir, true); |
| 80 | + } |
| 81 | + |
| 82 | + @AfterEach |
| 83 | + public void tearDown() throws IOException { |
| 84 | + fs.delete(testdir, true); |
| 85 | + } |
| 86 | + |
| 87 | + @Test |
| 88 | + public void injectsUrlsFromLocalSitemap1() throws Exception { |
| 89 | + sitemapUrl = resolveFixture("sitemaps/sitemap.example.1.xml"); |
| 90 | + |
| 91 | + List<String> seeds = new ArrayList<>(); |
| 92 | + seeds.add(sitemapUrl); |
| 93 | + CrawlDBTestUtil.generateSeedList(fs, urlPath, seeds); |
| 94 | + |
| 95 | + SitemapInjector sitemapInjector = new SitemapInjector(); |
| 96 | + sitemapInjector.setConf(conf); |
| 97 | + sitemapInjector.inject(crawldbPath, urlPath); |
| 98 | + |
| 99 | + List<String> injected = readCrawldb(); |
| 100 | + |
| 101 | + assertFalse(injected.isEmpty(), |
| 102 | + "SitemapInjector produced an empty CrawlDb"); |
| 103 | + |
| 104 | + // Primary <loc> URL from sitemap.example.1.xml |
| 105 | + assertTrue( |
| 106 | + injected.contains("https://example.com/sitemap.html"), |
| 107 | + "Primary <loc> URL missing from CrawlDb"); |
| 108 | + |
| 109 | + // hreflang alternate from the same <url> block - exercises the |
| 110 | + // sitemap-localized-links extraction path. |
| 111 | + assertTrue( |
| 112 | + injected.contains("https://example.com/tr/en/sitemap.html"), |
| 113 | + "hreflang alternate missing from CrawlDb (localized-links extraction failed)"); |
| 114 | + |
| 115 | + assertThat(injected.size(), is(6)); |
| 116 | + } |
| 117 | + |
| 118 | + |
| 119 | + @Test |
| 120 | + public void injectsUrlsFromLocalSitemap2() throws Exception { |
| 121 | + sitemapUrl = resolveFixture("sitemaps/sitemap.example.2.xml"); |
| 122 | + |
| 123 | + List<String> seeds = new ArrayList<>(); |
| 124 | + seeds.add(sitemapUrl); |
| 125 | + CrawlDBTestUtil.generateSeedList(fs, urlPath, seeds); |
| 126 | + |
| 127 | + SitemapInjector sitemapInjector = new SitemapInjector(); |
| 128 | + sitemapInjector.setConf(conf); |
| 129 | + sitemapInjector.inject(crawldbPath, urlPath); |
| 130 | + |
| 131 | + List<String> injected = readCrawldb(); |
| 132 | + |
| 133 | + assertFalse(injected.isEmpty(), |
| 134 | + "SitemapInjector produced an empty CrawlDb"); |
| 135 | + |
| 136 | + assertThat(injected.size(), is(3)); |
| 137 | + } |
| 138 | + |
| 139 | + /** |
| 140 | + * Resolve a fixture path either from {@code build/test/data} (where |
| 141 | + * {@code ant test-core} copies {@code src/testresources}) or directly from |
| 142 | + * {@code src/testresources} when running from an IDE. |
| 143 | + */ |
| 144 | + private String resolveFixture(String relative) { |
| 145 | + String testData = System.getProperty("test.build.data"); |
| 146 | + if (testData != null) { |
| 147 | + File f = new File(testData, relative); |
| 148 | + if (f.exists()) { |
| 149 | + return f.toURI().toString(); |
| 150 | + } |
| 151 | + } |
| 152 | + File src = new File("src/testresources", relative); |
| 153 | + if (!src.exists()) { |
| 154 | + src = new File("../src/testresources", relative); |
| 155 | + } |
| 156 | + return src.getAbsoluteFile().toURI().toString(); |
| 157 | + } |
| 158 | + |
| 159 | + private List<String> readCrawldb() throws IOException { |
| 160 | + Path dbfile = new Path(crawldbPath, |
| 161 | + CrawlDb.CURRENT_NAME + "/part-r-00000/data"); |
| 162 | + Option rFile = SequenceFile.Reader.file(dbfile); |
| 163 | + @SuppressWarnings("resource") |
| 164 | + SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); |
| 165 | + List<String> read = new ArrayList<>(); |
| 166 | + try { |
| 167 | + while (true) { |
| 168 | + Text key = new Text(); |
| 169 | + CrawlDatum value = new CrawlDatum(); |
| 170 | + if (!reader.next(key, value)) { |
| 171 | + break; |
| 172 | + } |
| 173 | + read.add(key.toString()); |
| 174 | + } |
| 175 | + } finally { |
| 176 | + reader.close(); |
| 177 | + } |
| 178 | + Collections.sort(read); |
| 179 | + return read; |
| 180 | + } |
| 181 | +} |
0 commit comments