diff --git a/pom.xml b/pom.xml
index 4c019aae..3cafb094 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,13 +1,13 @@
4.0.0
- org.netpreserve.commons
- webarchive-commons
+ org.commoncrawl
+ ia-web-commons
3.0.5-SNAPSHOT
jar
- webarchive-commons
- https://github.com/iipc/webarchive-commons
+ ia-web-commons
+ https://github.com/commoncrawl/ia-web-commons
Common web archive utility code
@@ -66,10 +66,11 @@
- org.json
- json
- 20260522
+ com.github.openjson
+ openjson
+ 1.0.13
+
org.htmlparser
htmlparser
@@ -97,7 +98,7 @@
org.apache.hadoop
hadoop-common
- 3.4.3
+ 3.3.6
true
@@ -110,7 +111,7 @@
org.apache.hadoop
hadoop-mapreduce-client-core
- 3.4.3
+ 3.3.6
true
@@ -139,6 +140,12 @@
3.20.0
+
+ org.jsoup
+ jsoup
+ 1.22.2
+
+
commons-io
commons-io
@@ -152,9 +159,9 @@
- org.apache.httpcomponents
- httpcore
- 4.4.16
+ org.apache.httpcomponents
+ httpcore
+ 4.4.16
@@ -169,6 +176,24 @@
${java.version}
+
+ maven-assembly-plugin
+ 3.8.0
+
+
+ jar-with-dependencies
+
+ webarchive-commons
+
+
+
+ package
+
+ single
+
+
+
+
org.apache.maven.plugins
maven-surefire-plugin
@@ -201,6 +226,11 @@
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.2.5
+
@@ -212,78 +242,4 @@
-
-
- release
-
-
-
- org.sonatype.central
- central-publishing-maven-plugin
- 0.10.0
- true
-
- central
- true
-
-
-
- org.apache.maven.plugins
- maven-release-plugin
- 3.3.1
-
-
- org.apache.maven.plugins
- maven-source-plugin
- 3.4.0
-
-
- attach-sources
-
- jar-no-fork
-
-
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 3.12.0
-
-
- attach-javadocs
-
- jar
-
-
-
-
-
- org.apache.maven.plugins
- maven-gpg-plugin
- 3.2.8
-
-
- sign-artifacts
- verify
-
- sign
-
-
-
-
-
-
-
-
- jdk9-plus
-
- [9,)
-
-
- 8
-
-
-
-
diff --git a/src/main/java/org/archive/extract/DumpingExtractorOutput.java b/src/main/java/org/archive/extract/DumpingExtractorOutput.java
index 1ccbf771..cf61ee74 100644
--- a/src/main/java/org/archive/extract/DumpingExtractorOutput.java
+++ b/src/main/java/org/archive/extract/DumpingExtractorOutput.java
@@ -8,7 +8,7 @@
import org.archive.resource.Resource;
import org.archive.util.StreamCopy;
-import org.json.JSONException;
+import com.github.openjson.JSONException;
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;
diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
index 567b1cd8..97a69564 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
@@ -5,6 +5,7 @@
import java.util.logging.Logger;
import org.archive.format.arc.ARCConstants;
+import org.archive.format.json.SimpleJSONPathSpec;
import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.resource.MetaData;
@@ -21,8 +22,8 @@
import org.archive.resource.warc.record.DNSResourceFactory;
import org.archive.resource.warc.record.WARCJSONMetaDataResourceFactory;
import org.archive.resource.warc.record.WARCMetaDataResourceFactory;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class ExtractingResourceFactoryMapper implements ResourceFactoryMapper {
@@ -126,9 +127,20 @@ private boolean isHTTPARCResource(MetaData envelope) {
ARCConstants.URL_KEY, "http");
}
- private boolean isHTMLHttpResource(MetaData m) {
- String type = caseInsensitiveKeyScan(m,HTTP_HEADERS_LIST,
- "Content-Type");
+ private boolean isHTMLHttpResource(MetaData m, HTTPResponseResource r) {
+ SimpleJSONPathSpec warcIdentifiedPayloadType = new SimpleJSONPathSpec(
+ "Envelope.WARC-Header-Metadata.WARC-Identified-Payload-Type");
+ String type = WARCMetadataRecordExtractorOutput
+ .unwrapFirst(warcIdentifiedPayloadType.extract(m.getTopMetaData()), null);
+ if (type != null) {
+ switch (type) {
+ case "text/html":
+ case "application/xhtml+xml":
+ return true;
+ }
+ return false;
+ }
+ type = caseInsensitiveKeyScan(m, HTTP_HEADERS_LIST, "Content-Type");
return type == null ? false : type.toLowerCase(Locale.ROOT).contains("html");
}
@@ -169,8 +181,8 @@ private boolean isDNSResponseWARCResource(MetaData envelope) {
WARCConstants.CONTENT_TYPE,PAYLOAD_TYPE_DNS);
}
+ @Override
public ResourceFactory mapResourceToFactory(Resource resource) {
-
if(resource instanceof WARCResource) {
WARCResource wr = (WARCResource) resource;
MetaData envelope = wr.getEnvelopeMetaData();
@@ -209,7 +221,7 @@ public ResourceFactory mapResourceToFactory(Resource resource) {
}
} else if(resource instanceof HTTPResponseResource) {
- if(isHTMLHttpResource(resource.getMetaData())) {
+ if(isHTMLHttpResource(resource.getMetaData(), (HTTPResponseResource) resource)) {
return htmlF;
} else {
// TODO: more formats...
diff --git a/src/main/java/org/archive/extract/ExtractingResourceProducer.java b/src/main/java/org/archive/extract/ExtractingResourceProducer.java
index 07cdb88a..93a5c3f3 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceProducer.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceProducer.java
@@ -23,6 +23,7 @@ public ExtractingResourceProducer(ResourceProducer producer,
this.mapper = mapper;
}
+ @Override
public Resource getNext() throws ResourceParseException, IOException {
Resource current = producer.getNext();
if(current == null) {
@@ -33,8 +34,8 @@ public Resource getNext() throws ResourceParseException, IOException {
if(f == null) {
return current;
}
- if(LOG.isLoggable(Level.INFO)) {
- LOG.info(String.format(Locale.ROOT, "Extracting (%s) with (%s)\n",
+ if(LOG.isLoggable(Level.FINE)) {
+ LOG.fine(String.format(Locale.ROOT, "Extracting (%s) with (%s)\n",
current.getClass().toString(),
f.getClass().toString()));
}
@@ -43,10 +44,12 @@ public Resource getNext() throws ResourceParseException, IOException {
}
}
+ @Override
public void close() throws IOException {
producer.close();
}
+ @Override
public String getContext() {
return producer.getContext();
}
diff --git a/src/main/java/org/archive/extract/ProducerUtils.java b/src/main/java/org/archive/extract/ProducerUtils.java
index b75d2f15..d8db9630 100644
--- a/src/main/java/org/archive/extract/ProducerUtils.java
+++ b/src/main/java/org/archive/extract/ProducerUtils.java
@@ -29,7 +29,7 @@ public static ResourceProducer getProducer(String path, long offset) throws IOEx
wf.setStrict(STRICT_GZ);
File file = new File(path);
- if(path.startsWith("hdfs://")) {
+ if(path.startsWith("hdfs://") || path.startsWith("s3a://")) {
String name = file.getName();
Path fsPath = new Path(path);
FileSystem fs = fsPath.getFileSystem(new Configuration());
@@ -65,7 +65,15 @@ public static ResourceProducer getProducer(String path, long offset) throws IOEx
} else {
- if(!(file.exists() && file.canRead())) {
+ if(path.startsWith("file:/")) {
+ file = new File(new URL(path).getPath());
+ }
+
+ if(!file.exists()) {
+ System.err.println(path + ": file not found.");
+ return null;
+ }
+ if(!file.canRead()) {
System.err.println(path + " is not a readable file.");
return null;
}
diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
index ff0b9e83..30411f49 100644
--- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
+++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
@@ -22,9 +22,9 @@
import org.archive.url.WaybackURLKeyMaker;
import org.archive.util.IAUtils;
import org.archive.util.StreamCopy;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONArray;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;
diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java
index d9b9f396..11ae78f6 100644
--- a/src/main/java/org/archive/extract/ResourceExtractor.java
+++ b/src/main/java/org/archive/extract/ResourceExtractor.java
@@ -54,6 +54,7 @@ private static int USAGE(int exitCode) {
System.err.println("\t\t\t (note that column 1 is NOT standard Wayback canonicalized)\n");
System.err.println("\t\t-wat\tembed JSON output in a compressed WARC" +
"wrapper, for storage, or sharing.");
+ System.err.println("\t\t-wet\tembed text extracts in a compressed WARC" + "wrapper, for storage, or sharing.");
return exitCode;
}
@@ -109,6 +110,9 @@ public int run(String[] args)
} else if(args[arg].equals("-wat")) {
path = args[arg+1];
out = new WATExtractorOutput(os, outputFile);
+ } else if (args[arg].equals("-wet")) {
+ path = args[arg + 1];
+ out = new WETExtractorOutput(os);
} else {
String filter = args[arg+1];
out = new JSONViewExtractorOutput(os, filter);
diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
index b1050a14..438c11c4 100644
--- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
@@ -12,8 +12,8 @@
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.util.StreamCopy;
-import org.json.JSONArray;
-import org.json.JSONObject;
+import com.github.openjson.JSONArray;
+import com.github.openjson.JSONObject;
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;
@@ -40,6 +40,7 @@ public WARCMetadataRecordExtractorOutput(PrintWriter out) {
this(out,"outlinks");
}
+ @Override
public void output(Resource resource) throws IOException {
OutputStream nullo = ByteStreams.nullOutputStream();
CountingOutputStream co = new CountingOutputStream(nullo);
@@ -124,7 +125,7 @@ private String getWARCMetadataRecord(MetaData m) {
return unwrapFirst(warcMetadataRecord.extract(m),"-");
}
- private String unwrapFirst(List> l, String defaultValue) {
+ public static String unwrapFirst(List> l, String defaultValue) {
if(l != null) {
if(l.size() > 0) {
if(l.get(0) != null) {
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index f695796f..11b571bf 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -21,11 +21,10 @@
import org.archive.util.DateUtils;
import org.archive.util.StreamCopy;
import org.archive.util.io.CommitedOutputStream;
-import org.json.JSONException;
+
+import com.github.openjson.JSONObject;
import java.net.InetAddress;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
import java.util.logging.Logger;
@@ -41,6 +40,10 @@ public class WATExtractorOutput implements ExtractorOutput {
private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName());
+ public WATExtractorOutput(OutputStream out) {
+ this(out, null);
+ }
+
public WATExtractorOutput(OutputStream out, String outputFile) {
gzW = new GZIPMemberWriter(out);
recW = new WARCRecordWriter();
@@ -52,6 +55,7 @@ private CommitedOutputStream getOutput() {
return new GZIPMemberWriterCommittedOutputStream(gzW,bufferRAM);
}
+ @Override
public void output(Resource resource) throws IOException {
StreamCopy.readToEOF(resource.getInputStream());
MetaData top = resource.getMetaData().getTopMetaData();
@@ -67,6 +71,15 @@ public void output(Resource resource) throws IOException {
// hrm...
throw new IOException("Missing Envelope.Format");
}
+
+ // remove the text extracts if it exists
+ String textExtract = null;
+ JSONObject htmlMeta = JSONUtils.extractObject(top, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata");
+ if (htmlMeta != null && htmlMeta.has("Text")) {
+ textExtract = htmlMeta.getString("Text");
+ htmlMeta.remove("Text");
+ }
+
cos = getOutput();
if(envelopeFormat.startsWith("ARC")) {
writeARC(cos,top);
@@ -77,6 +90,11 @@ public void output(Resource resource) throws IOException {
throw new IOException("Unknown Envelope.Format");
}
cos.commit();
+
+ // restore text extract
+ if (textExtract != null) {
+ htmlMeta.put("Text", textExtract);
+ }
}
private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
@@ -100,8 +118,8 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException
File tmpFile = new File(filename);
filename = tmpFile.getName();
HttpHeaders headers = new HttpHeaders();
- headers.add("software", IAUtils.COMMONS_VERSION);
- headers.addDateHeader("extractedDate", new Date());
+ headers.add("Software-Info", IAUtils.COMMONS_VERSION);
+ headers.addDateHeader("Extracted-Date", new Date());
// add ip, hostname
try {
@@ -159,27 +177,29 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
}
// handle date of generation in WARC format
- DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.ROOT);
- String capDateString = dateFormat.format(new Date());
+ Date date = new Date();
String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
- writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
+ writeWARCMDRecord(recOut,md,targetURI,date,recId);
}
private void writeWARCMDRecord(OutputStream recOut, MetaData md,
- String targetURI, String capDateString, String recId)
+ String targetURI, Date capDate, String recId)
throws IOException {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
OutputStreamWriter osw = new OutputStreamWriter(bos, UTF_8);
- try {
- md.write(osw);
- } catch (JSONException e1) {
- e1.printStackTrace();
- throw new IOException(e1);
- }
+ String contents = md.toString();
+ osw.write(contents, 0, contents.length());
osw.flush();
-// ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes(UTF_8));
+
+ recW.writeJSONMetadataRecord(recOut, bos.toByteArray(),
+ targetURI, capDate, recId);
+ }
+
+ private void writeWARCMDRecord(OutputStream recOut, MetaData md,
+ String targetURI, String capDateString, String recId)
+ throws IOException {
Date capDate;
try {
capDate = DateUtils.getSecondsSinceEpoch(capDateString);
@@ -189,22 +209,7 @@ private void writeWARCMDRecord(OutputStream recOut, MetaData md,
// TODO... not the write thing...
capDate = new Date();
}
-
- recW.writeJSONMetadataRecord(recOut, bos.toByteArray(),
- targetURI, capDate, recId);
+ writeWARCMDRecord(recOut, md, targetURI, capDate, recId);
}
- private static String transformWARCDate(final String input) {
-
- StringBuilder output = new StringBuilder(14);
-
- output.append(input.substring(0,4));
- output.append(input.substring(5,7));
- output.append(input.substring(8,10));
- output.append(input.substring(11,13));
- output.append(input.substring(14,16));
- output.append(input.substring(17,19));
-
- return output.toString();
- }
}
diff --git a/src/main/java/org/archive/extract/WETExtractorOutput.java b/src/main/java/org/archive/extract/WETExtractorOutput.java
new file mode 100644
index 00000000..b126096f
--- /dev/null
+++ b/src/main/java/org/archive/extract/WETExtractorOutput.java
@@ -0,0 +1,155 @@
+package org.archive.extract;
+
+import org.archive.format.gzip.GZIPMemberWriter;
+import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
+import org.archive.format.http.HttpHeaders;
+import org.archive.format.json.JSONUtils;
+import org.archive.format.warc.WARCRecordWriter;
+import org.archive.resource.MetaData;
+import org.archive.resource.Resource;
+import org.archive.util.IAUtils;
+import org.archive.util.StreamCopy;
+import org.archive.util.io.CommitedOutputStream;
+
+import com.github.openjson.JSONObject;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.Date;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * This is for generating a WARC Encapsulated Text file
+ *
+ * These are implemented as WARC conversion records. Only
+ * Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata.Text fields are included
+ */
+public class WETExtractorOutput implements ExtractorOutput {
+ WARCRecordWriter recW;
+ private boolean wroteFirst;
+ private GZIPMemberWriter gzW;
+ private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
+ private int bufferRAM = DEFAULT_BUFFER_RAM;
+ private String outFilename;
+
+ public WETExtractorOutput(OutputStream out) {
+ this(out, null);
+ }
+
+ public WETExtractorOutput(OutputStream out, String filename) {
+ gzW = new GZIPMemberWriter(out);
+ recW = new WARCRecordWriter();
+ wroteFirst = false;
+ outFilename = filename;
+ }
+
+ private CommitedOutputStream getOutput() {
+ return new GZIPMemberWriterCommittedOutputStream(gzW,bufferRAM);
+ }
+
+
+ public void output(Resource resource) throws IOException {
+ StreamCopy.readToEOF(resource.getInputStream());
+ MetaData top = resource.getMetaData().getTopMetaData();
+ CommitedOutputStream cos;
+
+ if (!wroteFirst) {
+ cos = getOutput();
+ writeWARCInfo(cos, top);
+ cos.commit();
+ wroteFirst = true;
+ }
+ String envelopeFormat = JSONUtils.extractSingle(top, "Envelope.Format");
+ if (envelopeFormat == null) {
+ throw new IOException("Missing Envelope.Format");
+ }
+
+ String warctype = JSONUtils.extractSingle(top, "Envelope.WARC-Header-Metadata.WARC-Type");
+ if (warctype == null)
+ return;
+
+ if (warctype.equals("response")) {
+ String textExtract = JSONUtils.extractSingle(top, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata.Text");
+
+ if (textExtract != null) {
+ cos = getOutput();
+ if (envelopeFormat.startsWith("WARC")) {
+ writeWARC(cos, top, textExtract);
+ } else {
+ // hrm...
+ throw new IOException("Unknown Envelope.Format");
+ }
+ cos.commit();
+ }
+ }
+ }
+
+ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
+ String filename = outFilename;
+
+ if (filename == null) {
+ filename = JSONUtils.extractSingle(md, "Container.Filename");
+
+ if (filename == null) {
+ throw new IOException("No Container.Filename...");
+ }
+ }
+
+ HttpHeaders headers = new HttpHeaders();
+ headers.add("Software-Info", IAUtils.COMMONS_VERSION);
+ headers.addDateHeader("Extracted-Date", new Date());
+
+ // Dup out some useful headers from the incoming warcinfo
+ String warctype = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Type");
+ if (warctype != null && warctype.equals("warcinfo")) {
+ final String[] usefulHeaders = {"robots", "isPartOf", "operator", "description", "publisher"};
+
+ for (String header : usefulHeaders) {
+ String value = JSONUtils.extractSingle(md, "Envelope.Payload-Metadata.WARC-Info-Metadata." + header);
+ if (value != null) {
+ headers.add(header, value);
+ }
+ }
+ }
+
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ headers.write(baos);
+ recW.writeWARCInfoRecord(recOut, filename, baos.toByteArray());
+ }
+
+ private void writeWARC(OutputStream recOut, MetaData md, String textExtract) throws IOException {
+ JSONObject headers = JSONUtils.extractObject(md, "Envelope.WARC-Header-Metadata");
+ String targetURI = headers.getString("WARC-Target-URI");
+ String capDateString = headers.getString("WARC-Date");
+ String recId = headers.getString("WARC-Record-ID");
+ Map addHeaders = null;
+ if (headers.has("WARC-Identified-Content-Language")) {
+ addHeaders = new TreeMap();
+ addHeaders.put("WARC-Identified-Content-Language", headers.getString("WARC-Identified-Content-Language"));
+ }
+ writeWARCMDRecord(recOut, targetURI, parseWarcDate(capDateString), recId, textExtract, addHeaders);
+ }
+
+ private static Date parseWarcDate(String capDateString) {
+ Date capDate;
+ ZonedDateTime zdt = ZonedDateTime.from(
+ DateTimeFormatter.ISO_INSTANT.withZone(ZoneId.of(ZoneOffset.UTC.toString())).parse(capDateString));
+ capDate = Date.from(zdt.toInstant());
+ return capDate;
+ }
+
+ private void writeWARCMDRecord(OutputStream recOut, String targetURI, Date capDate, String recId,
+ String textExtract, Map addHeaders)
+ throws IOException {
+ recW.writeTextConversionRecord(recOut, textExtract.getBytes(StandardCharsets.UTF_8), targetURI, capDate, recId, addHeaders);
+ }
+
+}
diff --git a/src/main/java/org/archive/format/arc/FiledescRecord.java b/src/main/java/org/archive/format/arc/FiledescRecord.java
index 9af3d461..dc43765b 100644
--- a/src/main/java/org/archive/format/arc/FiledescRecord.java
+++ b/src/main/java/org/archive/format/arc/FiledescRecord.java
@@ -2,9 +2,9 @@
import java.util.logging.Logger;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONArray;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class FiledescRecord {
private static final Logger LOG =
@@ -87,4 +87,4 @@ public String getFormat() {
public void setFormat(String format) {
this.format = format;
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java b/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java
index 154cf5f1..74dd8b9d 100644
--- a/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java
+++ b/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java
@@ -172,7 +172,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException
throw new IOException("getNextMember() on IOException Stream at "
+ currentMemberStartOffset + " in " + streamContext);
}
- LOG.info("getNextMember");
+ LOG.fine("getNextMember");
if(gotEOF) {
LOG.info("getNextMember-ATEOF");
@@ -209,9 +209,9 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException
while(currentMember == null) {
// scan ahead for another record start:
long amtSkipped = decoder.alignOnMagic3(this);
- if(LOG.isLoggable(Level.INFO)) {
+ if(LOG.isLoggable(Level.FINE)) {
- LOG.info("AlignedResult:" + amtSkipped);
+ LOG.fine("AlignedResult:" + amtSkipped);
}
if(amtSkipped < 0) {
gotEOF = true;
@@ -257,7 +257,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException
try {
currentMemberStartOffset = offset - 3;
header = decoder.parseHeader(this, true);
- LOG.info("Read next GZip header...");
+ LOG.fine("Read next GZip header...");
currentMember = new GZIPSeriesMember(this,header);
state = STATE_DEFLATING;
@@ -291,8 +291,8 @@ public int read(byte[] b) throws IOException {
public int read(byte[] b, int off, int len) throws IOException {
int amtWritten = 0;
- if(LOG.isLoggable(Level.INFO)) {
- LOG.info("read("+len+" bytes) bufferSize("+bufferSize+")");
+ if(LOG.isLoggable(Level.FINE)) {
+ LOG.fine("read("+len+" bytes) bufferSize("+bufferSize+")");
}
while(len > 0) {
if(bufferSize > 0) {
@@ -341,8 +341,8 @@ public void returnBytes(int bytes) {
if((bytes > bufferPos) || (bytes < 0)) {
throw new IndexOutOfBoundsException();
}
- if(LOG.isLoggable(Level.INFO)) {
- LOG.info("Returned ("+bytes+")bytes");
+ if(LOG.isLoggable(Level.FINE)) {
+ LOG.fine("Returned ("+bytes+")bytes");
}
bufferPos -= bytes;
bufferSize += bytes;
diff --git a/src/main/java/org/archive/format/http/HttpRequestMessageParser.java b/src/main/java/org/archive/format/http/HttpRequestMessageParser.java
index 759bbe5d..78a30641 100644
--- a/src/main/java/org/archive/format/http/HttpRequestMessageParser.java
+++ b/src/main/java/org/archive/format/http/HttpRequestMessageParser.java
@@ -182,9 +182,6 @@ public int parseLax(byte buf[], int len, HttpRequestMessageObserver obs)
}
while(buf[idx] == SP) {
idx++;
- if(idx >= len) {
- throw new HttpParseException("No spaces in message");
- }
}
vs = idx;
while(idx < len) {
diff --git a/src/main/java/org/archive/format/json/CompoundORJSONPathSpec.java b/src/main/java/org/archive/format/json/CompoundORJSONPathSpec.java
index b99e4f23..aa6911e4 100644
--- a/src/main/java/org/archive/format/json/CompoundORJSONPathSpec.java
+++ b/src/main/java/org/archive/format/json/CompoundORJSONPathSpec.java
@@ -3,7 +3,7 @@
import java.util.ArrayList;
import java.util.List;
-import org.json.JSONObject;
+import com.github.openjson.JSONObject;
public class CompoundORJSONPathSpec implements JSONPathSpec {
ArrayList parts;
diff --git a/src/main/java/org/archive/format/json/JSONPathSpec.java b/src/main/java/org/archive/format/json/JSONPathSpec.java
index 68adf0bd..f78eaaff 100644
--- a/src/main/java/org/archive/format/json/JSONPathSpec.java
+++ b/src/main/java/org/archive/format/json/JSONPathSpec.java
@@ -2,7 +2,7 @@
import java.util.List;
-import org.json.JSONObject;
+import com.github.openjson.JSONObject;
public interface JSONPathSpec {
public static final String EMPTY = "";
diff --git a/src/main/java/org/archive/format/json/JSONUtils.java b/src/main/java/org/archive/format/json/JSONUtils.java
index 28f4f43e..6fff07bb 100644
--- a/src/main/java/org/archive/format/json/JSONUtils.java
+++ b/src/main/java/org/archive/format/json/JSONUtils.java
@@ -4,9 +4,9 @@
import java.util.List;
import java.util.logging.Logger;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONArray;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class JSONUtils {
private static final Logger LOG =
diff --git a/src/main/java/org/archive/format/json/JSONView.java b/src/main/java/org/archive/format/json/JSONView.java
index 444ea7e6..9aec108e 100644
--- a/src/main/java/org/archive/format/json/JSONView.java
+++ b/src/main/java/org/archive/format/json/JSONView.java
@@ -7,7 +7,7 @@
import java.util.logging.Logger;
import org.apache.commons.lang3.StringUtils;
-import org.json.JSONObject;
+import com.github.openjson.JSONObject;
/**
*
diff --git a/src/main/java/org/archive/format/json/SimpleJSONPathSpec.java b/src/main/java/org/archive/format/json/SimpleJSONPathSpec.java
index c0b1a8d6..f114d30c 100644
--- a/src/main/java/org/archive/format/json/SimpleJSONPathSpec.java
+++ b/src/main/java/org/archive/format/json/SimpleJSONPathSpec.java
@@ -4,9 +4,9 @@
import java.util.List;
import java.util.logging.Logger;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONArray;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class SimpleJSONPathSpec implements JSONPathSpec {
private static final Logger LOG =
diff --git a/src/main/java/org/archive/format/text/html/CDATALexer.java b/src/main/java/org/archive/format/text/html/CDATALexer.java
index 850aebf0..04919f94 100644
--- a/src/main/java/org/archive/format/text/html/CDATALexer.java
+++ b/src/main/java/org/archive/format/text/html/CDATALexer.java
@@ -1,37 +1,96 @@
package org.archive.format.text.html;
import org.htmlparser.Node;
+import org.htmlparser.Text;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.util.ParserException;
+import static org.archive.format.text.html.NodeUtils.SCRIPT_TAG_NAME;
+import static org.archive.format.text.html.NodeUtils.STYLE_TAG_NAME;
+
public class CDATALexer extends Lexer {
private static final long serialVersionUID = -8513653556979405106L;
private Node cached;
- private boolean inCSS;
private boolean inJS;
- private boolean cachedJS = false;
+ private boolean inCSS;
+
+ private static enum STATE { DEFAULT, START_JS, START_CSS };
+ private STATE state = STATE.DEFAULT;
+
+ private int start = -1;
+ private int end = -1;
@Override
public Node nextNode() throws ParserException {
- inJS = false;
- inCSS = false;
- if(cached != null) {
+ if (cached != null) {
+ inJS = inCSS = false;
Node tmp = cached;
cached = null;
- inJS = cachedJS;
- inCSS = !cachedJS;
return tmp;
}
- Node got = super.nextNode();
- if(NodeUtils.isNonEmptyOpenTagNodeNamed(got, "SCRIPT")) {
- cached = super.parseCDATA(true);
- cachedJS = true;
- } else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, "STYLE")) {
- cached = super.parseCDATA(true);
- cachedJS = false;
+ Node got = null;
+ switch (state) {
+ case START_JS:
+ got = super.parseCDATA(false);
+ if (got != null) {
+ inJS = true;
+ }
+ break;
+ case START_CSS:
+ got = super.parseCDATA(false);
+ if (got != null) {
+ inCSS = true;
+ }
+ break;
+ default:
+ break;
+ }
+ if (got != null) {
+ Text t = (Text) got;
+ start = t.getStartPosition();
+ end = t.getEndPosition();
+ while ((t = (Text) super.parseCDATA(false)) != null) {
+ end = t.getEndPosition();
+ }
+ while ((got = super.nextNode()) != null) {
+ if (inJS) {
+ if (NodeUtils.isCloseTagNodeNamed(got, SCRIPT_TAG_NAME)) {
+ cached = got;
+ state = STATE.DEFAULT;
+ return createStringNode(getPage(), start, end);
+ } else {
+ end = got.getEndPosition();
+ }
+ } else if (inCSS) {
+ if (NodeUtils.isCloseTagNodeNamed(got, STYLE_TAG_NAME)) {
+ cached = got;
+ state = STATE.DEFAULT;
+ return createStringNode(getPage(), start, end);
+ } else {
+ end = got.getEndPosition();
+ }
+ }
+ }
+ t = createStringNode(getPage(), start, end);
+ state = STATE.DEFAULT;
+ start = end = -1;
+ return t;
+ }
+ got = super.nextNode();
+ if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, SCRIPT_TAG_NAME)) {
+ state = STATE.START_JS;
+ } else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, STYLE_TAG_NAME)) {
+ state = STATE.START_CSS;
+ } else if (NodeUtils.isCloseTagNodeNamed(got, SCRIPT_TAG_NAME)) {
+ state = STATE.DEFAULT;
+ inJS = false;
+ } else if (NodeUtils.isCloseTagNodeNamed(got, STYLE_TAG_NAME)) {
+ state = STATE.DEFAULT;
+ inCSS = false;
}
return got;
}
+
public boolean inJS() {
return inJS;
}
diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
index 3278b289..02e6700e 100644
--- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java
+++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
@@ -2,18 +2,30 @@
import java.io.IOException;
import java.io.OutputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import java.util.Date;
+import java.util.Map;
import java.util.UUID;
import org.archive.format.http.HttpConstants;
import org.archive.format.http.HttpHeaders;
+import org.archive.util.Base32;
import org.archive.util.DateUtils;
-public class WARCRecordWriter implements WARCConstants, HttpConstants
-{
- private static final String SCHEME = "urn:uuid";
- private static final String SCHEME_COLON = SCHEME + ":";
-
+public class WARCRecordWriter implements WARCConstants, HttpConstants {
+ private static final String SCHEME = "urn:uuid";
+ private static final String SCHEME_COLON = SCHEME + ":";
+ private MessageDigest sha1;
+
+ public WARCRecordWriter() {
+ try {
+ sha1 = MessageDigest.getInstance("SHA1");
+ } catch (NoSuchAlgorithmException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
/**
* Write the headers and contents as a WARC record to the given
* output stream.
@@ -100,6 +112,34 @@ public void writeJSONMetadataRecord( OutputStream out,
writeRecord(out, headers, contents);
}
+ public void writeTextConversionRecord(OutputStream out,
+ byte[] contents,
+ String targetURI,
+ Date originalDate,
+ String origRecordId,
+ Map addHeaders) throws IOException
+ {
+ HttpHeaders headers = new HttpHeaders();
+ headers.add(HEADER_KEY_TYPE, WARCRecordType.conversion.name());
+ headers.add(HEADER_KEY_URI, targetURI);
+ headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate));
+ headers.add(HEADER_KEY_ID, makeRecordId());
+ headers.add(HEADER_KEY_REFERS_TO, origRecordId);
+ headers.add(HEADER_KEY_BLOCK_DIGEST, contentHash(contents));
+ if (addHeaders != null) {
+ for (Map.Entry e : addHeaders.entrySet()) {
+ headers.add(e.getKey(), e.getValue());
+ }
+ }
+ headers.add(CONTENT_TYPE, "text/plain");
+ writeRecord(out, headers, contents);
+ }
+
+ private String contentHash(byte[] content) {
+ sha1.reset();
+ return "sha1:" + Base32.encode(sha1.digest(content));
+ }
+
private String makeRecordId()
{
StringBuilder recID = new StringBuilder();
diff --git a/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java b/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java
index d31e31c9..a3433004 100644
--- a/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java
+++ b/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java
@@ -11,8 +11,8 @@
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.archive.format.json.JSONView;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class ArchiveJSONViewLoader extends ArchiveMetadataLoader {
private final static Logger LOG =
diff --git a/src/main/java/org/archive/hadoop/func/JSONViewEvalFunc.java b/src/main/java/org/archive/hadoop/func/JSONViewEvalFunc.java
index 8d4446b5..bc390ff6 100644
--- a/src/main/java/org/archive/hadoop/func/JSONViewEvalFunc.java
+++ b/src/main/java/org/archive/hadoop/func/JSONViewEvalFunc.java
@@ -8,8 +8,8 @@
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.archive.format.json.JSONUtils;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class JSONViewEvalFunc extends EvalFunc {
private static final Logger LOG =
diff --git a/src/main/java/org/archive/io/WriterPool.java b/src/main/java/org/archive/io/WriterPool.java
index 79da16c0..980cd029 100644
--- a/src/main/java/org/archive/io/WriterPool.java
+++ b/src/main/java/org/archive/io/WriterPool.java
@@ -30,10 +30,10 @@
import java.util.logging.Level;
import java.util.logging.Logger;
+import com.github.openjson.JSONArray;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
import org.archive.format.ArchiveFileConstants;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
/**
* Pool of Writers.
diff --git a/src/main/java/org/archive/resource/MetaData.java b/src/main/java/org/archive/resource/MetaData.java
index fb3b24a4..05c0ee06 100755
--- a/src/main/java/org/archive/resource/MetaData.java
+++ b/src/main/java/org/archive/resource/MetaData.java
@@ -2,10 +2,10 @@
import java.util.logging.Logger;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
-import org.json.JSONTokener;
+import com.github.openjson.JSONArray;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
+import com.github.openjson.JSONTokener;
/**
* A nested structure of {@linkplain JSONObject}s to hold the metadata of
@@ -157,19 +157,7 @@ public JSONObject put(String name, long value) throws JSONException {
@Override
public JSONObject put(String key, Object value) {
- if (value instanceof JSONArray) {
- super.remove(key);
- super.put(key, value);
- } else if (has(key)) {
- if (super.get(key) instanceof JSONArray) {
- ((JSONArray) super.get(key)).put(value);
- return this;
- } else {
- JSONArray array = new JSONArray();
- array.put(super.get(key));
- array.put(value);
- super.put(key, array);
- }
+ if (has(key)) {
return super.accumulate(key, value);
}
return super.put(key, value);
diff --git a/src/main/java/org/archive/resource/ResourceConstants.java b/src/main/java/org/archive/resource/ResourceConstants.java
index 3b8bea1c..9eea22b5 100644
--- a/src/main/java/org/archive/resource/ResourceConstants.java
+++ b/src/main/java/org/archive/resource/ResourceConstants.java
@@ -115,5 +115,5 @@ public interface ResourceConstants {
public static final String HTML_LINK_TAGS = "Link";
public static final String HTML_META_TAGS = "Metas";
public static final String HTML_SCRIPT_TAGS = "Scripts";
-
+ public static final String HTML_TEXT = "Text";
}
diff --git a/src/main/java/org/archive/resource/gzip/GZIPMetaData.java b/src/main/java/org/archive/resource/gzip/GZIPMetaData.java
index 1058b01b..dc467b16 100644
--- a/src/main/java/org/archive/resource/gzip/GZIPMetaData.java
+++ b/src/main/java/org/archive/resource/gzip/GZIPMetaData.java
@@ -12,8 +12,8 @@
import org.archive.resource.MetaData;
import org.archive.resource.ResourceConstants;
import org.archive.util.ByteOp;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
import static java.nio.charset.StandardCharsets.UTF_8;
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index ab439d5c..43dba1f8 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -3,6 +3,7 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@@ -22,11 +23,15 @@ public class ExtractingParseObserver implements ParseObserver {
HTMLMetaData data;
Stack> openAnchors;
Stack openAnchorTexts;
+ StringBuilder textExtract;
String title = null;
+ boolean inHead = false;
boolean inTitle = false;
+ boolean inPre = false;
+ boolean inSVG = false;
protected static String cssUrlPatString =
- "url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
+ "url\\s*\\(\\s*([^)\\s]{1,8000}?)\\s*\\)";
protected static String cssUrlTrimPatString =
"^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$";
protected static String cssImportNoUrlPatString =
@@ -48,7 +53,31 @@ public class ExtractingParseObserver implements ParseObserver {
Pattern.compile(jsOnClickUrl2PatString)
};
- private final static int MAX_TEXT_LEN = 100;
+ protected static Pattern wsPattern = Pattern.compile("\\s+");
+
+ /** max. length for anchor texts */
+ private final static int MAX_TEXT_LEN = 128;
+
+ private final static String[] BLOCK_ELEMENTS = { "address", "article", "aside", "blockquote", "body", "br",
+ "button", "canvas", "caption", "center", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
+ "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
+ "li", "map", "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table",
+ "tbody", "textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
+ private static final Set blockElements;
+ /* inline elements which content is not melted with surrounding words */
+ private final static String[] INLINE_ELEMENTS_SPACING = { "address", "cite", "details", "datalist", "iframe", "img",
+ "input", "label", "legend", "optgroup", "q", "select", "summary", "tbody", "td", "time" };
+ private static final Set inlineSpacingElements;
+ static {
+ blockElements = new HashSet();
+ for (String el : BLOCK_ELEMENTS) {
+ blockElements.add(el.toUpperCase(Locale.ROOT));
+ }
+ inlineSpacingElements = new HashSet();
+ for (String el : INLINE_ELEMENTS_SPACING) {
+ inlineSpacingElements.add(el.toUpperCase(Locale.ROOT));
+ }
+ }
private static final String PATH = "path";
private static final String PATH_SEPARATOR = "@/";
@@ -82,6 +111,8 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("AUDIO", new EmbedTagExtractor());
extractors.put("TRACK", new EmbedTagExtractor());
extractors.put("SOURCE", new EmbedTagExtractor());
+ // language from HTML root element
+ extractors.put("HTML", new HTMLTagExtractor());
globalHrefAttributes = new HashSet();
globalHrefAttributes.add("background");
@@ -94,25 +125,47 @@ public ExtractingParseObserver(HTMLMetaData data) {
this.data = data;
openAnchors = new Stack>();
openAnchorTexts = new Stack();
+ textExtract = new StringBuilder(8192);
}
+ @Override
public void handleDocumentStart() {
// no-op
}
+ @Override
public void handleDocumentComplete() {
- // no-op
+ if (textExtract.length() > 0) {
+ data.setTextExtract(textExtract.toString());
+ textExtract = new StringBuilder(8192);
+ }
}
+ @Override
public void handleTagEmpty(TagNode tag) {
handleTagOpen(tag);
}
+ @Override
public void handleTagOpen(TagNode tag) {
String name = tag.getTagName();
- if(name.equals("TITLE")) {
+ if (name.equals("HEAD")) {
+ inHead = true;
+ } else if (name.equals("TITLE")) {
inTitle = !tag.isEmptyXmlTag();
return;
+ } else if (name.equals("PRE")) {
+ inPre = true;
+ } else if (name.equals("SVG")) {
+ inSVG = true;
+ } else if (name.equals("BODY")) {
+ inHead = false;
+ }
+
+ if (blockElements.contains(name)) {
+ appendParagraphSeparator(textExtract);
+ } else if (inlineSpacingElements.contains(name)) {
+ appendSpace(textExtract);
}
// first the global attributes:
@@ -125,6 +178,8 @@ public void handleTagOpen(TagNode tag) {
}
attrName = attrName.toLowerCase(Locale.ROOT);
if (globalHrefAttributes.contains(attrName)) {
+ attrValue = decodeCharEnt(attrValue);
+ attrValue = trimDataUrl(attrValue);
data.addHref(PATH,makePath(name,attrName),"url",attrValue);
}
}
@@ -136,16 +191,32 @@ public void handleTagOpen(TagNode tag) {
}
}
+ @Override
public void handleTagClose(TagNode tag) {
- if(inTitle) {
+ String name = tag.getTagName();
+
+ if (inTitle) {
inTitle = false;
- data.setTitle(title);
+ if (!inSVG && (inHead || !data.hasTitle())) {
+ data.setTitle(title);
+ }
title = null;
- // probably the right thing..
- return;
}
+
+ if (blockElements.contains(name)) {
+ appendParagraphSeparator(textExtract);
+ } else if (inlineSpacingElements.contains(name)) {
+ appendSpace(textExtract);
+ }
+ // also add space to open anchor texts
+ if (blockElements.contains(name) || inlineSpacingElements.contains(name)) {
+ for (StringBuilder s : openAnchorTexts) {
+ appendSpace(s);
+ }
+ }
+
// Only interesting if it's a :
- if(tag.getTagName().equals("A")) {
+ if(name.equals("A")) {
if(openAnchors.size() > 0) {
// TODO: what happens here when we get unaligned (extra 's?)
ArrayList vals = openAnchors.pop();
@@ -153,7 +224,7 @@ public void handleTagClose(TagNode tag) {
if((vals != null) && (vals.size() > 0)) {
if(text != null) {
// contained an href - we want to ignore :
- String trimmed = text.toString().trim().replaceAll("\\s+", " ");
+ String trimmed = text.toString().trim();
if(trimmed.length() > MAX_TEXT_LEN) {
trimmed = trimmed.substring(0,MAX_TEXT_LEN);
}
@@ -165,47 +236,128 @@ public void handleTagClose(TagNode tag) {
data.addHref(vals);
}
}
+ } else if (tag.getTagName().equals("HEAD")) {
+ inHead = false;
+ } else if (tag.getTagName().equals("PRE")) {
+ inPre = false;
+ } else if (tag.getTagName().equals("SVG")) {
+ inSVG = false;
}
}
+ @Override
public void handleTextNode(TextNode text) {
- // TODO: OPTIMIZ: This can be a lot smarter, if StringBuilders are full,
- // this result is thrown away.
- String t = text.getText().replaceAll("\\s+", " ");
-
- if(t.length() > MAX_TEXT_LEN) {
- t = t.substring(0,MAX_TEXT_LEN);
- }
- if(inTitle) {
- title = t;
-
+ String txt = text.getText();
+ StringBuilder t = new StringBuilder(8192);
+ txt = decodeCharEnt(txt, false);
+ if (inPre) {
+ t.append(txt);
} else {
-
- for(StringBuilder s : openAnchorTexts) {
- if(s.length() >= MAX_TEXT_LEN) {
- // if we are full, parents enclosing us should be too..
- break;
+ char c = ' ';
+ boolean cIsWhiteSpace = true;
+ if (textExtract.length() > 0) {
+ c = textExtract.charAt(textExtract.length() - 1);
+ cIsWhiteSpace = Character.isWhitespace(c);
+ }
+ for (int i = 0; i < txt.length(); i++) {
+ char c2 = txt.charAt(i);
+ switch (c2) {
+ /*
+ * normalize ASCII control characters, line breaks and some
+ * Unicode white space for cleaner text and paragraphs
+ */
+ case '\000':
+ case '\001':
+ case '\002':
+ case '\003':
+ case '\004':
+ case '\005':
+ case '\006':
+ case '\007':
+ case '\010':
+ case '\011':
+ case '\012': // = '\n'
+ case '\013':
+ case '\014':
+ case '\015': // = '\r'
+ case '\016':
+ case '\017':
+ case '\020':
+ case '\021':
+ case '\022':
+ case '\023':
+ case '\024':
+ case '\025':
+ case '\026':
+ case '\027':
+ case '\030':
+ case '\031':
+ case '\032':
+ case '\033':
+ case '\034':
+ case '\035':
+ case '\036':
+ case '\037':
+ case '\177':
+ case '\u00a0': // non-breaking space
+ c2 = ' ';
+ }
+ boolean c2IsWhiteSpace = Character.isWhitespace(c2);
+ if (!cIsWhiteSpace || !c2IsWhiteSpace) {
+ t.append(c2);
}
- if(s.length() + t.length() < MAX_TEXT_LEN) {
- s.append(t);
- } else {
- // only add as much as we can:
- s.append(t.substring(0,MAX_TEXT_LEN - s.length()));
+ c = c2;
+ cIsWhiteSpace = c2IsWhiteSpace;
+ }
+ }
+
+ textExtract.append(t);
+
+ if (inTitle || !openAnchorTexts.isEmpty()) {
+
+ if (t.length() > MAX_TEXT_LEN) {
+ t.setLength(MAX_TEXT_LEN);
+ }
+
+ if (inTitle) {
+ title = t.toString().trim();
+
+ } else {
+
+ for (StringBuilder s : openAnchorTexts) {
+ if (s.length() >= MAX_TEXT_LEN) {
+ // if we are full, parents enclosing us should be too..
+ break;
+ }
+ String tClipped;
+ if ((s.length() + t.length()) < MAX_TEXT_LEN) {
+ tClipped = t.toString();
+ } else {
+ // only add as much as we can:
+ tClipped = t.substring(0, MAX_TEXT_LEN - s.length());
+ }
+ if (!tClipped.isEmpty() && (s.length() == 0 || s.charAt(s.length() - 1) == ' ') && tClipped.charAt(0) == ' ') {
+ tClipped = tClipped.substring(1);
+ }
+ s.append(tClipped);
}
- // BUGBUG: check now for multiple trailing spaces, and strip:
}
}
}
+ @Override
public void handleScriptNode(TextNode text) {
// TODO: Find (semi) obvious URLs in JS:
}
+ @Override
public void handleStyleNode(TextNode text) {
- patternCSSExtract(data, cssUrlPattern, text.getText());
- patternCSSExtract(data, cssImportNoUrlPattern, text.getText());
+ String cssStr = decodeCharEnt(text.getText());
+ patternCSSExtract(data, cssUrlPattern, cssStr);
+ patternCSSExtract(data, cssImportNoUrlPattern, cssStr);
}
+ @Override
public void handleRemarkNode(RemarkNode remark) {
// TODO no-op, right??
}
@@ -230,6 +382,8 @@ private static void addBasicHrefs(HTMLMetaData data, TagNode node, String... att
for(String attr : attrs) {
String val = node.getAttribute(attr);
if(val != null) {
+ val = decodeCharEnt(val);
+ val = trimDataUrl(val);
data.addHref(PATH,makePath(node.getTagName(),attr),"url",val);
}
}
@@ -237,16 +391,28 @@ private static void addBasicHrefs(HTMLMetaData data, TagNode node, String... att
private static ArrayList getAttrList(TagNode node, String... attrs) {
ArrayList l = new ArrayList();
+ boolean isOgImage = false;
for(String attr : attrs) {
String val = node.getAttribute(attr);
if(val != null) {
+ val = decodeCharEnt(val);
l.add(attr);
l.add(val);
+ if (attr.equals("property") && val.equals("og:image")) {
+ isOgImage = true;
+ }
}
}
if(l.size() == 0) {
return null;
}
+ if (isOgImage) {
+ // trim data: URLs in og:image metadata
+ int content = l.indexOf("content");
+ if (content > -1 && (content % 2) == 0) {
+ l.set(content + 1, trimDataUrl(l.get(content + 1)));
+ }
+ }
return l;
}
@@ -255,6 +421,8 @@ private static ArrayList getAttrListUrl(TagNode node,
String url = node.getAttribute(urlAttr);
ArrayList l = null;
if(url != null) {
+ url = decodeCharEnt(url);
+ url = trimDataUrl(url);
l = new ArrayList();
l.add(PATH);
l.add(makePath(node.getTagName(),urlAttr));
@@ -264,6 +432,7 @@ private static ArrayList getAttrListUrl(TagNode node,
for(String attr : optionalAttrs) {
String val = node.getAttribute(attr);
if(val != null) {
+ val = decodeCharEnt(val);
l.add(attr);
l.add(val);
}
@@ -287,22 +456,49 @@ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
for (Pattern pattern : jsOnClickUrlPatterns) {
String url = patternJSExtract(pattern, onclick);
if (url != null) {
+ url = trimDataUrl(url);
data.addHref(PATH, path, "url", url);
}
}
}
}
+ private static void appendParagraphSeparator(StringBuilder sb) {
+ int length = sb.length();
+ if (length > 0) {
+ // remove white space before paragraph break
+ while (length > 0 && sb.charAt(length - 1) == ' ') {
+ sb.deleteCharAt(--length);
+ }
+ if (length > 0 && sb.charAt(length - 1) != '\n') {
+ sb.append('\n');
+ }
+ }
+ }
+
+ private static void appendSpace(StringBuilder sb) {
+ int length = sb.length();
+ if (length > 0) {
+ char lastBufferChar = sb.charAt(length - 1);
+ if (lastBufferChar != ' ' && lastBufferChar != '\n') {
+ sb.append(' ');
+ }
+ }
+ }
+
private interface TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs);
}
private static class AnchorTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = new ArrayList();
String url = node.getAttribute("href");
if(url != null) {
// got data:
+ url = decodeCharEnt(url);
+ url = trimDataUrl(url);
l.add(PATH);
l.add(makePath("A","href"));
l.add("url");
@@ -310,6 +506,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) {
String v = node.getAttribute(a);
if(v != null) {
+ v = decodeCharEnt(v);
l.add(a);
l.add(v);
}
@@ -327,15 +524,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
private static class AppletTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"codebase","cdata");
}
}
private static class AreaTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
String url = node.getAttribute("href");
if(url != null) {
+ url = decodeCharEnt(url);
+ url = trimDataUrl(url);
ArrayList l = new ArrayList();
l.add(PATH);
l.add(makePath("AREA","href"));
@@ -354,43 +555,52 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
private static class BaseTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
String url = node.getAttribute("href");
if(url != null) {
+ url = decodeCharEnt(url);
data.setBaseHref(url);
}
}
}
private static class ButtonTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"formaction");
}
}
private static class DivTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addHrefsOnclick(data,node);
}
}
private static class EmbedTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
private static class EmbedVideoTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","poster");
}
}
private static class FormTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = new ArrayList();
String url = node.getAttribute("action");
if(url != null) {
+ url = decodeCharEnt(url);
+ url = trimDataUrl(url);
// got data:
l.add(PATH);
l.add(makePath("FORM","action"));
@@ -409,18 +619,38 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
private static class FrameTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+ private static class HTMLTagExtractor implements TagExtractor {
+ @Override
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ ArrayList l = getAttrList(node, "lang", "xml:lang");
+ if(l != null) {
+ Iterator it = l.iterator();
+ while (it.hasNext()) {
+ String name = it.next();
+ if (it.hasNext()) {
+ String lang = it.next();
+ data.addMeta("name", makePath("HTML", name), "content", lang);
+ }
+ }
+ }
+ }
+ }
+
private static class IFrameTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
private static class ImgTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addHrefWithAttrs(data,node,"src","alt","title");
addBasicHrefs(data,node,"longdesc");
@@ -428,6 +658,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
private static class InputTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","formaction");
addHrefsOnclick(data,node);
@@ -435,8 +666,9 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
private static class LinkTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- ArrayList l = getAttrListUrl(node,"href","rel","type");
+ ArrayList l = getAttrListUrl(node,"href","rel","type","hreflang");
if(l != null) {
data.addLink(l);
}
@@ -444,33 +676,56 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
private static class MenuitemTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"icon");
}
}
private static class MetaTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property");
if(l != null) {
+ if (l.size() == 2) {
+ if (l.get(0).equals("content")) {
+ /*
+ * drop single "content" attributes very likely stemming
+ * from schema.org
+ * annotations embedded in the HTML body, see
+ * https://github.com/commoncrawl/ia-web-commons/issues/40
+ */
+ return;
+ } else {
+ /*
+ * Single key-value metadata pair, e.g. (no "content") - no value or something
+ * when wrong with attribute parsing.
+ */
+ return;
+ }
+ }
data.addMeta(l);
}
}
}
private static class ObjectTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"codebase","cdata","data");
}
}
private static class QuotationLinkTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"cite");
}
}
private static class ScriptTagExtractor implements TagExtractor {
+ @Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"src","type");
if(l != null) {
@@ -491,7 +746,8 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
String url = m.group(1);
url = cssUrlTrimPattern.matcher(url).replaceAll("");
if (!url.isEmpty()) {
- data.addHref("path","STYLE/#text","href", url);
+ url = trimDataUrl(url);
+ data.addHref("path", "STYLE/#text", "href", url);
}
}
}
@@ -503,4 +759,53 @@ private static String patternJSExtract(Pattern pattern, String content) {
}
return null;
}
+
+ public static String decodeCharEnt(String text) {
+ return decodeCharEnt(text, true);
+ }
+
+ public static String decodeCharEnt(String text, boolean inAttribute) {
+ if (text.indexOf('&') == -1) {
+ return text;
+ }
+ try {
+ return org.jsoup.parser.Parser.unescapeEntities(text, inAttribute);
+ } catch (Throwable e) {
+ System.err.println(text);
+ e.printStackTrace();
+ return text;
+ }
+ }
+
+ /**
+ * Trim data from
+ * data URLs.
+ *
+ * Any data (after the comma) is trimmed from a data URL. If no comma is
+ * found within the first 128 characters of the URL, the URL is trimmed to
+ * 128 characters.
+ *
+ * @param url
+ * URL to be trimmed
+ * @return
+ */
+ public static String trimDataUrl(String url) {
+ if (url.startsWith("data:")) {
+ int posComma = url.indexOf(',', 5);
+ if (posComma == -1) {
+ // no comma, trim to 128 characters if necessary
+ if (url.length() > 128) {
+ return url.substring(0, 128);
+ }
+ return url;
+ } else if (posComma > 128) {
+ return url.substring(0, 128);
+ } else if (posComma == 6) {
+ return "data:,";
+ } else if (posComma > 6) {
+ return url.substring(0, posComma + 1);
+ }
+ }
+ return url;
+ }
}
diff --git a/src/main/java/org/archive/resource/html/HTMLMetaData.java b/src/main/java/org/archive/resource/html/HTMLMetaData.java
index d995cf65..a6d2532b 100644
--- a/src/main/java/org/archive/resource/html/HTMLMetaData.java
+++ b/src/main/java/org/archive/resource/html/HTMLMetaData.java
@@ -6,9 +6,9 @@
import org.archive.resource.MetaData;
import org.archive.resource.ResourceConstants;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONArray;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class HTMLMetaData extends MetaData implements ResourceConstants {
@@ -32,9 +32,15 @@ private JSONObject getHeader() {
public void setBaseHref(String href) {
putUnlessNull(getHeader(),HTML_BASE, href);
}
+
public void setTitle(String title) {
putUnlessNull(getHeader(),HTML_TITLE, title);
}
+
+ public boolean hasTitle() {
+ return header != null && header.has(HTML_TITLE);
+ }
+
private void putUnlessNull(JSONObject o, String k, String v) {
if(o != null) {
try {
@@ -44,6 +50,7 @@ private void putUnlessNull(JSONObject o, String k, String v) {
}
}
}
+
public String[] LtoA(List l) {
String[] a = new String[l.size()];
l.toArray(a);
@@ -70,6 +77,10 @@ public void addHref(String...a) {
appendObj2(this,HTML_LINKS,a);
}
+ public void setTextExtract(String textExtract) {
+ putUnlessNull(this,HTML_TEXT, textExtract);
+ }
+
private void appendObj2(JSONObject o, String arr, String... a) {
if(o == null) {
return;
diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
index 410449a1..3ad90db6 100644
--- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
+++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
@@ -20,8 +20,8 @@
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class HTMLResourceFactory implements ResourceFactory {
diff --git a/src/main/java/org/archive/resource/warc/record/DNSResource.java b/src/main/java/org/archive/resource/warc/record/DNSResource.java
index 2bcb2bc1..86c56652 100644
--- a/src/main/java/org/archive/resource/warc/record/DNSResource.java
+++ b/src/main/java/org/archive/resource/warc/record/DNSResource.java
@@ -9,8 +9,8 @@
import org.archive.resource.MetaData;
import org.archive.resource.ResourceConstants;
import org.archive.resource.ResourceContainer;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
public class DNSResource extends AbstractEmptyResource implements ResourceConstants {
private static final Logger LOG =
diff --git a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
index 8cc8c146..b945b216 100644
--- a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
@@ -10,8 +10,8 @@
import org.archive.resource.ResourceContainer;
import org.archive.resource.ResourceFactory;
import org.archive.resource.ResourceParseException;
-import org.json.JSONException;
-import org.json.JSONTokener;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONTokener;
import static java.nio.charset.StandardCharsets.UTF_8;
diff --git a/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java b/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
index ef8c2fa0..c410932f 100644
--- a/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
+++ b/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java
@@ -3,8 +3,8 @@
import java.util.ArrayList;
import org.archive.util.TestUtils;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
import org.junit.jupiter.api.Test;
diff --git a/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java b/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
index 257cb112..d306920b 100644
--- a/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
+++ b/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java
@@ -1,8 +1,8 @@
package org.archive.format.json;
import org.archive.util.TestUtils;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
import org.junit.jupiter.api.Test;
diff --git a/src/test/java/org/archive/format/json/JSONViewTest.java b/src/test/java/org/archive/format/json/JSONViewTest.java
index 6d199025..9a325131 100644
--- a/src/test/java/org/archive/format/json/JSONViewTest.java
+++ b/src/test/java/org/archive/format/json/JSONViewTest.java
@@ -3,8 +3,8 @@
import java.util.Locale;
import org.archive.util.TestUtils;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
import org.junit.jupiter.api.Test;
diff --git a/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java b/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
index 640a5a80..8f99f4a3 100644
--- a/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
+++ b/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java
@@ -1,8 +1,8 @@
package org.archive.format.json;
import org.archive.util.TestUtils;
-import org.json.JSONException;
-import org.json.JSONObject;
+import com.github.openjson.JSONException;
+import com.github.openjson.JSONObject;
import org.junit.jupiter.api.Test;
diff --git a/src/test/java/org/archive/format/text/html/CDATALexerTest.java b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
index 7c9f24f3..a03a3ec9 100644
--- a/src/test/java/org/archive/format/text/html/CDATALexerTest.java
+++ b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
@@ -76,20 +76,38 @@ public void testInCSS() throws ParserException {
assertFalse(l.inJS());
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "STYLE"));
}
+
+ public void testInCSSEmpty() throws ParserException {
+ l = makeLexer("");
+ assertFalse(l.inCSS());
+ assertFalse(l.inJS());
+ n = l.nextNode();
+ assertFalse(l.inCSS());
+ assertFalse(l.inJS());
+ assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "STYLE"));
+ n = l.nextNode();
+ assertFalse(l.inCSS());
+ assertFalse(l.inJS());
+ assertTrue(NodeUtils.isCloseTagNodeNamed(n, "STYLE"));
+ }
+
+ public void testInCSSBachelorTag() throws ParserException {
+ l = makeLexer("
");
+ assertFalse(l.inCSS());
+ assertFalse(l.inJS());
+ n = l.nextNode();
+ assertFalse(l.inCSS());
+ assertFalse(l.inJS());
+ assertTrue(NodeUtils.isTagNode(n));
+ assertTrue(((TagNode) n).isEmptyXmlTag());
+ assertEquals(((TagNode) n).getTagName(), "STYLE");
+ n = l.nextNode();
+ assertFalse(l.inCSS());
+ assertFalse(l.inJS());
+ assertNull(n);
+ }
public void testInJSComment() throws ParserException {
-
-// dumpParse("");
-// dumpParse("");
-// dumpParse("");
-// dumpParse("");
-// dumpParse("");
-// dumpParse("");
-// dumpParse("");
-// dumpParse("");
-// dumpParse("\"); ");
-// dumpParse("");
-
assertJSContentWorks("//");
assertJSContentWorks("");
assertJSContentWorks("//");
@@ -98,9 +116,22 @@ public void testInJSComment() throws ParserException {
assertJSContentWorks("if(1 < 2) { foo(); } ");
assertJSContentWorks("if(1 bold\"); ");
- assertJSContentWorks("document.write(\"\"); ");
+ assertJSContentWorks("document.write(\"