Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ jobs:
restore-keys: |
${{ runner.os }}-maven-
- name: Build with Maven
run: mvn -B package --file pom.xml
run: mvn -B verify --file pom.xml
32 changes: 30 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<build.time>${maven.build.timestamp}</build.time>
<maven.build.timestamp.format>yyyyMMddhhmmss</maven.build.timestamp.format>
<java.version>8</java.version>
</properties>

<dependencies>
Expand Down Expand Up @@ -164,15 +165,42 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>3.14.1</version>
<configuration>
<source>8</source>
<target>8</target>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.2.5</version>
</plugin>
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<version>3.10</version>
<configuration>
<targetVersion>${java.version}</targetVersion>
<ignoreSignaturesOfMissingClasses>true</ignoreSignaturesOfMissingClasses>
<!-- if the used Java version is too new, don't fail, just do nothing: -->
<failOnUnsupportedJava>false</failOnUnsupportedJava>
<bundledSignatures>
<bundledSignature>jdk-unsafe</bundledSignature>
<bundledSignature>jdk-deprecated</bundledSignature>
<bundledSignature>jdk-non-portable</bundledSignature>
</bundledSignatures>
<signaturesFiles>
<signaturesFile>src/test/resources/forbidden-apis-signatures.txt</signaturesFile>
</signaturesFiles>
</configuration>
<executions>
<execution>
<goals>
<goal>check</goal>
<goal>testCheck</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>

<resources>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.logging.Logger;

import org.archive.resource.Resource;
Expand All @@ -12,13 +13,18 @@
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;

import static java.nio.charset.StandardCharsets.UTF_8;

public class DumpingExtractorOutput implements ExtractorOutput {
private static final Logger LOG =
Logger.getLogger(DumpingExtractorOutput.class.getName());

private PrintStream out;
public DumpingExtractorOutput(OutputStream out) {
this.out = new PrintStream(out);
try {
this.out = new PrintStream(out, false, UTF_8.name());
} catch (UnsupportedEncodingException e) {
}
}

public void output(Resource resource) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.archive.extract;

import java.util.Iterator;
import java.util.Locale;
import java.util.logging.Logger;

import org.archive.format.arc.ARCConstants;
Expand Down Expand Up @@ -68,14 +69,14 @@ private boolean childFieldStartsWith(MetaData m, String child,
String key, String search) {
String val = getChildField(m,child,key);
return val == null ? false :
val.toLowerCase().startsWith(search.toLowerCase());
val.toLowerCase(Locale.ROOT).startsWith(search.toLowerCase(Locale.ROOT));
}

private boolean childFieldContains(MetaData m, String child,
String key, String search) {
String val = getChildField(m,child,key);
return val == null ? false :
val.toLowerCase().contains(search.toLowerCase());
val.toLowerCase(Locale.ROOT).contains(search.toLowerCase(Locale.ROOT));
}

private boolean childFieldEquals(MetaData m, String child,
Expand All @@ -88,15 +89,15 @@ private boolean childFieldEquals(MetaData m, String child,
private String caseInsensitiveKeyScan(MetaData m, String child, String k) {
try {
if(m.has(child)) {
String kLC = k.toLowerCase();
String kLC = k.toLowerCase(Locale.ROOT);
JSONObject childJSObj = m.getJSONObject(child);
@SuppressWarnings("rawtypes")
Iterator i = childJSObj.keys();
while(i.hasNext()) {
Object kObj = i.next();
if(kObj instanceof String) {
String kString = (String) kObj;
if(kString.toLowerCase().equals(kLC)) {
if(kString.toLowerCase(Locale.ROOT).equals(kLC)) {
return childJSObj.getString(kString);
}
}
Expand Down Expand Up @@ -128,7 +129,7 @@ private boolean isHTTPARCResource(MetaData envelope) {
private boolean isHTMLHttpResource(MetaData m) {
String type = caseInsensitiveKeyScan(m,HTTP_HEADERS_LIST,
"Content-Type");
return type == null ? false : type.toLowerCase().contains("html");
return type == null ? false : type.toLowerCase(Locale.ROOT).contains("html");
}

private boolean isWARCType(MetaData envelope, WARCRecordType type) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.archive.extract;

import java.io.IOException;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand Down Expand Up @@ -33,7 +34,7 @@ public Resource getNext() throws ResourceParseException, IOException {
return current;
}
if(LOG.isLoggable(Level.INFO)) {
LOG.info(String.format("Extracting (%s) with (%s)\n",
LOG.info(String.format(Locale.ROOT, "Extracting (%s) with (%s)\n",
current.getClass().toString(),
f.getClass().toString()));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.archive.format.json.JSONView;
import org.archive.resource.Resource;
import org.archive.util.StreamCopy;

import static java.nio.charset.StandardCharsets.UTF_8;

public class JSONViewExtractorOutput implements ExtractorOutput {
private PrintStream out;
private JSONView view;
public JSONViewExtractorOutput(OutputStream out, String filterPath) {
view = new JSONView(filterPath.split(","));
this.out = new PrintStream(out);
try {
this.out = new PrintStream(out, false, UTF_8.name());
} catch (UnsupportedEncodingException e) {
}
}
public void output(Resource resource) throws IOException {
StreamCopy.readToEOF(resource.getInputStream());
Expand Down
21 changes: 12 additions & 9 deletions src/main/java/org/archive/extract/RealCDXExtractorOutput.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
import java.util.Locale;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -131,7 +132,7 @@ public void output(Resource resource) throws IOException {
} else {
meta = "-";
}
if(mime.toLowerCase().contains("html")) {
if(mime.toLowerCase(Locale.ROOT).contains("html")) {
if(redir.equals("-")) {
// maybe an obvious meta-refresh?
redir = extractHTMLMetaRefresh(origUrl,m);
Expand Down Expand Up @@ -202,7 +203,7 @@ public void output(Resource resource) throws IOException {
} else {
meta = "-";
}
if(mime.toLowerCase().contains("html")) {
if(mime.toLowerCase(Locale.ROOT).contains("html")) {
if(redir.equals("-")) {
// maybe an obvious meta-refresh?
redir = extractHTMLMetaRefresh(origUrl,m);
Expand All @@ -222,7 +223,8 @@ public void output(Resource resource) throws IOException {
canUrl = keyMaker.makeKey(origUrl);
// URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE
if(dumpJSON) {
out.format("%s %s %s %s %s %s %s %s %s %s %s %s\n",
out.format(Locale.ROOT,
"%s %s %s %s %s %s %s %s %s %s %s %s\n",
canUrl,
date,
origUrl,
Expand All @@ -236,7 +238,8 @@ public void output(Resource resource) throws IOException {
filename,
m.toString(1));
} else {
out.format("%s %s %s %s %s %s %s %s %s %s %s\n",
out.format(Locale.ROOT,
"%s %s %s %s %s %s %s %s %s %s %s\n",
canUrl,
date,
origUrl,
Expand Down Expand Up @@ -269,7 +272,7 @@ private String extractHTMLRobots(MetaData m) {
if(meta != null) {
String name = scanHeadersLC(meta, "name", null);
if(name != null) {
if(name.toLowerCase().equals("robots")) {
if(name.toLowerCase(Locale.ROOT).equals("robots")) {
// alright - some robot instructions:
String content = scanHeadersLC(meta, "content", null);
if(content != null) {
Expand All @@ -291,7 +294,7 @@ private String extractHTMLMetaRefresh(String origUrl, MetaData m) {
if(meta != null) {
String name = scanHeadersLC(meta, "http-equiv", null);
if(name != null) {
if(name.toLowerCase().equals("refresh")) {
if(name.toLowerCase(Locale.ROOT).equals("refresh")) {
// alright - some robot instructions:
String content = scanHeadersLC(meta, "content", null);
if(content != null) {
Expand Down Expand Up @@ -330,15 +333,15 @@ private String scanHeadersLC(JSONObject o, String match, String defaultVal) {
if(o.length() == 0) {
return defaultVal;
}
String lc = match.toLowerCase().trim();
String lc = match.toLowerCase(Locale.ROOT).trim();
// try {
// System.err.println("REC:" + o.toString(1));
// } catch (JSONException e1) {
// // TODO Auto-generated catch block
// e1.printStackTrace();
// }
for(String key : JSONObject.getNames(o)) {
if(lc.equals(key.toLowerCase().trim())) {
if(lc.equals(key.toLowerCase(Locale.ROOT).trim())) {
try {
return o.getString(key).trim();
} catch (JSONException e) {
Expand Down Expand Up @@ -472,7 +475,7 @@ private String parseRobotInstructions(String input) {
if(input == null) {
return "-";
}
String up = input.replaceAll("-", "").toUpperCase();
String up = input.replaceAll("-", "").toUpperCase(Locale.ROOT);
StringBuilder sb = new StringBuilder(3);
if(up.contains(NO_FOLLOW_MATCH)) {
sb.append("F");
Expand Down
18 changes: 9 additions & 9 deletions src/main/java/org/archive/extract/ResourceExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand All @@ -26,7 +27,6 @@ public class ResourceExtractor implements ResourceConstants, Tool {

private final static Logger LOG =
Logger.getLogger(ResourceExtractor.class.getName());
Charset UTF8 = Charset.forName("utf-8");
public final static String TOOL_NAME = "extractor";
public static final String TOOL_DESCRIPTION =
"A tool for extracting metadata from WARC, ARC, and WAT files";
Expand Down Expand Up @@ -65,7 +65,7 @@ public static void main(String[] args) throws Exception {

private PrintWriter makePrintWriter(OutputStream os)
{
return new PrintWriter(new OutputStreamWriter(os, Charset.forName("UTF-8")));
return new PrintWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8));
}

public int run(String[] args)
Expand Down Expand Up @@ -138,28 +138,28 @@ public int run(String[] args)

out.output(r);
} catch(GZIPFormatException e) {
LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage()));
LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage());

if(ProducerUtils.STRICT_GZ) {
throw e;
}
e.printStackTrace();
} catch(ResourceParseException e) {
LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage()));
LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage());

if(ProducerUtils.STRICT_GZ) {
throw e;
}
e.printStackTrace();
} catch(RecoverableRecordFormatException e) {
// this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions...
LOG.severe(String.format("RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage()));
LOG.severe(String.format(Locale.ROOT, "RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage());

e.printStackTrace();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,16 @@
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
import java.util.Locale;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.archive.format.gzip.GZIPFormatException;
import org.archive.format.json.JSONUtils;
import org.archive.format.json.SimpleJSONPathSpec;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.util.IAUtils;
import org.archive.util.StreamCopy;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import com.google.common.io.ByteStreams;
Expand Down Expand Up @@ -87,7 +80,7 @@ public void output(Resource resource) throws IOException {
String[] linkParts = outLinkValue.split(" ");
if(linkParts.length > 2)
//'outlinks': 'origUrl date origOutlinkUrl linktype linktext'
out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]);
out.format(Locale.ROOT,"%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]);
}
} else if(outputType.equals("hopinfo")) {
String key = obj.get("Name").toString();
Expand All @@ -103,7 +96,7 @@ public void output(Resource resource) throws IOException {
}
if(outputType.equals("hopinfo")) {
//'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag'
out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag);
out.format(Locale.ROOT,"%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag);
}
}
}
Expand Down
Loading