From bf3fcb9e787ae7f4f740416a91c2d74b79f31fe7 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Nov 2025 15:11:12 +0100
Subject: [PATCH 01/16] Add forbiddenAPIs Maven plugin to fail the build when
methods relying on default locale are charset are used. Also forbid usage of
URL.equals and .hashCode which may resolve host named per DNS lookup.
---
pom.xml | 25 +++++++++++++++++++
.../resources/forbidden-apis-signatures.txt | 2 ++
2 files changed, 27 insertions(+)
create mode 100644 src/test/resources/forbidden-apis-signatures.txt
diff --git a/pom.xml b/pom.xml
index 73ba9ba2..c1c17e9b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -173,6 +173,31 @@
maven-surefire-plugin
3.2.5
+
+ de.thetaphi
+ forbiddenapis
+ 3.10
+
+
+ false
+
+ jdk-unsafe
+ jdk-deprecated
+ jdk-non-portable
+
+
+ src/test/resources/forbidden-apis-signatures.txt
+
+
+
+
+
+ check
+ testCheck
+
+
+
+
diff --git a/src/test/resources/forbidden-apis-signatures.txt b/src/test/resources/forbidden-apis-signatures.txt
new file mode 100644
index 00000000..1eda9eec
--- /dev/null
+++ b/src/test/resources/forbidden-apis-signatures.txt
@@ -0,0 +1,2 @@
+java.net.URL#equals(java.lang.Object) @ may trigger a DNS lookup to resolve the host part
+java.net.URL#hashCode() @ may trigger a DNS lookup to resolve the host part
From c94928e324b633a882783b72c99b4e24a8a23bbb Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Nov 2025 18:00:01 +0100
Subject: [PATCH 02/16] Add Locale.ROOT as parameter to all occurrences of
String.toLowerCase(), String.toUpperCase() and String.format(...)
---
.../extract/ExtractingResourceFactoryMapper.java | 11 ++++++-----
.../extract/ExtractingResourceProducer.java | 3 ++-
.../archive/extract/RealCDXExtractorOutput.java | 15 ++++++++-------
.../org/archive/extract/ResourceExtractor.java | 7 ++++---
.../org/archive/extract/WATExtractorOutput.java | 3 ++-
.../org/archive/format/gzip/GZIPMemberSeries.java | 9 +++++----
.../java/org/archive/format/http/HttpHeader.java | 3 ++-
.../java/org/archive/format/http/HttpHeaders.java | 5 +++--
.../archive/format/http/HttpMessageParser.java | 7 ++++---
.../format/http/HttpRequestMessageParser.java | 3 ++-
.../archive/format/http/HttpResponseMessage.java | 6 ++++--
.../archive/format/json/CrossProductOfLists.java | 7 ++++---
.../java/org/archive/format/json/JSONView.java | 3 ++-
.../format/text/charset/CharsetDetector.java | 7 ++++---
.../org/archive/format/text/html/NodeUtils.java | 10 ++++++----
.../org/archive/hadoop/ArchiveMetadataLoader.java | 3 ++-
.../org/archive/hadoop/ResourceRecordReader.java | 5 +++--
src/main/java/org/archive/io/ArchiveReader.java | 5 +++--
.../java/org/archive/io/ArchiveReaderFactory.java | 5 +++--
src/main/java/org/archive/io/ArchiveRecord.java | 3 ++-
.../org/archive/io/HeaderedArchiveRecord.java | 5 +++--
src/main/java/org/archive/io/arc/ARCReader.java | 3 ++-
.../java/org/archive/io/arc/ARCReaderFactory.java | 9 +++++----
src/main/java/org/archive/io/arc/ARCRecord.java | 3 ++-
src/main/java/org/archive/io/arc/ARCUtils.java | 5 +++--
src/main/java/org/archive/io/warc/WARCReader.java | 5 +++--
.../org/archive/io/warc/WARCReaderFactory.java | 7 ++++---
src/main/java/org/archive/net/PublicSuffixes.java | 3 ++-
.../resource/generic/GenericResourceProducer.java | 3 ++-
.../resource/gzip/GZIPResourceContainer.java | 3 ++-
.../org/archive/resource/warc/WARCResource.java | 3 ++-
.../org/archive/streamcontext/HTTP11Stream.java | 3 ++-
.../org/archive/url/BasicURLCanonicalizer.java | 9 +++++----
src/main/java/org/archive/url/HandyURL.java | 3 ++-
.../java/org/archive/url/IAURLCanonicalizer.java | 11 ++++++-----
src/main/java/org/archive/url/LaxURI.java | 3 ++-
src/main/java/org/archive/url/URI.java | 4 ++--
.../java/org/archive/url/URLRegexTransformer.java | 3 ++-
.../java/org/archive/url/UsableURIFactory.java | 5 +++--
src/main/java/org/archive/util/ArchiveUtils.java | 6 +++---
src/main/java/org/archive/util/FileNameSpec.java | 3 ++-
src/main/java/org/archive/util/FileUtils.java | 9 +++++----
src/main/java/org/archive/util/Recorder.java | 5 +++--
src/main/java/org/archive/util/SurtPrefixSet.java | 5 +++--
.../archive/util/binsearch/SortedTextFile.java | 7 ++++---
45 files changed, 145 insertions(+), 100 deletions(-)
diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
index 0afe16fb..567b1cd8 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
@@ -1,6 +1,7 @@
package org.archive.extract;
import java.util.Iterator;
+import java.util.Locale;
import java.util.logging.Logger;
import org.archive.format.arc.ARCConstants;
@@ -68,14 +69,14 @@ private boolean childFieldStartsWith(MetaData m, String child,
String key, String search) {
String val = getChildField(m,child,key);
return val == null ? false :
- val.toLowerCase().startsWith(search.toLowerCase());
+ val.toLowerCase(Locale.ROOT).startsWith(search.toLowerCase(Locale.ROOT));
}
private boolean childFieldContains(MetaData m, String child,
String key, String search) {
String val = getChildField(m,child,key);
return val == null ? false :
- val.toLowerCase().contains(search.toLowerCase());
+ val.toLowerCase(Locale.ROOT).contains(search.toLowerCase(Locale.ROOT));
}
private boolean childFieldEquals(MetaData m, String child,
@@ -88,7 +89,7 @@ private boolean childFieldEquals(MetaData m, String child,
private String caseInsensitiveKeyScan(MetaData m, String child, String k) {
try {
if(m.has(child)) {
- String kLC = k.toLowerCase();
+ String kLC = k.toLowerCase(Locale.ROOT);
JSONObject childJSObj = m.getJSONObject(child);
@SuppressWarnings("rawtypes")
Iterator i = childJSObj.keys();
@@ -96,7 +97,7 @@ private String caseInsensitiveKeyScan(MetaData m, String child, String k) {
Object kObj = i.next();
if(kObj instanceof String) {
String kString = (String) kObj;
- if(kString.toLowerCase().equals(kLC)) {
+ if(kString.toLowerCase(Locale.ROOT).equals(kLC)) {
return childJSObj.getString(kString);
}
}
@@ -128,7 +129,7 @@ private boolean isHTTPARCResource(MetaData envelope) {
private boolean isHTMLHttpResource(MetaData m) {
String type = caseInsensitiveKeyScan(m,HTTP_HEADERS_LIST,
"Content-Type");
- return type == null ? false : type.toLowerCase().contains("html");
+ return type == null ? false : type.toLowerCase(Locale.ROOT).contains("html");
}
private boolean isWARCType(MetaData envelope, WARCRecordType type) {
diff --git a/src/main/java/org/archive/extract/ExtractingResourceProducer.java b/src/main/java/org/archive/extract/ExtractingResourceProducer.java
index de671bee..07cdb88a 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceProducer.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceProducer.java
@@ -1,6 +1,7 @@
package org.archive.extract;
import java.io.IOException;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -33,7 +34,7 @@ public Resource getNext() throws ResourceParseException, IOException {
return current;
}
if(LOG.isLoggable(Level.INFO)) {
- LOG.info(String.format("Extracting (%s) with (%s)\n",
+ LOG.info(String.format(Locale.ROOT, "Extracting (%s) with (%s)\n",
current.getClass().toString(),
f.getClass().toString()));
}
diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
index e6f6e82f..b8f06034 100644
--- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
+++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
@@ -8,6 +8,7 @@
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
+import java.util.Locale;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -131,7 +132,7 @@ public void output(Resource resource) throws IOException {
} else {
meta = "-";
}
- if(mime.toLowerCase().contains("html")) {
+ if(mime.toLowerCase(Locale.ROOT).contains("html")) {
if(redir.equals("-")) {
// maybe an obvious meta-refresh?
redir = extractHTMLMetaRefresh(origUrl,m);
@@ -202,7 +203,7 @@ public void output(Resource resource) throws IOException {
} else {
meta = "-";
}
- if(mime.toLowerCase().contains("html")) {
+ if(mime.toLowerCase(Locale.ROOT).contains("html")) {
if(redir.equals("-")) {
// maybe an obvious meta-refresh?
redir = extractHTMLMetaRefresh(origUrl,m);
@@ -269,7 +270,7 @@ private String extractHTMLRobots(MetaData m) {
if(meta != null) {
String name = scanHeadersLC(meta, "name", null);
if(name != null) {
- if(name.toLowerCase().equals("robots")) {
+ if(name.toLowerCase(Locale.ROOT).equals("robots")) {
// alright - some robot instructions:
String content = scanHeadersLC(meta, "content", null);
if(content != null) {
@@ -291,7 +292,7 @@ private String extractHTMLMetaRefresh(String origUrl, MetaData m) {
if(meta != null) {
String name = scanHeadersLC(meta, "http-equiv", null);
if(name != null) {
- if(name.toLowerCase().equals("refresh")) {
+ if(name.toLowerCase(Locale.ROOT).equals("refresh")) {
// alright - some robot instructions:
String content = scanHeadersLC(meta, "content", null);
if(content != null) {
@@ -330,7 +331,7 @@ private String scanHeadersLC(JSONObject o, String match, String defaultVal) {
if(o.length() == 0) {
return defaultVal;
}
- String lc = match.toLowerCase().trim();
+ String lc = match.toLowerCase(Locale.ROOT).trim();
// try {
// System.err.println("REC:" + o.toString(1));
// } catch (JSONException e1) {
@@ -338,7 +339,7 @@ private String scanHeadersLC(JSONObject o, String match, String defaultVal) {
// e1.printStackTrace();
// }
for(String key : JSONObject.getNames(o)) {
- if(lc.equals(key.toLowerCase().trim())) {
+ if(lc.equals(key.toLowerCase(Locale.ROOT).trim())) {
try {
return o.getString(key).trim();
} catch (JSONException e) {
@@ -472,7 +473,7 @@ private String parseRobotInstructions(String input) {
if(input == null) {
return "-";
}
- String up = input.replaceAll("-", "").toUpperCase();
+ String up = input.replaceAll("-", "").toUpperCase(Locale.ROOT);
StringBuilder sb = new StringBuilder(3);
if(up.contains(NO_FOLLOW_MATCH)) {
sb.append("F");
diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java
index 2812aa5b..a6fa0a00 100644
--- a/src/main/java/org/archive/extract/ResourceExtractor.java
+++ b/src/main/java/org/archive/extract/ResourceExtractor.java
@@ -8,6 +8,7 @@
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -138,7 +139,7 @@ public int run(String[] args)
out.output(r);
} catch(GZIPFormatException e) {
- LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage()));
+ LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
@@ -147,7 +148,7 @@ public int run(String[] args)
}
e.printStackTrace();
} catch(ResourceParseException e) {
- LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage()));
+ LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
@@ -157,7 +158,7 @@ public int run(String[] args)
e.printStackTrace();
} catch(RecoverableRecordFormatException e) {
// this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions...
- LOG.severe(String.format("RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage()));
+ LOG.severe(String.format(Locale.ROOT, "RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index 4b5f72ed..dbe979e5 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -10,6 +10,7 @@
import java.text.ParseException;
import java.net.UnknownHostException;
import java.util.Date;
+import java.util.Locale;
import org.archive.format.gzip.GZIPMemberWriter;
import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
@@ -143,7 +144,7 @@ private void writeARC(OutputStream recOut, MetaData md) throws IOException {
String capDateString = extractOrIO(md, "Envelope.ARC-Header-Metadata.Date");
String filename = extractOrIO(md, "Container.Filename");
String offset = extractOrIO(md, "Container.Offset");
- String recId = String.format("",filename,offset);
+ String recId = String.format(Locale.ROOT, "",filename,offset);
writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
}
diff --git a/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java b/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java
index d70bf394..154cf5f1 100644
--- a/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java
+++ b/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.Inflater;
@@ -227,7 +228,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException
}
if(LOG.isLoggable(Level.INFO)) {
- LOG.info(String.format(
+ LOG.info(String.format(Locale.ROOT,
"Got EOF after %d bytes before finding magic in %s\n",
amtSkipped * -1, streamContext));
}
@@ -237,7 +238,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException
if(amtSkipped > 0) {
if(strict) {
if(state == STATE_START) {
- LOG.info(String.format(
+ LOG.info(String.format(Locale.ROOT,
"Strict mode Skipped %d bytes in (%s) before finding magic at offset(%d)\n",
amtSkipped, streamContext, offset-3));
} else {
@@ -248,7 +249,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException
}
if(LOG.isLoggable(Level.INFO)) {
- LOG.info(String.format(
+ LOG.info(String.format(Locale.ROOT,
"Skipped %d bytes in (%s) before finding magic at offset(%d)\n",
amtSkipped, streamContext, offset-3));
}
@@ -268,7 +269,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException
}
offset = currentMemberStartOffset + 3;
stream.setOffset(currentMemberStartOffset + 3);
- LOG.warning(String.format(
+ LOG.warning(String.format(Locale.ROOT,
"GZIPFormatException with record around offset(%d) in (%s)\n",
offset, streamContext));
}
diff --git a/src/main/java/org/archive/format/http/HttpHeader.java b/src/main/java/org/archive/format/http/HttpHeader.java
index 57b70e1f..9ebe860f 100755
--- a/src/main/java/org/archive/format/http/HttpHeader.java
+++ b/src/main/java/org/archive/format/http/HttpHeader.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.io.OutputStream;
+import java.util.Locale;
public class HttpHeader implements HttpConstants {
private String name = null;
@@ -27,7 +28,7 @@ public void write(OutputStream out) throws IOException {
public String toString() {
StringBuilder sb = new StringBuilder(name.length() + value.length()+20);
- sb.append(String.format("HttpHeader(%s)(%s)",name,value));
+ sb.append(String.format(Locale.ROOT, "HttpHeader(%s)(%s)",name,value));
return sb.toString();
}
}
diff --git a/src/main/java/org/archive/format/http/HttpHeaders.java b/src/main/java/org/archive/format/http/HttpHeaders.java
index ed8061d7..a65dd8fb 100755
--- a/src/main/java/org/archive/format/http/HttpHeaders.java
+++ b/src/main/java/org/archive/format/http/HttpHeaders.java
@@ -4,6 +4,7 @@
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Date;
+import java.util.Locale;
import java.util.logging.Logger;
import org.archive.util.ByteOp;
@@ -54,9 +55,9 @@ public String getValue(String name) {
}
public String getValueCaseInsensitive(String name) {
- String lc = name.toLowerCase();
+ String lc = name.toLowerCase(Locale.ROOT);
for(HttpHeader h : this) {
- if(h.getName().toLowerCase().equals(lc)) {
+ if(h.getName().toLowerCase(Locale.ROOT).equals(lc)) {
return h.getValue();
}
}
diff --git a/src/main/java/org/archive/format/http/HttpMessageParser.java b/src/main/java/org/archive/format/http/HttpMessageParser.java
index c4fcdf92..24e59e03 100644
--- a/src/main/java/org/archive/format/http/HttpMessageParser.java
+++ b/src/main/java/org/archive/format/http/HttpMessageParser.java
@@ -1,5 +1,6 @@
package org.archive.format.http;
+import java.util.Locale;
public class HttpMessageParser implements HttpConstants {
@@ -22,11 +23,11 @@ protected int parseVersionLax(byte buf[], int start, int len)
throws HttpParseException {
String v = new String(buf,start,len,UTF8);
- if(v.toLowerCase().compareTo(VERSION_0_STATUS.toLowerCase()) == 0) {
+ if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_0_STATUS.toLowerCase(Locale.ROOT)) == 0) {
return VERSION_0;
- } else if(v.toLowerCase().compareTo(VERSION_1_STATUS.toLowerCase()) == 0) {
+ } else if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_1_STATUS.toLowerCase(Locale.ROOT)) == 0) {
return VERSION_1;
- } else if(v.toLowerCase().compareTo(VERSION_9_STATUS.toLowerCase()) == 0) {
+ } else if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_9_STATUS.toLowerCase(Locale.ROOT)) == 0) {
return VERSION_9;
}
return VERSION_0;
diff --git a/src/main/java/org/archive/format/http/HttpRequestMessageParser.java b/src/main/java/org/archive/format/http/HttpRequestMessageParser.java
index f7bc43c7..759bbe5d 100644
--- a/src/main/java/org/archive/format/http/HttpRequestMessageParser.java
+++ b/src/main/java/org/archive/format/http/HttpRequestMessageParser.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.util.Locale;
public class HttpRequestMessageParser extends HttpMessageParser {
public int maxBytes = 1024 * 1024;
@@ -223,7 +224,7 @@ protected int parseMethodStrict(byte buf[], int start, int len)
protected int parseMethodLax(byte buf[], int start, int len)
throws HttpParseException {
- String v = new String(buf,start,len,UTF8).toUpperCase();
+ String v = new String(buf,start,len,UTF8).toUpperCase(Locale.ROOT);
if(v.compareTo(METHOD_GET_STRING) == 0) {
return METHOD_GET;
} else if(v.compareTo(METHOD_HEAD_STRING) == 0) {
diff --git a/src/main/java/org/archive/format/http/HttpResponseMessage.java b/src/main/java/org/archive/format/http/HttpResponseMessage.java
index 0cb7b7e5..6d3f5c35 100755
--- a/src/main/java/org/archive/format/http/HttpResponseMessage.java
+++ b/src/main/java/org/archive/format/http/HttpResponseMessage.java
@@ -1,5 +1,7 @@
package org.archive.format.http;
+import java.util.Locale;
+
public class HttpResponseMessage extends HttpMessage implements HttpResponseMessageObserver {
private int status = 0;
private String reason = null;
@@ -20,10 +22,10 @@ public String getReason() {
return reason;
}
public String toString() {
- return String.format("%s %d %s%s", getVersionString(), status, reason, CRLF);
+ return String.format(Locale.ROOT, "%s %d %s%s", getVersionString(), status, reason, CRLF);
}
public String toDebugString() {
- return String.format("Message(%d):(%s) (%d) (%s)\n",
+ return String.format(Locale.ROOT, "Message(%d):(%s) (%d) (%s)\n",
reason.length(),getVersionString(),status,reason,CRLF);
}
diff --git a/src/main/java/org/archive/format/json/CrossProductOfLists.java b/src/main/java/org/archive/format/json/CrossProductOfLists.java
index f9e2abd2..69cdae33 100644
--- a/src/main/java/org/archive/format/json/CrossProductOfLists.java
+++ b/src/main/java/org/archive/format/json/CrossProductOfLists.java
@@ -4,6 +4,7 @@
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
+import java.util.Locale;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -18,12 +19,12 @@ public List> crossProduct(List>> listOfLists) {
if(LOG.isLoggable(Level.INFO)) {
int count = listOfLists.size();
- LOG.info(String.format("Total of (%d) lists to cross product",count));
+ LOG.info(String.format(Locale.ROOT, "Total of (%d) lists to cross product",count));
for(int i = 0; i < count; i++) {
- LOG.info(String.format("Field (%d) is (%d) deep",i,listOfLists.get(i).size()));
+ LOG.info(String.format(Locale.ROOT, "Field (%d) is (%d) deep",i,listOfLists.get(i).size()));
for(List inner : listOfLists.get(i)) {
LOG.info(
- String.format("----(%d):(%s)"
+ String.format(Locale.ROOT, "----(%d):(%s)"
,i,StringUtils.join(inner.toArray(),",") ) );
}
}
diff --git a/src/main/java/org/archive/format/json/JSONView.java b/src/main/java/org/archive/format/json/JSONView.java
index 7a984ebe..444ea7e6 100644
--- a/src/main/java/org/archive/format/json/JSONView.java
+++ b/src/main/java/org/archive/format/json/JSONView.java
@@ -2,6 +2,7 @@
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -28,7 +29,7 @@ public class JSONView {
public JSONView(String... pathSpecs) {
this.pathSpecs = new ArrayList(pathSpecs.length);
if(LOG.isLoggable(Level.INFO)) {
- LOG.info(String.format("Creating JSONView with(%s)",
+ LOG.info(String.format(Locale.ROOT, "Creating JSONView with(%s)",
StringUtils.join(pathSpecs,",")));
}
for(String pathSpec : pathSpecs) {
diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
index 214fde07..49286764 100644
--- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java
+++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
@@ -22,6 +22,7 @@
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -106,7 +107,7 @@ protected boolean isCharsetSupported(String charsetName) {
}
}
protected String mapCharset(String orig) {
- String lc = orig.toLowerCase();
+ String lc = orig.toLowerCase(Locale.ROOT);
if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
return "cp1252";
}
@@ -114,7 +115,7 @@ protected String mapCharset(String orig) {
}
protected String contentTypeToCharset(final String contentType) {
int offset =
- contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());
+ contentType.toUpperCase(Locale.ROOT).indexOf(CHARSET_TOKEN.toUpperCase(Locale.ROOT));
if (offset != -1) {
String cs = contentType.substring(offset + CHARSET_TOKEN.length());
@@ -148,7 +149,7 @@ protected String getCharsetFromHeaders(HttpHeaders headers)
return null;
}
for(HttpHeader header : headers) {
- if(header.getName().toUpperCase().trim().equals(
+ if(header.getName().toUpperCase(Locale.ROOT).trim().equals(
HTTP_CONTENT_TYPE_HEADER)) {
return contentTypeToCharset(header.getValue());
}
diff --git a/src/main/java/org/archive/format/text/html/NodeUtils.java b/src/main/java/org/archive/format/text/html/NodeUtils.java
index 625d9099..f231b91a 100644
--- a/src/main/java/org/archive/format/text/html/NodeUtils.java
+++ b/src/main/java/org/archive/format/text/html/NodeUtils.java
@@ -19,6 +19,8 @@
*/
package org.archive.format.text.html;
+import java.util.Locale;
+
import org.htmlparser.Node;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
@@ -41,7 +43,7 @@ public static boolean isTagNodeNamed(Node node, String name) {
if(isTagNode(node)) {
TagNode tagNode = (TagNode) node;
String nodeName = tagNode.getTagName();
- return nodeName.equals(name.toUpperCase());
+ return nodeName.equals(name.toUpperCase(Locale.ROOT));
}
return false;
}
@@ -50,7 +52,7 @@ public static boolean isOpenTagNodeNamed(Node node, String name) {
TagNode tagNode = (TagNode) node;
if(!tagNode.isEndTag()) {
String nodeName = tagNode.getTagName();
- return nodeName.equals(name.toUpperCase());
+ return nodeName.equals(name.toUpperCase(Locale.ROOT));
}
}
return false;
@@ -60,7 +62,7 @@ public static boolean isNonEmptyOpenTagNodeNamed(Node node, String name) {
TagNode tagNode = (TagNode) node;
if(!tagNode.isEndTag() && !tagNode.isEmptyXmlTag()) {
String nodeName = tagNode.getTagName();
- return nodeName.equals(name.toUpperCase());
+ return nodeName.equals(name.toUpperCase(Locale.ROOT));
}
}
return false;
@@ -70,7 +72,7 @@ public static boolean isCloseTagNodeNamed(Node node, String name) {
TagNode tagNode = (TagNode) node;
if(tagNode.isEndTag()) {
String nodeName = tagNode.getTagName();
- return nodeName.equals(name.toUpperCase());
+ return nodeName.equals(name.toUpperCase(Locale.ROOT));
}
}
return false;
diff --git a/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java b/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java
index 37c8af99..a3cbb26c 100644
--- a/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java
+++ b/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Locale;
import java.util.logging.Logger;
import org.apache.hadoop.mapreduce.InputFormat;
@@ -54,7 +55,7 @@ public Tuple getNext() throws IOException {
try {
key = reader.getCurrentKey();
- LOG.info(String.format("Loaded key-offset %d\n", key.offset));
+ LOG.info(String.format(Locale.ROOT, "Loaded key-offset %d\n", key.offset));
value = reader.getCurrentValue();
} catch (InterruptedException e) {
// is this needed and the right way?
diff --git a/src/main/java/org/archive/hadoop/ResourceRecordReader.java b/src/main/java/org/archive/hadoop/ResourceRecordReader.java
index 06d3ce2e..88b93dd2 100644
--- a/src/main/java/org/archive/hadoop/ResourceRecordReader.java
+++ b/src/main/java/org/archive/hadoop/ResourceRecordReader.java
@@ -1,6 +1,7 @@
package org.archive.hadoop;
import java.io.IOException;
+import java.util.Locale;
import java.util.logging.Logger;
import org.apache.hadoop.fs.FSDataInputStream;
@@ -111,7 +112,7 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
if(r != null) {
StreamCopy.readToEOF(r.getInputStream());
- LOG.info(String.format("Extracted offset %d\n",
+ LOG.info(String.format(Locale.ROOT, "Extracted offset %d\n",
series.getCurrentMemberStartOffset()));
cachedK = new ResourceContext(name,
series.getCurrentMemberStartOffset());
@@ -121,7 +122,7 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
} catch (ResourceParseException e) {
e.printStackTrace();
throw new IOException(
- String.format("ResourceParseException at(%s)(%d)",
+ String.format(Locale.ROOT, "ResourceParseException at(%s)(%d)",
name,series.getCurrentMemberStartOffset()),
e);
}
diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java
index 449cdc24..53b8167b 100644
--- a/src/main/java/org/archive/io/ArchiveReader.java
+++ b/src/main/java/org/archive/io/ArchiveReader.java
@@ -32,6 +32,7 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -615,7 +616,7 @@ protected static boolean getTrueOrFalse(final String value) {
if (value == null || value.length() <= 0) {
return false;
}
- return Boolean.TRUE.toString().equals(value.toLowerCase());
+ return Boolean.TRUE.toString().equals(value.toLowerCase(Locale.ROOT));
}
/**
@@ -757,4 +758,4 @@ protected static Options getOptions() {
"'or 'nohead'. Default: 'cdx'."));
return options;
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java
index bc316893..fe72236b 100644
--- a/src/main/java/org/archive/io/ArchiveReaderFactory.java
+++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java
@@ -25,6 +25,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
+import java.util.Locale;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.warc.WARCReaderFactory;
@@ -296,7 +297,7 @@ protected void addUserAgent(final HttpURLConnection connection) {
* @throws IOException
*/
protected boolean isCompressed(final File f) throws IOException {
- return f.getName().toLowerCase().
+ return f.getName().toLowerCase(Locale.ROOT).
endsWith(DOT_COMPRESSED_FILE_EXTENSION);
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java
index 4bd1fa02..01e8d5ec 100644
--- a/src/main/java/org/archive/io/ArchiveRecord.java
+++ b/src/main/java/org/archive/io/ArchiveRecord.java
@@ -23,6 +23,7 @@
import java.io.OutputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
+import java.util.Locale;
import java.util.logging.Level;
import org.archive.format.ArchiveFileConstants;
@@ -393,7 +394,7 @@ public boolean hasContentHeaders() {
return false;
}
- if (!url.toLowerCase().startsWith("http")) {
+ if (!url.toLowerCase(Locale.ROOT).startsWith("http")) {
return false;
}
diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
index 809a9e54..70c4fb04 100644
--- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java
+++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
@@ -25,6 +25,7 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.util.Locale;
import org.archive.format.http.HttpHeader;
import org.archive.format.arc.ARCConstants;
@@ -156,8 +157,8 @@ private InputStream readContentHeaders() throws IOException {
boolean isHttpResponse = statusLine.startsWith("HTTP");
boolean isHttpRequest = false;
if (!isHttpResponse) {
- isHttpRequest = statusLine.toUpperCase().startsWith("GET") ||
- !statusLine.toUpperCase().startsWith("POST");
+ isHttpRequest = statusLine.toUpperCase(Locale.ROOT).startsWith("GET") ||
+ !statusLine.toUpperCase(Locale.ROOT).startsWith("POST");
}
if (!isHttpResponse && !isHttpRequest) {
throw new UnexpectedStartLineIOException("Failed parse of " +
diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java
index c9a88415..ecc742a5 100644
--- a/src/main/java/org/archive/io/arc/ARCReader.java
+++ b/src/main/java/org/archive/io/arc/ARCReader.java
@@ -27,6 +27,7 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
@@ -493,7 +494,7 @@ public static void main(String [] args)
break;
case 'f':
- format = cmdlineOptions[i].getValue().toLowerCase();
+ format = cmdlineOptions[i].getValue().toLowerCase(Locale.ROOT);
boolean match = false;
// List of supported formats.
final String [] supportedFormats =
diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
index d2f10842..bbcc8b6f 100644
--- a/src/main/java/org/archive/io/arc/ARCReaderFactory.java
+++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
@@ -27,6 +27,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
+import java.util.Locale;
import java.util.logging.Level;
import org.archive.io.ArchiveReader;
@@ -230,7 +231,7 @@ public static boolean testCompressedARCFile(File arcFile,
throws IOException {
boolean compressedARCFile = false;
FileUtils.assertReadable(arcFile);
- if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+ if(!skipSuffixCheck && !arcFile.getName().toLowerCase(Locale.ROOT)
.endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
return compressedARCFile;
}
@@ -247,9 +248,9 @@ public static boolean testCompressedARCFile(File arcFile,
public static boolean isARCSuffix(final String arcName) {
return (arcName == null)?
false:
- (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
+ (arcName.toLowerCase(Locale.ROOT).endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
true:
- (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
+ (arcName.toLowerCase(Locale.ROOT).endsWith(DOT_ARC_FILE_EXTENSION))?
true: false;
}
@@ -452,4 +453,4 @@ protected void gotoEOR(ArchiveRecord rec) throws IOException {
logStdErr(Level.WARNING, message);
}
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
index 0815c18a..14e80728 100644
--- a/src/main/java/org/archive/io/arc/ARCRecord.java
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -27,6 +27,7 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -376,7 +377,7 @@ private ARCRecordMetaData computeMetaData(List keys,
if (keys.size() != values.size()) {
// Early ARCs had a space in mimetype.
if (values.size() == (keys.size() + 1) &&
- values.get(4).toLowerCase().startsWith("charset=")) {
+ values.get(4).toLowerCase(Locale.ROOT).startsWith("charset=")) {
List nuvalues =
new ArrayList(keys.size());
nuvalues.add(0, values.get(0));
diff --git a/src/main/java/org/archive/io/arc/ARCUtils.java b/src/main/java/org/archive/io/arc/ARCUtils.java
index 5bcb4cc3..05c15abb 100644
--- a/src/main/java/org/archive/io/arc/ARCUtils.java
+++ b/src/main/java/org/archive/io/arc/ARCUtils.java
@@ -27,6 +27,7 @@
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
+import java.util.Locale;
import org.archive.url.UsableURI;
import org.archive.util.zip.GzipHeader;
@@ -94,7 +95,7 @@ public static boolean testCompressedARCFile(File arcFile,
throws IOException {
boolean compressedARCFile = false;
isReadable(arcFile);
- if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+ if(!skipSuffixCheck && !arcFile.getName().toLowerCase(Locale.ROOT)
.endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
return compressedARCFile;
}
@@ -197,7 +198,7 @@ public static boolean testUncompressedARCFile(File arcFile)
throws IOException {
boolean uncompressedARCFile = false;
isReadable(arcFile);
- if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) {
+ if(arcFile.getName().toLowerCase(Locale.ROOT).endsWith(ARC_FILE_EXTENSION)) {
FileInputStream fis = new FileInputStream(arcFile);
try {
byte [] b = new byte[ARC_MAGIC_NUMBER.length()];
diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java
index d33874a3..02756cb1 100644
--- a/src/main/java/org/archive/io/warc/WARCReader.java
+++ b/src/main/java/org/archive/io/warc/WARCReader.java
@@ -24,6 +24,7 @@
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
@@ -233,7 +234,7 @@ public static void main(String [] args)
break;
case 'f':
- format = cmdlineOptions[i].getValue().toLowerCase();
+ format = cmdlineOptions[i].getValue().toLowerCase(Locale.ROOT);
boolean match = false;
// List of supported formats.
final String [] supportedFormats =
@@ -286,4 +287,4 @@ public static void main(String [] args)
}
}
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/warc/WARCReaderFactory.java b/src/main/java/org/archive/io/warc/WARCReaderFactory.java
index 881da869..70b80340 100644
--- a/src/main/java/org/archive/io/warc/WARCReaderFactory.java
+++ b/src/main/java/org/archive/io/warc/WARCReaderFactory.java
@@ -26,6 +26,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
+import java.util.Locale;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
@@ -307,9 +308,9 @@ protected void gotoEOR(ArchiveRecord rec) throws IOException {
public static boolean isWARCSuffix(final String f) {
return (f == null)?
false:
- (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
+ (f.toLowerCase(Locale.ROOT).endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
true:
- (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?
+ (f.toLowerCase(Locale.ROOT).endsWith(DOT_WARC_FILE_EXTENSION))?
true: false;
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java
index e436b8dc..a2a2bfb2 100644
--- a/src/main/java/org/archive/net/PublicSuffixes.java
+++ b/src/main/java/org/archive/net/PublicSuffixes.java
@@ -31,6 +31,7 @@
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -231,7 +232,7 @@ protected static Node readPublishedFileToSurtTrie(BufferedReader reader) throws
// discard utf8 notation after entry
line = line.split("\\s+")[0];
// TODO: maybe we don't need to create lower-cased String
- line = line.toLowerCase();
+ line = line.toLowerCase(Locale.ROOT);
// SURT-order domain segments
String[] segs = line.split("\\.");
StringBuilder sb = new StringBuilder();
diff --git a/src/main/java/org/archive/resource/generic/GenericResourceProducer.java b/src/main/java/org/archive/resource/generic/GenericResourceProducer.java
index 812a3f0d..b111dc1e 100644
--- a/src/main/java/org/archive/resource/generic/GenericResourceProducer.java
+++ b/src/main/java/org/archive/resource/generic/GenericResourceProducer.java
@@ -1,6 +1,7 @@
package org.archive.resource.generic;
import java.io.IOException;
+import java.util.Locale;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
@@ -45,6 +46,6 @@ public void close() throws IOException {
stream.close();
}
public String getContext() {
- return String.format("Context(%s)(%d)", name, stream.getOffset());
+ return String.format(Locale.ROOT, "Context(%s)(%d)", name, stream.getOffset());
}
}
diff --git a/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java b/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java
index 39611ab8..5267a0f9 100644
--- a/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java
+++ b/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java
@@ -1,6 +1,7 @@
package org.archive.resource.gzip;
import java.io.IOException;
+import java.util.Locale;
import org.archive.format.gzip.GZIPMemberSeries;
import org.archive.format.gzip.GZIPSeriesMember;
@@ -54,6 +55,6 @@ public void close() throws IOException {
series.close();
}
public String getContext() {
- return String.format("Context(%s)(%d)", series.getStreamContext(), series.getCurrentMemberStartOffset());
+ return String.format(Locale.ROOT, "Context(%s)(%d)", series.getStreamContext(), series.getCurrentMemberStartOffset());
}
}
diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java
index a9c3fcc3..a5e5ac35 100644
--- a/src/main/java/org/archive/resource/warc/WARCResource.java
+++ b/src/main/java/org/archive/resource/warc/WARCResource.java
@@ -5,6 +5,7 @@
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
+import java.util.Locale;
import org.archive.format.http.HttpHeader;
import org.archive.format.http.HttpResponse;
@@ -43,7 +44,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
String name = h.getName();
String value = h.getValue();
fields.putString(name,value);
- if(name.toLowerCase().equals("content-length")) {
+ if(name.toLowerCase(Locale.ROOT).equals("content-length")) {
// TODO: catch formatexception
length = Long.parseLong(value);
}
diff --git a/src/main/java/org/archive/streamcontext/HTTP11Stream.java b/src/main/java/org/archive/streamcontext/HTTP11Stream.java
index 06f51409..995dc53e 100755
--- a/src/main/java/org/archive/streamcontext/HTTP11Stream.java
+++ b/src/main/java/org/archive/streamcontext/HTTP11Stream.java
@@ -5,6 +5,7 @@
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
+import java.util.Locale;
public class HTTP11Stream extends AbstractBufferingStream {
private URL url;
@@ -42,7 +43,7 @@ public int doRead(byte[] b, int off, int len) throws IOException {
public void doSeek(long offset) throws IOException {
doClose();
conn = url.openConnection();
- conn.setRequestProperty("Range", String.format("bytes=%d-", offset));
+ conn.setRequestProperty("Range", String.format(Locale.ROOT, "bytes=%d-", offset));
conn.connect();
is = conn.getInputStream();
}
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
index 37b448c1..632d1ea7 100644
--- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -7,6 +7,7 @@
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.util.ArrayList;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -64,7 +65,7 @@ public void canonicalize(HandyURL url) {
if (ip != null) {
host = ip;
} else if (host != null) {
- host = escapeOnce(host.toLowerCase());
+ host = escapeOnce(host.toLowerCase(Locale.ROOT));
}
url.setHost(host);
// now the path:
@@ -159,7 +160,7 @@ public String attemptIPFormats(String host) { // throws URIException {
}
ip[i] = octet;
}
- return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]);
+ return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]);
} else {
Matcher m2 = DECIMAL_IP.matcher(host);
if (m2.matches()) {
@@ -190,7 +191,7 @@ public String attemptIPFormats(String host) { // throws URIException {
}
ip[i] = octet;
}
- return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2],
+ return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2],
ip[3]);
}
@@ -261,7 +262,7 @@ public String escapeOnce(String input) {
}
sb.append("%");
- String hex = Integer.toHexString(b).toUpperCase();
+ String hex = Integer.toHexString(b).toUpperCase(Locale.ROOT);
if (hex.length() == 1) {
sb.append('0');
}
diff --git a/src/main/java/org/archive/url/HandyURL.java b/src/main/java/org/archive/url/HandyURL.java
index 91539b3f..0c2c81f7 100644
--- a/src/main/java/org/archive/url/HandyURL.java
+++ b/src/main/java/org/archive/url/HandyURL.java
@@ -2,6 +2,7 @@
import java.net.MalformedURLException;
import java.net.URL;
+import java.util.Locale;
public class HandyURL {
public final static int DEFAULT_PORT = -1;
@@ -277,7 +278,7 @@ public void setOpaque(String opaque) {
}
public String toDebugString() {
- return String.format("Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)",
+ return String.format(Locale.ROOT, "Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)",
scheme, authUser, authPass, host, port, path, query, hash);
}
diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java
index 0cf7c8a4..e964cd00 100644
--- a/src/main/java/org/archive/url/IAURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java
@@ -2,6 +2,7 @@
import java.util.Arrays;
import java.util.Comparator;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -20,11 +21,11 @@ public void canonicalize(HandyURL url) {
}
if (rules.isSet(SCHEME_SETTINGS, SCHEME_LOWERCASE)) {
if (url.getScheme() != null) {
- url.setScheme(url.getScheme().toLowerCase());
+ url.setScheme(url.getScheme().toLowerCase(Locale.ROOT));
}
}
if(rules.isSet(HOST_SETTINGS, HOST_LOWERCASE)) {
- url.setHost(url.getHost().toLowerCase());
+ url.setHost(url.getHost().toLowerCase(Locale.ROOT));
}
if(rules.isSet(HOST_SETTINGS, HOST_MASSAGE)) {
url.setHost(massageHost(url.getHost()));
@@ -46,7 +47,7 @@ public void canonicalize(HandyURL url) {
url.setPath(null);
} else {
if(rules.isSet(PATH_SETTINGS, PATH_LOWERCASE)) {
- path = path.toLowerCase();
+ path = path.toLowerCase(Locale.ROOT);
}
if(rules.isSet(PATH_SETTINGS, PATH_STRIP_SESSION_ID)) {
path = URLRegexTransformer.stripPathSessionID(path);
@@ -71,7 +72,7 @@ public void canonicalize(HandyURL url) {
}
// lower-case:
if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
- query = query.toLowerCase();
+ query = query.toLowerCase(Locale.ROOT);
}
// re-order?
if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
@@ -155,7 +156,7 @@ public static String massageHost(String host) {
return host;
}
public static int getDefaultPort(String scheme) {
- String lcScheme = scheme.toLowerCase();
+ String lcScheme = scheme.toLowerCase(Locale.ROOT);
if(lcScheme.equals("http")) {
return 80;
} else if(lcScheme.equals("https")) {
diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java
index 57071460..4210c303 100644
--- a/src/main/java/org/archive/url/LaxURI.java
+++ b/src/main/java/org/archive/url/LaxURI.java
@@ -22,6 +22,7 @@
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.BitSet;
+import java.util.Locale;
/**
* URI subclass which allows partial/inconsistent encoding, matching
@@ -321,7 +322,7 @@ protected void parseUriReference(String original, boolean escaped)
*
*/
if (at > 0 && at < length && tmp.charAt(at) == ':') {
- char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
+ char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray();
if (validate(target, scheme)) {
_scheme = target;
from = ++at;
diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java
index 374e0574..38219556 100644
--- a/src/main/java/org/archive/url/URI.java
+++ b/src/main/java/org/archive/url/URI.java
@@ -261,7 +261,7 @@ public URI(String scheme, String schemeSpecificPart, String fragment)
if (scheme == null) {
throw new URIException(URIException.PARSING, "scheme required");
}
- char[] s = scheme.toLowerCase().toCharArray();
+ char[] s = scheme.toLowerCase(Locale.ROOT).toCharArray();
if (validate(s, URI.scheme)) {
_scheme = s; // is_absoluteURI
} else {
@@ -1954,7 +1954,7 @@ protected void parseUriReference(String original, boolean escaped)
*
*/
if (at > 0 && at < length && tmp.charAt(at) == ':') {
- char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
+ char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray();
if (validate(target, scheme)) {
_scheme = target;
} else {
diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java
index 5f31c81c..182eb218 100644
--- a/src/main/java/org/archive/url/URLRegexTransformer.java
+++ b/src/main/java/org/archive/url/URLRegexTransformer.java
@@ -1,5 +1,6 @@
package org.archive.url;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -27,7 +28,7 @@ public class URLRegexTransformer {
public static String stripOpts(String orig, OptimizedPattern op[]) {
- String origLC = orig.toLowerCase();
+ String origLC = orig.toLowerCase(Locale.ROOT);
StringBuilder sb = null;
int i = 0;
int max = op.length;
diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java
index 08f18999..3038ada5 100644
--- a/src/main/java/org/archive/url/UsableURIFactory.java
+++ b/src/main/java/org/archive/url/UsableURIFactory.java
@@ -23,6 +23,7 @@
import java.io.UnsupportedEncodingException;
import java.util.BitSet;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
@@ -609,7 +610,7 @@ private String fixupDomainlabel(String label)
throw ue;
}
}
- label = label.toLowerCase();
+ label = label.toLowerCase(Locale.ROOT);
return label;
}
@@ -755,6 +756,6 @@ private String checkUriElement(String element) {
*/
private String checkUriElementAndLowerCase(String element) {
String tmp = checkUriElement(element);
- return (tmp != null)? tmp.toLowerCase(): tmp;
+ return (tmp != null)? tmp.toLowerCase(Locale.ROOT): tmp;
}
}
diff --git a/src/main/java/org/archive/util/ArchiveUtils.java b/src/main/java/org/archive/util/ArchiveUtils.java
index 22ba2787..50307b43 100644
--- a/src/main/java/org/archive/util/ArchiveUtils.java
+++ b/src/main/java/org/archive/util/ArchiveUtils.java
@@ -900,7 +900,7 @@ private static String loadVersion() {
if (line.startsWith("#")) {
continue;
}
- TLDS.add(line.trim().toLowerCase());
+ TLDS.add(line.trim().toLowerCase(Locale.ROOT));
}
} catch (Exception e) {
LOGGER.log(Level.SEVERE,"TLD list unavailable",e);
@@ -917,7 +917,7 @@ private static String loadVersion() {
* @return boolean true if recognized as TLD
*/
public static boolean isTld(String dom) {
- return TLDS.contains(dom.toLowerCase());
+ return TLDS.contains(dom.toLowerCase(Locale.ROOT));
}
public static void closeQuietly(Object input) {
@@ -981,7 +981,7 @@ public static int readFully(InputStream input, byte[] buf)
*/
public static BufferedReader getBufferedReader(File source) throws IOException {
InputStream is = new BufferedInputStream(new FileInputStream(source));
- boolean isGzipped = source.getName().toLowerCase().
+ boolean isGzipped = source.getName().toLowerCase(Locale.ROOT).
endsWith(GZIP_SUFFIX);
if(isGzipped) {
is = new GZIPInputStream(is);
diff --git a/src/main/java/org/archive/util/FileNameSpec.java b/src/main/java/org/archive/util/FileNameSpec.java
index a3312cfc..7ace8b59 100644
--- a/src/main/java/org/archive/util/FileNameSpec.java
+++ b/src/main/java/org/archive/util/FileNameSpec.java
@@ -1,5 +1,6 @@
package org.archive.util;
+import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
public class FileNameSpec {
@@ -15,7 +16,7 @@ public FileNameSpec(String prefix, String suffix) {
public String getNextName() {
StringBuilder sb = new StringBuilder();
sb.append(prefix);
- sb.append(String.format("%06d",aInt.incrementAndGet()));
+ sb.append(String.format(Locale.ROOT, "%06d",aInt.incrementAndGet()));
sb.append(suffix);
return sb.toString();
}
diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java
index 70b5ffae..6886e08c 100644
--- a/src/main/java/org/archive/util/FileUtils.java
+++ b/src/main/java/org/archive/util/FileUtils.java
@@ -32,6 +32,7 @@
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import java.util.Locale;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -219,8 +220,8 @@ protected static void workaroundCopyFile(final File src,
FileFilter prefixFilter = new FileFilter() {
public boolean accept(File pathname)
{
- return pathname.getName().toLowerCase().
- startsWith(prefix.toLowerCase());
+ return pathname.getName().toLowerCase(Locale.ROOT).
+ startsWith(prefix.toLowerCase(Locale.ROOT));
}
};
return dir.listFiles(prefixFilter);
@@ -283,7 +284,7 @@ public static boolean isReadableWithExtensionAndMagic(final File f,
throws IOException {
boolean result = false;
FileUtils.assertReadable(f);
- if(f.getName().toLowerCase().endsWith(uncompressedExtension)) {
+ if(f.getName().toLowerCase(Locale.ROOT).endsWith(uncompressedExtension)) {
FileInputStream fis = new FileInputStream(f);
try {
byte [] b = new byte[magic.length()];
@@ -708,4 +709,4 @@ public static void appendTo(File fileToAppendTo, File fileToAppendFrom) throws I
out.flush();
}
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java
index 6a7a53d7..6f9e0117 100644
--- a/src/main/java/org/archive/util/Recorder.java
+++ b/src/main/java/org/archive/util/Recorder.java
@@ -26,6 +26,7 @@
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.HashSet;
+import java.util.Locale;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -338,8 +339,8 @@ public void setInputIsChunked(boolean chunked) {
* @param contentEncoding declared content-encoding of input recording.
*/
public void setContentEncoding(String contentEncoding) {
- String lowerCoding = contentEncoding.toLowerCase();
- if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase())) {
+ String lowerCoding = contentEncoding.toLowerCase(Locale.ROOT);
+ if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase(Locale.ROOT))) {
throw new IllegalArgumentException("contentEncoding unsupported: "+contentEncoding);
}
this.contentEncoding = lowerCoding;
diff --git a/src/main/java/org/archive/util/SurtPrefixSet.java b/src/main/java/org/archive/util/SurtPrefixSet.java
index 6925cc83..32a34d53 100644
--- a/src/main/java/org/archive/util/SurtPrefixSet.java
+++ b/src/main/java/org/archive/util/SurtPrefixSet.java
@@ -31,6 +31,7 @@
import java.io.PrintStream;
import java.io.Reader;
import java.util.Iterator;
+import java.util.Locale;
import org.archive.url.UsableURI;
import org.archive.util.iterator.LineReadingIterator;
@@ -70,7 +71,7 @@ public void importFrom(Reader r) {
while (iter.hasNext()) {
s = (String) iter.next();
- add(s.toLowerCase());
+ add(s.toLowerCase(Locale.ROOT));
}
}
@@ -145,7 +146,7 @@ public boolean considerAsAddDirective(String suri) {
}
if(u.indexOf("(")>0) {
// formal SURT prefix; toLowerCase just in case
- add(u.toLowerCase());
+ add(u.toLowerCase(Locale.ROOT));
} else {
// hostname/normal form URI from which
// to deduce SURT prefix
diff --git a/src/main/java/org/archive/util/binsearch/SortedTextFile.java b/src/main/java/org/archive/util/binsearch/SortedTextFile.java
index ab8118b7..a4326dc0 100644
--- a/src/main/java/org/archive/util/binsearch/SortedTextFile.java
+++ b/src/main/java/org/archive/util/binsearch/SortedTextFile.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.util.Comparator;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -142,14 +143,14 @@ public long binaryFindOffset(SeekableLineReader slr, final String key, Comparato
if (comparator.compare(key, line) > 0) {
if(LOGGER.isLoggable(Level.FINE)) {
- LOGGER.fine(String.format("Search(%d) (%s)/(%s) : After",
+ LOGGER.fine(String.format(Locale.ROOT, "Search(%d) (%s)/(%s) : After",
mid * blockSize, key,line));
}
min = mid;
} else {
if(LOGGER.isLoggable(Level.FINE)) {
- LOGGER.fine(String.format("Search(%d) (%s)/(%s) : Before",
+ LOGGER.fine(String.format(Locale.ROOT, "Search(%d) (%s)/(%s) : Before",
mid * blockSize, key,line));
}
max = mid;
@@ -391,7 +392,7 @@ private CloseableIterator search(SeekableLineReader slr,
long min = binaryFindOffset(slr, key, comparator);
if (LOGGER.isLoggable(Level.FINE)) {
- LOGGER.fine(String.format("Aligning(%d)",min));
+ LOGGER.fine(String.format(Locale.ROOT, "Aligning(%d)",min));
}
slr.seek(min);
From 72d8a808e7d61173a435cca7ee5a7ae2b24b61d1 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Nov 2025 18:27:41 +0100
Subject: [PATCH 03/16] Add Locale.ROOT as parameter to all occurrences of
PrintStream.format(...) and number formatters
---
.../java/org/archive/extract/RealCDXExtractorOutput.java | 6 ++++--
src/main/java/org/archive/extract/ResourceExtractor.java | 6 +++---
.../extract/WARCMetadataRecordExtractorOutput.java | 5 +++--
.../java/org/archive/extract/WATExtractorOutput.java | 2 +-
.../archive/format/http/DumpingHTTPParseObserver.java | 5 +++--
.../java/org/archive/io/GenericReplayCharSequence.java | 9 +++++----
src/main/java/org/archive/io/WriterPoolMember.java | 9 ++++++++-
.../java/org/archive/resource/html/HTMLMetaData.java | 3 ++-
8 files changed, 29 insertions(+), 16 deletions(-)
diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
index b8f06034..ff0b9e83 100644
--- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
+++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java
@@ -223,7 +223,8 @@ public void output(Resource resource) throws IOException {
canUrl = keyMaker.makeKey(origUrl);
// URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE
if(dumpJSON) {
- out.format("%s %s %s %s %s %s %s %s %s %s %s %s\n",
+ out.format(Locale.ROOT,
+ "%s %s %s %s %s %s %s %s %s %s %s %s\n",
canUrl,
date,
origUrl,
@@ -237,7 +238,8 @@ public void output(Resource resource) throws IOException {
filename,
m.toString(1));
} else {
- out.format("%s %s %s %s %s %s %s %s %s %s %s\n",
+ out.format(Locale.ROOT,
+ "%s %s %s %s %s %s %s %s %s %s %s\n",
canUrl,
date,
origUrl,
diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java
index a6fa0a00..dcbfc122 100644
--- a/src/main/java/org/archive/extract/ResourceExtractor.java
+++ b/src/main/java/org/archive/extract/ResourceExtractor.java
@@ -141,7 +141,7 @@ public int run(String[] args)
} catch(GZIPFormatException e) {
LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
- System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
+ System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage());
if(ProducerUtils.STRICT_GZ) {
throw e;
@@ -150,7 +150,7 @@ public int run(String[] args)
} catch(ResourceParseException e) {
LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
- System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
+ System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage());
if(ProducerUtils.STRICT_GZ) {
throw e;
@@ -160,7 +160,7 @@ public int run(String[] args)
// this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions...
LOG.severe(String.format(Locale.ROOT, "RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
- System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
+ System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage());
e.printStackTrace();
diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
index 68f9d1c8..426acb02 100644
--- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java
@@ -7,6 +7,7 @@
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
+import java.util.Locale;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -87,7 +88,7 @@ public void output(Resource resource) throws IOException {
String[] linkParts = outLinkValue.split(" ");
if(linkParts.length > 2)
//'outlinks': 'origUrl date origOutlinkUrl linktype linktext'
- out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]);
+ out.format(Locale.ROOT,"%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]);
}
} else if(outputType.equals("hopinfo")) {
String key = obj.get("Name").toString();
@@ -103,7 +104,7 @@ public void output(Resource resource) throws IOException {
}
if(outputType.equals("hopinfo")) {
//'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag'
- out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag);
+ out.format(Locale.ROOT,"%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag);
}
}
}
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index dbe979e5..79cb0870 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -157,7 +157,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
}
// handle date of generation in WARC format
- DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
+ DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.ROOT);
String capDateString = dateFormat.format(new Date());
String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
diff --git a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java
index ed5dfcb2..11cd9276 100755
--- a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java
+++ b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java
@@ -2,6 +2,7 @@
import java.io.PrintStream;
import java.nio.charset.Charset;
+import java.util.Locale;
public class DumpingHTTPParseObserver implements HttpHeaderObserver {
private static final Charset UTF8 = Charset.forName("UTF-8");
@@ -15,13 +16,13 @@ public DumpingHTTPParseObserver(PrintStream ps) {
public void headerParsed(byte[] name, int ns, int nl, byte[] value, int vs,
int vl) {
- ps.format("headerParsed:(%d:%d)(%s)(%d:%d)(%s)\n",
+ ps.format(Locale.ROOT,"headerParsed:(%d:%d)(%s)(%d:%d)(%s)\n",
ns,nl,new String(name,0,nl,UTF8),
vs,vl,new String(value,0,vl,UTF8));
}
public void headersComplete(int bytesRead) {
- ps.format("headersComplete(%d)\n",bytesRead);
+ ps.format(Locale.ROOT,"headersComplete(%d)\n",bytesRead);
}
public void headersCorrupt() {
ps.println("headersCorrupted\n");
diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java
index c427550b..7aacb25a 100644
--- a/src/main/java/org/archive/io/GenericReplayCharSequence.java
+++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java
@@ -34,6 +34,7 @@
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.text.NumberFormat;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -168,8 +169,8 @@ private void updateMemoryMappedBuffer() {
long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES);
logger.fine("updateMemoryMappedBuffer: mapOffset="
- + NumberFormat.getInstance().format(mapByteOffset)
- + " mapSize=" + NumberFormat.getInstance().format(mapSize));
+ + NumberFormat.getInstance(Locale.ROOT).format(mapByteOffset)
+ + " mapSize=" + NumberFormat.getInstance(Locale.ROOT).format(mapSize));
try {
// TODO: stress-test without these possibly-costly requests!
// System.gc();
@@ -255,9 +256,9 @@ protected void decode(InputStream inStream, int prefixMax,
this.length = Ints.saturatedCast(count);
if(count>Integer.MAX_VALUE) {
logger.warning("input stream is longer than Integer.MAX_VALUE="
- + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE)
+ " characters -- only first "
- + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE)
+ " are accessible through this GenericReplayCharSequence");
}
diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java
index a488354a..4679ea78 100644
--- a/src/main/java/org/archive/io/WriterPoolMember.java
+++ b/src/main/java/org/archive/io/WriterPoolMember.java
@@ -26,9 +26,11 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
@@ -103,12 +105,17 @@ public abstract class WriterPoolMember {
*/
protected static int roundRobinIndex = 0;
+ /**
+ * Symbol set for serial number formatter.
+ */
+ protected static DecimalFormatSymbols serialNoFormatterSymbols = new DecimalFormatSymbols(Locale.ROOT);
+
/**
* NumberFormat instance for formatting serial number.
*
* Pads serial number with zeros.
*/
- protected static NumberFormat serialNoFormatter = new DecimalFormat("00000");
+ protected static NumberFormat serialNoFormatter = new DecimalFormat("00000", serialNoFormatterSymbols);
/**
diff --git a/src/main/java/org/archive/resource/html/HTMLMetaData.java b/src/main/java/org/archive/resource/html/HTMLMetaData.java
index 024d9677..d995cf65 100644
--- a/src/main/java/org/archive/resource/html/HTMLMetaData.java
+++ b/src/main/java/org/archive/resource/html/HTMLMetaData.java
@@ -1,6 +1,7 @@
package org.archive.resource.html;
import java.util.List;
+import java.util.Locale;
import java.util.logging.Logger;
import org.archive.resource.MetaData;
@@ -98,7 +99,7 @@ private void appendObj2(JSONObject o, String arr, String... a) {
} catch(JSONException e) {
try {
- System.err.format("GotErr(%s) JSON(%s)(%s)", e.getMessage(),
+ System.err.format(Locale.ROOT, "GotErr(%s) JSON(%s)(%s)", e.getMessage(),
o.toString(1),a.toString());
} catch (JSONException e1) {
// TODO Auto-generated catch block
From 56941573a8ea7ef729b550581aadc45647f9826f Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Nov 2025 18:34:34 +0100
Subject: [PATCH 04/16] Initialize InputStreamReaders using UTF-8 charset
---
src/main/java/org/archive/format/cdx/CDXFile.java | 4 +++-
.../record/WARCJSONMetaDataResourceFactory.java | 7 +++----
src/main/java/org/archive/util/ArchiveUtils.java | 14 ++++++++------
src/main/java/org/archive/util/DevUtils.java | 3 ++-
src/main/java/org/archive/util/IAUtils.java | 4 +++-
src/main/java/org/archive/util/ProcessUtils.java | 4 +++-
6 files changed, 22 insertions(+), 14 deletions(-)
diff --git a/src/main/java/org/archive/format/cdx/CDXFile.java b/src/main/java/org/archive/format/cdx/CDXFile.java
index 7dca0464..612f7454 100644
--- a/src/main/java/org/archive/format/cdx/CDXFile.java
+++ b/src/main/java/org/archive/format/cdx/CDXFile.java
@@ -18,6 +18,8 @@
import org.archive.util.iterator.CloseableIterator;
import org.archive.util.zip.OpenJDK7GZIPInputStream;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class CDXFile extends SortedTextFile implements CDXInputSource {
public CDXFile(String uri) throws IOException {
@@ -94,7 +96,7 @@ public static BufferedReader createStreamingLineReader(String uri, boolean gzipp
input = new OpenJDK7GZIPInputStream(swis);
}
- BufferedReader reader = new BufferedReader(new InputStreamReader(input));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(input, UTF_8));
return reader;
}
diff --git a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
index 43041efb..8cc8c146 100644
--- a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
@@ -3,7 +3,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.nio.charset.Charset;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
@@ -14,9 +13,9 @@
import org.json.JSONException;
import org.json.JSONTokener;
-public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants {
- private static final Charset UTF8 = Charset.forName("UTF-8");
+import static java.nio.charset.StandardCharsets.UTF_8;
+public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants {
public WARCJSONMetaDataResourceFactory() {
}
@@ -27,7 +26,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
MetaData md;
try {
- md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF8)));
+ md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF_8)));
} catch (JSONException e) {
throw new ResourceParseException(e);
}
diff --git a/src/main/java/org/archive/util/ArchiveUtils.java b/src/main/java/org/archive/util/ArchiveUtils.java
index 50307b43..cce411df 100644
--- a/src/main/java/org/archive/util/ArchiveUtils.java
+++ b/src/main/java/org/archive/util/ArchiveUtils.java
@@ -49,6 +49,8 @@
import org.archive.format.gzip.GZIPDecoder;
import org.archive.format.gzip.GZIPFormatException;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Miscellaneous useful methods.
*
@@ -851,7 +853,7 @@ private static String loadVersion() {
BufferedReader br = null;
String version;
try {
- br = new BufferedReader(new InputStreamReader(input));
+ br = new BufferedReader(new InputStreamReader(input, UTF_8));
version = br.readLine();
br.readLine();
} catch (IOException e) {
@@ -873,7 +875,7 @@ private static String loadVersion() {
br = null;
String timestamp;
try {
- br = new BufferedReader(new InputStreamReader(input));
+ br = new BufferedReader(new InputStreamReader(input, UTF_8));
timestamp = br.readLine();
} catch (IOException e) {
return version;
@@ -894,7 +896,7 @@ private static String loadVersion() {
TLDS = new HashSet();
InputStream is = ArchiveUtils.class.getResourceAsStream("tlds-alpha-by-domain.txt");
try {
- BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8));
String line;
while((line = reader.readLine())!=null) {
if (line.startsWith("#")) {
@@ -986,7 +988,7 @@ public static BufferedReader getBufferedReader(File source) throws IOException {
if(isGzipped) {
is = new GZIPInputStream(is);
}
- return new BufferedReader(new InputStreamReader(is));
+ return new BufferedReader(new InputStreamReader(is, UTF_8));
}
/**
@@ -1002,8 +1004,8 @@ public static BufferedReader getBufferedReader(URL source) throws IOException {
|| conn.getContentEncoding() != null && conn.getContentEncoding().equalsIgnoreCase("gzip");
InputStream uis = conn.getInputStream();
return new BufferedReader(isGzipped?
- new InputStreamReader(new GZIPInputStream(uis)):
- new InputStreamReader(uis));
+ new InputStreamReader(new GZIPInputStream(uis), UTF_8):
+ new InputStreamReader(uis, UTF_8));
}
/**
diff --git a/src/main/java/org/archive/util/DevUtils.java b/src/main/java/org/archive/util/DevUtils.java
index f2a1d044..7ee4b13a 100644
--- a/src/main/java/org/archive/util/DevUtils.java
+++ b/src/main/java/org/archive/util/DevUtils.java
@@ -25,6 +25,7 @@
import java.io.StringWriter;
import java.util.logging.Logger;
+import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Write a message and stack trace to the 'org.archive.util.DevUtils' logger.
@@ -92,7 +93,7 @@ public static void sigquitSelf() {
Process p = Runtime.getRuntime().exec(
new String[] {"perl", "-e", "print getppid(). \"\n\";"});
BufferedReader br =
- new BufferedReader(new InputStreamReader(p.getInputStream()));
+ new BufferedReader(new InputStreamReader(p.getInputStream(), UTF_8));
String ppid = br.readLine();
Runtime.getRuntime().exec(
new String[] {"sh", "-c", "kill -3 "+ppid}).waitFor();
diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java
index 4597d723..b0c448f0 100644
--- a/src/main/java/org/archive/util/IAUtils.java
+++ b/src/main/java/org/archive/util/IAUtils.java
@@ -29,6 +29,8 @@
import java.nio.charset.Charset;
import java.util.Properties;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Miscellaneous useful methods.
*
@@ -53,7 +55,7 @@ public static String loadCommonsVersion() {
BufferedReader br = null;
String version;
try {
- br = new BufferedReader(new InputStreamReader(input));
+ br = new BufferedReader(new InputStreamReader(input, UTF_8));
version = br.readLine();
br.readLine();
} catch (IOException e) {
diff --git a/src/main/java/org/archive/util/ProcessUtils.java b/src/main/java/org/archive/util/ProcessUtils.java
index af792981..0a3eeb67 100644
--- a/src/main/java/org/archive/util/ProcessUtils.java
+++ b/src/main/java/org/archive/util/ProcessUtils.java
@@ -26,6 +26,8 @@
import java.util.logging.Level;
import java.util.logging.Logger;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Class to run an external process.
* @author stack
@@ -55,7 +57,7 @@ protected StreamGobbler(InputStream is, String name) {
public void run() {
try {
BufferedReader br =
- new BufferedReader(new InputStreamReader(this.is));
+ new BufferedReader(new InputStreamReader(this.is, UTF_8));
for (String line = null; (line = br.readLine()) != null;) {
this.sink.append(line);
}
From c013b258be71c1c00b2a016641d60a2fc65195ff Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Nov 2025 19:44:46 +0100
Subject: [PATCH 05/16] Add charset to invocations of String constructor
---
.../java/org/archive/format/http/HttpHeaderParser.java | 4 +++-
.../archive/format/http/HttpResponseMessageParser.java | 5 +++--
src/main/java/org/archive/io/CompositeFileReader.java | 4 +++-
.../java/org/archive/io/HeaderedArchiveRecord.java | 5 +++--
src/main/java/org/archive/io/arc/ARCRecord.java | 4 ++--
src/main/java/org/archive/url/LaxURI.java | 9 ++++++---
src/main/java/org/archive/url/URI.java | 10 +++++++---
src/main/java/org/archive/util/LaxHttpParser.java | 3 ++-
8 files changed, 29 insertions(+), 15 deletions(-)
diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java
index bee3c28b..ddbb6e47 100755
--- a/src/main/java/org/archive/format/http/HttpHeaderParser.java
+++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
public class HttpHeaderParser implements HttpConstants {
private static final int DEFAULT_MAX_NAME_LENGTH = 1024 * 100;
@@ -288,7 +289,8 @@ public ParseState handleByte(byte b, HttpHeaderParser parser)
return parser.postColonState;
}
if(parser.isStrict) {
- throw new HttpParseException("Illegal char after name("+new String(name,0,nameLength)+")");
+ throw new HttpParseException("Illegal char after name("
+ + new String(name, 0, nameLength, StandardCharsets.ISO_8859_1) + ")");
}
parser.headersCorrupted();
return parser.laxLineEatParseState;
diff --git a/src/main/java/org/archive/format/http/HttpResponseMessageParser.java b/src/main/java/org/archive/format/http/HttpResponseMessageParser.java
index 3aee7c48..4ddef2ad 100755
--- a/src/main/java/org/archive/format/http/HttpResponseMessageParser.java
+++ b/src/main/java/org/archive/format/http/HttpResponseMessageParser.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
public class HttpResponseMessageParser extends HttpMessageParser {
public int maxBytes = 1024 * 128;
@@ -97,7 +98,7 @@ public int parseStrict(byte buf[], int len, HttpResponseMessageObserver obs)
version = parseVersionStrict(buf, vs, vl);
status = parseStatusStrict(buf,ss,sl);
- reason = new String(buf,idx+1,(len - idx)-1);
+ reason = new String(buf,idx+1,(len - idx)-1,StandardCharsets.ISO_8859_1);
obs.messageParsed(version, status, reason, len);
@@ -155,7 +156,7 @@ private int parseLax(byte buf[], int len, HttpResponseMessageObserver obs)
idx++;
int reasonLen = bufferEnd - idx;
if(reasonLen > 0) {
- reason = new String(buf,idx,reasonLen);
+ reason = new String(buf,idx,reasonLen,StandardCharsets.ISO_8859_1);
}
} else {
// missed some:
diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java
index 14b56219..6e331565 100644
--- a/src/main/java/org/archive/io/CompositeFileReader.java
+++ b/src/main/java/org/archive/io/CompositeFileReader.java
@@ -23,6 +23,8 @@
import java.io.InputStreamReader;
import java.util.List;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* @author gojomo
@@ -34,7 +36,7 @@ public class CompositeFileReader extends InputStreamReader {
* @throws IOException
*/
public CompositeFileReader(List filenames) throws IOException {
- super(new CompositeFileInputStream(filenames));
+ super(new CompositeFileInputStream(filenames), UTF_8);
}
}
diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
index 70c4fb04..a149acac 100644
--- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java
+++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
@@ -25,6 +25,7 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
import java.util.Locale;
import org.archive.format.http.HttpHeader;
@@ -145,7 +146,7 @@ private InputStream readContentHeaders() throws IOException {
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new IOException("Failed to read raw lie where one " +
- " was expected: " + new String(statusBytes));
+ " was expected: " + new String(statusBytes, ARCConstants.DEFAULT_ENCODING));
}
String statusLine = new String(statusBytes, 0,
statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
@@ -186,7 +187,7 @@ private InputStream readContentHeaders() throws IOException {
eolCharCount = getEolCharsCount(lineBytes);
if (eolCharCount <= 0) {
throw new IOException("Failed reading headers: " +
- ((lineBytes != null)? new String(lineBytes): null));
+ ((lineBytes != null)? new String(lineBytes, StandardCharsets.ISO_8859_1): null));
}
// Save the bytes read.
baos.write(lineBytes);
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
index 14e80728..c14426a5 100644
--- a/src/main/java/org/archive/io/arc/ARCRecord.java
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -589,7 +589,7 @@ private InputStream readHttpHeader() throws IOException {
if (eolCharCount <= 0) {
throw new RecoverableIOException(
"Failed to read http status where one was expected: "
- + ((statusBytes == null) ? "" : new String(statusBytes)));
+ + ((statusBytes == null) ? "" : new String(statusBytes, DEFAULT_ENCODING)));
}
statusLine = new String(statusBytes, 0,
@@ -659,7 +659,7 @@ private InputStream readHttpHeader() throws IOException {
break;
} else {
throw new IOException("Failed reading http headers: " +
- ((lineBytes != null)? new String(lineBytes): null));
+ ((lineBytes != null)? new String(lineBytes, DEFAULT_ENCODING): null));
}
} else {
httpHeaderBytesRead += lineBytes.length;
diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java
index 4210c303..3b27e045 100644
--- a/src/main/java/org/archive/url/LaxURI.java
+++ b/src/main/java/org/archive/url/LaxURI.java
@@ -19,6 +19,8 @@
package org.archive.url;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.BitSet;
@@ -122,9 +124,10 @@ protected static String decode(String component, String charset)
byte[] rawdata = null;
rawdata = LaxURLCodec.decodeUrlLoose(component.getBytes(StandardCharsets.US_ASCII));
try {
- return new String(rawdata, charset);
- } catch (UnsupportedEncodingException e) {
- return new String(rawdata);
+ Charset cs = Charset.forName(charset);
+ return new String(rawdata, cs);
+ } catch (IllegalCharsetNameException e) {
+ return new String(rawdata, StandardCharsets.US_ASCII);
}
}
diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java
index 38219556..ff53775e 100644
--- a/src/main/java/org/archive/url/URI.java
+++ b/src/main/java/org/archive/url/URI.java
@@ -34,6 +34,8 @@
import org.apache.commons.codec.net.URLCodec;
import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.BitSet;
@@ -1780,11 +1782,13 @@ protected static String decode(String component, String charset)
throw new URIException(e.getMessage());
}
try {
- return new String(rawdata, charset);
- } catch (UnsupportedEncodingException e) {
- return new String(rawdata);
+ Charset cs = Charset.forName(charset);
+ return new String(rawdata, cs);
+ } catch (IllegalCharsetNameException e) {
+ return new String(rawdata, StandardCharsets.US_ASCII);
}
}
+
/**
* Pre-validate the unescaped URI string within a specific component.
*
diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java
index 0545fd95..05d2469c 100644
--- a/src/main/java/org/archive/util/LaxHttpParser.java
+++ b/src/main/java/org/archive/util/LaxHttpParser.java
@@ -36,6 +36,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.logging.Logger;
@@ -127,7 +128,7 @@ public static String readLine(InputStream inputStream, String charset) throws IO
try {
return new String(rawdata, 0, len - offset, charset);
} catch (UnsupportedEncodingException e) {
- return new String(rawdata, 0, len - offset);
+ return new String(rawdata, 0, len - offset, StandardCharsets.ISO_8859_1);
}
}
From 88ac2989028ed35d52e0e46076d1322040362de3 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Nov 2025 22:38:54 +0100
Subject: [PATCH 06/16] Initialize PrintStreams using UTF-8 charset, call
String.getBytes() with charset.
---
.../archive/extract/DumpingExtractorOutput.java | 8 +++++++-
.../archive/extract/JSONViewExtractorOutput.java | 8 +++++++-
.../format/gzip/zipnum/ZipNumCluster.java | 3 +++
src/main/java/org/archive/io/arc/ARC2WCDX.java | 4 +++-
.../java/org/archive/io/warc/WARCWriter.java | 6 ++++--
src/main/java/org/archive/url/URI.java | 4 +++-
src/main/java/org/archive/util/SURT.java | 4 +++-
.../java/org/archive/util/SurtPrefixSet.java | 6 ++++--
src/main/java/org/archive/util/TextUtils.java | 16 ++++++----------
.../archive/util/binsearch/SortedTextFile.java | 6 ++++--
10 files changed, 44 insertions(+), 21 deletions(-)
diff --git a/src/main/java/org/archive/extract/DumpingExtractorOutput.java b/src/main/java/org/archive/extract/DumpingExtractorOutput.java
index 69591931..1ccbf771 100644
--- a/src/main/java/org/archive/extract/DumpingExtractorOutput.java
+++ b/src/main/java/org/archive/extract/DumpingExtractorOutput.java
@@ -3,6 +3,7 @@
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
import java.util.logging.Logger;
import org.archive.resource.Resource;
@@ -12,13 +13,18 @@
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class DumpingExtractorOutput implements ExtractorOutput {
private static final Logger LOG =
Logger.getLogger(DumpingExtractorOutput.class.getName());
private PrintStream out;
public DumpingExtractorOutput(OutputStream out) {
- this.out = new PrintStream(out);
+ try {
+ this.out = new PrintStream(out, false, UTF_8.name());
+ } catch (UnsupportedEncodingException e) {
+ }
}
public void output(Resource resource) throws IOException {
diff --git a/src/main/java/org/archive/extract/JSONViewExtractorOutput.java b/src/main/java/org/archive/extract/JSONViewExtractorOutput.java
index fb6dc847..6cb7c445 100644
--- a/src/main/java/org/archive/extract/JSONViewExtractorOutput.java
+++ b/src/main/java/org/archive/extract/JSONViewExtractorOutput.java
@@ -3,6 +3,7 @@
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
@@ -10,12 +11,17 @@
import org.archive.resource.Resource;
import org.archive.util.StreamCopy;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class JSONViewExtractorOutput implements ExtractorOutput {
private PrintStream out;
private JSONView view;
public JSONViewExtractorOutput(OutputStream out, String filterPath) {
view = new JSONView(filterPath.split(","));
- this.out = new PrintStream(out);
+ try {
+ this.out = new PrintStream(out, false, UTF_8.name());
+ } catch (UnsupportedEncodingException e) {
+ }
}
public void output(Resource resource) throws IOException {
StreamCopy.readToEOF(resource.getInputStream());
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
index a3d34a4b..edf5857c 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
@@ -13,6 +13,7 @@
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
@@ -35,6 +36,8 @@
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.iterator.CloseableIterator;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class ZipNumCluster extends ZipNumIndex {
final static Logger LOGGER = Logger.getLogger(ZipNumCluster.class.getName());
diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java
index f0515694..aec571e9 100644
--- a/src/main/java/org/archive/io/arc/ARC2WCDX.java
+++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java
@@ -32,6 +32,8 @@
import org.archive.util.ArchiveUtils;
import org.archive.util.SURT;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
* Writes .wcdx.gz in same directory.
@@ -61,7 +63,7 @@ public static Object[] createWcdx(ARCReader reader) {
PrintStream writer = null;
long count = 0;
try {
- writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)));
+ writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)), false, UTF_8.name());
// write header: legend + timestamp
StringBuilder legend = new StringBuilder();
diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java
index 5c6a6854..8b571fad 100644
--- a/src/main/java/org/archive/io/warc/WARCWriter.java
+++ b/src/main/java/org/archive/io/warc/WARCWriter.java
@@ -45,6 +45,8 @@
import static org.archive.format.warc.WARCConstants.*;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* WARC implementation.
@@ -357,12 +359,12 @@ public URI writeWarcinfoRecord(String filename, final String description)
byte [] warcinfoBody = null;
if (settings.getMetadata() == null) {
// TODO: What to write into a warcinfo? What to associate?
- warcinfoBody = "TODO: Unimplemented".getBytes();
+ warcinfoBody = "TODO: Unimplemented".getBytes(UTF_8);
} else {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (final Iterator i = settings.getMetadata().iterator();
i.hasNext();) {
- baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8));
+ baos.write(i.next().toString().getBytes(UTF_8));
}
warcinfoBody = baos.toByteArray();
}
diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java
index ff53775e..b19151cd 100644
--- a/src/main/java/org/archive/url/URI.java
+++ b/src/main/java/org/archive/url/URI.java
@@ -42,6 +42,8 @@
import java.util.Hashtable;
import java.util.Locale;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* The interface for the URI(Uniform Resource Identifiers) version of RFC 2396.
* This class has the purpose of supportting of parsing a URI reference to
@@ -1696,7 +1698,7 @@ private static byte[] getBytes(String original, String charset) {
try {
return original.getBytes(charset);
} catch (UnsupportedEncodingException e) {
- return original.getBytes();
+ return original.getBytes(UTF_8);
}
}
diff --git a/src/main/java/org/archive/util/SURT.java b/src/main/java/org/archive/util/SURT.java
index 059b2ec6..c52582e1 100644
--- a/src/main/java/org/archive/util/SURT.java
+++ b/src/main/java/org/archive/util/SURT.java
@@ -32,6 +32,8 @@
import org.archive.url.URIException;
import org.archive.url.UsableURIFactory;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Sort-friendly URI Reordering Transform.
*
@@ -238,7 +240,7 @@ public static void main(String[] args) throws IOException {
InputStream in = args.length > 0 ? new BufferedInputStream(
new FileInputStream(args[0])) : System.in;
PrintStream out = args.length > 1 ? new PrintStream(
- new BufferedOutputStream(new FileOutputStream(args[1])))
+ new BufferedOutputStream(new FileOutputStream(args[1])), false, UTF_8.name())
: System.out;
BufferedReader br =
new BufferedReader(new InputStreamReader(in));
diff --git a/src/main/java/org/archive/util/SurtPrefixSet.java b/src/main/java/org/archive/util/SurtPrefixSet.java
index 32a34d53..b2f0ea4f 100644
--- a/src/main/java/org/archive/util/SurtPrefixSet.java
+++ b/src/main/java/org/archive/util/SurtPrefixSet.java
@@ -37,6 +37,8 @@
import org.archive.util.iterator.LineReadingIterator;
import org.archive.util.iterator.RegexLineIterator;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Specialized TreeSet for keeping a set of String prefixes.
*
@@ -343,10 +345,10 @@ public static void main(String[] args) throws IOException {
InputStream in = args.length > 0 ? new BufferedInputStream(
new FileInputStream(args[0])) : System.in;
PrintStream out = args.length > 1 ? new PrintStream(
- new BufferedOutputStream(new FileOutputStream(args[1])))
+ new BufferedOutputStream(new FileOutputStream(args[1])), false, UTF_8.name())
: System.out;
BufferedReader br =
- new BufferedReader(new InputStreamReader(in));
+ new BufferedReader(new InputStreamReader(in, UTF_8.name()));
String line;
while((line = br.readLine())!=null) {
if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
diff --git a/src/main/java/org/archive/util/TextUtils.java b/src/main/java/org/archive/util/TextUtils.java
index 98b471f8..df3de58b 100644
--- a/src/main/java/org/archive/util/TextUtils.java
+++ b/src/main/java/org/archive/util/TextUtils.java
@@ -40,6 +40,8 @@
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class TextUtils {
private static final String FIRSTWORD = "^([^\\s]*).*$";
@@ -279,14 +281,11 @@ public static String exceptionToString(String message, Throwable e) {
* @param s String to escape
* @return URL-escaped string
*/
- @SuppressWarnings("deprecation")
public static String urlEscape(String s) {
try {
- return URLEncoder.encode(s,"UTF8");
+ return URLEncoder.encode(s, UTF_8.name());
} catch (UnsupportedEncodingException e) {
- // should be impossible; all JVMs must support UTF8
- // but have a fallback just in case
- return URLEncoder.encode(s);
+ return s;
}
}
@@ -296,14 +295,11 @@ public static String urlEscape(String s) {
* @param s String do unescape
* @return URL-unescaped String
*/
- @SuppressWarnings("deprecation")
public static String urlUnescape(String s) {
try {
- return URLDecoder.decode(s, "UTF8");
+ return URLDecoder.decode(s, UTF_8.name());
} catch (UnsupportedEncodingException e) {
- // should be impossible; all JVMs must support UTF8
- // but have a fallback just in case
- return URLDecoder.decode(s);
+ return s;
}
}
}
\ No newline at end of file
diff --git a/src/main/java/org/archive/util/binsearch/SortedTextFile.java b/src/main/java/org/archive/util/binsearch/SortedTextFile.java
index a4326dc0..bb4a1f66 100644
--- a/src/main/java/org/archive/util/binsearch/SortedTextFile.java
+++ b/src/main/java/org/archive/util/binsearch/SortedTextFile.java
@@ -9,6 +9,8 @@
import org.archive.util.GeneralURIStreamFactory;
import org.archive.util.iterator.CloseableIterator;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class SortedTextFile {
public static class NumericComparator implements Comparator
@@ -371,7 +373,7 @@ private long searchOffset(SeekableLineReader slr,
String prev = null;
while(true) {
if (line != null) {
- offset += line.getBytes().length + 1;
+ offset += line.getBytes(UTF_8).length + 1;
}
line = slr.readLine();
if(line == null) break;
@@ -380,7 +382,7 @@ private long searchOffset(SeekableLineReader slr,
}
if (lessThan && prev != null) {
- offset -= prev.getBytes().length + 1;
+ offset -= prev.getBytes(UTF_8).length + 1;
}
return offset;
From c1e4cd323b73715e04f17502f3abcb12a09da89c Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Nov 2025 22:54:57 +0100
Subject: [PATCH 07/16] Replace Charset.forName("utf-8") by
StandardCharsets.UTF-8
---
src/main/java/org/archive/extract/ResourceExtractor.java | 5 ++---
src/main/java/org/archive/extract/WATExtractorOutput.java | 7 +++----
src/main/java/org/archive/format/arc/ARCConstants.java | 3 ++-
.../java/org/archive/format/gzip/zipnum/ZipNumWriter.java | 6 +++---
.../org/archive/format/http/DumpingHTTPParseObserver.java | 3 +--
src/main/java/org/archive/format/http/HttpConstants.java | 3 ++-
src/main/java/org/archive/url/BasicURLCanonicalizer.java | 6 ++----
src/main/java/org/archive/url/SURT.java | 4 ++--
src/main/java/org/archive/util/IAUtils.java | 2 +-
.../archive/util/binsearch/AbstractSeekableLineReader.java | 3 ++-
10 files changed, 20 insertions(+), 22 deletions(-)
diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java
index dcbfc122..d9b9f396 100644
--- a/src/main/java/org/archive/extract/ResourceExtractor.java
+++ b/src/main/java/org/archive/extract/ResourceExtractor.java
@@ -7,7 +7,7 @@
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URISyntaxException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -27,7 +27,6 @@ public class ResourceExtractor implements ResourceConstants, Tool {
private final static Logger LOG =
Logger.getLogger(ResourceExtractor.class.getName());
- Charset UTF8 = Charset.forName("utf-8");
public final static String TOOL_NAME = "extractor";
public static final String TOOL_DESCRIPTION =
"A tool for extracting metadata from WARC, ARC, and WAT files";
@@ -66,7 +65,7 @@ public static void main(String[] args) throws Exception {
private PrintWriter makePrintWriter(OutputStream os)
{
- return new PrintWriter(new OutputStreamWriter(os, Charset.forName("UTF-8")));
+ return new PrintWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8));
}
public int run(String[] args)
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index 79cb0870..bb179fd1 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -1,12 +1,10 @@
package org.archive.extract;
-import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
-import java.nio.charset.Charset;
import java.text.ParseException;
import java.net.UnknownHostException;
import java.util.Date;
@@ -31,13 +29,14 @@
import java.util.logging.Logger;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class WATExtractorOutput implements ExtractorOutput {
WARCRecordWriter recW;
private boolean wroteFirst;
private GZIPMemberWriter gzW;
private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
private int bufferRAM = DEFAULT_BUFFER_RAM;
- private final static Charset UTF8 = Charset.forName("UTF-8");
private String outputFile;
private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName());
@@ -169,7 +168,7 @@ private void writeWARCMDRecord(OutputStream recOut, MetaData md,
ByteArrayOutputStream bos = new ByteArrayOutputStream();
- OutputStreamWriter osw = new OutputStreamWriter(bos, UTF8);
+ OutputStreamWriter osw = new OutputStreamWriter(bos, UTF_8);
try {
md.write(osw);
} catch (JSONException e1) {
diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java
index 5987b49f..39dbf7ed 100755
--- a/src/main/java/org/archive/format/arc/ARCConstants.java
+++ b/src/main/java/org/archive/format/arc/ARCConstants.java
@@ -1,6 +1,7 @@
package org.archive.format.arc;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.zip.Deflater;
@@ -16,7 +17,7 @@
*/
public interface ARCConstants extends ArchiveFileConstants {
public final static int MAX_META_LENGTH = 1024 * 32;
- public final static Charset ARC_META_CHARSET = Charset.forName("utf-8");
+ public final static Charset ARC_META_CHARSET = StandardCharsets.UTF_8;
public final static int NEW_LINE_ORD = 10;
public static final int CARRIAGE_RETURN_ORD = 13;
public final static String DELIMITER = " ";
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java
index a104244a..c0e4e01d 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java
@@ -3,18 +3,18 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
-import java.nio.charset.Charset;
import org.archive.format.gzip.GZIPMemberWriter;
import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class ZipNumWriter extends GZIPMemberWriterCommittedOutputStream {
int limit;
int count;
OutputStream manifestOut;
ByteArrayOutputStream manifestBuffer;
char delimiter = '\t';
- private static final Charset UTF8 = Charset.forName("utf-8");
public ZipNumWriter(OutputStream main, OutputStream manifest, int limit) {
super(new GZIPMemberWriter(main));
manifestOut = manifest;
@@ -51,7 +51,7 @@ private void finishCurrent() throws IOException {
sb.append(delimiter);
sb.append(len);
sb.append(delimiter);
- manifestOut.write(sb.toString().getBytes(UTF8));
+ manifestOut.write(sb.toString().getBytes(UTF_8));
manifestBuffer.writeTo(manifestOut);
manifestOut.flush();
count = 0;
diff --git a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java
index 11cd9276..f1ac16c6 100755
--- a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java
+++ b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java
@@ -1,11 +1,10 @@
package org.archive.format.http;
import java.io.PrintStream;
-import java.nio.charset.Charset;
import java.util.Locale;
+
public class DumpingHTTPParseObserver implements HttpHeaderObserver {
- private static final Charset UTF8 = Charset.forName("UTF-8");
private PrintStream ps = null;
public DumpingHTTPParseObserver() {
ps = System.out;
diff --git a/src/main/java/org/archive/format/http/HttpConstants.java b/src/main/java/org/archive/format/http/HttpConstants.java
index fa0a7e10..8ae4d4db 100755
--- a/src/main/java/org/archive/format/http/HttpConstants.java
+++ b/src/main/java/org/archive/format/http/HttpConstants.java
@@ -1,9 +1,10 @@
package org.archive.format.http;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
public interface HttpConstants {
- public static final Charset UTF8 = Charset.forName("UTF-8");
+ public static final Charset UTF8 = StandardCharsets.UTF_8;
public static final byte CR = 13;
public static final byte LF = 10;
public static final byte SP = 32;
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
index 632d1ea7..dd0d9ac7 100644
--- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -6,6 +6,7 @@
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Locale;
import java.util.regex.Matcher;
@@ -204,12 +205,9 @@ public String minimalEscape(String input) {
return escapeOnce(unescapeRepeatedly(input));
}
- protected static Charset _UTF8 = null;
+ protected static Charset _UTF8 = StandardCharsets.UTF_8;
protected static Charset UTF8() {
- if (_UTF8 == null) {
- _UTF8 = Charset.forName("UTF-8");
- }
return _UTF8;
}
diff --git a/src/main/java/org/archive/url/SURT.java b/src/main/java/org/archive/url/SURT.java
index 3e0bcd55..9598f458 100644
--- a/src/main/java/org/archive/url/SURT.java
+++ b/src/main/java/org/archive/url/SURT.java
@@ -2,7 +2,7 @@
import java.io.BufferedReader;
import java.io.InputStreamReader;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.logging.Logger;
@@ -33,7 +33,7 @@ public static String toSURT(String input) {
}
public static void main(String[] args) {
String line;
- InputStreamReader isr = new InputStreamReader(System.in,Charset.forName("UTF-8"));
+ InputStreamReader isr = new InputStreamReader(System.in, StandardCharsets.UTF_8);
BufferedReader br = new BufferedReader(isr);
Iterator i = AbstractPeekableIterator.wrapReader(br);
while(i.hasNext()) {
diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java
index b0c448f0..1d15256e 100644
--- a/src/main/java/org/archive/util/IAUtils.java
+++ b/src/main/java/org/archive/util/IAUtils.java
@@ -37,7 +37,7 @@
* @author gojomo & others
*/
public class IAUtils {
- public final static Charset UTF8 = Charset.forName("utf-8");
+ public final static Charset UTF8 = UTF_8;
final public static String COMMONS_VERSION = loadCommonsVersion();
final public static String PUBLISHER = loadCommons("publisher");
diff --git a/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java b/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java
index de57278e..17d411fa 100644
--- a/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java
+++ b/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java
@@ -7,13 +7,14 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import org.archive.util.zip.GZIPMembersInputStream;
import com.google.common.io.ByteStreams;
public abstract class AbstractSeekableLineReader implements SeekableLineReader {
- public final static Charset UTF8 = Charset.forName("UTF-8");
+ public final static Charset UTF8 = StandardCharsets.UTF_8;
protected int blockSize = 128 * 1024;
From ed0070b7f6486fe48df0c00b03a9385fbd608fe5 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 11 Nov 2025 23:29:57 +0100
Subject: [PATCH 08/16] Replace FileReader and FileWriter using classes
allowing to configure the charset. Use default charset for main methods when
reading from stdin.
---
.../org/archive/format/gzip/zipnum/ZipNumCluster.java | 6 +++---
src/main/java/org/archive/io/ArchiveReader.java | 7 +++++--
src/main/java/org/archive/net/PublicSuffixes.java | 9 ++++++---
src/main/java/org/archive/util/Grep.java | 11 +++++++----
src/main/java/org/archive/util/SURT.java | 3 ++-
.../archive/util/binsearch/SeekCDXBenchmarker.java | 3 ++-
6 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
index edf5857c..0a3fa1bf 100644
--- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
+++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java
@@ -11,9 +11,9 @@
*/
import java.io.BufferedReader;
-import java.io.FileReader;
+import java.io.FileInputStream;
import java.io.IOException;
-import java.nio.charset.StandardCharsets;
+import java.io.InputStreamReader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
@@ -370,7 +370,7 @@ protected void loadLastBlockSizes(String filename)
totalAdjustment = 0;
try {
- reader = new BufferedReader(new FileReader(filename));
+ reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), UTF_8));
while ((line = reader.readLine()) != null) {
String[] splits = line.split("\t");
diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java
index 53b8167b..070455a5 100644
--- a/src/main/java/org/archive/io/ArchiveReader.java
+++ b/src/main/java/org/archive/io/ArchiveReader.java
@@ -26,9 +26,10 @@
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@@ -45,6 +46,8 @@
import static org.archive.format.ArchiveFileConstants.*;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Reader for an Archive file of Archive {@link ArchiveRecord}s.
@@ -660,7 +663,7 @@ protected void cdxOutput(boolean toFile)
DOT_COMPRESSED_FILE_EXTENSION);
cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
cdxFilename += ('.' + CDX);
- cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
+ cdxWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(cdxFilename), UTF_8));
}
String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java
index a2a2bfb2..5b3219d5 100644
--- a/src/main/java/org/archive/net/PublicSuffixes.java
+++ b/src/main/java/org/archive/net/PublicSuffixes.java
@@ -22,13 +22,14 @@
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@@ -38,6 +39,8 @@
import org.apache.commons.io.IOUtils;
import org.archive.util.TextUtils;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Utility class for making use of the information about 'public suffixes' at
* http://publicsuffix.org.
@@ -198,11 +201,11 @@ public static void main(String args[]) throws IOException {
BufferedWriter writer;
if (args.length >= 2) {
// write to specified file
- writer = new BufferedWriter(new FileWriter(args[1]));
+ writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), UTF_8));
needsClose = true;
} else {
// write to stdout
- writer = new BufferedWriter(new OutputStreamWriter(System.out));
+ writer = new BufferedWriter(new OutputStreamWriter(System.out, Charset.defaultCharset()));
}
writer.append(regex);
writer.flush();
diff --git a/src/main/java/org/archive/util/Grep.java b/src/main/java/org/archive/util/Grep.java
index e446e47e..892429bd 100644
--- a/src/main/java/org/archive/util/Grep.java
+++ b/src/main/java/org/archive/util/Grep.java
@@ -1,10 +1,13 @@
package org.archive.util;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.BufferedReader;
-import java.io.FileReader;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
+import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
@@ -119,14 +122,14 @@ protected void doTheGrepThing() throws Exception {
if (files != null) {
if (files.size() == 1) {
- grep(new BufferedReader(new FileReader(files.get(0))), "");
+ grep(new BufferedReader(new InputStreamReader(new FileInputStream(files.get(0)), UTF_8)), "");
} else {
for (String path : files) {
- grep(new BufferedReader(new FileReader(path)), path + ": ");
+ grep(new BufferedReader(new InputStreamReader(new FileInputStream(path), UTF_8)), path + ": ");
}
}
} else {
- grep(new BufferedReader(new InputStreamReader(System.in)), "");
+ grep(new BufferedReader(new InputStreamReader(System.in, Charset.defaultCharset())), "");
}
}
diff --git a/src/main/java/org/archive/util/SURT.java b/src/main/java/org/archive/util/SURT.java
index c52582e1..99347e9f 100644
--- a/src/main/java/org/archive/util/SURT.java
+++ b/src/main/java/org/archive/util/SURT.java
@@ -27,6 +27,7 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
+import java.nio.charset.Charset;
import java.util.regex.Matcher;
import org.archive.url.URIException;
@@ -243,7 +244,7 @@ public static void main(String[] args) throws IOException {
new BufferedOutputStream(new FileOutputStream(args[1])), false, UTF_8.name())
: System.out;
BufferedReader br =
- new BufferedReader(new InputStreamReader(in));
+ new BufferedReader(new InputStreamReader(in, Charset.defaultCharset()));
String line;
while((line = br.readLine())!=null) {
if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
diff --git a/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java b/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java
index 76b7b2b9..45c2ee04 100644
--- a/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java
+++ b/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java
@@ -3,6 +3,7 @@
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.nio.charset.Charset;
import org.archive.url.WaybackURLKeyMaker;
import org.archive.util.binsearch.impl.MappedSeekableLineReaderFactory;
@@ -52,7 +53,7 @@ public static void main(String[] args) throws IOException {
SortedTextFile sorted = new SortedTextFile(factory);
sorted.setBinsearchBlockSize(blocksize);
- BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(System.in, Charset.defaultCharset()));
WaybackURLKeyMaker keymaker = new WaybackURLKeyMaker(true);
From e3c06efb091377fd0474edd8eb18e0e67b80c3b3 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 12 Nov 2025 12:51:15 +0100
Subject: [PATCH 09/16] Unit tests: pass charset to all occurrences of
String.getBytes()
---
.../archive/io/HeaderedArchiveRecordTest.java | 10 +++--
.../archive/io/RecordingInputStreamTest.java | 8 ++--
.../archive/io/RecordingOutputStreamTest.java | 40 ++++++++++---------
.../archive/io/ReplayCharSequenceTest.java | 6 ++-
.../org/archive/io/arc/ARCWriterPoolTest.java | 8 ++--
.../org/archive/io/arc/ARCWriterTest.java | 12 +++---
.../org/archive/io/warc/WARCWriterTest.java | 8 ++--
7 files changed, 53 insertions(+), 39 deletions(-)
diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
index 005e2c49..65027395 100644
--- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
+++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
@@ -31,6 +31,8 @@
import org.archive.io.warc.WARCRecord;
import org.junit.jupiter.api.Test;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -68,7 +70,7 @@ public void testParseHttpHeadersInWARC() throws IOException {
final String hdr = warcHeader + HTTPHEADER + BODY;
- WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)),
"READER_IDENTIFIER", 0, false, true);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
@@ -156,7 +158,7 @@ public String getVersion() {
}
};
- ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)),
arh, 0, false, true, false);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
@@ -175,7 +177,7 @@ public void testEasierParseHttpHeadersInARC() throws IOException {
+ " 192.168.0.1 20070515111004 text/html 167568\n";
final String hdr = arcHeader + HTTPHEADER + BODY;
- ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
+ ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)),
"READER_IDENTIFIER", 0, false, true, false);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
@@ -205,7 +207,7 @@ public void testNoheaderWARC() throws IOException {
String c = "WARC/0.12\r\nContent-Type: text/plain\r\n"
+ "Content-Length: " + b.length() + "\r\n\r\n" + b;
org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord(
- new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0,
+ new ByteArrayInputStream(c.getBytes(UTF_8)), "READER_IDENTIFIER", 0,
false, true);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
assertTrue(har.isStrict());
diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java
index 49160aa3..8ccee986 100644
--- a/src/test/java/org/archive/io/RecordingInputStreamTest.java
+++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java
@@ -28,6 +28,8 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -55,7 +57,7 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException,
RecordingInputStream ris = new RecordingInputStream(16384, (new File(
tempDir, "testReadFullyOrUntil").getAbsolutePath()));
ByteArrayInputStream bais = new ByteArrayInputStream(
- "abcdefghijklmnopqrstuvwxyz".getBytes());
+ "abcdefghijklmnopqrstuvwxyz".getBytes(UTF_8));
// test soft max
ris.open(bais);
ris.setLimits(10,0,0);
@@ -87,7 +89,7 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException,
PipedOutputStream pout = new PipedOutputStream(pin);
ris.open(pin);
exceptionThrown = false;
- trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout);
+ trickle("abcdefghijklmnopqrstuvwxyz".getBytes(UTF_8),pout);
int timeout = 200;
try {
ris.setLimits(0, timeout,0);
@@ -133,7 +135,7 @@ public void testAsOutputStream() throws IOException {
RecordingInputStream ris = new RecordingInputStream(16384, (new File(
tempDir, "testAsOutputStream").getAbsolutePath()));
ris.open(null);
- ris.asOutputStream().write("hello".getBytes());
+ ris.asOutputStream().write("hello".getBytes(UTF_8));
ris.close();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ris.getReplayInputStream().readFullyTo(baos);
diff --git a/src/test/java/org/archive/io/RecordingOutputStreamTest.java b/src/test/java/org/archive/io/RecordingOutputStreamTest.java
index c94f8245..0dba910e 100644
--- a/src/test/java/org/archive/io/RecordingOutputStreamTest.java
+++ b/src/test/java/org/archive/io/RecordingOutputStreamTest.java
@@ -28,6 +28,8 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -266,61 +268,61 @@ public void testMessageBodyBegin() throws IOException {
ros.setSha1Digest();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789\n\nabcdefghij".getBytes());
+ ros.write("0123456789\n\nabcdefghij".getBytes(UTF_8));
assertEquals(12, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789\r\n\r\nabcdefghij".getBytes());
+ ros.write("0123456789\r\n\r\nabcdefghij".getBytes(UTF_8));
assertEquals(14, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789\n\r\nabcdefghij".getBytes());
+ ros.write("0123456789\n\r\nabcdefghij".getBytes(UTF_8));
assertEquals(13, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789\n".getBytes());
+ ros.write("0123456789\n".getBytes(UTF_8));
assertEquals(-1, ros.getMessageBodyBegin());
- ros.write("\nabcdefghij".getBytes());
+ ros.write("\nabcdefghij".getBytes(UTF_8));
assertEquals(12, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789\n".getBytes());
+ ros.write("0123456789\n".getBytes(UTF_8));
assertEquals(-1, ros.getMessageBodyBegin());
- ros.write("\r\nabcdefghij".getBytes());
+ ros.write("\r\nabcdefghij".getBytes(UTF_8));
assertEquals(13, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789\n\r".getBytes());
+ ros.write("0123456789\n\r".getBytes(UTF_8));
assertEquals(-1, ros.getMessageBodyBegin());
- ros.write("\nabcdefghij".getBytes());
+ ros.write("\nabcdefghij".getBytes(UTF_8));
assertEquals(13, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789".getBytes());
+ ros.write("0123456789".getBytes(UTF_8));
ros.write('\n');
assertEquals(-1, ros.getMessageBodyBegin());
- ros.write("\nabcdefghij".getBytes());
+ ros.write("\nabcdefghij".getBytes(UTF_8));
assertEquals(12, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789".getBytes());
+ ros.write("0123456789".getBytes(UTF_8));
ros.write('\n');
ros.write('\n');
- for (int b: "abcdefghij".getBytes()) {
+ for (int b: "abcdefghij".getBytes(UTF_8)) {
ros.write(b);
}
assertEquals(12, ros.getMessageBodyBegin());
@@ -328,11 +330,11 @@ public void testMessageBodyBegin() throws IOException {
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789".getBytes());
+ ros.write("0123456789".getBytes(UTF_8));
ros.write('\n');
ros.write('\r');
ros.write('\n');
- for (int b: "abcdefghij".getBytes()) {
+ for (int b: "abcdefghij".getBytes(UTF_8)) {
ros.write(b);
}
assertEquals(13, ros.getMessageBodyBegin());
@@ -340,17 +342,17 @@ public void testMessageBodyBegin() throws IOException {
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789\n".getBytes());
+ ros.write("0123456789\n".getBytes(UTF_8));
ros.write('\n');
- ros.write("abcdefghij".getBytes());
+ ros.write("abcdefghij".getBytes(UTF_8));
assertEquals(12, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
ros.open(new ByteArrayOutputStream());
- ros.write("0123456789\n\r".getBytes());
+ ros.write("0123456789\n\r".getBytes(UTF_8));
ros.write('\n');
- ros.write("abcdefghij".getBytes());
+ ros.write("abcdefghij".getBytes(UTF_8));
assertEquals(13, ros.getMessageBodyBegin());
assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue()));
ros.close();
diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
index 3234259c..f0b688a9 100644
--- a/src/test/java/org/archive/io/ReplayCharSequenceTest.java
+++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
@@ -36,6 +36,8 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.*;
/**
@@ -143,7 +145,7 @@ public void testGetReplayCharSequenceMultiByteZeroOffset()
@Test
public void testReplayCharSequenceByteToString() throws IOException {
String fileContent = "Some file content";
- byte [] buffer = fileContent.getBytes();
+ byte [] buffer = fileContent.getBytes(UTF_8);
RecordingOutputStream ros = writeTestStream(
buffer,1,
"testReplayCharSequenceByteToString.txt",0);
@@ -207,7 +209,7 @@ public void testSingleByteEncodings() throws IOException {
@Test
public void testReplayCharSequenceByteToStringOverflow() throws IOException {
String fileContent = "Some file content. "; // ascii
- byte [] buffer = fileContent.getBytes();
+ byte [] buffer = fileContent.getBytes(UTF_8);
RecordingOutputStream ros = writeTestStream(
buffer,1,
"testReplayCharSequenceByteToStringOverflow.txt",1);
diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
index 954da636..f6820337 100644
--- a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
+++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java
@@ -30,6 +30,8 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.archive.format.arc.ARCConstants.*;
@@ -51,7 +53,7 @@ public void testARCWriterPool()
WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE];
final String CONTENT = "Any old content";
ByteArrayOutputStream baos = new ByteArrayOutputStream();
- baos.write(CONTENT.getBytes());
+ baos.write(CONTENT.getBytes(UTF_8));
for (int i = 0; i < MAX_ACTIVE; i++) {
writers[i] = pool.borrowFile();
assertEquals(i + 1, pool.getNumActive(), "Number active");
@@ -81,7 +83,7 @@ public void testInvalidate() throws Exception {
WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE];
final String CONTENT = "Any old content";
ByteArrayOutputStream baos = new ByteArrayOutputStream();
- baos.write(CONTENT.getBytes());
+ baos.write(CONTENT.getBytes(UTF_8));
for (int i = 0; i < MAX_ACTIVE; i++) {
writers[i] = pool.borrowFile();
assertEquals(i + 1, pool.getNumActive(), "Number active");
@@ -124,4 +126,4 @@ private WriterPoolSettings getSettings(final boolean isCompressed) {
Arrays.asList(files),
null);
}
-}
\ No newline at end of file
+}
diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java
index ca300697..8b2f7d64 100644
--- a/src/test/java/org/archive/io/arc/ARCWriterTest.java
+++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java
@@ -47,6 +47,8 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.*;
import static org.archive.format.arc.ARCConstants.*;
@@ -122,11 +124,11 @@ protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
// Start the record with an arbitrary 14-digit date per RFC2540
String now = ArchiveUtils.get14DigitDate();
int recordLength = 0;
- byte[] record = (getContent(indexStr)).getBytes();
+ byte[] record = (getContent(indexStr)).getBytes(UTF_8);
recordLength += record.length;
baos.write(record);
// Add the newline between records back in
- baos.write("\n".getBytes());
+ baos.write("\n".getBytes(UTF_8));
recordLength += 1;
arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
"0.1.2.3", Long.parseLong(now), recordLength, baos);
@@ -305,7 +307,7 @@ protected CorruptibleARCWriter createARCWriter(String name, boolean compress) {
protected static ByteArrayInputStream getBais(String str)
throws IOException {
- return new ByteArrayInputStream(str.getBytes());
+ return new ByteArrayInputStream(str.getBytes(UTF_8));
}
/**
@@ -417,7 +419,7 @@ protected void lengthTooShort(String name, boolean compress, boolean strict)
ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
writeRecord(writer, SOME_URL, "text/html",
content.length(), bais);
- writer.setEndJunk("SOME TRAILING BYTES".getBytes());
+ writer.setEndJunk("SOME TRAILING BYTES".getBytes(UTF_8));
writeRecord(writer, SOME_URL, "text/html",
content.length(), getBais(content));
} finally {
@@ -518,7 +520,7 @@ public void testGapError() throws IOException {
String content = getContent();
// Make a 'weird' RIS that returns bad 'remaining' length
// awhen remaining should be 0
- ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
+ ReplayInputStream ris = new ReplayInputStream(content.getBytes(UTF_8),
content.length(), null) {
public long remaining() {
return (super.remaining()==0) ? -1 : super.remaining();
diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java
index c0ace5f0..d2684fa4 100644
--- a/src/test/java/org/archive/io/warc/WARCWriterTest.java
+++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java
@@ -42,6 +42,8 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.*;
import static org.archive.format.warc.WARCConstants.*;
@@ -228,7 +230,7 @@ protected int writeRandomHTTPRecord(WARCWriter w, int index)
String indexStr = Integer.toString(index);
recordInfo.setUrl("http://www.one.net/id=" + indexStr);
- byte[] record = (getContent(indexStr)).getBytes();
+ byte[] record = (getContent(indexStr)).getBytes(UTF_8);
recordInfo.setContentLength((long) record.length);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
@@ -385,7 +387,7 @@ protected WARCWriter createWARCWriter(String name,
protected static ByteArrayOutputStream getBaos(String str)
throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
- baos.write(str.getBytes());
+ baos.write(str.getBytes(UTF_8));
return baos;
}
@@ -524,4 +526,4 @@ public void testArcRecordOffsetReads() throws Exception {
assertTrue(totalRead > 0);
}
}
-}
\ No newline at end of file
+}
From 6b0f0f29f8193118396d1cd693dc1a086c63d755 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 12 Nov 2025 13:20:24 +0100
Subject: [PATCH 10/16] Unit tests: add Locale.ROOT as parameter to all
occurrences of PrintStream.format(...) and number formatters.
Unify usage charset constants.
---
.../format/gzip/zipnum/ZipNumWriterTest.java | 10 ++++----
.../org/archive/format/json/JSONViewTest.java | 8 ++++---
.../format/text/html/CDATALexerTest.java | 4 +++-
.../archive/io/HeaderedArchiveRecordTest.java | 6 ++---
.../archive/io/RecordingInputStreamTest.java | 9 ++++----
.../archive/io/ReplayCharSequenceTest.java | 23 ++++++++++---------
.../io/RepositionableInputStreamTest.java | 4 +++-
.../html/ExtractingParseObserverTest.java | 3 ++-
.../resource/html/HTMLMetaDataTest.java | 4 +++-
.../url/BasicURLCanonicalizerTest.java | 5 ++--
.../java/org/archive/url/URLParserTest.java | 10 +++++---
.../archive/url/URLRegexTransformerTest.java | 4 +++-
.../java/org/archive/util/ByteOpTest.java | 5 ++--
.../org/archive/util/CrossProductTest.java | 8 ++++++-
src/test/java/org/archive/util/TestUtils.java | 5 ++--
.../util/binsearch/SortedTextFileTest.java | 5 +++-
16 files changed, 72 insertions(+), 41 deletions(-)
diff --git a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
index 25a5eaa7..13658bcb 100644
--- a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
+++ b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java
@@ -10,7 +10,7 @@
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
-import java.nio.charset.StandardCharsets;
+import java.util.Locale;
import org.archive.format.gzip.GZIPMemberSeries;
import org.archive.format.gzip.GZIPSeriesMember;
@@ -18,6 +18,8 @@
import org.junit.jupiter.api.Test;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.assertEquals;
public class ZipNumWriterTest {
@@ -28,16 +30,16 @@ public void testAddRecord() throws IOException {
File summ = File.createTempFile("test-znw",".summ");
main.deleteOnExit();
summ.deleteOnExit();
- System.out.format("Summ: %s\n", summ.getAbsolutePath());
+ System.out.format(Locale.ROOT, "Summ: %s\n", summ.getAbsolutePath());
int limit = 10;
ZipNumWriter znw = new ZipNumWriter(new FileOutputStream(main,false),
new FileOutputStream(summ,false), limit);
for(int i = 0; i < 1000; i++) {
- znw.addRecord(String.format("%06d\n",i).getBytes(StandardCharsets.UTF_8));
+ znw.addRecord(String.format(Locale.ROOT,"%06d\n",i).getBytes(UTF_8));
}
znw.close();
InputStreamReader isr =
- new InputStreamReader(new FileInputStream(summ), StandardCharsets.UTF_8);
+ new InputStreamReader(new FileInputStream(summ), UTF_8);
BufferedReader br = new BufferedReader(isr);
String line = null;
int count = 0;
diff --git a/src/test/java/org/archive/format/json/JSONViewTest.java b/src/test/java/org/archive/format/json/JSONViewTest.java
index aabbe7df..6d199025 100644
--- a/src/test/java/org/archive/format/json/JSONViewTest.java
+++ b/src/test/java/org/archive/format/json/JSONViewTest.java
@@ -1,5 +1,7 @@
package org.archive.format.json;
+import java.util.Locale;
+
import org.archive.util.TestUtils;
import org.json.JSONException;
import org.json.JSONObject;
@@ -17,16 +19,16 @@ public void testBytes() throws JSONException {
JSONObject o = new JSONObject();
o.append("name1", "val\\rue1");
String json = o.toString();
- System.out.format("once: (%s)\n",json);
+ System.out.format(Locale.ROOT, "once: (%s)\n", json);
JSONObject o2 = new JSONObject(json);
- System.out.format("twice: (%s)\n",o2.toString());
+ System.out.format(Locale.ROOT, "twice: (%s)\n", o2.toString());
byte b[] = new byte[2];
for(int i = 0; i < 256; i++) {
b[0] = (byte) i;
int gi = getInt(b);
- System.out.format("I(%d) gi(%d)\n",i,gi);
+ System.out.format(Locale.ROOT, "I(%d) gi(%d)\n", i, gi);
}
}
diff --git a/src/test/java/org/archive/format/text/html/CDATALexerTest.java b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
index 856576ba..7c9f24f3 100644
--- a/src/test/java/org/archive/format/text/html/CDATALexerTest.java
+++ b/src/test/java/org/archive/format/text/html/CDATALexerTest.java
@@ -10,6 +10,8 @@
import static org.junit.jupiter.api.Assertions.*;
+import java.util.Locale;
+
public class CDATALexerTest {
CDATALexer l;
Node n;
@@ -102,7 +104,7 @@ public void testInJSComment() throws ParserException {
}
private void assertJSContentWorks(String js) throws ParserException {
- String html = String.format("",js);
+ String html = String.format(Locale.ROOT,"",js);
l = makeLexer(html);
assertFalse(l.inCSS());
assertFalse(l.inJS());
diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
index 65027395..5d31b890 100644
--- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
+++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java
@@ -78,7 +78,7 @@ public void testParseHttpHeadersInWARC() throws IOException {
byte[] b = new byte[BODY.length()];
har.read(b);
- String bodyRead = new String(b);
+ String bodyRead = new String(b, UTF_8);
assertEquals(BODY, bodyRead);
assertHeaderCorrectlyParsed(har.getContentHeaders());
assertEquals(har.getHeader().getUrl(), url,
@@ -165,7 +165,7 @@ public String getVersion() {
har.skipHttpHeader();
byte[] b = new byte[BODY.length()];
har.read(b);
- String bodyRead = new String(b);
+ String bodyRead = new String(b, UTF_8);
assertEquals(BODY, bodyRead);
assertHeaderCorrectlyParsed(har.getContentHeaders());
}
@@ -184,7 +184,7 @@ public void testEasierParseHttpHeadersInARC() throws IOException {
har.skipHttpHeader();
byte[] b = new byte[BODY.length()];
har.read(b);
- String bodyRead = new String(b);
+ String bodyRead = new String(b, UTF_8);
assertEquals(BODY, bodyRead);
assertHeaderCorrectlyParsed(har.getContentHeaders());
assertEquals(har.getHeader().getUrl(), url, "failed to retrieve Url from metadata");
diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java
index 8ccee986..d794d925 100644
--- a/src/test/java/org/archive/io/RecordingInputStreamTest.java
+++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java
@@ -66,8 +66,9 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException,
ReplayInputStream res = ris.getReplayInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
res.readFullyTo(baos);
- assertEquals("abcdefg",new String(baos.toByteArray()),"soft max cutoff");
- // test hard max
+ assertEquals("abcdefg", new String(baos.toByteArray(), UTF_8),
+ "soft max cutoff");
+ // test hard max
bais.reset();
baos.reset();
ris.open(bais);
@@ -82,8 +83,8 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException,
ris.close();
res = ris.getReplayInputStream();
res.readFullyTo(baos);
- assertEquals("abcdefghijk",new String(baos.toByteArray()),
- "hard max cutoff");
+ assertEquals("abcdefghijk", new String(baos.toByteArray(), UTF_8),
+ "hard max cutoff");
// test timeout
PipedInputStream pin = new PipedInputStream();
PipedOutputStream pout = new PipedOutputStream(pin);
diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
index f0b688a9..3935837b 100644
--- a/src/test/java/org/archive/io/ReplayCharSequenceTest.java
+++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java
@@ -25,17 +25,19 @@
import java.nio.charset.StandardCharsets;
import java.text.NumberFormat;
import java.util.Date;
+import java.util.Locale;
import java.util.Random;
import java.util.logging.Logger;
import org.archive.util.FileUtils;
-import com.google.common.base.Charsets;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.*;
@@ -135,7 +137,7 @@ public void testGetReplayCharSequenceMultiByteZeroOffset()
RecordingOutputStream ros = writeTestStream(
regularBuffer,MULTIPLIER,
"testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER);
- ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8);
+ ReplayCharSequence rcs = getReplayCharSequence(ros, UTF_8);
for (int i = 0; i < MULTIPLIER; i++) {
accessingCharacters(rcs);
@@ -181,7 +183,7 @@ public void testSingleByteEncodings() throws IOException {
String latin1String = new String(bytes, "latin1");
RecordingOutputStream ros = writeTestStream(
bytes, 1, "testSingleByteEncodings-latin1.txt", 0);
- ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1);
+ ReplayCharSequence rcs = getReplayCharSequence(ros, ISO_8859_1);
String result = rcs.toString();
logger.fine("latin1[0] " + toHexString(latin1String));
logger.fine("latin1[1] " + toHexString(result));
@@ -219,8 +221,8 @@ public void testReplayCharSequenceByteToStringOverflow() throws IOException {
// both encodings because they exercise different code paths. UTF-8 is
// decoded to UTF-16 while windows-1252 is memory mapped directly. See
// GenericReplayCharSequence
- ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8);
- ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252"));
+ ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros, UTF_8);
+ ReplayCharSequence rcs1252 = getReplayCharSequence(ros, Charset.forName("windows-1252"));
String result = rcsUtf8.toString();
assertEquals(expectedContent, result, "Strings don't match");
@@ -244,7 +246,7 @@ public void testReplayCharSequenceByteToStringMulti() throws IOException {
buffer,1,
"testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1);
for (int i = 0; i < 3; i++) {
- ReplayCharSequence rcs = getReplayCharSequence(ros,StandardCharsets.UTF_8);
+ ReplayCharSequence rcs = getReplayCharSequence(ros, UTF_8);
String result = rcs.toString();
assertEquals(result, expectedResult, "Strings don't match");
rcs.close();
@@ -257,8 +259,7 @@ public void testReplayCharSequenceByteToStringMulti() throws IOException {
@Disabled
public void xestHugeReplayCharSequence() throws IOException {
String fileContent = "01234567890123456789";
- String characterEncoding = "ascii";
- byte[] buffer = fileContent.getBytes(characterEncoding);
+ byte[] buffer = fileContent.getBytes(US_ASCII);
long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l;
@@ -266,7 +267,7 @@ public void xestHugeReplayCharSequence() throws IOException {
+ " bytes to testHugeReplayCharSequence.txt");
RecordingOutputStream ros = writeTestStream(buffer, 0,
"testHugeReplayCharSequence.txt", reps);
- ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding));
+ ReplayCharSequence rcs = getReplayCharSequence(ros, US_ASCII);
if (reps * fileContent.length() > (long) Integer.MAX_VALUE) {
assertEquals(Integer.MAX_VALUE, rcs.length(), "ReplayCharSequence has wrong length (length()="
@@ -285,7 +286,7 @@ public void xestHugeReplayCharSequence() throws IOException {
// NumberFormat.getInstance().format(index));
assertEquals(fileContent.charAt(index % fileContent.length()),
rcs.charAt(index), "Characters don't match (index="
- + NumberFormat.getInstance().format(index) + ")");
+ + NumberFormat.getInstance(Locale.ROOT).format(index) + ")");
}
// check that out of bounds indices throw exception
@@ -309,7 +310,7 @@ public void xestHugeReplayCharSequence() throws IOException {
// NumberFormat.getInstance().format(index));
assertEquals(fileContent.charAt(index % fileContent.length()),
rcs.charAt(index), "Characters don't match (index="
- + NumberFormat.getInstance().format(index) + ")");
+ + NumberFormat.getInstance(Locale.ROOT).format(index) + ")");
}
}
diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java
index 228c9042..08143d01 100644
--- a/src/test/java/org/archive/io/RepositionableInputStreamTest.java
+++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java
@@ -27,6 +27,8 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.assertEquals;
public class RepositionableInputStreamTest {
@@ -63,7 +65,7 @@ public void testname() throws Exception {
long offset = 0;
for (int i = 0; i < 10; i++) {
ris.read(bytes, 0, LINE.length());
- assertEquals(LINE, new String(bytes));
+ assertEquals(LINE, new String(bytes, UTF_8));
offset += LINE.length();
assertEquals(offset, ris.position());
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 157499ff..e34d4e6f 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -3,6 +3,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.logging.Logger;
import org.archive.extract.ExtractingResourceFactoryMapper;
@@ -52,7 +53,7 @@ public void testHandleStyleNodeExceptions() throws Exception {
TextNode tn = new TextNode(css);
epo.handleStyleNode(tn);
} catch(Exception e) {
- System.err.format("And the winner is....(%s)\n", css);
+ System.err.format(Locale.ROOT, "And the winner is....(%s)\n", css);
e.printStackTrace();
except = true;
throw e;
diff --git a/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java b/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java
index 3b4193b9..a3c8c1c9 100644
--- a/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java
+++ b/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java
@@ -1,5 +1,7 @@
package org.archive.resource.html;
+import java.util.Locale;
+
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@@ -59,7 +61,7 @@ private void appendStrArr(JSONObject o, String a[][]) throws JSONException {
}
private void appendStrArr2(JSONObject o, String k, String... a) throws JSONException {
- System.out.format("A length(%d)\n", a.length);
+ System.out.format(Locale.ROOT, "A length(%d)\n", a.length);
JSONObject n = new JSONObject();
if((a.length & 1) == 1) {
throw new IllegalArgumentException();
diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
index 19b1984f..45989416 100644
--- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
+++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
@@ -1,6 +1,7 @@
package org.archive.url;
import java.net.URISyntaxException;
+import java.util.Locale;
import org.junit.jupiter.api.Test;
@@ -204,12 +205,12 @@ public void testFoo() {
String path = "/a/b/c/";
String[] paths = path.split("/",-1);
for(String p : paths) {
- System.out.format("(%s)",p);
+ System.out.format(Locale.ROOT, "(%s)", p);
}
System.out.println();
paths = path.split("/");
for(String p : paths) {
- System.out.format("(%s)",p);
+ System.out.format(Locale.ROOT, "(%s)", p);
}
System.out.println();
}
diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java
index bc8fc3a5..c942a260 100644
--- a/src/test/java/org/archive/url/URLParserTest.java
+++ b/src/test/java/org/archive/url/URLParserTest.java
@@ -3,10 +3,14 @@
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URLDecoder;
+import java.util.Locale;
import com.google.common.net.InetAddresses;
+
import org.junit.jupiter.api.Test;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import static org.junit.jupiter.api.Assertions.assertEquals;
public class URLParserTest {
@@ -15,7 +19,7 @@ public void testGuava() throws URIException, UnsupportedEncodingException {
Long l = Long.parseLong("3279880203");
int i2 = l.intValue();
// int i = Integer.decode("3279880203");
- System.err.format("FromNum(%s)\n", InetAddresses.fromInteger(i2).getHostAddress());
+ System.err.format(Locale.ROOT, "FromNum(%s)\n", InetAddresses.fromInteger(i2).getHostAddress());
}
@Test
@@ -30,7 +34,7 @@ public void testAddDefaultSchemeIfNeeded() {
@Test
public void testParse() throws UnsupportedEncodingException, URISyntaxException {
- System.out.format("O(%s) E(%s)\n","%66",URLDecoder.decode("%66","UTF-8"));
+ System.out.format(Locale.ROOT, "O(%s) E(%s)\n","%66", URLDecoder.decode("%66", UTF_8.name()));
checkParse("http://www.archive.org/index.html#foo",
null, "http", null, null, "www.archive.org", -1, "/index.html", null, "foo",
"http://www.archive.org/index.html#foo", "/index.html");
@@ -96,7 +100,7 @@ private void checkParse(String s, String opaque, String scheme, String authUser,
String authPass, String host, int port, String path,
String query, String fragment, String urlString, String pathQuery) throws URISyntaxException {
HandyURL h = URLParser.parse(s);
- System.out.format("Input:(%s)\nHandyURL\t%s\n",s,h.toDebugString());
+ System.out.format(Locale.ROOT, "Input:(%s)\nHandyURL\t%s\n", s, h.toDebugString());
assertEquals(scheme, h.getScheme());
assertEquals(authUser, h.getAuthUser());
assertEquals(authPass, h.getAuthPass());
diff --git a/src/test/java/org/archive/url/URLRegexTransformerTest.java b/src/test/java/org/archive/url/URLRegexTransformerTest.java
index 73c43f96..d5c98f6a 100644
--- a/src/test/java/org/archive/url/URLRegexTransformerTest.java
+++ b/src/test/java/org/archive/url/URLRegexTransformerTest.java
@@ -5,6 +5,8 @@
import static org.junit.jupiter.api.Assertions.assertEquals;
+import java.util.Locale;
+
public class URLRegexTransformerTest {
@Test
@@ -49,7 +51,7 @@ public void testStripPathSessionID() {
private static void checkStripPathSessionID(String orig, String want) {
String got = URLRegexTransformer.stripPathSessionID(orig);
- assertEquals(want, got, String.format("FAIL Orig(%s) Got(%s) Want(%s)", orig, got, want));
+ assertEquals(want, got, String.format(Locale.ROOT, "FAIL Orig(%s) Got(%s) Want(%s)", orig, got, want));
}
// private static final String BASE = "http://www.archive.org/index.html";
diff --git a/src/test/java/org/archive/util/ByteOpTest.java b/src/test/java/org/archive/util/ByteOpTest.java
index 49781c36..eb89353e 100644
--- a/src/test/java/org/archive/util/ByteOpTest.java
+++ b/src/test/java/org/archive/util/ByteOpTest.java
@@ -4,6 +4,7 @@
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.IOException;
+import java.util.Locale;
import com.google.common.io.LittleEndianDataOutputStream;
@@ -18,10 +19,10 @@ public void testReadShort() throws IOException {
byte a[] = new byte[]{0,1,2,3};
ByteArrayInputStream bais = new ByteArrayInputStream(a);
int bos = ByteOp.readShort(bais);
- System.out.format("BO.Read short(%d)\n", bos);
+ System.out.format(Locale.ROOT, "BO.Read short(%d)\n", bos);
DataInputStream dis = new DataInputStream(new ByteArrayInputStream(a));
int disv = dis.readUnsignedShort();
- System.out.format("DI.Read short(%d)\n", disv);
+ System.out.format(Locale.ROOT, "DI.Read short(%d)\n", disv);
for(int i = 0; i < 256 * 256; i++) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(2);
LittleEndianDataOutputStream dos = new LittleEndianDataOutputStream(baos);
diff --git a/src/test/java/org/archive/util/CrossProductTest.java b/src/test/java/org/archive/util/CrossProductTest.java
index 211fa65e..a487ab15 100644
--- a/src/test/java/org/archive/util/CrossProductTest.java
+++ b/src/test/java/org/archive/util/CrossProductTest.java
@@ -2,10 +2,12 @@
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import org.junit.jupiter.api.Test;
public class CrossProductTest {
+
private void dumpC(List