From bf3fcb9e787ae7f4f740416a91c2d74b79f31fe7 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 15:11:12 +0100 Subject: [PATCH 01/16] Add forbiddenAPIs Maven plugin to fail the build when methods relying on default locale are charset are used. Also forbid usage of URL.equals and .hashCode which may resolve host named per DNS lookup. --- pom.xml | 25 +++++++++++++++++++ .../resources/forbidden-apis-signatures.txt | 2 ++ 2 files changed, 27 insertions(+) create mode 100644 src/test/resources/forbidden-apis-signatures.txt diff --git a/pom.xml b/pom.xml index 73ba9ba2..c1c17e9b 100644 --- a/pom.xml +++ b/pom.xml @@ -173,6 +173,31 @@ maven-surefire-plugin 3.2.5 + + de.thetaphi + forbiddenapis + 3.10 + + + false + + jdk-unsafe + jdk-deprecated + jdk-non-portable + + + src/test/resources/forbidden-apis-signatures.txt + + + + + + check + testCheck + + + + diff --git a/src/test/resources/forbidden-apis-signatures.txt b/src/test/resources/forbidden-apis-signatures.txt new file mode 100644 index 00000000..1eda9eec --- /dev/null +++ b/src/test/resources/forbidden-apis-signatures.txt @@ -0,0 +1,2 @@ +java.net.URL#equals(java.lang.Object) @ may trigger a DNS lookup to resolve the host part +java.net.URL#hashCode() @ may trigger a DNS lookup to resolve the host part From c94928e324b633a882783b72c99b4e24a8a23bbb Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 18:00:01 +0100 Subject: [PATCH 02/16] Add Locale.ROOT as parameter to all occurrences of String.toLowerCase(), String.toUpperCase() and String.format(...) --- .../extract/ExtractingResourceFactoryMapper.java | 11 ++++++----- .../extract/ExtractingResourceProducer.java | 3 ++- .../archive/extract/RealCDXExtractorOutput.java | 15 ++++++++------- .../org/archive/extract/ResourceExtractor.java | 7 ++++--- .../org/archive/extract/WATExtractorOutput.java | 3 ++- .../org/archive/format/gzip/GZIPMemberSeries.java | 9 +++++---- .../java/org/archive/format/http/HttpHeader.java | 3 ++- .../java/org/archive/format/http/HttpHeaders.java | 5 +++-- .../archive/format/http/HttpMessageParser.java | 7 ++++--- .../format/http/HttpRequestMessageParser.java | 3 ++- .../archive/format/http/HttpResponseMessage.java | 6 ++++-- .../archive/format/json/CrossProductOfLists.java | 7 ++++--- .../java/org/archive/format/json/JSONView.java | 3 ++- .../format/text/charset/CharsetDetector.java | 7 ++++--- .../org/archive/format/text/html/NodeUtils.java | 10 ++++++---- .../org/archive/hadoop/ArchiveMetadataLoader.java | 3 ++- .../org/archive/hadoop/ResourceRecordReader.java | 5 +++-- src/main/java/org/archive/io/ArchiveReader.java | 5 +++-- .../java/org/archive/io/ArchiveReaderFactory.java | 5 +++-- src/main/java/org/archive/io/ArchiveRecord.java | 3 ++- .../org/archive/io/HeaderedArchiveRecord.java | 5 +++-- src/main/java/org/archive/io/arc/ARCReader.java | 3 ++- .../java/org/archive/io/arc/ARCReaderFactory.java | 9 +++++---- src/main/java/org/archive/io/arc/ARCRecord.java | 3 ++- src/main/java/org/archive/io/arc/ARCUtils.java | 5 +++-- src/main/java/org/archive/io/warc/WARCReader.java | 5 +++-- .../org/archive/io/warc/WARCReaderFactory.java | 7 ++++--- src/main/java/org/archive/net/PublicSuffixes.java | 3 ++- .../resource/generic/GenericResourceProducer.java | 3 ++- .../resource/gzip/GZIPResourceContainer.java | 3 ++- .../org/archive/resource/warc/WARCResource.java | 3 ++- .../org/archive/streamcontext/HTTP11Stream.java | 3 ++- .../org/archive/url/BasicURLCanonicalizer.java | 9 +++++---- src/main/java/org/archive/url/HandyURL.java | 3 ++- .../java/org/archive/url/IAURLCanonicalizer.java | 11 ++++++----- src/main/java/org/archive/url/LaxURI.java | 3 ++- src/main/java/org/archive/url/URI.java | 4 ++-- .../java/org/archive/url/URLRegexTransformer.java | 3 ++- .../java/org/archive/url/UsableURIFactory.java | 5 +++-- src/main/java/org/archive/util/ArchiveUtils.java | 6 +++--- src/main/java/org/archive/util/FileNameSpec.java | 3 ++- src/main/java/org/archive/util/FileUtils.java | 9 +++++---- src/main/java/org/archive/util/Recorder.java | 5 +++-- src/main/java/org/archive/util/SurtPrefixSet.java | 5 +++-- .../archive/util/binsearch/SortedTextFile.java | 7 ++++--- 45 files changed, 145 insertions(+), 100 deletions(-) diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java index 0afe16fb..567b1cd8 100644 --- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java +++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java @@ -1,6 +1,7 @@ package org.archive.extract; import java.util.Iterator; +import java.util.Locale; import java.util.logging.Logger; import org.archive.format.arc.ARCConstants; @@ -68,14 +69,14 @@ private boolean childFieldStartsWith(MetaData m, String child, String key, String search) { String val = getChildField(m,child,key); return val == null ? false : - val.toLowerCase().startsWith(search.toLowerCase()); + val.toLowerCase(Locale.ROOT).startsWith(search.toLowerCase(Locale.ROOT)); } private boolean childFieldContains(MetaData m, String child, String key, String search) { String val = getChildField(m,child,key); return val == null ? false : - val.toLowerCase().contains(search.toLowerCase()); + val.toLowerCase(Locale.ROOT).contains(search.toLowerCase(Locale.ROOT)); } private boolean childFieldEquals(MetaData m, String child, @@ -88,7 +89,7 @@ private boolean childFieldEquals(MetaData m, String child, private String caseInsensitiveKeyScan(MetaData m, String child, String k) { try { if(m.has(child)) { - String kLC = k.toLowerCase(); + String kLC = k.toLowerCase(Locale.ROOT); JSONObject childJSObj = m.getJSONObject(child); @SuppressWarnings("rawtypes") Iterator i = childJSObj.keys(); @@ -96,7 +97,7 @@ private String caseInsensitiveKeyScan(MetaData m, String child, String k) { Object kObj = i.next(); if(kObj instanceof String) { String kString = (String) kObj; - if(kString.toLowerCase().equals(kLC)) { + if(kString.toLowerCase(Locale.ROOT).equals(kLC)) { return childJSObj.getString(kString); } } @@ -128,7 +129,7 @@ private boolean isHTTPARCResource(MetaData envelope) { private boolean isHTMLHttpResource(MetaData m) { String type = caseInsensitiveKeyScan(m,HTTP_HEADERS_LIST, "Content-Type"); - return type == null ? false : type.toLowerCase().contains("html"); + return type == null ? false : type.toLowerCase(Locale.ROOT).contains("html"); } private boolean isWARCType(MetaData envelope, WARCRecordType type) { diff --git a/src/main/java/org/archive/extract/ExtractingResourceProducer.java b/src/main/java/org/archive/extract/ExtractingResourceProducer.java index de671bee..07cdb88a 100644 --- a/src/main/java/org/archive/extract/ExtractingResourceProducer.java +++ b/src/main/java/org/archive/extract/ExtractingResourceProducer.java @@ -1,6 +1,7 @@ package org.archive.extract; import java.io.IOException; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -33,7 +34,7 @@ public Resource getNext() throws ResourceParseException, IOException { return current; } if(LOG.isLoggable(Level.INFO)) { - LOG.info(String.format("Extracting (%s) with (%s)\n", + LOG.info(String.format(Locale.ROOT, "Extracting (%s) with (%s)\n", current.getClass().toString(), f.getClass().toString())); } diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java index e6f6e82f..b8f06034 100644 --- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java +++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java @@ -8,6 +8,7 @@ import java.net.URISyntaxException; import java.net.URL; import java.util.List; +import java.util.Locale; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -131,7 +132,7 @@ public void output(Resource resource) throws IOException { } else { meta = "-"; } - if(mime.toLowerCase().contains("html")) { + if(mime.toLowerCase(Locale.ROOT).contains("html")) { if(redir.equals("-")) { // maybe an obvious meta-refresh? redir = extractHTMLMetaRefresh(origUrl,m); @@ -202,7 +203,7 @@ public void output(Resource resource) throws IOException { } else { meta = "-"; } - if(mime.toLowerCase().contains("html")) { + if(mime.toLowerCase(Locale.ROOT).contains("html")) { if(redir.equals("-")) { // maybe an obvious meta-refresh? redir = extractHTMLMetaRefresh(origUrl,m); @@ -269,7 +270,7 @@ private String extractHTMLRobots(MetaData m) { if(meta != null) { String name = scanHeadersLC(meta, "name", null); if(name != null) { - if(name.toLowerCase().equals("robots")) { + if(name.toLowerCase(Locale.ROOT).equals("robots")) { // alright - some robot instructions: String content = scanHeadersLC(meta, "content", null); if(content != null) { @@ -291,7 +292,7 @@ private String extractHTMLMetaRefresh(String origUrl, MetaData m) { if(meta != null) { String name = scanHeadersLC(meta, "http-equiv", null); if(name != null) { - if(name.toLowerCase().equals("refresh")) { + if(name.toLowerCase(Locale.ROOT).equals("refresh")) { // alright - some robot instructions: String content = scanHeadersLC(meta, "content", null); if(content != null) { @@ -330,7 +331,7 @@ private String scanHeadersLC(JSONObject o, String match, String defaultVal) { if(o.length() == 0) { return defaultVal; } - String lc = match.toLowerCase().trim(); + String lc = match.toLowerCase(Locale.ROOT).trim(); // try { // System.err.println("REC:" + o.toString(1)); // } catch (JSONException e1) { @@ -338,7 +339,7 @@ private String scanHeadersLC(JSONObject o, String match, String defaultVal) { // e1.printStackTrace(); // } for(String key : JSONObject.getNames(o)) { - if(lc.equals(key.toLowerCase().trim())) { + if(lc.equals(key.toLowerCase(Locale.ROOT).trim())) { try { return o.getString(key).trim(); } catch (JSONException e) { @@ -472,7 +473,7 @@ private String parseRobotInstructions(String input) { if(input == null) { return "-"; } - String up = input.replaceAll("-", "").toUpperCase(); + String up = input.replaceAll("-", "").toUpperCase(Locale.ROOT); StringBuilder sb = new StringBuilder(3); if(up.contains(NO_FOLLOW_MATCH)) { sb.append("F"); diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index 2812aa5b..a6fa0a00 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -8,6 +8,7 @@ import java.io.PrintWriter; import java.net.URISyntaxException; import java.nio.charset.Charset; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -138,7 +139,7 @@ public int run(String[] args) out.output(r); } catch(GZIPFormatException e) { - LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); + LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); @@ -147,7 +148,7 @@ public int run(String[] args) } e.printStackTrace(); } catch(ResourceParseException e) { - LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); + LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); @@ -157,7 +158,7 @@ public int run(String[] args) e.printStackTrace(); } catch(RecoverableRecordFormatException e) { // this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions... - LOG.severe(String.format("RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage())); + LOG.severe(String.format(Locale.ROOT, "RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 4b5f72ed..dbe979e5 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -10,6 +10,7 @@ import java.text.ParseException; import java.net.UnknownHostException; import java.util.Date; +import java.util.Locale; import org.archive.format.gzip.GZIPMemberWriter; import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream; @@ -143,7 +144,7 @@ private void writeARC(OutputStream recOut, MetaData md) throws IOException { String capDateString = extractOrIO(md, "Envelope.ARC-Header-Metadata.Date"); String filename = extractOrIO(md, "Container.Filename"); String offset = extractOrIO(md, "Container.Offset"); - String recId = String.format("",filename,offset); + String recId = String.format(Locale.ROOT, "",filename,offset); writeWARCMDRecord(recOut,md,targetURI,capDateString,recId); } diff --git a/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java b/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java index d70bf394..154cf5f1 100644 --- a/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java +++ b/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.InputStream; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.Inflater; @@ -227,7 +228,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException } if(LOG.isLoggable(Level.INFO)) { - LOG.info(String.format( + LOG.info(String.format(Locale.ROOT, "Got EOF after %d bytes before finding magic in %s\n", amtSkipped * -1, streamContext)); } @@ -237,7 +238,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException if(amtSkipped > 0) { if(strict) { if(state == STATE_START) { - LOG.info(String.format( + LOG.info(String.format(Locale.ROOT, "Strict mode Skipped %d bytes in (%s) before finding magic at offset(%d)\n", amtSkipped, streamContext, offset-3)); } else { @@ -248,7 +249,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException } if(LOG.isLoggable(Level.INFO)) { - LOG.info(String.format( + LOG.info(String.format(Locale.ROOT, "Skipped %d bytes in (%s) before finding magic at offset(%d)\n", amtSkipped, streamContext, offset-3)); } @@ -268,7 +269,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException } offset = currentMemberStartOffset + 3; stream.setOffset(currentMemberStartOffset + 3); - LOG.warning(String.format( + LOG.warning(String.format(Locale.ROOT, "GZIPFormatException with record around offset(%d) in (%s)\n", offset, streamContext)); } diff --git a/src/main/java/org/archive/format/http/HttpHeader.java b/src/main/java/org/archive/format/http/HttpHeader.java index 57b70e1f..9ebe860f 100755 --- a/src/main/java/org/archive/format/http/HttpHeader.java +++ b/src/main/java/org/archive/format/http/HttpHeader.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.OutputStream; +import java.util.Locale; public class HttpHeader implements HttpConstants { private String name = null; @@ -27,7 +28,7 @@ public void write(OutputStream out) throws IOException { public String toString() { StringBuilder sb = new StringBuilder(name.length() + value.length()+20); - sb.append(String.format("HttpHeader(%s)(%s)",name,value)); + sb.append(String.format(Locale.ROOT, "HttpHeader(%s)(%s)",name,value)); return sb.toString(); } } diff --git a/src/main/java/org/archive/format/http/HttpHeaders.java b/src/main/java/org/archive/format/http/HttpHeaders.java index ed8061d7..a65dd8fb 100755 --- a/src/main/java/org/archive/format/http/HttpHeaders.java +++ b/src/main/java/org/archive/format/http/HttpHeaders.java @@ -4,6 +4,7 @@ import java.io.OutputStream; import java.util.ArrayList; import java.util.Date; +import java.util.Locale; import java.util.logging.Logger; import org.archive.util.ByteOp; @@ -54,9 +55,9 @@ public String getValue(String name) { } public String getValueCaseInsensitive(String name) { - String lc = name.toLowerCase(); + String lc = name.toLowerCase(Locale.ROOT); for(HttpHeader h : this) { - if(h.getName().toLowerCase().equals(lc)) { + if(h.getName().toLowerCase(Locale.ROOT).equals(lc)) { return h.getValue(); } } diff --git a/src/main/java/org/archive/format/http/HttpMessageParser.java b/src/main/java/org/archive/format/http/HttpMessageParser.java index c4fcdf92..24e59e03 100644 --- a/src/main/java/org/archive/format/http/HttpMessageParser.java +++ b/src/main/java/org/archive/format/http/HttpMessageParser.java @@ -1,5 +1,6 @@ package org.archive.format.http; +import java.util.Locale; public class HttpMessageParser implements HttpConstants { @@ -22,11 +23,11 @@ protected int parseVersionLax(byte buf[], int start, int len) throws HttpParseException { String v = new String(buf,start,len,UTF8); - if(v.toLowerCase().compareTo(VERSION_0_STATUS.toLowerCase()) == 0) { + if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_0_STATUS.toLowerCase(Locale.ROOT)) == 0) { return VERSION_0; - } else if(v.toLowerCase().compareTo(VERSION_1_STATUS.toLowerCase()) == 0) { + } else if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_1_STATUS.toLowerCase(Locale.ROOT)) == 0) { return VERSION_1; - } else if(v.toLowerCase().compareTo(VERSION_9_STATUS.toLowerCase()) == 0) { + } else if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_9_STATUS.toLowerCase(Locale.ROOT)) == 0) { return VERSION_9; } return VERSION_0; diff --git a/src/main/java/org/archive/format/http/HttpRequestMessageParser.java b/src/main/java/org/archive/format/http/HttpRequestMessageParser.java index f7bc43c7..759bbe5d 100644 --- a/src/main/java/org/archive/format/http/HttpRequestMessageParser.java +++ b/src/main/java/org/archive/format/http/HttpRequestMessageParser.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.InputStream; +import java.util.Locale; public class HttpRequestMessageParser extends HttpMessageParser { public int maxBytes = 1024 * 1024; @@ -223,7 +224,7 @@ protected int parseMethodStrict(byte buf[], int start, int len) protected int parseMethodLax(byte buf[], int start, int len) throws HttpParseException { - String v = new String(buf,start,len,UTF8).toUpperCase(); + String v = new String(buf,start,len,UTF8).toUpperCase(Locale.ROOT); if(v.compareTo(METHOD_GET_STRING) == 0) { return METHOD_GET; } else if(v.compareTo(METHOD_HEAD_STRING) == 0) { diff --git a/src/main/java/org/archive/format/http/HttpResponseMessage.java b/src/main/java/org/archive/format/http/HttpResponseMessage.java index 0cb7b7e5..6d3f5c35 100755 --- a/src/main/java/org/archive/format/http/HttpResponseMessage.java +++ b/src/main/java/org/archive/format/http/HttpResponseMessage.java @@ -1,5 +1,7 @@ package org.archive.format.http; +import java.util.Locale; + public class HttpResponseMessage extends HttpMessage implements HttpResponseMessageObserver { private int status = 0; private String reason = null; @@ -20,10 +22,10 @@ public String getReason() { return reason; } public String toString() { - return String.format("%s %d %s%s", getVersionString(), status, reason, CRLF); + return String.format(Locale.ROOT, "%s %d %s%s", getVersionString(), status, reason, CRLF); } public String toDebugString() { - return String.format("Message(%d):(%s) (%d) (%s)\n", + return String.format(Locale.ROOT, "Message(%d):(%s) (%d) (%s)\n", reason.length(),getVersionString(),status,reason,CRLF); } diff --git a/src/main/java/org/archive/format/json/CrossProductOfLists.java b/src/main/java/org/archive/format/json/CrossProductOfLists.java index f9e2abd2..69cdae33 100644 --- a/src/main/java/org/archive/format/json/CrossProductOfLists.java +++ b/src/main/java/org/archive/format/json/CrossProductOfLists.java @@ -4,6 +4,7 @@ import java.util.ArrayList; import java.util.Deque; import java.util.List; +import java.util.Locale; import java.util.Stack; import java.util.logging.Level; import java.util.logging.Logger; @@ -18,12 +19,12 @@ public List> crossProduct(List>> listOfLists) { if(LOG.isLoggable(Level.INFO)) { int count = listOfLists.size(); - LOG.info(String.format("Total of (%d) lists to cross product",count)); + LOG.info(String.format(Locale.ROOT, "Total of (%d) lists to cross product",count)); for(int i = 0; i < count; i++) { - LOG.info(String.format("Field (%d) is (%d) deep",i,listOfLists.get(i).size())); + LOG.info(String.format(Locale.ROOT, "Field (%d) is (%d) deep",i,listOfLists.get(i).size())); for(List inner : listOfLists.get(i)) { LOG.info( - String.format("----(%d):(%s)" + String.format(Locale.ROOT, "----(%d):(%s)" ,i,StringUtils.join(inner.toArray(),",") ) ); } } diff --git a/src/main/java/org/archive/format/json/JSONView.java b/src/main/java/org/archive/format/json/JSONView.java index 7a984ebe..444ea7e6 100644 --- a/src/main/java/org/archive/format/json/JSONView.java +++ b/src/main/java/org/archive/format/json/JSONView.java @@ -2,6 +2,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -28,7 +29,7 @@ public class JSONView { public JSONView(String... pathSpecs) { this.pathSpecs = new ArrayList(pathSpecs.length); if(LOG.isLoggable(Level.INFO)) { - LOG.info(String.format("Creating JSONView with(%s)", + LOG.info(String.format(Locale.ROOT, "Creating JSONView with(%s)", StringUtils.join(pathSpecs,","))); } for(String pathSpec : pathSpecs) { diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 214fde07..49286764 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -106,7 +107,7 @@ protected boolean isCharsetSupported(String charsetName) { } } protected String mapCharset(String orig) { - String lc = orig.toLowerCase(); + String lc = orig.toLowerCase(Locale.ROOT); if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) { return "cp1252"; } @@ -114,7 +115,7 @@ protected String mapCharset(String orig) { } protected String contentTypeToCharset(final String contentType) { int offset = - contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase()); + contentType.toUpperCase(Locale.ROOT).indexOf(CHARSET_TOKEN.toUpperCase(Locale.ROOT)); if (offset != -1) { String cs = contentType.substring(offset + CHARSET_TOKEN.length()); @@ -148,7 +149,7 @@ protected String getCharsetFromHeaders(HttpHeaders headers) return null; } for(HttpHeader header : headers) { - if(header.getName().toUpperCase().trim().equals( + if(header.getName().toUpperCase(Locale.ROOT).trim().equals( HTTP_CONTENT_TYPE_HEADER)) { return contentTypeToCharset(header.getValue()); } diff --git a/src/main/java/org/archive/format/text/html/NodeUtils.java b/src/main/java/org/archive/format/text/html/NodeUtils.java index 625d9099..f231b91a 100644 --- a/src/main/java/org/archive/format/text/html/NodeUtils.java +++ b/src/main/java/org/archive/format/text/html/NodeUtils.java @@ -19,6 +19,8 @@ */ package org.archive.format.text.html; +import java.util.Locale; + import org.htmlparser.Node; import org.htmlparser.nodes.RemarkNode; import org.htmlparser.nodes.TagNode; @@ -41,7 +43,7 @@ public static boolean isTagNodeNamed(Node node, String name) { if(isTagNode(node)) { TagNode tagNode = (TagNode) node; String nodeName = tagNode.getTagName(); - return nodeName.equals(name.toUpperCase()); + return nodeName.equals(name.toUpperCase(Locale.ROOT)); } return false; } @@ -50,7 +52,7 @@ public static boolean isOpenTagNodeNamed(Node node, String name) { TagNode tagNode = (TagNode) node; if(!tagNode.isEndTag()) { String nodeName = tagNode.getTagName(); - return nodeName.equals(name.toUpperCase()); + return nodeName.equals(name.toUpperCase(Locale.ROOT)); } } return false; @@ -60,7 +62,7 @@ public static boolean isNonEmptyOpenTagNodeNamed(Node node, String name) { TagNode tagNode = (TagNode) node; if(!tagNode.isEndTag() && !tagNode.isEmptyXmlTag()) { String nodeName = tagNode.getTagName(); - return nodeName.equals(name.toUpperCase()); + return nodeName.equals(name.toUpperCase(Locale.ROOT)); } } return false; @@ -70,7 +72,7 @@ public static boolean isCloseTagNodeNamed(Node node, String name) { TagNode tagNode = (TagNode) node; if(tagNode.isEndTag()) { String nodeName = tagNode.getTagName(); - return nodeName.equals(name.toUpperCase()); + return nodeName.equals(name.toUpperCase(Locale.ROOT)); } } return false; diff --git a/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java b/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java index 37c8af99..a3cbb26c 100644 --- a/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java +++ b/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Locale; import java.util.logging.Logger; import org.apache.hadoop.mapreduce.InputFormat; @@ -54,7 +55,7 @@ public Tuple getNext() throws IOException { try { key = reader.getCurrentKey(); - LOG.info(String.format("Loaded key-offset %d\n", key.offset)); + LOG.info(String.format(Locale.ROOT, "Loaded key-offset %d\n", key.offset)); value = reader.getCurrentValue(); } catch (InterruptedException e) { // is this needed and the right way? diff --git a/src/main/java/org/archive/hadoop/ResourceRecordReader.java b/src/main/java/org/archive/hadoop/ResourceRecordReader.java index 06d3ce2e..88b93dd2 100644 --- a/src/main/java/org/archive/hadoop/ResourceRecordReader.java +++ b/src/main/java/org/archive/hadoop/ResourceRecordReader.java @@ -1,6 +1,7 @@ package org.archive.hadoop; import java.io.IOException; +import java.util.Locale; import java.util.logging.Logger; import org.apache.hadoop.fs.FSDataInputStream; @@ -111,7 +112,7 @@ public boolean nextKeyValue() throws IOException, InterruptedException { if(r != null) { StreamCopy.readToEOF(r.getInputStream()); - LOG.info(String.format("Extracted offset %d\n", + LOG.info(String.format(Locale.ROOT, "Extracted offset %d\n", series.getCurrentMemberStartOffset())); cachedK = new ResourceContext(name, series.getCurrentMemberStartOffset()); @@ -121,7 +122,7 @@ public boolean nextKeyValue() throws IOException, InterruptedException { } catch (ResourceParseException e) { e.printStackTrace(); throw new IOException( - String.format("ResourceParseException at(%s)(%d)", + String.format(Locale.ROOT, "ResourceParseException at(%s)(%d)", name,series.getCurrentMemberStartOffset()), e); } diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java index 449cdc24..53b8167b 100644 --- a/src/main/java/org/archive/io/ArchiveReader.java +++ b/src/main/java/org/archive/io/ArchiveReader.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -615,7 +616,7 @@ protected static boolean getTrueOrFalse(final String value) { if (value == null || value.length() <= 0) { return false; } - return Boolean.TRUE.toString().equals(value.toLowerCase()); + return Boolean.TRUE.toString().equals(value.toLowerCase(Locale.ROOT)); } /** @@ -757,4 +758,4 @@ protected static Options getOptions() { "'or 'nohead'. Default: 'cdx'.")); return options; } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java index bc316893..fe72236b 100644 --- a/src/main/java/org/archive/io/ArchiveReaderFactory.java +++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java @@ -25,6 +25,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; +import java.util.Locale; import org.archive.io.arc.ARCReaderFactory; import org.archive.io.warc.WARCReaderFactory; @@ -296,7 +297,7 @@ protected void addUserAgent(final HttpURLConnection connection) { * @throws IOException */ protected boolean isCompressed(final File f) throws IOException { - return f.getName().toLowerCase(). + return f.getName().toLowerCase(Locale.ROOT). endsWith(DOT_COMPRESSED_FILE_EXTENSION); } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java index 4bd1fa02..01e8d5ec 100644 --- a/src/main/java/org/archive/io/ArchiveRecord.java +++ b/src/main/java/org/archive/io/ArchiveRecord.java @@ -23,6 +23,7 @@ import java.io.OutputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Locale; import java.util.logging.Level; import org.archive.format.ArchiveFileConstants; @@ -393,7 +394,7 @@ public boolean hasContentHeaders() { return false; } - if (!url.toLowerCase().startsWith("http")) { + if (!url.toLowerCase(Locale.ROOT).startsWith("http")) { return false; } diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java index 809a9e54..70c4fb04 100644 --- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -25,6 +25,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; +import java.util.Locale; import org.archive.format.http.HttpHeader; import org.archive.format.arc.ARCConstants; @@ -156,8 +157,8 @@ private InputStream readContentHeaders() throws IOException { boolean isHttpResponse = statusLine.startsWith("HTTP"); boolean isHttpRequest = false; if (!isHttpResponse) { - isHttpRequest = statusLine.toUpperCase().startsWith("GET") || - !statusLine.toUpperCase().startsWith("POST"); + isHttpRequest = statusLine.toUpperCase(Locale.ROOT).startsWith("GET") || + !statusLine.toUpperCase(Locale.ROOT).startsWith("POST"); } if (!isHttpResponse && !isHttpRequest) { throw new UnexpectedStartLineIOException("Failed parse of " + diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java index c9a88415..ecc742a5 100644 --- a/src/main/java/org/archive/io/arc/ARCReader.java +++ b/src/main/java/org/archive/io/arc/ARCReader.java @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; @@ -493,7 +494,7 @@ public static void main(String [] args) break; case 'f': - format = cmdlineOptions[i].getValue().toLowerCase(); + format = cmdlineOptions[i].getValue().toLowerCase(Locale.ROOT); boolean match = false; // List of supported formats. final String [] supportedFormats = diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java index d2f10842..bbcc8b6f 100644 --- a/src/main/java/org/archive/io/arc/ARCReaderFactory.java +++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java @@ -27,6 +27,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; +import java.util.Locale; import java.util.logging.Level; import org.archive.io.ArchiveReader; @@ -230,7 +231,7 @@ public static boolean testCompressedARCFile(File arcFile, throws IOException { boolean compressedARCFile = false; FileUtils.assertReadable(arcFile); - if(!skipSuffixCheck && !arcFile.getName().toLowerCase() + if(!skipSuffixCheck && !arcFile.getName().toLowerCase(Locale.ROOT) .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { return compressedARCFile; } @@ -247,9 +248,9 @@ public static boolean testCompressedARCFile(File arcFile, public static boolean isARCSuffix(final String arcName) { return (arcName == null)? false: - (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))? + (arcName.toLowerCase(Locale.ROOT).endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))? true: - (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))? + (arcName.toLowerCase(Locale.ROOT).endsWith(DOT_ARC_FILE_EXTENSION))? true: false; } @@ -452,4 +453,4 @@ protected void gotoEOR(ArchiveRecord rec) throws IOException { logStdErr(Level.WARNING, message); } } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index 0815c18a..14e80728 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -27,6 +27,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; @@ -376,7 +377,7 @@ private ARCRecordMetaData computeMetaData(List keys, if (keys.size() != values.size()) { // Early ARCs had a space in mimetype. if (values.size() == (keys.size() + 1) && - values.get(4).toLowerCase().startsWith("charset=")) { + values.get(4).toLowerCase(Locale.ROOT).startsWith("charset=")) { List nuvalues = new ArrayList(keys.size()); nuvalues.add(0, values.get(0)); diff --git a/src/main/java/org/archive/io/arc/ARCUtils.java b/src/main/java/org/archive/io/arc/ARCUtils.java index 5bcb4cc3..05c15abb 100644 --- a/src/main/java/org/archive/io/arc/ARCUtils.java +++ b/src/main/java/org/archive/io/arc/ARCUtils.java @@ -27,6 +27,7 @@ import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; +import java.util.Locale; import org.archive.url.UsableURI; import org.archive.util.zip.GzipHeader; @@ -94,7 +95,7 @@ public static boolean testCompressedARCFile(File arcFile, throws IOException { boolean compressedARCFile = false; isReadable(arcFile); - if(!skipSuffixCheck && !arcFile.getName().toLowerCase() + if(!skipSuffixCheck && !arcFile.getName().toLowerCase(Locale.ROOT) .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { return compressedARCFile; } @@ -197,7 +198,7 @@ public static boolean testUncompressedARCFile(File arcFile) throws IOException { boolean uncompressedARCFile = false; isReadable(arcFile); - if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) { + if(arcFile.getName().toLowerCase(Locale.ROOT).endsWith(ARC_FILE_EXTENSION)) { FileInputStream fis = new FileInputStream(arcFile); try { byte [] b = new byte[ARC_MAGIC_NUMBER.length()]; diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java index d33874a3..02756cb1 100644 --- a/src/main/java/org/archive/io/warc/WARCReader.java +++ b/src/main/java/org/archive/io/warc/WARCReader.java @@ -24,6 +24,7 @@ import java.io.InputStream; import java.util.Iterator; import java.util.List; +import java.util.Locale; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; @@ -233,7 +234,7 @@ public static void main(String [] args) break; case 'f': - format = cmdlineOptions[i].getValue().toLowerCase(); + format = cmdlineOptions[i].getValue().toLowerCase(Locale.ROOT); boolean match = false; // List of supported formats. final String [] supportedFormats = @@ -286,4 +287,4 @@ public static void main(String [] args) } } } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/warc/WARCReaderFactory.java b/src/main/java/org/archive/io/warc/WARCReaderFactory.java index 881da869..70b80340 100644 --- a/src/main/java/org/archive/io/warc/WARCReaderFactory.java +++ b/src/main/java/org/archive/io/warc/WARCReaderFactory.java @@ -26,6 +26,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; +import java.util.Locale; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; @@ -307,9 +308,9 @@ protected void gotoEOR(ArchiveRecord rec) throws IOException { public static boolean isWARCSuffix(final String f) { return (f == null)? false: - (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))? + (f.toLowerCase(Locale.ROOT).endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))? true: - (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))? + (f.toLowerCase(Locale.ROOT).endsWith(DOT_WARC_FILE_EXTENSION))? true: false; } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java index e436b8dc..a2a2bfb2 100644 --- a/src/main/java/org/archive/net/PublicSuffixes.java +++ b/src/main/java/org/archive/net/PublicSuffixes.java @@ -31,6 +31,7 @@ import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -231,7 +232,7 @@ protected static Node readPublishedFileToSurtTrie(BufferedReader reader) throws // discard utf8 notation after entry line = line.split("\\s+")[0]; // TODO: maybe we don't need to create lower-cased String - line = line.toLowerCase(); + line = line.toLowerCase(Locale.ROOT); // SURT-order domain segments String[] segs = line.split("\\."); StringBuilder sb = new StringBuilder(); diff --git a/src/main/java/org/archive/resource/generic/GenericResourceProducer.java b/src/main/java/org/archive/resource/generic/GenericResourceProducer.java index 812a3f0d..b111dc1e 100644 --- a/src/main/java/org/archive/resource/generic/GenericResourceProducer.java +++ b/src/main/java/org/archive/resource/generic/GenericResourceProducer.java @@ -1,6 +1,7 @@ package org.archive.resource.generic; import java.io.IOException; +import java.util.Locale; import org.archive.resource.MetaData; import org.archive.resource.Resource; @@ -45,6 +46,6 @@ public void close() throws IOException { stream.close(); } public String getContext() { - return String.format("Context(%s)(%d)", name, stream.getOffset()); + return String.format(Locale.ROOT, "Context(%s)(%d)", name, stream.getOffset()); } } diff --git a/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java b/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java index 39611ab8..5267a0f9 100644 --- a/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java +++ b/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java @@ -1,6 +1,7 @@ package org.archive.resource.gzip; import java.io.IOException; +import java.util.Locale; import org.archive.format.gzip.GZIPMemberSeries; import org.archive.format.gzip.GZIPSeriesMember; @@ -54,6 +55,6 @@ public void close() throws IOException { series.close(); } public String getContext() { - return String.format("Context(%s)(%d)", series.getStreamContext(), series.getCurrentMemberStartOffset()); + return String.format(Locale.ROOT, "Context(%s)(%d)", series.getStreamContext(), series.getCurrentMemberStartOffset()); } } diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java index a9c3fcc3..a5e5ac35 100644 --- a/src/main/java/org/archive/resource/warc/WARCResource.java +++ b/src/main/java/org/archive/resource/warc/WARCResource.java @@ -5,6 +5,7 @@ import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Locale; import org.archive.format.http.HttpHeader; import org.archive.format.http.HttpResponse; @@ -43,7 +44,7 @@ public WARCResource(MetaData metaData, ResourceContainer container, String name = h.getName(); String value = h.getValue(); fields.putString(name,value); - if(name.toLowerCase().equals("content-length")) { + if(name.toLowerCase(Locale.ROOT).equals("content-length")) { // TODO: catch formatexception length = Long.parseLong(value); } diff --git a/src/main/java/org/archive/streamcontext/HTTP11Stream.java b/src/main/java/org/archive/streamcontext/HTTP11Stream.java index 06f51409..995dc53e 100755 --- a/src/main/java/org/archive/streamcontext/HTTP11Stream.java +++ b/src/main/java/org/archive/streamcontext/HTTP11Stream.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.net.URL; import java.net.URLConnection; +import java.util.Locale; public class HTTP11Stream extends AbstractBufferingStream { private URL url; @@ -42,7 +43,7 @@ public int doRead(byte[] b, int off, int len) throws IOException { public void doSeek(long offset) throws IOException { doClose(); conn = url.openConnection(); - conn.setRequestProperty("Range", String.format("bytes=%d-", offset)); + conn.setRequestProperty("Range", String.format(Locale.ROOT, "bytes=%d-", offset)); conn.connect(); is = conn.getInputStream(); } diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java index 37b448c1..632d1ea7 100644 --- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java +++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java @@ -7,6 +7,7 @@ import java.nio.charset.CharsetDecoder; import java.nio.charset.CoderResult; import java.util.ArrayList; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -64,7 +65,7 @@ public void canonicalize(HandyURL url) { if (ip != null) { host = ip; } else if (host != null) { - host = escapeOnce(host.toLowerCase()); + host = escapeOnce(host.toLowerCase(Locale.ROOT)); } url.setHost(host); // now the path: @@ -159,7 +160,7 @@ public String attemptIPFormats(String host) { // throws URIException { } ip[i] = octet; } - return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]); + return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]); } else { Matcher m2 = DECIMAL_IP.matcher(host); if (m2.matches()) { @@ -190,7 +191,7 @@ public String attemptIPFormats(String host) { // throws URIException { } ip[i] = octet; } - return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2], + return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]); } @@ -261,7 +262,7 @@ public String escapeOnce(String input) { } sb.append("%"); - String hex = Integer.toHexString(b).toUpperCase(); + String hex = Integer.toHexString(b).toUpperCase(Locale.ROOT); if (hex.length() == 1) { sb.append('0'); } diff --git a/src/main/java/org/archive/url/HandyURL.java b/src/main/java/org/archive/url/HandyURL.java index 91539b3f..0c2c81f7 100644 --- a/src/main/java/org/archive/url/HandyURL.java +++ b/src/main/java/org/archive/url/HandyURL.java @@ -2,6 +2,7 @@ import java.net.MalformedURLException; import java.net.URL; +import java.util.Locale; public class HandyURL { public final static int DEFAULT_PORT = -1; @@ -277,7 +278,7 @@ public void setOpaque(String opaque) { } public String toDebugString() { - return String.format("Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)", + return String.format(Locale.ROOT, "Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)", scheme, authUser, authPass, host, port, path, query, hash); } diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java index 0cf7c8a4..e964cd00 100644 --- a/src/main/java/org/archive/url/IAURLCanonicalizer.java +++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java @@ -2,6 +2,7 @@ import java.util.Arrays; import java.util.Comparator; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -20,11 +21,11 @@ public void canonicalize(HandyURL url) { } if (rules.isSet(SCHEME_SETTINGS, SCHEME_LOWERCASE)) { if (url.getScheme() != null) { - url.setScheme(url.getScheme().toLowerCase()); + url.setScheme(url.getScheme().toLowerCase(Locale.ROOT)); } } if(rules.isSet(HOST_SETTINGS, HOST_LOWERCASE)) { - url.setHost(url.getHost().toLowerCase()); + url.setHost(url.getHost().toLowerCase(Locale.ROOT)); } if(rules.isSet(HOST_SETTINGS, HOST_MASSAGE)) { url.setHost(massageHost(url.getHost())); @@ -46,7 +47,7 @@ public void canonicalize(HandyURL url) { url.setPath(null); } else { if(rules.isSet(PATH_SETTINGS, PATH_LOWERCASE)) { - path = path.toLowerCase(); + path = path.toLowerCase(Locale.ROOT); } if(rules.isSet(PATH_SETTINGS, PATH_STRIP_SESSION_ID)) { path = URLRegexTransformer.stripPathSessionID(path); @@ -71,7 +72,7 @@ public void canonicalize(HandyURL url) { } // lower-case: if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { - query = query.toLowerCase(); + query = query.toLowerCase(Locale.ROOT); } // re-order? if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { @@ -155,7 +156,7 @@ public static String massageHost(String host) { return host; } public static int getDefaultPort(String scheme) { - String lcScheme = scheme.toLowerCase(); + String lcScheme = scheme.toLowerCase(Locale.ROOT); if(lcScheme.equals("http")) { return 80; } else if(lcScheme.equals("https")) { diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java index 57071460..4210c303 100644 --- a/src/main/java/org/archive/url/LaxURI.java +++ b/src/main/java/org/archive/url/LaxURI.java @@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.BitSet; +import java.util.Locale; /** * URI subclass which allows partial/inconsistent encoding, matching @@ -321,7 +322,7 @@ protected void parseUriReference(String original, boolean escaped) *

*/ if (at > 0 && at < length && tmp.charAt(at) == ':') { - char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); + char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray(); if (validate(target, scheme)) { _scheme = target; from = ++at; diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java index 374e0574..38219556 100644 --- a/src/main/java/org/archive/url/URI.java +++ b/src/main/java/org/archive/url/URI.java @@ -261,7 +261,7 @@ public URI(String scheme, String schemeSpecificPart, String fragment) if (scheme == null) { throw new URIException(URIException.PARSING, "scheme required"); } - char[] s = scheme.toLowerCase().toCharArray(); + char[] s = scheme.toLowerCase(Locale.ROOT).toCharArray(); if (validate(s, URI.scheme)) { _scheme = s; // is_absoluteURI } else { @@ -1954,7 +1954,7 @@ protected void parseUriReference(String original, boolean escaped) *

*/ if (at > 0 && at < length && tmp.charAt(at) == ':') { - char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); + char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray(); if (validate(target, scheme)) { _scheme = target; } else { diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java index 5f31c81c..182eb218 100644 --- a/src/main/java/org/archive/url/URLRegexTransformer.java +++ b/src/main/java/org/archive/url/URLRegexTransformer.java @@ -1,5 +1,6 @@ package org.archive.url; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -27,7 +28,7 @@ public class URLRegexTransformer { public static String stripOpts(String orig, OptimizedPattern op[]) { - String origLC = orig.toLowerCase(); + String origLC = orig.toLowerCase(Locale.ROOT); StringBuilder sb = null; int i = 0; int max = op.length; diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java index 08f18999..3038ada5 100644 --- a/src/main/java/org/archive/url/UsableURIFactory.java +++ b/src/main/java/org/archive/url/UsableURIFactory.java @@ -23,6 +23,7 @@ import java.io.UnsupportedEncodingException; import java.util.BitSet; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; @@ -609,7 +610,7 @@ private String fixupDomainlabel(String label) throw ue; } } - label = label.toLowerCase(); + label = label.toLowerCase(Locale.ROOT); return label; } @@ -755,6 +756,6 @@ private String checkUriElement(String element) { */ private String checkUriElementAndLowerCase(String element) { String tmp = checkUriElement(element); - return (tmp != null)? tmp.toLowerCase(): tmp; + return (tmp != null)? tmp.toLowerCase(Locale.ROOT): tmp; } } diff --git a/src/main/java/org/archive/util/ArchiveUtils.java b/src/main/java/org/archive/util/ArchiveUtils.java index 22ba2787..50307b43 100644 --- a/src/main/java/org/archive/util/ArchiveUtils.java +++ b/src/main/java/org/archive/util/ArchiveUtils.java @@ -900,7 +900,7 @@ private static String loadVersion() { if (line.startsWith("#")) { continue; } - TLDS.add(line.trim().toLowerCase()); + TLDS.add(line.trim().toLowerCase(Locale.ROOT)); } } catch (Exception e) { LOGGER.log(Level.SEVERE,"TLD list unavailable",e); @@ -917,7 +917,7 @@ private static String loadVersion() { * @return boolean true if recognized as TLD */ public static boolean isTld(String dom) { - return TLDS.contains(dom.toLowerCase()); + return TLDS.contains(dom.toLowerCase(Locale.ROOT)); } public static void closeQuietly(Object input) { @@ -981,7 +981,7 @@ public static int readFully(InputStream input, byte[] buf) */ public static BufferedReader getBufferedReader(File source) throws IOException { InputStream is = new BufferedInputStream(new FileInputStream(source)); - boolean isGzipped = source.getName().toLowerCase(). + boolean isGzipped = source.getName().toLowerCase(Locale.ROOT). endsWith(GZIP_SUFFIX); if(isGzipped) { is = new GZIPInputStream(is); diff --git a/src/main/java/org/archive/util/FileNameSpec.java b/src/main/java/org/archive/util/FileNameSpec.java index a3312cfc..7ace8b59 100644 --- a/src/main/java/org/archive/util/FileNameSpec.java +++ b/src/main/java/org/archive/util/FileNameSpec.java @@ -1,5 +1,6 @@ package org.archive.util; +import java.util.Locale; import java.util.concurrent.atomic.AtomicInteger; public class FileNameSpec { @@ -15,7 +16,7 @@ public FileNameSpec(String prefix, String suffix) { public String getNextName() { StringBuilder sb = new StringBuilder(); sb.append(prefix); - sb.append(String.format("%06d",aInt.incrementAndGet())); + sb.append(String.format(Locale.ROOT, "%06d",aInt.incrementAndGet())); sb.append(suffix); return sb.toString(); } diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java index 70b5ffae..6886e08c 100644 --- a/src/main/java/org/archive/util/FileUtils.java +++ b/src/main/java/org/archive/util/FileUtils.java @@ -32,6 +32,7 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import java.util.Locale; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; @@ -219,8 +220,8 @@ protected static void workaroundCopyFile(final File src, FileFilter prefixFilter = new FileFilter() { public boolean accept(File pathname) { - return pathname.getName().toLowerCase(). - startsWith(prefix.toLowerCase()); + return pathname.getName().toLowerCase(Locale.ROOT). + startsWith(prefix.toLowerCase(Locale.ROOT)); } }; return dir.listFiles(prefixFilter); @@ -283,7 +284,7 @@ public static boolean isReadableWithExtensionAndMagic(final File f, throws IOException { boolean result = false; FileUtils.assertReadable(f); - if(f.getName().toLowerCase().endsWith(uncompressedExtension)) { + if(f.getName().toLowerCase(Locale.ROOT).endsWith(uncompressedExtension)) { FileInputStream fis = new FileInputStream(f); try { byte [] b = new byte[magic.length()]; @@ -708,4 +709,4 @@ public static void appendTo(File fileToAppendTo, File fileToAppendFrom) throws I out.flush(); } } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java index 6a7a53d7..6f9e0117 100644 --- a/src/main/java/org/archive/util/Recorder.java +++ b/src/main/java/org/archive/util/Recorder.java @@ -26,6 +26,7 @@ import java.io.OutputStream; import java.nio.charset.Charset; import java.util.HashSet; +import java.util.Locale; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; @@ -338,8 +339,8 @@ public void setInputIsChunked(boolean chunked) { * @param contentEncoding declared content-encoding of input recording. */ public void setContentEncoding(String contentEncoding) { - String lowerCoding = contentEncoding.toLowerCase(); - if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase())) { + String lowerCoding = contentEncoding.toLowerCase(Locale.ROOT); + if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase(Locale.ROOT))) { throw new IllegalArgumentException("contentEncoding unsupported: "+contentEncoding); } this.contentEncoding = lowerCoding; diff --git a/src/main/java/org/archive/util/SurtPrefixSet.java b/src/main/java/org/archive/util/SurtPrefixSet.java index 6925cc83..32a34d53 100644 --- a/src/main/java/org/archive/util/SurtPrefixSet.java +++ b/src/main/java/org/archive/util/SurtPrefixSet.java @@ -31,6 +31,7 @@ import java.io.PrintStream; import java.io.Reader; import java.util.Iterator; +import java.util.Locale; import org.archive.url.UsableURI; import org.archive.util.iterator.LineReadingIterator; @@ -70,7 +71,7 @@ public void importFrom(Reader r) { while (iter.hasNext()) { s = (String) iter.next(); - add(s.toLowerCase()); + add(s.toLowerCase(Locale.ROOT)); } } @@ -145,7 +146,7 @@ public boolean considerAsAddDirective(String suri) { } if(u.indexOf("(")>0) { // formal SURT prefix; toLowerCase just in case - add(u.toLowerCase()); + add(u.toLowerCase(Locale.ROOT)); } else { // hostname/normal form URI from which // to deduce SURT prefix diff --git a/src/main/java/org/archive/util/binsearch/SortedTextFile.java b/src/main/java/org/archive/util/binsearch/SortedTextFile.java index ab8118b7..a4326dc0 100644 --- a/src/main/java/org/archive/util/binsearch/SortedTextFile.java +++ b/src/main/java/org/archive/util/binsearch/SortedTextFile.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.util.Comparator; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -142,14 +143,14 @@ public long binaryFindOffset(SeekableLineReader slr, final String key, Comparato if (comparator.compare(key, line) > 0) { if(LOGGER.isLoggable(Level.FINE)) { - LOGGER.fine(String.format("Search(%d) (%s)/(%s) : After", + LOGGER.fine(String.format(Locale.ROOT, "Search(%d) (%s)/(%s) : After", mid * blockSize, key,line)); } min = mid; } else { if(LOGGER.isLoggable(Level.FINE)) { - LOGGER.fine(String.format("Search(%d) (%s)/(%s) : Before", + LOGGER.fine(String.format(Locale.ROOT, "Search(%d) (%s)/(%s) : Before", mid * blockSize, key,line)); } max = mid; @@ -391,7 +392,7 @@ private CloseableIterator search(SeekableLineReader slr, long min = binaryFindOffset(slr, key, comparator); if (LOGGER.isLoggable(Level.FINE)) { - LOGGER.fine(String.format("Aligning(%d)",min)); + LOGGER.fine(String.format(Locale.ROOT, "Aligning(%d)",min)); } slr.seek(min); From 72d8a808e7d61173a435cca7ee5a7ae2b24b61d1 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 18:27:41 +0100 Subject: [PATCH 03/16] Add Locale.ROOT as parameter to all occurrences of PrintStream.format(...) and number formatters --- .../java/org/archive/extract/RealCDXExtractorOutput.java | 6 ++++-- src/main/java/org/archive/extract/ResourceExtractor.java | 6 +++--- .../extract/WARCMetadataRecordExtractorOutput.java | 5 +++-- .../java/org/archive/extract/WATExtractorOutput.java | 2 +- .../archive/format/http/DumpingHTTPParseObserver.java | 5 +++-- .../java/org/archive/io/GenericReplayCharSequence.java | 9 +++++---- src/main/java/org/archive/io/WriterPoolMember.java | 9 ++++++++- .../java/org/archive/resource/html/HTMLMetaData.java | 3 ++- 8 files changed, 29 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java index b8f06034..ff0b9e83 100644 --- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java +++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java @@ -223,7 +223,8 @@ public void output(Resource resource) throws IOException { canUrl = keyMaker.makeKey(origUrl); // URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE if(dumpJSON) { - out.format("%s %s %s %s %s %s %s %s %s %s %s %s\n", + out.format(Locale.ROOT, + "%s %s %s %s %s %s %s %s %s %s %s %s\n", canUrl, date, origUrl, @@ -237,7 +238,8 @@ public void output(Resource resource) throws IOException { filename, m.toString(1)); } else { - out.format("%s %s %s %s %s %s %s %s %s %s %s\n", + out.format(Locale.ROOT, + "%s %s %s %s %s %s %s %s %s %s %s\n", canUrl, date, origUrl, diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index a6fa0a00..dcbfc122 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -141,7 +141,7 @@ public int run(String[] args) } catch(GZIPFormatException e) { LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied - System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()); if(ProducerUtils.STRICT_GZ) { throw e; @@ -150,7 +150,7 @@ public int run(String[] args) } catch(ResourceParseException e) { LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied - System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()); if(ProducerUtils.STRICT_GZ) { throw e; @@ -160,7 +160,7 @@ public int run(String[] args) // this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions... LOG.severe(String.format(Locale.ROOT, "RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied - System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()); e.printStackTrace(); diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java index 68f9d1c8..426acb02 100644 --- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -7,6 +7,7 @@ import java.net.URISyntaxException; import java.net.URL; import java.util.List; +import java.util.Locale; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -87,7 +88,7 @@ public void output(Resource resource) throws IOException { String[] linkParts = outLinkValue.split(" "); if(linkParts.length > 2) //'outlinks': 'origUrl date origOutlinkUrl linktype linktext' - out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]); + out.format(Locale.ROOT,"%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]); } } else if(outputType.equals("hopinfo")) { String key = obj.get("Name").toString(); @@ -103,7 +104,7 @@ public void output(Resource resource) throws IOException { } if(outputType.equals("hopinfo")) { //'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag' - out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag); + out.format(Locale.ROOT,"%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag); } } } diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index dbe979e5..79cb0870 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -157,7 +157,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } // handle date of generation in WARC format - DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); + DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.ROOT); String capDateString = dateFormat.format(new Date()); String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID"); writeWARCMDRecord(recOut,md,targetURI,capDateString,recId); diff --git a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java index ed5dfcb2..11cd9276 100755 --- a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java +++ b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java @@ -2,6 +2,7 @@ import java.io.PrintStream; import java.nio.charset.Charset; +import java.util.Locale; public class DumpingHTTPParseObserver implements HttpHeaderObserver { private static final Charset UTF8 = Charset.forName("UTF-8"); @@ -15,13 +16,13 @@ public DumpingHTTPParseObserver(PrintStream ps) { public void headerParsed(byte[] name, int ns, int nl, byte[] value, int vs, int vl) { - ps.format("headerParsed:(%d:%d)(%s)(%d:%d)(%s)\n", + ps.format(Locale.ROOT,"headerParsed:(%d:%d)(%s)(%d:%d)(%s)\n", ns,nl,new String(name,0,nl,UTF8), vs,vl,new String(value,0,vl,UTF8)); } public void headersComplete(int bytesRead) { - ps.format("headersComplete(%d)\n",bytesRead); + ps.format(Locale.ROOT,"headersComplete(%d)\n",bytesRead); } public void headersCorrupt() { ps.println("headersCorrupted\n"); diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java index c427550b..7aacb25a 100644 --- a/src/main/java/org/archive/io/GenericReplayCharSequence.java +++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java @@ -34,6 +34,7 @@ import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.text.NumberFormat; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -168,8 +169,8 @@ private void updateMemoryMappedBuffer() { long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES); logger.fine("updateMemoryMappedBuffer: mapOffset=" - + NumberFormat.getInstance().format(mapByteOffset) - + " mapSize=" + NumberFormat.getInstance().format(mapSize)); + + NumberFormat.getInstance(Locale.ROOT).format(mapByteOffset) + + " mapSize=" + NumberFormat.getInstance(Locale.ROOT).format(mapSize)); try { // TODO: stress-test without these possibly-costly requests! // System.gc(); @@ -255,9 +256,9 @@ protected void decode(InputStream inStream, int prefixMax, this.length = Ints.saturatedCast(count); if(count>Integer.MAX_VALUE) { logger.warning("input stream is longer than Integer.MAX_VALUE=" - + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE) + " characters -- only first " - + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE) + " are accessible through this GenericReplayCharSequence"); } diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java index a488354a..4679ea78 100644 --- a/src/main/java/org/archive/io/WriterPoolMember.java +++ b/src/main/java/org/archive/io/WriterPoolMember.java @@ -26,9 +26,11 @@ import java.io.InputStream; import java.io.OutputStream; import java.text.DecimalFormat; +import java.text.DecimalFormatSymbols; import java.text.NumberFormat; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Properties; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; @@ -103,12 +105,17 @@ public abstract class WriterPoolMember { */ protected static int roundRobinIndex = 0; + /** + * Symbol set for serial number formatter. + */ + protected static DecimalFormatSymbols serialNoFormatterSymbols = new DecimalFormatSymbols(Locale.ROOT); + /** * NumberFormat instance for formatting serial number. * * Pads serial number with zeros. */ - protected static NumberFormat serialNoFormatter = new DecimalFormat("00000"); + protected static NumberFormat serialNoFormatter = new DecimalFormat("00000", serialNoFormatterSymbols); /** diff --git a/src/main/java/org/archive/resource/html/HTMLMetaData.java b/src/main/java/org/archive/resource/html/HTMLMetaData.java index 024d9677..d995cf65 100644 --- a/src/main/java/org/archive/resource/html/HTMLMetaData.java +++ b/src/main/java/org/archive/resource/html/HTMLMetaData.java @@ -1,6 +1,7 @@ package org.archive.resource.html; import java.util.List; +import java.util.Locale; import java.util.logging.Logger; import org.archive.resource.MetaData; @@ -98,7 +99,7 @@ private void appendObj2(JSONObject o, String arr, String... a) { } catch(JSONException e) { try { - System.err.format("GotErr(%s) JSON(%s)(%s)", e.getMessage(), + System.err.format(Locale.ROOT, "GotErr(%s) JSON(%s)(%s)", e.getMessage(), o.toString(1),a.toString()); } catch (JSONException e1) { // TODO Auto-generated catch block From 56941573a8ea7ef729b550581aadc45647f9826f Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 18:34:34 +0100 Subject: [PATCH 04/16] Initialize InputStreamReaders using UTF-8 charset --- src/main/java/org/archive/format/cdx/CDXFile.java | 4 +++- .../record/WARCJSONMetaDataResourceFactory.java | 7 +++---- src/main/java/org/archive/util/ArchiveUtils.java | 14 ++++++++------ src/main/java/org/archive/util/DevUtils.java | 3 ++- src/main/java/org/archive/util/IAUtils.java | 4 +++- src/main/java/org/archive/util/ProcessUtils.java | 4 +++- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/archive/format/cdx/CDXFile.java b/src/main/java/org/archive/format/cdx/CDXFile.java index 7dca0464..612f7454 100644 --- a/src/main/java/org/archive/format/cdx/CDXFile.java +++ b/src/main/java/org/archive/format/cdx/CDXFile.java @@ -18,6 +18,8 @@ import org.archive.util.iterator.CloseableIterator; import org.archive.util.zip.OpenJDK7GZIPInputStream; +import static java.nio.charset.StandardCharsets.UTF_8; + public class CDXFile extends SortedTextFile implements CDXInputSource { public CDXFile(String uri) throws IOException { @@ -94,7 +96,7 @@ public static BufferedReader createStreamingLineReader(String uri, boolean gzipp input = new OpenJDK7GZIPInputStream(swis); } - BufferedReader reader = new BufferedReader(new InputStreamReader(input)); + BufferedReader reader = new BufferedReader(new InputStreamReader(input, UTF_8)); return reader; } diff --git a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java index 43041efb..8cc8c146 100644 --- a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java +++ b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java @@ -3,7 +3,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.nio.charset.Charset; import org.archive.resource.MetaData; import org.archive.resource.Resource; @@ -14,9 +13,9 @@ import org.json.JSONException; import org.json.JSONTokener; -public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants { - private static final Charset UTF8 = Charset.forName("UTF-8"); +import static java.nio.charset.StandardCharsets.UTF_8; +public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants { public WARCJSONMetaDataResourceFactory() { } @@ -27,7 +26,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData, MetaData md; try { - md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF8))); + md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF_8))); } catch (JSONException e) { throw new ResourceParseException(e); } diff --git a/src/main/java/org/archive/util/ArchiveUtils.java b/src/main/java/org/archive/util/ArchiveUtils.java index 50307b43..cce411df 100644 --- a/src/main/java/org/archive/util/ArchiveUtils.java +++ b/src/main/java/org/archive/util/ArchiveUtils.java @@ -49,6 +49,8 @@ import org.archive.format.gzip.GZIPDecoder; import org.archive.format.gzip.GZIPFormatException; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Miscellaneous useful methods. * @@ -851,7 +853,7 @@ private static String loadVersion() { BufferedReader br = null; String version; try { - br = new BufferedReader(new InputStreamReader(input)); + br = new BufferedReader(new InputStreamReader(input, UTF_8)); version = br.readLine(); br.readLine(); } catch (IOException e) { @@ -873,7 +875,7 @@ private static String loadVersion() { br = null; String timestamp; try { - br = new BufferedReader(new InputStreamReader(input)); + br = new BufferedReader(new InputStreamReader(input, UTF_8)); timestamp = br.readLine(); } catch (IOException e) { return version; @@ -894,7 +896,7 @@ private static String loadVersion() { TLDS = new HashSet(); InputStream is = ArchiveUtils.class.getResourceAsStream("tlds-alpha-by-domain.txt"); try { - BufferedReader reader = new BufferedReader(new InputStreamReader(is)); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8)); String line; while((line = reader.readLine())!=null) { if (line.startsWith("#")) { @@ -986,7 +988,7 @@ public static BufferedReader getBufferedReader(File source) throws IOException { if(isGzipped) { is = new GZIPInputStream(is); } - return new BufferedReader(new InputStreamReader(is)); + return new BufferedReader(new InputStreamReader(is, UTF_8)); } /** @@ -1002,8 +1004,8 @@ public static BufferedReader getBufferedReader(URL source) throws IOException { || conn.getContentEncoding() != null && conn.getContentEncoding().equalsIgnoreCase("gzip"); InputStream uis = conn.getInputStream(); return new BufferedReader(isGzipped? - new InputStreamReader(new GZIPInputStream(uis)): - new InputStreamReader(uis)); + new InputStreamReader(new GZIPInputStream(uis), UTF_8): + new InputStreamReader(uis, UTF_8)); } /** diff --git a/src/main/java/org/archive/util/DevUtils.java b/src/main/java/org/archive/util/DevUtils.java index f2a1d044..7ee4b13a 100644 --- a/src/main/java/org/archive/util/DevUtils.java +++ b/src/main/java/org/archive/util/DevUtils.java @@ -25,6 +25,7 @@ import java.io.StringWriter; import java.util.logging.Logger; +import static java.nio.charset.StandardCharsets.UTF_8; /** * Write a message and stack trace to the 'org.archive.util.DevUtils' logger. @@ -92,7 +93,7 @@ public static void sigquitSelf() { Process p = Runtime.getRuntime().exec( new String[] {"perl", "-e", "print getppid(). \"\n\";"}); BufferedReader br = - new BufferedReader(new InputStreamReader(p.getInputStream())); + new BufferedReader(new InputStreamReader(p.getInputStream(), UTF_8)); String ppid = br.readLine(); Runtime.getRuntime().exec( new String[] {"sh", "-c", "kill -3 "+ppid}).waitFor(); diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java index 4597d723..b0c448f0 100644 --- a/src/main/java/org/archive/util/IAUtils.java +++ b/src/main/java/org/archive/util/IAUtils.java @@ -29,6 +29,8 @@ import java.nio.charset.Charset; import java.util.Properties; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Miscellaneous useful methods. * @@ -53,7 +55,7 @@ public static String loadCommonsVersion() { BufferedReader br = null; String version; try { - br = new BufferedReader(new InputStreamReader(input)); + br = new BufferedReader(new InputStreamReader(input, UTF_8)); version = br.readLine(); br.readLine(); } catch (IOException e) { diff --git a/src/main/java/org/archive/util/ProcessUtils.java b/src/main/java/org/archive/util/ProcessUtils.java index af792981..0a3eeb67 100644 --- a/src/main/java/org/archive/util/ProcessUtils.java +++ b/src/main/java/org/archive/util/ProcessUtils.java @@ -26,6 +26,8 @@ import java.util.logging.Level; import java.util.logging.Logger; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Class to run an external process. * @author stack @@ -55,7 +57,7 @@ protected StreamGobbler(InputStream is, String name) { public void run() { try { BufferedReader br = - new BufferedReader(new InputStreamReader(this.is)); + new BufferedReader(new InputStreamReader(this.is, UTF_8)); for (String line = null; (line = br.readLine()) != null;) { this.sink.append(line); } From c013b258be71c1c00b2a016641d60a2fc65195ff Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 19:44:46 +0100 Subject: [PATCH 05/16] Add charset to invocations of String constructor --- .../java/org/archive/format/http/HttpHeaderParser.java | 4 +++- .../archive/format/http/HttpResponseMessageParser.java | 5 +++-- src/main/java/org/archive/io/CompositeFileReader.java | 4 +++- .../java/org/archive/io/HeaderedArchiveRecord.java | 5 +++-- src/main/java/org/archive/io/arc/ARCRecord.java | 4 ++-- src/main/java/org/archive/url/LaxURI.java | 9 ++++++--- src/main/java/org/archive/url/URI.java | 10 +++++++--- src/main/java/org/archive/util/LaxHttpParser.java | 3 ++- 8 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java index bee3c28b..ddbb6e47 100755 --- a/src/main/java/org/archive/format/http/HttpHeaderParser.java +++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; public class HttpHeaderParser implements HttpConstants { private static final int DEFAULT_MAX_NAME_LENGTH = 1024 * 100; @@ -288,7 +289,8 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) return parser.postColonState; } if(parser.isStrict) { - throw new HttpParseException("Illegal char after name("+new String(name,0,nameLength)+")"); + throw new HttpParseException("Illegal char after name(" + + new String(name, 0, nameLength, StandardCharsets.ISO_8859_1) + ")"); } parser.headersCorrupted(); return parser.laxLineEatParseState; diff --git a/src/main/java/org/archive/format/http/HttpResponseMessageParser.java b/src/main/java/org/archive/format/http/HttpResponseMessageParser.java index 3aee7c48..4ddef2ad 100755 --- a/src/main/java/org/archive/format/http/HttpResponseMessageParser.java +++ b/src/main/java/org/archive/format/http/HttpResponseMessageParser.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; public class HttpResponseMessageParser extends HttpMessageParser { public int maxBytes = 1024 * 128; @@ -97,7 +98,7 @@ public int parseStrict(byte buf[], int len, HttpResponseMessageObserver obs) version = parseVersionStrict(buf, vs, vl); status = parseStatusStrict(buf,ss,sl); - reason = new String(buf,idx+1,(len - idx)-1); + reason = new String(buf,idx+1,(len - idx)-1,StandardCharsets.ISO_8859_1); obs.messageParsed(version, status, reason, len); @@ -155,7 +156,7 @@ private int parseLax(byte buf[], int len, HttpResponseMessageObserver obs) idx++; int reasonLen = bufferEnd - idx; if(reasonLen > 0) { - reason = new String(buf,idx,reasonLen); + reason = new String(buf,idx,reasonLen,StandardCharsets.ISO_8859_1); } } else { // missed some: diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java index 14b56219..6e331565 100644 --- a/src/main/java/org/archive/io/CompositeFileReader.java +++ b/src/main/java/org/archive/io/CompositeFileReader.java @@ -23,6 +23,8 @@ import java.io.InputStreamReader; import java.util.List; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * @author gojomo @@ -34,7 +36,7 @@ public class CompositeFileReader extends InputStreamReader { * @throws IOException */ public CompositeFileReader(List filenames) throws IOException { - super(new CompositeFileInputStream(filenames)); + super(new CompositeFileInputStream(filenames), UTF_8); } } diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java index 70c4fb04..a149acac 100644 --- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -25,6 +25,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; +import java.nio.charset.StandardCharsets; import java.util.Locale; import org.archive.format.http.HttpHeader; @@ -145,7 +146,7 @@ private InputStream readContentHeaders() throws IOException { int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new IOException("Failed to read raw lie where one " + - " was expected: " + new String(statusBytes)); + " was expected: " + new String(statusBytes, ARCConstants.DEFAULT_ENCODING)); } String statusLine = new String(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); @@ -186,7 +187,7 @@ private InputStream readContentHeaders() throws IOException { eolCharCount = getEolCharsCount(lineBytes); if (eolCharCount <= 0) { throw new IOException("Failed reading headers: " + - ((lineBytes != null)? new String(lineBytes): null)); + ((lineBytes != null)? new String(lineBytes, StandardCharsets.ISO_8859_1): null)); } // Save the bytes read. baos.write(lineBytes); diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index 14e80728..c14426a5 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -589,7 +589,7 @@ private InputStream readHttpHeader() throws IOException { if (eolCharCount <= 0) { throw new RecoverableIOException( "Failed to read http status where one was expected: " - + ((statusBytes == null) ? "" : new String(statusBytes))); + + ((statusBytes == null) ? "" : new String(statusBytes, DEFAULT_ENCODING))); } statusLine = new String(statusBytes, 0, @@ -659,7 +659,7 @@ private InputStream readHttpHeader() throws IOException { break; } else { throw new IOException("Failed reading http headers: " + - ((lineBytes != null)? new String(lineBytes): null)); + ((lineBytes != null)? new String(lineBytes, DEFAULT_ENCODING): null)); } } else { httpHeaderBytesRead += lineBytes.length; diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java index 4210c303..3b27e045 100644 --- a/src/main/java/org/archive/url/LaxURI.java +++ b/src/main/java/org/archive/url/LaxURI.java @@ -19,6 +19,8 @@ package org.archive.url; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.BitSet; @@ -122,9 +124,10 @@ protected static String decode(String component, String charset) byte[] rawdata = null; rawdata = LaxURLCodec.decodeUrlLoose(component.getBytes(StandardCharsets.US_ASCII)); try { - return new String(rawdata, charset); - } catch (UnsupportedEncodingException e) { - return new String(rawdata); + Charset cs = Charset.forName(charset); + return new String(rawdata, cs); + } catch (IllegalCharsetNameException e) { + return new String(rawdata, StandardCharsets.US_ASCII); } } diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java index 38219556..ff53775e 100644 --- a/src/main/java/org/archive/url/URI.java +++ b/src/main/java/org/archive/url/URI.java @@ -34,6 +34,8 @@ import org.apache.commons.codec.net.URLCodec; import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.BitSet; @@ -1780,11 +1782,13 @@ protected static String decode(String component, String charset) throw new URIException(e.getMessage()); } try { - return new String(rawdata, charset); - } catch (UnsupportedEncodingException e) { - return new String(rawdata); + Charset cs = Charset.forName(charset); + return new String(rawdata, cs); + } catch (IllegalCharsetNameException e) { + return new String(rawdata, StandardCharsets.US_ASCII); } } + /** * Pre-validate the unescaped URI string within a specific component. * diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java index 0545fd95..05d2469c 100644 --- a/src/main/java/org/archive/util/LaxHttpParser.java +++ b/src/main/java/org/archive/util/LaxHttpParser.java @@ -36,6 +36,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.logging.Logger; @@ -127,7 +128,7 @@ public static String readLine(InputStream inputStream, String charset) throws IO try { return new String(rawdata, 0, len - offset, charset); } catch (UnsupportedEncodingException e) { - return new String(rawdata, 0, len - offset); + return new String(rawdata, 0, len - offset, StandardCharsets.ISO_8859_1); } } From 88ac2989028ed35d52e0e46076d1322040362de3 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 22:38:54 +0100 Subject: [PATCH 06/16] Initialize PrintStreams using UTF-8 charset, call String.getBytes() with charset. --- .../archive/extract/DumpingExtractorOutput.java | 8 +++++++- .../archive/extract/JSONViewExtractorOutput.java | 8 +++++++- .../format/gzip/zipnum/ZipNumCluster.java | 3 +++ src/main/java/org/archive/io/arc/ARC2WCDX.java | 4 +++- .../java/org/archive/io/warc/WARCWriter.java | 6 ++++-- src/main/java/org/archive/url/URI.java | 4 +++- src/main/java/org/archive/util/SURT.java | 4 +++- .../java/org/archive/util/SurtPrefixSet.java | 6 ++++-- src/main/java/org/archive/util/TextUtils.java | 16 ++++++---------- .../archive/util/binsearch/SortedTextFile.java | 6 ++++-- 10 files changed, 44 insertions(+), 21 deletions(-) diff --git a/src/main/java/org/archive/extract/DumpingExtractorOutput.java b/src/main/java/org/archive/extract/DumpingExtractorOutput.java index 69591931..1ccbf771 100644 --- a/src/main/java/org/archive/extract/DumpingExtractorOutput.java +++ b/src/main/java/org/archive/extract/DumpingExtractorOutput.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; +import java.io.UnsupportedEncodingException; import java.util.logging.Logger; import org.archive.resource.Resource; @@ -12,13 +13,18 @@ import com.google.common.io.ByteStreams; import com.google.common.io.CountingOutputStream; +import static java.nio.charset.StandardCharsets.UTF_8; + public class DumpingExtractorOutput implements ExtractorOutput { private static final Logger LOG = Logger.getLogger(DumpingExtractorOutput.class.getName()); private PrintStream out; public DumpingExtractorOutput(OutputStream out) { - this.out = new PrintStream(out); + try { + this.out = new PrintStream(out, false, UTF_8.name()); + } catch (UnsupportedEncodingException e) { + } } public void output(Resource resource) throws IOException { diff --git a/src/main/java/org/archive/extract/JSONViewExtractorOutput.java b/src/main/java/org/archive/extract/JSONViewExtractorOutput.java index fb6dc847..6cb7c445 100644 --- a/src/main/java/org/archive/extract/JSONViewExtractorOutput.java +++ b/src/main/java/org/archive/extract/JSONViewExtractorOutput.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; +import java.io.UnsupportedEncodingException; import java.util.List; import org.apache.commons.lang3.StringUtils; @@ -10,12 +11,17 @@ import org.archive.resource.Resource; import org.archive.util.StreamCopy; +import static java.nio.charset.StandardCharsets.UTF_8; + public class JSONViewExtractorOutput implements ExtractorOutput { private PrintStream out; private JSONView view; public JSONViewExtractorOutput(OutputStream out, String filterPath) { view = new JSONView(filterPath.split(",")); - this.out = new PrintStream(out); + try { + this.out = new PrintStream(out, false, UTF_8.name()); + } catch (UnsupportedEncodingException e) { + } } public void output(Resource resource) throws IOException { StreamCopy.readToEOF(resource.getInputStream()); diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index a3d34a4b..edf5857c 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -13,6 +13,7 @@ import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -35,6 +36,8 @@ import org.archive.util.binsearch.impl.HTTPSeekableLineReader; import org.archive.util.iterator.CloseableIterator; +import static java.nio.charset.StandardCharsets.UTF_8; + public class ZipNumCluster extends ZipNumIndex { final static Logger LOGGER = Logger.getLogger(ZipNumCluster.class.getName()); diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java index f0515694..aec571e9 100644 --- a/src/main/java/org/archive/io/arc/ARC2WCDX.java +++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java @@ -32,6 +32,8 @@ import org.archive.util.ArchiveUtils; import org.archive.util.SURT; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC. * Writes .wcdx.gz in same directory. @@ -61,7 +63,7 @@ public static Object[] createWcdx(ARCReader reader) { PrintStream writer = null; long count = 0; try { - writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile))); + writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)), false, UTF_8.name()); // write header: legend + timestamp StringBuilder legend = new StringBuilder(); diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java index 5c6a6854..8b571fad 100644 --- a/src/main/java/org/archive/io/warc/WARCWriter.java +++ b/src/main/java/org/archive/io/warc/WARCWriter.java @@ -45,6 +45,8 @@ import static org.archive.format.warc.WARCConstants.*; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * WARC implementation. @@ -357,12 +359,12 @@ public URI writeWarcinfoRecord(String filename, final String description) byte [] warcinfoBody = null; if (settings.getMetadata() == null) { // TODO: What to write into a warcinfo? What to associate? - warcinfoBody = "TODO: Unimplemented".getBytes(); + warcinfoBody = "TODO: Unimplemented".getBytes(UTF_8); } else { ByteArrayOutputStream baos = new ByteArrayOutputStream(); for (final Iterator i = settings.getMetadata().iterator(); i.hasNext();) { - baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8)); + baos.write(i.next().toString().getBytes(UTF_8)); } warcinfoBody = baos.toByteArray(); } diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java index ff53775e..b19151cd 100644 --- a/src/main/java/org/archive/url/URI.java +++ b/src/main/java/org/archive/url/URI.java @@ -42,6 +42,8 @@ import java.util.Hashtable; import java.util.Locale; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396. * This class has the purpose of supportting of parsing a URI reference to @@ -1696,7 +1698,7 @@ private static byte[] getBytes(String original, String charset) { try { return original.getBytes(charset); } catch (UnsupportedEncodingException e) { - return original.getBytes(); + return original.getBytes(UTF_8); } } diff --git a/src/main/java/org/archive/util/SURT.java b/src/main/java/org/archive/util/SURT.java index 059b2ec6..c52582e1 100644 --- a/src/main/java/org/archive/util/SURT.java +++ b/src/main/java/org/archive/util/SURT.java @@ -32,6 +32,8 @@ import org.archive.url.URIException; import org.archive.url.UsableURIFactory; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Sort-friendly URI Reordering Transform. * @@ -238,7 +240,7 @@ public static void main(String[] args) throws IOException { InputStream in = args.length > 0 ? new BufferedInputStream( new FileInputStream(args[0])) : System.in; PrintStream out = args.length > 1 ? new PrintStream( - new BufferedOutputStream(new FileOutputStream(args[1]))) + new BufferedOutputStream(new FileOutputStream(args[1])), false, UTF_8.name()) : System.out; BufferedReader br = new BufferedReader(new InputStreamReader(in)); diff --git a/src/main/java/org/archive/util/SurtPrefixSet.java b/src/main/java/org/archive/util/SurtPrefixSet.java index 32a34d53..b2f0ea4f 100644 --- a/src/main/java/org/archive/util/SurtPrefixSet.java +++ b/src/main/java/org/archive/util/SurtPrefixSet.java @@ -37,6 +37,8 @@ import org.archive.util.iterator.LineReadingIterator; import org.archive.util.iterator.RegexLineIterator; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Specialized TreeSet for keeping a set of String prefixes. * @@ -343,10 +345,10 @@ public static void main(String[] args) throws IOException { InputStream in = args.length > 0 ? new BufferedInputStream( new FileInputStream(args[0])) : System.in; PrintStream out = args.length > 1 ? new PrintStream( - new BufferedOutputStream(new FileOutputStream(args[1]))) + new BufferedOutputStream(new FileOutputStream(args[1])), false, UTF_8.name()) : System.out; BufferedReader br = - new BufferedReader(new InputStreamReader(in)); + new BufferedReader(new InputStreamReader(in, UTF_8.name())); String line; while((line = br.readLine())!=null) { if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#")); diff --git a/src/main/java/org/archive/util/TextUtils.java b/src/main/java/org/archive/util/TextUtils.java index 98b471f8..df3de58b 100644 --- a/src/main/java/org/archive/util/TextUtils.java +++ b/src/main/java/org/archive/util/TextUtils.java @@ -40,6 +40,8 @@ import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; +import static java.nio.charset.StandardCharsets.UTF_8; + public class TextUtils { private static final String FIRSTWORD = "^([^\\s]*).*$"; @@ -279,14 +281,11 @@ public static String exceptionToString(String message, Throwable e) { * @param s String to escape * @return URL-escaped string */ - @SuppressWarnings("deprecation") public static String urlEscape(String s) { try { - return URLEncoder.encode(s,"UTF8"); + return URLEncoder.encode(s, UTF_8.name()); } catch (UnsupportedEncodingException e) { - // should be impossible; all JVMs must support UTF8 - // but have a fallback just in case - return URLEncoder.encode(s); + return s; } } @@ -296,14 +295,11 @@ public static String urlEscape(String s) { * @param s String do unescape * @return URL-unescaped String */ - @SuppressWarnings("deprecation") public static String urlUnescape(String s) { try { - return URLDecoder.decode(s, "UTF8"); + return URLDecoder.decode(s, UTF_8.name()); } catch (UnsupportedEncodingException e) { - // should be impossible; all JVMs must support UTF8 - // but have a fallback just in case - return URLDecoder.decode(s); + return s; } } } \ No newline at end of file diff --git a/src/main/java/org/archive/util/binsearch/SortedTextFile.java b/src/main/java/org/archive/util/binsearch/SortedTextFile.java index a4326dc0..bb4a1f66 100644 --- a/src/main/java/org/archive/util/binsearch/SortedTextFile.java +++ b/src/main/java/org/archive/util/binsearch/SortedTextFile.java @@ -9,6 +9,8 @@ import org.archive.util.GeneralURIStreamFactory; import org.archive.util.iterator.CloseableIterator; +import static java.nio.charset.StandardCharsets.UTF_8; + public class SortedTextFile { public static class NumericComparator implements Comparator @@ -371,7 +373,7 @@ private long searchOffset(SeekableLineReader slr, String prev = null; while(true) { if (line != null) { - offset += line.getBytes().length + 1; + offset += line.getBytes(UTF_8).length + 1; } line = slr.readLine(); if(line == null) break; @@ -380,7 +382,7 @@ private long searchOffset(SeekableLineReader slr, } if (lessThan && prev != null) { - offset -= prev.getBytes().length + 1; + offset -= prev.getBytes(UTF_8).length + 1; } return offset; From c1e4cd323b73715e04f17502f3abcb12a09da89c Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 22:54:57 +0100 Subject: [PATCH 07/16] Replace Charset.forName("utf-8") by StandardCharsets.UTF-8 --- src/main/java/org/archive/extract/ResourceExtractor.java | 5 ++--- src/main/java/org/archive/extract/WATExtractorOutput.java | 7 +++---- src/main/java/org/archive/format/arc/ARCConstants.java | 3 ++- .../java/org/archive/format/gzip/zipnum/ZipNumWriter.java | 6 +++--- .../org/archive/format/http/DumpingHTTPParseObserver.java | 3 +-- src/main/java/org/archive/format/http/HttpConstants.java | 3 ++- src/main/java/org/archive/url/BasicURLCanonicalizer.java | 6 ++---- src/main/java/org/archive/url/SURT.java | 4 ++-- src/main/java/org/archive/util/IAUtils.java | 2 +- .../archive/util/binsearch/AbstractSeekableLineReader.java | 3 ++- 10 files changed, 20 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index dcbfc122..d9b9f396 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -7,7 +7,7 @@ import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.URISyntaxException; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -27,7 +27,6 @@ public class ResourceExtractor implements ResourceConstants, Tool { private final static Logger LOG = Logger.getLogger(ResourceExtractor.class.getName()); - Charset UTF8 = Charset.forName("utf-8"); public final static String TOOL_NAME = "extractor"; public static final String TOOL_DESCRIPTION = "A tool for extracting metadata from WARC, ARC, and WAT files"; @@ -66,7 +65,7 @@ public static void main(String[] args) throws Exception { private PrintWriter makePrintWriter(OutputStream os) { - return new PrintWriter(new OutputStreamWriter(os, Charset.forName("UTF-8"))); + return new PrintWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8)); } public int run(String[] args) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 79cb0870..bb179fd1 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -1,12 +1,10 @@ package org.archive.extract; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; -import java.nio.charset.Charset; import java.text.ParseException; import java.net.UnknownHostException; import java.util.Date; @@ -31,13 +29,14 @@ import java.util.logging.Logger; +import static java.nio.charset.StandardCharsets.UTF_8; + public class WATExtractorOutput implements ExtractorOutput { WARCRecordWriter recW; private boolean wroteFirst; private GZIPMemberWriter gzW; private static int DEFAULT_BUFFER_RAM = 1024 * 1024; private int bufferRAM = DEFAULT_BUFFER_RAM; - private final static Charset UTF8 = Charset.forName("UTF-8"); private String outputFile; private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName()); @@ -169,7 +168,7 @@ private void writeWARCMDRecord(OutputStream recOut, MetaData md, ByteArrayOutputStream bos = new ByteArrayOutputStream(); - OutputStreamWriter osw = new OutputStreamWriter(bos, UTF8); + OutputStreamWriter osw = new OutputStreamWriter(bos, UTF_8); try { md.write(osw); } catch (JSONException e1) { diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java index 5987b49f..39dbf7ed 100755 --- a/src/main/java/org/archive/format/arc/ARCConstants.java +++ b/src/main/java/org/archive/format/arc/ARCConstants.java @@ -1,6 +1,7 @@ package org.archive.format.arc; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; import java.util.zip.Deflater; @@ -16,7 +17,7 @@ */ public interface ARCConstants extends ArchiveFileConstants { public final static int MAX_META_LENGTH = 1024 * 32; - public final static Charset ARC_META_CHARSET = Charset.forName("utf-8"); + public final static Charset ARC_META_CHARSET = StandardCharsets.UTF_8; public final static int NEW_LINE_ORD = 10; public static final int CARRIAGE_RETURN_ORD = 13; public final static String DELIMITER = " "; diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java index a104244a..c0e4e01d 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java @@ -3,18 +3,18 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.nio.charset.Charset; import org.archive.format.gzip.GZIPMemberWriter; import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream; +import static java.nio.charset.StandardCharsets.UTF_8; + public class ZipNumWriter extends GZIPMemberWriterCommittedOutputStream { int limit; int count; OutputStream manifestOut; ByteArrayOutputStream manifestBuffer; char delimiter = '\t'; - private static final Charset UTF8 = Charset.forName("utf-8"); public ZipNumWriter(OutputStream main, OutputStream manifest, int limit) { super(new GZIPMemberWriter(main)); manifestOut = manifest; @@ -51,7 +51,7 @@ private void finishCurrent() throws IOException { sb.append(delimiter); sb.append(len); sb.append(delimiter); - manifestOut.write(sb.toString().getBytes(UTF8)); + manifestOut.write(sb.toString().getBytes(UTF_8)); manifestBuffer.writeTo(manifestOut); manifestOut.flush(); count = 0; diff --git a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java index 11cd9276..f1ac16c6 100755 --- a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java +++ b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java @@ -1,11 +1,10 @@ package org.archive.format.http; import java.io.PrintStream; -import java.nio.charset.Charset; import java.util.Locale; + public class DumpingHTTPParseObserver implements HttpHeaderObserver { - private static final Charset UTF8 = Charset.forName("UTF-8"); private PrintStream ps = null; public DumpingHTTPParseObserver() { ps = System.out; diff --git a/src/main/java/org/archive/format/http/HttpConstants.java b/src/main/java/org/archive/format/http/HttpConstants.java index fa0a7e10..8ae4d4db 100755 --- a/src/main/java/org/archive/format/http/HttpConstants.java +++ b/src/main/java/org/archive/format/http/HttpConstants.java @@ -1,9 +1,10 @@ package org.archive.format.http; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; public interface HttpConstants { - public static final Charset UTF8 = Charset.forName("UTF-8"); + public static final Charset UTF8 = StandardCharsets.UTF_8; public static final byte CR = 13; public static final byte LF = 10; public static final byte SP = 32; diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java index 632d1ea7..dd0d9ac7 100644 --- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java +++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java @@ -6,6 +6,7 @@ import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CoderResult; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Locale; import java.util.regex.Matcher; @@ -204,12 +205,9 @@ public String minimalEscape(String input) { return escapeOnce(unescapeRepeatedly(input)); } - protected static Charset _UTF8 = null; + protected static Charset _UTF8 = StandardCharsets.UTF_8; protected static Charset UTF8() { - if (_UTF8 == null) { - _UTF8 = Charset.forName("UTF-8"); - } return _UTF8; } diff --git a/src/main/java/org/archive/url/SURT.java b/src/main/java/org/archive/url/SURT.java index 3e0bcd55..9598f458 100644 --- a/src/main/java/org/archive/url/SURT.java +++ b/src/main/java/org/archive/url/SURT.java @@ -2,7 +2,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.logging.Logger; @@ -33,7 +33,7 @@ public static String toSURT(String input) { } public static void main(String[] args) { String line; - InputStreamReader isr = new InputStreamReader(System.in,Charset.forName("UTF-8")); + InputStreamReader isr = new InputStreamReader(System.in, StandardCharsets.UTF_8); BufferedReader br = new BufferedReader(isr); Iterator i = AbstractPeekableIterator.wrapReader(br); while(i.hasNext()) { diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java index b0c448f0..1d15256e 100644 --- a/src/main/java/org/archive/util/IAUtils.java +++ b/src/main/java/org/archive/util/IAUtils.java @@ -37,7 +37,7 @@ * @author gojomo & others */ public class IAUtils { - public final static Charset UTF8 = Charset.forName("utf-8"); + public final static Charset UTF8 = UTF_8; final public static String COMMONS_VERSION = loadCommonsVersion(); final public static String PUBLISHER = loadCommons("publisher"); diff --git a/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java b/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java index de57278e..17d411fa 100644 --- a/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java +++ b/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java @@ -7,13 +7,14 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import org.archive.util.zip.GZIPMembersInputStream; import com.google.common.io.ByteStreams; public abstract class AbstractSeekableLineReader implements SeekableLineReader { - public final static Charset UTF8 = Charset.forName("UTF-8"); + public final static Charset UTF8 = StandardCharsets.UTF_8; protected int blockSize = 128 * 1024; From ed0070b7f6486fe48df0c00b03a9385fbd608fe5 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Nov 2025 23:29:57 +0100 Subject: [PATCH 08/16] Replace FileReader and FileWriter using classes allowing to configure the charset. Use default charset for main methods when reading from stdin. --- .../org/archive/format/gzip/zipnum/ZipNumCluster.java | 6 +++--- src/main/java/org/archive/io/ArchiveReader.java | 7 +++++-- src/main/java/org/archive/net/PublicSuffixes.java | 9 ++++++--- src/main/java/org/archive/util/Grep.java | 11 +++++++---- src/main/java/org/archive/util/SURT.java | 3 ++- .../archive/util/binsearch/SeekCDXBenchmarker.java | 3 ++- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index edf5857c..0a3fa1bf 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -11,9 +11,9 @@ */ import java.io.BufferedReader; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; -import java.nio.charset.StandardCharsets; +import java.io.InputStreamReader; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -370,7 +370,7 @@ protected void loadLastBlockSizes(String filename) totalAdjustment = 0; try { - reader = new BufferedReader(new FileReader(filename)); + reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), UTF_8)); while ((line = reader.readLine()) != null) { String[] splits = line.split("\t"); diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java index 53b8167b..070455a5 100644 --- a/src/main/java/org/archive/io/ArchiveReader.java +++ b/src/main/java/org/archive/io/ArchiveReader.java @@ -26,9 +26,10 @@ import java.io.EOFException; import java.io.File; import java.io.FileInputStream; -import java.io.FileWriter; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -45,6 +46,8 @@ import static org.archive.format.ArchiveFileConstants.*; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Reader for an Archive file of Archive {@link ArchiveRecord}s. @@ -660,7 +663,7 @@ protected void cdxOutput(boolean toFile) DOT_COMPRESSED_FILE_EXTENSION); cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); cdxFilename += ('.' + CDX); - cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); + cdxWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(cdxFilename), UTF_8)); } String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java index a2a2bfb2..5b3219d5 100644 --- a/src/main/java/org/archive/net/PublicSuffixes.java +++ b/src/main/java/org/archive/net/PublicSuffixes.java @@ -22,13 +22,14 @@ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; -import java.io.FileWriter; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Locale; @@ -38,6 +39,8 @@ import org.apache.commons.io.IOUtils; import org.archive.util.TextUtils; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Utility class for making use of the information about 'public suffixes' at * http://publicsuffix.org. @@ -198,11 +201,11 @@ public static void main(String args[]) throws IOException { BufferedWriter writer; if (args.length >= 2) { // write to specified file - writer = new BufferedWriter(new FileWriter(args[1])); + writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), UTF_8)); needsClose = true; } else { // write to stdout - writer = new BufferedWriter(new OutputStreamWriter(System.out)); + writer = new BufferedWriter(new OutputStreamWriter(System.out, Charset.defaultCharset())); } writer.append(regex); writer.flush(); diff --git a/src/main/java/org/archive/util/Grep.java b/src/main/java/org/archive/util/Grep.java index e446e47e..892429bd 100644 --- a/src/main/java/org/archive/util/Grep.java +++ b/src/main/java/org/archive/util/Grep.java @@ -1,10 +1,13 @@ package org.archive.util; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.BufferedReader; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; +import java.nio.charset.Charset; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; @@ -119,14 +122,14 @@ protected void doTheGrepThing() throws Exception { if (files != null) { if (files.size() == 1) { - grep(new BufferedReader(new FileReader(files.get(0))), ""); + grep(new BufferedReader(new InputStreamReader(new FileInputStream(files.get(0)), UTF_8)), ""); } else { for (String path : files) { - grep(new BufferedReader(new FileReader(path)), path + ": "); + grep(new BufferedReader(new InputStreamReader(new FileInputStream(path), UTF_8)), path + ": "); } } } else { - grep(new BufferedReader(new InputStreamReader(System.in)), ""); + grep(new BufferedReader(new InputStreamReader(System.in, Charset.defaultCharset())), ""); } } diff --git a/src/main/java/org/archive/util/SURT.java b/src/main/java/org/archive/util/SURT.java index c52582e1..99347e9f 100644 --- a/src/main/java/org/archive/util/SURT.java +++ b/src/main/java/org/archive/util/SURT.java @@ -27,6 +27,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintStream; +import java.nio.charset.Charset; import java.util.regex.Matcher; import org.archive.url.URIException; @@ -243,7 +244,7 @@ public static void main(String[] args) throws IOException { new BufferedOutputStream(new FileOutputStream(args[1])), false, UTF_8.name()) : System.out; BufferedReader br = - new BufferedReader(new InputStreamReader(in)); + new BufferedReader(new InputStreamReader(in, Charset.defaultCharset())); String line; while((line = br.readLine())!=null) { if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#")); diff --git a/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java b/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java index 76b7b2b9..45c2ee04 100644 --- a/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java +++ b/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java @@ -3,6 +3,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.charset.Charset; import org.archive.url.WaybackURLKeyMaker; import org.archive.util.binsearch.impl.MappedSeekableLineReaderFactory; @@ -52,7 +53,7 @@ public static void main(String[] args) throws IOException { SortedTextFile sorted = new SortedTextFile(factory); sorted.setBinsearchBlockSize(blocksize); - BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader reader = new BufferedReader(new InputStreamReader(System.in, Charset.defaultCharset())); WaybackURLKeyMaker keymaker = new WaybackURLKeyMaker(true); From e3c06efb091377fd0474edd8eb18e0e67b80c3b3 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 12 Nov 2025 12:51:15 +0100 Subject: [PATCH 09/16] Unit tests: pass charset to all occurrences of String.getBytes() --- .../archive/io/HeaderedArchiveRecordTest.java | 10 +++-- .../archive/io/RecordingInputStreamTest.java | 8 ++-- .../archive/io/RecordingOutputStreamTest.java | 40 ++++++++++--------- .../archive/io/ReplayCharSequenceTest.java | 6 ++- .../org/archive/io/arc/ARCWriterPoolTest.java | 8 ++-- .../org/archive/io/arc/ARCWriterTest.java | 12 +++--- .../org/archive/io/warc/WARCWriterTest.java | 8 ++-- 7 files changed, 53 insertions(+), 39 deletions(-) diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java index 005e2c49..65027395 100644 --- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java +++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java @@ -31,6 +31,8 @@ import org.archive.io.warc.WARCRecord; import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -68,7 +70,7 @@ public void testParseHttpHeadersInWARC() throws IOException { final String hdr = warcHeader + HTTPHEADER + BODY; - WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()), + WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)), "READER_IDENTIFIER", 0, false, true); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); @@ -156,7 +158,7 @@ public String getVersion() { } }; - ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)), arh, 0, false, true, false); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); @@ -175,7 +177,7 @@ public void testEasierParseHttpHeadersInARC() throws IOException { + " 192.168.0.1 20070515111004 text/html 167568\n"; final String hdr = arcHeader + HTTPHEADER + BODY; - ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)), "READER_IDENTIFIER", 0, false, true, false); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); @@ -205,7 +207,7 @@ public void testNoheaderWARC() throws IOException { String c = "WARC/0.12\r\nContent-Type: text/plain\r\n" + "Content-Length: " + b.length() + "\r\n\r\n" + b; org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord( - new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0, + new ByteArrayInputStream(c.getBytes(UTF_8)), "READER_IDENTIFIER", 0, false, true); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); assertTrue(har.isStrict()); diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java index 49160aa3..8ccee986 100644 --- a/src/test/java/org/archive/io/RecordingInputStreamTest.java +++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java @@ -28,6 +28,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -55,7 +57,7 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, RecordingInputStream ris = new RecordingInputStream(16384, (new File( tempDir, "testReadFullyOrUntil").getAbsolutePath())); ByteArrayInputStream bais = new ByteArrayInputStream( - "abcdefghijklmnopqrstuvwxyz".getBytes()); + "abcdefghijklmnopqrstuvwxyz".getBytes(UTF_8)); // test soft max ris.open(bais); ris.setLimits(10,0,0); @@ -87,7 +89,7 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, PipedOutputStream pout = new PipedOutputStream(pin); ris.open(pin); exceptionThrown = false; - trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout); + trickle("abcdefghijklmnopqrstuvwxyz".getBytes(UTF_8),pout); int timeout = 200; try { ris.setLimits(0, timeout,0); @@ -133,7 +135,7 @@ public void testAsOutputStream() throws IOException { RecordingInputStream ris = new RecordingInputStream(16384, (new File( tempDir, "testAsOutputStream").getAbsolutePath())); ris.open(null); - ris.asOutputStream().write("hello".getBytes()); + ris.asOutputStream().write("hello".getBytes(UTF_8)); ris.close(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); ris.getReplayInputStream().readFullyTo(baos); diff --git a/src/test/java/org/archive/io/RecordingOutputStreamTest.java b/src/test/java/org/archive/io/RecordingOutputStreamTest.java index c94f8245..0dba910e 100644 --- a/src/test/java/org/archive/io/RecordingOutputStreamTest.java +++ b/src/test/java/org/archive/io/RecordingOutputStreamTest.java @@ -28,6 +28,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -266,61 +268,61 @@ public void testMessageBodyBegin() throws IOException { ros.setSha1Digest(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n\nabcdefghij".getBytes()); + ros.write("0123456789\n\nabcdefghij".getBytes(UTF_8)); assertEquals(12, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\r\n\r\nabcdefghij".getBytes()); + ros.write("0123456789\r\n\r\nabcdefghij".getBytes(UTF_8)); assertEquals(14, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n\r\nabcdefghij".getBytes()); + ros.write("0123456789\n\r\nabcdefghij".getBytes(UTF_8)); assertEquals(13, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n".getBytes()); + ros.write("0123456789\n".getBytes(UTF_8)); assertEquals(-1, ros.getMessageBodyBegin()); - ros.write("\nabcdefghij".getBytes()); + ros.write("\nabcdefghij".getBytes(UTF_8)); assertEquals(12, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n".getBytes()); + ros.write("0123456789\n".getBytes(UTF_8)); assertEquals(-1, ros.getMessageBodyBegin()); - ros.write("\r\nabcdefghij".getBytes()); + ros.write("\r\nabcdefghij".getBytes(UTF_8)); assertEquals(13, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n\r".getBytes()); + ros.write("0123456789\n\r".getBytes(UTF_8)); assertEquals(-1, ros.getMessageBodyBegin()); - ros.write("\nabcdefghij".getBytes()); + ros.write("\nabcdefghij".getBytes(UTF_8)); assertEquals(13, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789".getBytes()); + ros.write("0123456789".getBytes(UTF_8)); ros.write('\n'); assertEquals(-1, ros.getMessageBodyBegin()); - ros.write("\nabcdefghij".getBytes()); + ros.write("\nabcdefghij".getBytes(UTF_8)); assertEquals(12, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789".getBytes()); + ros.write("0123456789".getBytes(UTF_8)); ros.write('\n'); ros.write('\n'); - for (int b: "abcdefghij".getBytes()) { + for (int b: "abcdefghij".getBytes(UTF_8)) { ros.write(b); } assertEquals(12, ros.getMessageBodyBegin()); @@ -328,11 +330,11 @@ public void testMessageBodyBegin() throws IOException { ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789".getBytes()); + ros.write("0123456789".getBytes(UTF_8)); ros.write('\n'); ros.write('\r'); ros.write('\n'); - for (int b: "abcdefghij".getBytes()) { + for (int b: "abcdefghij".getBytes(UTF_8)) { ros.write(b); } assertEquals(13, ros.getMessageBodyBegin()); @@ -340,17 +342,17 @@ public void testMessageBodyBegin() throws IOException { ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n".getBytes()); + ros.write("0123456789\n".getBytes(UTF_8)); ros.write('\n'); - ros.write("abcdefghij".getBytes()); + ros.write("abcdefghij".getBytes(UTF_8)); assertEquals(12, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n\r".getBytes()); + ros.write("0123456789\n\r".getBytes(UTF_8)); ros.write('\n'); - ros.write("abcdefghij".getBytes()); + ros.write("abcdefghij".getBytes(UTF_8)); assertEquals(13, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java index 3234259c..f0b688a9 100644 --- a/src/test/java/org/archive/io/ReplayCharSequenceTest.java +++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java @@ -36,6 +36,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.*; /** @@ -143,7 +145,7 @@ public void testGetReplayCharSequenceMultiByteZeroOffset() @Test public void testReplayCharSequenceByteToString() throws IOException { String fileContent = "Some file content"; - byte [] buffer = fileContent.getBytes(); + byte [] buffer = fileContent.getBytes(UTF_8); RecordingOutputStream ros = writeTestStream( buffer,1, "testReplayCharSequenceByteToString.txt",0); @@ -207,7 +209,7 @@ public void testSingleByteEncodings() throws IOException { @Test public void testReplayCharSequenceByteToStringOverflow() throws IOException { String fileContent = "Some file content. "; // ascii - byte [] buffer = fileContent.getBytes(); + byte [] buffer = fileContent.getBytes(UTF_8); RecordingOutputStream ros = writeTestStream( buffer,1, "testReplayCharSequenceByteToStringOverflow.txt",1); diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java index 954da636..f6820337 100644 --- a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java +++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java @@ -30,6 +30,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.archive.format.arc.ARCConstants.*; @@ -51,7 +53,7 @@ public void testARCWriterPool() WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; final String CONTENT = "Any old content"; ByteArrayOutputStream baos = new ByteArrayOutputStream(); - baos.write(CONTENT.getBytes()); + baos.write(CONTENT.getBytes(UTF_8)); for (int i = 0; i < MAX_ACTIVE; i++) { writers[i] = pool.borrowFile(); assertEquals(i + 1, pool.getNumActive(), "Number active"); @@ -81,7 +83,7 @@ public void testInvalidate() throws Exception { WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; final String CONTENT = "Any old content"; ByteArrayOutputStream baos = new ByteArrayOutputStream(); - baos.write(CONTENT.getBytes()); + baos.write(CONTENT.getBytes(UTF_8)); for (int i = 0; i < MAX_ACTIVE; i++) { writers[i] = pool.borrowFile(); assertEquals(i + 1, pool.getNumActive(), "Number active"); @@ -124,4 +126,4 @@ private WriterPoolSettings getSettings(final boolean isCompressed) { Arrays.asList(files), null); } -} \ No newline at end of file +} diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java index ca300697..8b2f7d64 100644 --- a/src/test/java/org/archive/io/arc/ARCWriterTest.java +++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java @@ -47,6 +47,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.*; import static org.archive.format.arc.ARCConstants.*; @@ -122,11 +124,11 @@ protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index) // Start the record with an arbitrary 14-digit date per RFC2540 String now = ArchiveUtils.get14DigitDate(); int recordLength = 0; - byte[] record = (getContent(indexStr)).getBytes(); + byte[] record = (getContent(indexStr)).getBytes(UTF_8); recordLength += record.length; baos.write(record); // Add the newline between records back in - baos.write("\n".getBytes()); + baos.write("\n".getBytes(UTF_8)); recordLength += 1; arcWriter.write("http://www.one.net/id=" + indexStr, "text/html", "0.1.2.3", Long.parseLong(now), recordLength, baos); @@ -305,7 +307,7 @@ protected CorruptibleARCWriter createARCWriter(String name, boolean compress) { protected static ByteArrayInputStream getBais(String str) throws IOException { - return new ByteArrayInputStream(str.getBytes()); + return new ByteArrayInputStream(str.getBytes(UTF_8)); } /** @@ -417,7 +419,7 @@ protected void lengthTooShort(String name, boolean compress, boolean strict) ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES"); writeRecord(writer, SOME_URL, "text/html", content.length(), bais); - writer.setEndJunk("SOME TRAILING BYTES".getBytes()); + writer.setEndJunk("SOME TRAILING BYTES".getBytes(UTF_8)); writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content)); } finally { @@ -518,7 +520,7 @@ public void testGapError() throws IOException { String content = getContent(); // Make a 'weird' RIS that returns bad 'remaining' length // awhen remaining should be 0 - ReplayInputStream ris = new ReplayInputStream(content.getBytes(), + ReplayInputStream ris = new ReplayInputStream(content.getBytes(UTF_8), content.length(), null) { public long remaining() { return (super.remaining()==0) ? -1 : super.remaining(); diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java index c0ace5f0..d2684fa4 100644 --- a/src/test/java/org/archive/io/warc/WARCWriterTest.java +++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java @@ -42,6 +42,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.*; import static org.archive.format.warc.WARCConstants.*; @@ -228,7 +230,7 @@ protected int writeRandomHTTPRecord(WARCWriter w, int index) String indexStr = Integer.toString(index); recordInfo.setUrl("http://www.one.net/id=" + indexStr); - byte[] record = (getContent(indexStr)).getBytes(); + byte[] record = (getContent(indexStr)).getBytes(UTF_8); recordInfo.setContentLength((long) record.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -385,7 +387,7 @@ protected WARCWriter createWARCWriter(String name, protected static ByteArrayOutputStream getBaos(String str) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); - baos.write(str.getBytes()); + baos.write(str.getBytes(UTF_8)); return baos; } @@ -524,4 +526,4 @@ public void testArcRecordOffsetReads() throws Exception { assertTrue(totalRead > 0); } } -} \ No newline at end of file +} From 6b0f0f29f8193118396d1cd693dc1a086c63d755 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 12 Nov 2025 13:20:24 +0100 Subject: [PATCH 10/16] Unit tests: add Locale.ROOT as parameter to all occurrences of PrintStream.format(...) and number formatters. Unify usage charset constants. --- .../format/gzip/zipnum/ZipNumWriterTest.java | 10 ++++---- .../org/archive/format/json/JSONViewTest.java | 8 ++++--- .../format/text/html/CDATALexerTest.java | 4 +++- .../archive/io/HeaderedArchiveRecordTest.java | 6 ++--- .../archive/io/RecordingInputStreamTest.java | 9 ++++---- .../archive/io/ReplayCharSequenceTest.java | 23 ++++++++++--------- .../io/RepositionableInputStreamTest.java | 4 +++- .../html/ExtractingParseObserverTest.java | 3 ++- .../resource/html/HTMLMetaDataTest.java | 4 +++- .../url/BasicURLCanonicalizerTest.java | 5 ++-- .../java/org/archive/url/URLParserTest.java | 10 +++++--- .../archive/url/URLRegexTransformerTest.java | 4 +++- .../java/org/archive/util/ByteOpTest.java | 5 ++-- .../org/archive/util/CrossProductTest.java | 8 ++++++- src/test/java/org/archive/util/TestUtils.java | 5 ++-- .../util/binsearch/SortedTextFileTest.java | 5 +++- 16 files changed, 72 insertions(+), 41 deletions(-) diff --git a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java index 25a5eaa7..13658bcb 100644 --- a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java +++ b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java @@ -10,7 +10,7 @@ import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import java.nio.charset.StandardCharsets; +import java.util.Locale; import org.archive.format.gzip.GZIPMemberSeries; import org.archive.format.gzip.GZIPSeriesMember; @@ -18,6 +18,8 @@ import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; public class ZipNumWriterTest { @@ -28,16 +30,16 @@ public void testAddRecord() throws IOException { File summ = File.createTempFile("test-znw",".summ"); main.deleteOnExit(); summ.deleteOnExit(); - System.out.format("Summ: %s\n", summ.getAbsolutePath()); + System.out.format(Locale.ROOT, "Summ: %s\n", summ.getAbsolutePath()); int limit = 10; ZipNumWriter znw = new ZipNumWriter(new FileOutputStream(main,false), new FileOutputStream(summ,false), limit); for(int i = 0; i < 1000; i++) { - znw.addRecord(String.format("%06d\n",i).getBytes(StandardCharsets.UTF_8)); + znw.addRecord(String.format(Locale.ROOT,"%06d\n",i).getBytes(UTF_8)); } znw.close(); InputStreamReader isr = - new InputStreamReader(new FileInputStream(summ), StandardCharsets.UTF_8); + new InputStreamReader(new FileInputStream(summ), UTF_8); BufferedReader br = new BufferedReader(isr); String line = null; int count = 0; diff --git a/src/test/java/org/archive/format/json/JSONViewTest.java b/src/test/java/org/archive/format/json/JSONViewTest.java index aabbe7df..6d199025 100644 --- a/src/test/java/org/archive/format/json/JSONViewTest.java +++ b/src/test/java/org/archive/format/json/JSONViewTest.java @@ -1,5 +1,7 @@ package org.archive.format.json; +import java.util.Locale; + import org.archive.util.TestUtils; import org.json.JSONException; import org.json.JSONObject; @@ -17,16 +19,16 @@ public void testBytes() throws JSONException { JSONObject o = new JSONObject(); o.append("name1", "val\\rue1"); String json = o.toString(); - System.out.format("once: (%s)\n",json); + System.out.format(Locale.ROOT, "once: (%s)\n", json); JSONObject o2 = new JSONObject(json); - System.out.format("twice: (%s)\n",o2.toString()); + System.out.format(Locale.ROOT, "twice: (%s)\n", o2.toString()); byte b[] = new byte[2]; for(int i = 0; i < 256; i++) { b[0] = (byte) i; int gi = getInt(b); - System.out.format("I(%d) gi(%d)\n",i,gi); + System.out.format(Locale.ROOT, "I(%d) gi(%d)\n", i, gi); } } diff --git a/src/test/java/org/archive/format/text/html/CDATALexerTest.java b/src/test/java/org/archive/format/text/html/CDATALexerTest.java index 856576ba..7c9f24f3 100644 --- a/src/test/java/org/archive/format/text/html/CDATALexerTest.java +++ b/src/test/java/org/archive/format/text/html/CDATALexerTest.java @@ -10,6 +10,8 @@ import static org.junit.jupiter.api.Assertions.*; +import java.util.Locale; + public class CDATALexerTest { CDATALexer l; Node n; @@ -102,7 +104,7 @@ public void testInJSComment() throws ParserException { } private void assertJSContentWorks(String js) throws ParserException { - String html = String.format("",js); + String html = String.format(Locale.ROOT,"",js); l = makeLexer(html); assertFalse(l.inCSS()); assertFalse(l.inJS()); diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java index 65027395..5d31b890 100644 --- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java +++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java @@ -78,7 +78,7 @@ public void testParseHttpHeadersInWARC() throws IOException { byte[] b = new byte[BODY.length()]; har.read(b); - String bodyRead = new String(b); + String bodyRead = new String(b, UTF_8); assertEquals(BODY, bodyRead); assertHeaderCorrectlyParsed(har.getContentHeaders()); assertEquals(har.getHeader().getUrl(), url, @@ -165,7 +165,7 @@ public String getVersion() { har.skipHttpHeader(); byte[] b = new byte[BODY.length()]; har.read(b); - String bodyRead = new String(b); + String bodyRead = new String(b, UTF_8); assertEquals(BODY, bodyRead); assertHeaderCorrectlyParsed(har.getContentHeaders()); } @@ -184,7 +184,7 @@ public void testEasierParseHttpHeadersInARC() throws IOException { har.skipHttpHeader(); byte[] b = new byte[BODY.length()]; har.read(b); - String bodyRead = new String(b); + String bodyRead = new String(b, UTF_8); assertEquals(BODY, bodyRead); assertHeaderCorrectlyParsed(har.getContentHeaders()); assertEquals(har.getHeader().getUrl(), url, "failed to retrieve Url from metadata"); diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java index 8ccee986..d794d925 100644 --- a/src/test/java/org/archive/io/RecordingInputStreamTest.java +++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java @@ -66,8 +66,9 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, ReplayInputStream res = ris.getReplayInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); res.readFullyTo(baos); - assertEquals("abcdefg",new String(baos.toByteArray()),"soft max cutoff"); - // test hard max + assertEquals("abcdefg", new String(baos.toByteArray(), UTF_8), + "soft max cutoff"); + // test hard max bais.reset(); baos.reset(); ris.open(bais); @@ -82,8 +83,8 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, ris.close(); res = ris.getReplayInputStream(); res.readFullyTo(baos); - assertEquals("abcdefghijk",new String(baos.toByteArray()), - "hard max cutoff"); + assertEquals("abcdefghijk", new String(baos.toByteArray(), UTF_8), + "hard max cutoff"); // test timeout PipedInputStream pin = new PipedInputStream(); PipedOutputStream pout = new PipedOutputStream(pin); diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java index f0b688a9..3935837b 100644 --- a/src/test/java/org/archive/io/ReplayCharSequenceTest.java +++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java @@ -25,17 +25,19 @@ import java.nio.charset.StandardCharsets; import java.text.NumberFormat; import java.util.Date; +import java.util.Locale; import java.util.Random; import java.util.logging.Logger; import org.archive.util.FileUtils; -import com.google.common.base.Charsets; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.US_ASCII; +import static java.nio.charset.StandardCharsets.ISO_8859_1; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.*; @@ -135,7 +137,7 @@ public void testGetReplayCharSequenceMultiByteZeroOffset() RecordingOutputStream ros = writeTestStream( regularBuffer,MULTIPLIER, "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER); - ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); + ReplayCharSequence rcs = getReplayCharSequence(ros, UTF_8); for (int i = 0; i < MULTIPLIER; i++) { accessingCharacters(rcs); @@ -181,7 +183,7 @@ public void testSingleByteEncodings() throws IOException { String latin1String = new String(bytes, "latin1"); RecordingOutputStream ros = writeTestStream( bytes, 1, "testSingleByteEncodings-latin1.txt", 0); - ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1); + ReplayCharSequence rcs = getReplayCharSequence(ros, ISO_8859_1); String result = rcs.toString(); logger.fine("latin1[0] " + toHexString(latin1String)); logger.fine("latin1[1] " + toHexString(result)); @@ -219,8 +221,8 @@ public void testReplayCharSequenceByteToStringOverflow() throws IOException { // both encodings because they exercise different code paths. UTF-8 is // decoded to UTF-16 while windows-1252 is memory mapped directly. See // GenericReplayCharSequence - ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8); - ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252")); + ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros, UTF_8); + ReplayCharSequence rcs1252 = getReplayCharSequence(ros, Charset.forName("windows-1252")); String result = rcsUtf8.toString(); assertEquals(expectedContent, result, "Strings don't match"); @@ -244,7 +246,7 @@ public void testReplayCharSequenceByteToStringMulti() throws IOException { buffer,1, "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1); for (int i = 0; i < 3; i++) { - ReplayCharSequence rcs = getReplayCharSequence(ros,StandardCharsets.UTF_8); + ReplayCharSequence rcs = getReplayCharSequence(ros, UTF_8); String result = rcs.toString(); assertEquals(result, expectedResult, "Strings don't match"); rcs.close(); @@ -257,8 +259,7 @@ public void testReplayCharSequenceByteToStringMulti() throws IOException { @Disabled public void xestHugeReplayCharSequence() throws IOException { String fileContent = "01234567890123456789"; - String characterEncoding = "ascii"; - byte[] buffer = fileContent.getBytes(characterEncoding); + byte[] buffer = fileContent.getBytes(US_ASCII); long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l; @@ -266,7 +267,7 @@ public void xestHugeReplayCharSequence() throws IOException { + " bytes to testHugeReplayCharSequence.txt"); RecordingOutputStream ros = writeTestStream(buffer, 0, "testHugeReplayCharSequence.txt", reps); - ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding)); + ReplayCharSequence rcs = getReplayCharSequence(ros, US_ASCII); if (reps * fileContent.length() > (long) Integer.MAX_VALUE) { assertEquals(Integer.MAX_VALUE, rcs.length(), "ReplayCharSequence has wrong length (length()=" @@ -285,7 +286,7 @@ public void xestHugeReplayCharSequence() throws IOException { // NumberFormat.getInstance().format(index)); assertEquals(fileContent.charAt(index % fileContent.length()), rcs.charAt(index), "Characters don't match (index=" - + NumberFormat.getInstance().format(index) + ")"); + + NumberFormat.getInstance(Locale.ROOT).format(index) + ")"); } // check that out of bounds indices throw exception @@ -309,7 +310,7 @@ public void xestHugeReplayCharSequence() throws IOException { // NumberFormat.getInstance().format(index)); assertEquals(fileContent.charAt(index % fileContent.length()), rcs.charAt(index), "Characters don't match (index=" - + NumberFormat.getInstance().format(index) + ")"); + + NumberFormat.getInstance(Locale.ROOT).format(index) + ")"); } } diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java index 228c9042..08143d01 100644 --- a/src/test/java/org/archive/io/RepositionableInputStreamTest.java +++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java @@ -27,6 +27,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; public class RepositionableInputStreamTest { @@ -63,7 +65,7 @@ public void testname() throws Exception { long offset = 0; for (int i = 0; i < 10; i++) { ris.read(bytes, 0, LINE.length()); - assertEquals(LINE, new String(bytes)); + assertEquals(LINE, new String(bytes, UTF_8)); offset += LINE.length(); assertEquals(offset, ris.position()); } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 157499ff..e34d4e6f 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.logging.Logger; import org.archive.extract.ExtractingResourceFactoryMapper; @@ -52,7 +53,7 @@ public void testHandleStyleNodeExceptions() throws Exception { TextNode tn = new TextNode(css); epo.handleStyleNode(tn); } catch(Exception e) { - System.err.format("And the winner is....(%s)\n", css); + System.err.format(Locale.ROOT, "And the winner is....(%s)\n", css); e.printStackTrace(); except = true; throw e; diff --git a/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java b/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java index 3b4193b9..a3c8c1c9 100644 --- a/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java +++ b/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java @@ -1,5 +1,7 @@ package org.archive.resource.html; +import java.util.Locale; + import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; @@ -59,7 +61,7 @@ private void appendStrArr(JSONObject o, String a[][]) throws JSONException { } private void appendStrArr2(JSONObject o, String k, String... a) throws JSONException { - System.out.format("A length(%d)\n", a.length); + System.out.format(Locale.ROOT, "A length(%d)\n", a.length); JSONObject n = new JSONObject(); if((a.length & 1) == 1) { throw new IllegalArgumentException(); diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java index 19b1984f..45989416 100644 --- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java @@ -1,6 +1,7 @@ package org.archive.url; import java.net.URISyntaxException; +import java.util.Locale; import org.junit.jupiter.api.Test; @@ -204,12 +205,12 @@ public void testFoo() { String path = "/a/b/c/"; String[] paths = path.split("/",-1); for(String p : paths) { - System.out.format("(%s)",p); + System.out.format(Locale.ROOT, "(%s)", p); } System.out.println(); paths = path.split("/"); for(String p : paths) { - System.out.format("(%s)",p); + System.out.format(Locale.ROOT, "(%s)", p); } System.out.println(); } diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java index bc8fc3a5..c942a260 100644 --- a/src/test/java/org/archive/url/URLParserTest.java +++ b/src/test/java/org/archive/url/URLParserTest.java @@ -3,10 +3,14 @@ import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; import java.net.URLDecoder; +import java.util.Locale; import com.google.common.net.InetAddresses; + import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; public class URLParserTest { @@ -15,7 +19,7 @@ public void testGuava() throws URIException, UnsupportedEncodingException { Long l = Long.parseLong("3279880203"); int i2 = l.intValue(); // int i = Integer.decode("3279880203"); - System.err.format("FromNum(%s)\n", InetAddresses.fromInteger(i2).getHostAddress()); + System.err.format(Locale.ROOT, "FromNum(%s)\n", InetAddresses.fromInteger(i2).getHostAddress()); } @Test @@ -30,7 +34,7 @@ public void testAddDefaultSchemeIfNeeded() { @Test public void testParse() throws UnsupportedEncodingException, URISyntaxException { - System.out.format("O(%s) E(%s)\n","%66",URLDecoder.decode("%66","UTF-8")); + System.out.format(Locale.ROOT, "O(%s) E(%s)\n","%66", URLDecoder.decode("%66", UTF_8.name())); checkParse("http://www.archive.org/index.html#foo", null, "http", null, null, "www.archive.org", -1, "/index.html", null, "foo", "http://www.archive.org/index.html#foo", "/index.html"); @@ -96,7 +100,7 @@ private void checkParse(String s, String opaque, String scheme, String authUser, String authPass, String host, int port, String path, String query, String fragment, String urlString, String pathQuery) throws URISyntaxException { HandyURL h = URLParser.parse(s); - System.out.format("Input:(%s)\nHandyURL\t%s\n",s,h.toDebugString()); + System.out.format(Locale.ROOT, "Input:(%s)\nHandyURL\t%s\n", s, h.toDebugString()); assertEquals(scheme, h.getScheme()); assertEquals(authUser, h.getAuthUser()); assertEquals(authPass, h.getAuthPass()); diff --git a/src/test/java/org/archive/url/URLRegexTransformerTest.java b/src/test/java/org/archive/url/URLRegexTransformerTest.java index 73c43f96..d5c98f6a 100644 --- a/src/test/java/org/archive/url/URLRegexTransformerTest.java +++ b/src/test/java/org/archive/url/URLRegexTransformerTest.java @@ -5,6 +5,8 @@ import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.Locale; + public class URLRegexTransformerTest { @Test @@ -49,7 +51,7 @@ public void testStripPathSessionID() { private static void checkStripPathSessionID(String orig, String want) { String got = URLRegexTransformer.stripPathSessionID(orig); - assertEquals(want, got, String.format("FAIL Orig(%s) Got(%s) Want(%s)", orig, got, want)); + assertEquals(want, got, String.format(Locale.ROOT, "FAIL Orig(%s) Got(%s) Want(%s)", orig, got, want)); } // private static final String BASE = "http://www.archive.org/index.html"; diff --git a/src/test/java/org/archive/util/ByteOpTest.java b/src/test/java/org/archive/util/ByteOpTest.java index 49781c36..eb89353e 100644 --- a/src/test/java/org/archive/util/ByteOpTest.java +++ b/src/test/java/org/archive/util/ByteOpTest.java @@ -4,6 +4,7 @@ import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.IOException; +import java.util.Locale; import com.google.common.io.LittleEndianDataOutputStream; @@ -18,10 +19,10 @@ public void testReadShort() throws IOException { byte a[] = new byte[]{0,1,2,3}; ByteArrayInputStream bais = new ByteArrayInputStream(a); int bos = ByteOp.readShort(bais); - System.out.format("BO.Read short(%d)\n", bos); + System.out.format(Locale.ROOT, "BO.Read short(%d)\n", bos); DataInputStream dis = new DataInputStream(new ByteArrayInputStream(a)); int disv = dis.readUnsignedShort(); - System.out.format("DI.Read short(%d)\n", disv); + System.out.format(Locale.ROOT, "DI.Read short(%d)\n", disv); for(int i = 0; i < 256 * 256; i++) { ByteArrayOutputStream baos = new ByteArrayOutputStream(2); LittleEndianDataOutputStream dos = new LittleEndianDataOutputStream(baos); diff --git a/src/test/java/org/archive/util/CrossProductTest.java b/src/test/java/org/archive/util/CrossProductTest.java index 211fa65e..a487ab15 100644 --- a/src/test/java/org/archive/util/CrossProductTest.java +++ b/src/test/java/org/archive/util/CrossProductTest.java @@ -2,10 +2,12 @@ import java.util.ArrayList; import java.util.List; +import java.util.Locale; import org.junit.jupiter.api.Test; public class CrossProductTest { + private void dumpC(List a) { StringBuilder sb = new StringBuilder(); boolean first = false; @@ -19,16 +21,19 @@ private void dumpC(List a) { } System.out.println("Dump:" + sb.toString()); } + private void dumpLOL(List> coc) { for(List co : coc) { dumpC(co); } } + @Test public void testVersion() { String version = IAUtils.loadCommonsVersion(); - System.out.format("Loaded version(%s)\n", version); + System.out.format(Locale.ROOT, "Loaded version(%s)\n", version); } + @Test public void testCrossProduct() { ArrayList> input = new ArrayList>(); @@ -40,6 +45,7 @@ public void testCrossProduct() { List> cross = xp.crossProduct(input); dumpLOL(cross); } + private List AtoL(Object... a) { ArrayList al = new ArrayList(a.length); for(Object s : a) { diff --git a/src/test/java/org/archive/util/TestUtils.java b/src/test/java/org/archive/util/TestUtils.java index 01b0d099..b8fee0f4 100644 --- a/src/test/java/org/archive/util/TestUtils.java +++ b/src/test/java/org/archive/util/TestUtils.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.List; +import java.util.Locale; import com.google.common.io.ByteStreams; @@ -12,9 +13,9 @@ public class TestUtils { public static void dumpMatch(String context, List> res) { - System.out.format("Context(%s) Found (%d) matches\n", context, res.size()); + System.out.format(Locale.ROOT, "Context(%s) Found (%d) matches\n", context, res.size()); for(List r : res) { - System.out.format("Match(%s)\n", StringParse.join(r)); + System.out.format(Locale.ROOT, "Match(%s)\n", StringParse.join(r)); } } diff --git a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java index 5e8889e5..ab8ca627 100644 --- a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java +++ b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java @@ -4,6 +4,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; +import java.util.Locale; import org.archive.util.binsearch.impl.RandomAccessFileSeekableLineReaderFactory; import org.archive.util.iterator.CloseableIterator; @@ -13,9 +14,11 @@ import static org.junit.jupiter.api.Assertions.assertFalse; public class SortedTextFileTest { + private static String formatS(int i) { - return String.format("%07d",i); + return String.format(Locale.ROOT, "%07d", i); } + private void createFile(File target, int max) throws FileNotFoundException { PrintWriter pw = new PrintWriter(target); for(int i = 0; i < max; i++) { From 1c116c7305c36fc24f56f916e9cb3fa87723b1f0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 12 Nov 2025 13:27:03 +0100 Subject: [PATCH 11/16] Replace all occurences of com.google.common.base.Charsets by java.nio.StandardCharsets --- src/main/java/org/archive/io/GenericReplayCharSequence.java | 4 ++-- src/main/java/org/archive/io/ReplayCharSequence.java | 5 ++--- src/main/java/org/archive/url/LaxURLCodec.java | 5 ++--- src/main/java/org/archive/util/Recorder.java | 5 ++--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java index 7aacb25a..ff96717c 100644 --- a/src/main/java/org/archive/io/GenericReplayCharSequence.java +++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java @@ -33,6 +33,7 @@ import java.nio.channels.FileChannel; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.text.NumberFormat; import java.util.Locale; import java.util.logging.Level; @@ -41,7 +42,6 @@ import org.apache.commons.io.IOUtils; import org.archive.util.DevUtils; -import com.google.common.base.Charsets; import com.google.common.primitives.Ints; /** @@ -68,7 +68,7 @@ public class GenericReplayCharSequence implements ReplayCharSequence { * *

See Encoding. */ - public static final Charset WRITE_ENCODING = Charsets.UTF_16BE; + public static final Charset WRITE_ENCODING = StandardCharsets.UTF_16BE; private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java index e456e293..bd74f2f8 100644 --- a/src/main/java/org/archive/io/ReplayCharSequence.java +++ b/src/main/java/org/archive/io/ReplayCharSequence.java @@ -23,8 +23,7 @@ import java.io.IOException; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; - -import com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; /** @@ -40,7 +39,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable { /** charset to use in replay when declared value * is absent/illegal/unavailable */ - public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8? + public Charset FALLBACK_CHARSET = StandardCharsets.ISO_8859_1; // TODO: should this be UTF-8? /** * Call this method when done so implementation has chance to clean up diff --git a/src/main/java/org/archive/url/LaxURLCodec.java b/src/main/java/org/archive/url/LaxURLCodec.java index e27d9de0..92c7cae6 100644 --- a/src/main/java/org/archive/url/LaxURLCodec.java +++ b/src/main/java/org/archive/url/LaxURLCodec.java @@ -20,12 +20,11 @@ import java.io.ByteArrayOutputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.BitSet; import org.apache.commons.codec.net.URLCodec; -import com.google.common.base.Charsets; - /** * @author gojomo */ @@ -155,6 +154,6 @@ public String encode(BitSet safe, String pString, String cs) if (pString == null) { return null; } - return new String(encodeUrl(safe,pString.getBytes(cs)), Charsets.US_ASCII); + return new String(encodeUrl(safe,pString.getBytes(cs)), StandardCharsets.US_ASCII); } } diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java index 6f9e0117..9f10ec92 100644 --- a/src/main/java/org/archive/util/Recorder.java +++ b/src/main/java/org/archive/util/Recorder.java @@ -25,6 +25,7 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.HashSet; import java.util.Locale; import java.util.Set; @@ -42,8 +43,6 @@ import org.archive.io.ReplayCharSequence; import org.archive.io.ReplayInputStream; -import com.google.common.base.Charsets; - /** * Pairs together a RecordingInputStream and RecordingOutputStream @@ -96,7 +95,7 @@ public class Recorder { * (current behavior is for consistency with our prior but perhaps not * optimal behavior) */ - protected Charset charset = Charsets.UTF_8; + protected Charset charset = StandardCharsets.UTF_8; /** whether recording-input (ris) message-body is chunked */ protected boolean inputIsChunked = false; From 4a12fa43ed02512219b089af3708189afdd2b8a2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 12 Nov 2025 13:51:40 +0100 Subject: [PATCH 12/16] Unit tests: replace FileReader and FileWriter using classes allowing to configure the charset. Add charset to toString() methods of OutputStreams --- .../org/archive/io/RecordingInputStreamTest.java | 2 +- .../archive/io/RepositionableInputStreamTest.java | 3 ++- .../java/org/archive/io/arc/ARCWriterTest.java | 10 +++++----- src/test/java/org/archive/util/FileUtilsTest.java | 2 -- .../util/binsearch/SortedTextFileTest.java | 7 +++++-- .../util/iterator/FilterStringIteratorTest.java | 1 - .../iterator/SortedCompositeIteratorTest.java | 15 +++++++++------ 7 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java index d794d925..74e92024 100644 --- a/src/test/java/org/archive/io/RecordingInputStreamTest.java +++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java @@ -140,6 +140,6 @@ public void testAsOutputStream() throws IOException { ris.close(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); ris.getReplayInputStream().readFullyTo(baos); - assertEquals("hello", baos.toString()); + assertEquals("hello", baos.toString(UTF_8.name())); } } diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java index 08143d01..4aad11b9 100644 --- a/src/test/java/org/archive/io/RepositionableInputStreamTest.java +++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.OutputStreamWriter; import java.io.PrintWriter; import org.junit.jupiter.api.BeforeEach; @@ -40,7 +41,7 @@ public class RepositionableInputStreamTest { @BeforeEach protected void setUp() throws Exception { this.testFile = new File(tempDir, this.getClass().getName()); - PrintWriter pw = new PrintWriter(new FileOutputStream(testFile)); + PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(testFile), UTF_8)); for (int i = 0; i < 100; i++) { pw.print(LINE); } diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java index 8b2f7d64..f6c48462 100644 --- a/src/test/java/org/archive/io/arc/ARCWriterTest.java +++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java @@ -262,7 +262,7 @@ public void testWriteRecordCompressed() throws IOException { } public void testWriteGiantRecord() throws IOException { - PrintStream dummyStream = new PrintStream(new NullOutputStream()); + PrintStream dummyStream = new PrintStream(new NullOutputStream(), false, UTF_8.name()); ARCWriter arcWriter = new ARCWriter( SERIAL_NO, @@ -431,7 +431,7 @@ protected void lengthTooShort(String name, boolean compress, boolean strict) PrintStream origErr = System.err; ARCReader r = null; try { - System.setErr(new PrintStream(os)); + System.setErr(new PrintStream(os, false, UTF_8.name())); r = ARCReaderFactory.get(writer.getFile()); r.setStrict(strict); @@ -440,7 +440,7 @@ protected void lengthTooShort(String name, boolean compress, boolean strict) // Make sure we get the warning string which complains about the // trailing bytes. - String err = os.toString(); + String err = os.toString(UTF_8.name()); assertTrue(err.startsWith("WARNING") && (err.indexOf("Record STARTING at") > 0), "No message " + err); r.close(); @@ -496,7 +496,7 @@ protected void lengthTooLong(String name, boolean compress, PrintStream origErr = System.err; ARCReader r = null; try { - System.setErr(new PrintStream(os)); + System.setErr(new PrintStream(os, false, UTF_8.name())); r = ARCReaderFactory.get(writer.getFile()); r.setStrict(strict); @@ -505,7 +505,7 @@ protected void lengthTooLong(String name, boolean compress, // Make sure we get the warning string which complains about the // trailing bytes. - String err = os.toString(); + String err = os.toString(UTF_8.name()); assertTrue(err.startsWith("WARNING Premature EOF before end-of-record"), "No message " + err); } finally { diff --git a/src/test/java/org/archive/util/FileUtilsTest.java b/src/test/java/org/archive/util/FileUtilsTest.java index bd58bd09..51c416f0 100644 --- a/src/test/java/org/archive/util/FileUtilsTest.java +++ b/src/test/java/org/archive/util/FileUtilsTest.java @@ -185,7 +185,6 @@ public void testTailLinesNakedWindows() throws IOException { verifyTailLines(nakedLastLineWindows); } - @SuppressWarnings("unchecked") private void verifyTailLines(File file) throws IOException { List lines = org.apache.commons.io.FileUtils.readLines(file); verifyTailLines(file, lines, 1, 80); @@ -263,7 +262,6 @@ public void testHeadLinesNakedWindows() throws IOException { } - @SuppressWarnings("unchecked") private void verifyHeadLines(File file) throws IOException { List lines = org.apache.commons.io.FileUtils.readLines(file); verifyHeadLines(file, lines, 1, 80); diff --git a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java index ab8ca627..26d7a16d 100644 --- a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java +++ b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java @@ -4,12 +4,15 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; import java.util.Locale; import org.archive.util.binsearch.impl.RandomAccessFileSeekableLineReaderFactory; import org.archive.util.iterator.CloseableIterator; import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -19,8 +22,8 @@ private static String formatS(int i) { return String.format(Locale.ROOT, "%07d", i); } - private void createFile(File target, int max) throws FileNotFoundException { - PrintWriter pw = new PrintWriter(target); + private void createFile(File target, int max) throws FileNotFoundException, UnsupportedEncodingException { + PrintWriter pw = new PrintWriter(target, UTF_8.name()); for(int i = 0; i < max; i++) { pw.println(formatS(i)); } diff --git a/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java b/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java index 20143289..6d5685ad 100644 --- a/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java +++ b/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java @@ -5,7 +5,6 @@ import java.util.List; import java.util.TreeSet; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; diff --git a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java index 98de1416..fa1213f7 100644 --- a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java +++ b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java @@ -2,14 +2,17 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.Comparator; import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -21,19 +24,19 @@ public void testHasNext() throws FileNotFoundException, IOException { File a = File.createTempFile("filea", null); File b = File.createTempFile("fileb", null); - PrintWriter apw = new PrintWriter(a); - PrintWriter bpw = new PrintWriter(b); + PrintWriter apw = new PrintWriter(a, UTF_8.name()); + PrintWriter bpw = new PrintWriter(b, UTF_8.name()); apw.println("1"); apw.println("3"); bpw.println("2"); bpw.println("4"); apw.close(); bpw.close(); - BufferedReader abr = new BufferedReader(new FileReader(a)); - BufferedReader bbr = new BufferedReader(new FileReader(b)); + BufferedReader abr = new BufferedReader(new InputStreamReader(new FileInputStream(a), UTF_8)); + BufferedReader bbr = new BufferedReader(new InputStreamReader(new FileInputStream(b), UTF_8)); SortedCompositeIterator sci = new SortedCompositeIterator(new Comparator() { - @Override + @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } From 62341dafc3a15ab200e7e5724bf1b9e1f774ce55 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 12 Nov 2025 13:55:54 +0100 Subject: [PATCH 13/16] Code quality: replace obsolete imports and suppressed warnings --- .../extract/WARCMetadataRecordExtractorOutput.java | 8 -------- src/main/java/org/archive/hadoop/FilenameInputFormat.java | 1 - src/main/java/org/archive/hadoop/PerMapOutputFormat.java | 1 - src/main/java/org/archive/io/HeaderedArchiveRecord.java | 3 --- src/main/java/org/archive/io/arc/ARCReader.java | 1 - src/main/java/org/archive/io/warc/WARCReader.java | 1 - src/main/java/org/archive/io/warc/WARCWriter.java | 1 - src/main/java/org/archive/uid/RecordIDGenerator.java | 1 - src/main/java/org/archive/url/LaxURI.java | 1 - src/main/java/org/archive/util/FileUtils.java | 1 - src/main/java/org/archive/util/IterableLineIterator.java | 1 - src/main/java/org/archive/util/TextUtils.java | 1 - 12 files changed, 21 deletions(-) diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java index 426acb02..b1050a14 100644 --- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -3,24 +3,16 @@ import java.io.IOException; import java.io.OutputStream; import java.io.PrintWriter; -import java.net.MalformedURLException; -import java.net.URISyntaxException; -import java.net.URL; import java.util.List; import java.util.Locale; import java.util.logging.Logger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.archive.format.gzip.GZIPFormatException; -import org.archive.format.json.JSONUtils; import org.archive.format.json.SimpleJSONPathSpec; import org.archive.resource.MetaData; import org.archive.resource.Resource; -import org.archive.util.IAUtils; import org.archive.util.StreamCopy; import org.json.JSONArray; -import org.json.JSONException; import org.json.JSONObject; import com.google.common.io.ByteStreams; diff --git a/src/main/java/org/archive/hadoop/FilenameInputFormat.java b/src/main/java/org/archive/hadoop/FilenameInputFormat.java index 5893afb1..3f41cdee 100644 --- a/src/main/java/org/archive/hadoop/FilenameInputFormat.java +++ b/src/main/java/org/archive/hadoop/FilenameInputFormat.java @@ -17,7 +17,6 @@ package org.archive.hadoop; import java.io.*; -import java.util.*; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; diff --git a/src/main/java/org/archive/hadoop/PerMapOutputFormat.java b/src/main/java/org/archive/hadoop/PerMapOutputFormat.java index 28ebca73..684202bb 100644 --- a/src/main/java/org/archive/hadoop/PerMapOutputFormat.java +++ b/src/main/java/org/archive/hadoop/PerMapOutputFormat.java @@ -17,7 +17,6 @@ package org.archive.hadoop; import java.io.*; -import java.util.*; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java index a149acac..858edb4d 100644 --- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -150,9 +150,6 @@ private InputStream readContentHeaders() throws IOException { } String statusLine = new String(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); - if (statusLine == null) { - throw new NullPointerException("Expected status line is null"); - } statusLine = statusLine.trim(); // TODO: Tighten up this test. boolean isHttpResponse = statusLine.startsWith("HTTP"); diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java index ecc742a5..f8935e79 100644 --- a/src/main/java/org/archive/io/arc/ARCReader.java +++ b/src/main/java/org/archive/io/arc/ARCReader.java @@ -448,7 +448,6 @@ public static void createCDXIndexFile(String urlOrPath) * @throws IOException * @throws java.text.ParseException */ - @SuppressWarnings("unchecked") public static void main(String [] args) throws ParseException, IOException, java.text.ParseException { Options options = getOptions(); diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java index 02756cb1..34583e58 100644 --- a/src/main/java/org/archive/io/warc/WARCReader.java +++ b/src/main/java/org/archive/io/warc/WARCReader.java @@ -199,7 +199,6 @@ public static void main(String [] args) Options options = getOptions(); PosixParser parser = new PosixParser(); CommandLine cmdline = parser.parse(options, args, false); - @SuppressWarnings("unchecked") List cmdlineArgs = cmdline.getArgList(); Option [] cmdlineOptions = cmdline.getOptions(); HelpFormatter formatter = new HelpFormatter(); diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java index 8b571fad..65eb3346 100644 --- a/src/main/java/org/archive/io/warc/WARCWriter.java +++ b/src/main/java/org/archive/io/warc/WARCWriter.java @@ -38,7 +38,6 @@ import org.apache.commons.lang3.StringUtils; import org.archive.format.ArchiveFileConstants; -import org.archive.io.UTF8Bytes; import org.archive.io.WriterPoolMember; import org.archive.util.ArchiveUtils; import org.archive.util.anvl.Element; diff --git a/src/main/java/org/archive/uid/RecordIDGenerator.java b/src/main/java/org/archive/uid/RecordIDGenerator.java index 4f16c5ab..80cc5565 100644 --- a/src/main/java/org/archive/uid/RecordIDGenerator.java +++ b/src/main/java/org/archive/uid/RecordIDGenerator.java @@ -19,7 +19,6 @@ package org.archive.uid; import java.net.URI; -import java.net.URISyntaxException; import java.util.Map; /** diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java index 3b27e045..9b7485c7 100644 --- a/src/main/java/org/archive/url/LaxURI.java +++ b/src/main/java/org/archive/url/LaxURI.java @@ -18,7 +18,6 @@ */ package org.archive.url; -import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java index 6886e08c..271d0212 100644 --- a/src/main/java/org/archive/util/FileUtils.java +++ b/src/main/java/org/archive/util/FileUtils.java @@ -393,7 +393,6 @@ public static boolean moveAsideIfExists(File file) throws IOException { * after the end of the last line returned * @throws IOException */ - @SuppressWarnings("unchecked") public static LongRange pagedLines(File file, long position, int signedDesiredLineCount, List lines, int lineEstimate) throws IOException { diff --git a/src/main/java/org/archive/util/IterableLineIterator.java b/src/main/java/org/archive/util/IterableLineIterator.java index 33efa1fd..c9010031 100644 --- a/src/main/java/org/archive/util/IterableLineIterator.java +++ b/src/main/java/org/archive/util/IterableLineIterator.java @@ -19,7 +19,6 @@ public IterableLineIterator(final Reader reader) super(reader); } - @SuppressWarnings("unchecked") public Iterator iterator() { return this; } diff --git a/src/main/java/org/archive/util/TextUtils.java b/src/main/java/org/archive/util/TextUtils.java index df3de58b..627d411a 100644 --- a/src/main/java/org/archive/util/TextUtils.java +++ b/src/main/java/org/archive/util/TextUtils.java @@ -30,7 +30,6 @@ import java.net.URLEncoder; import java.util.HashMap; import java.util.Map; -import java.util.concurrent.ConcurrentMap; import java.util.regex.Matcher; import java.util.regex.Pattern; From 97f7eb0035f921986b9a82441af3a6603c77961d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 12 Nov 2025 14:14:37 +0100 Subject: [PATCH 14/16] Use StandardCharsets to replace or initialize String constants defining character sets --- .../archive/extract/WATExtractorOutput.java | 2 +- .../format/arc/FiledescRecordParser.java | 3 ++- .../archive/format/dns/DNSResponseParser.java | 3 ++- .../format/text/charset/CharsetDetector.java | 3 ++- .../org/archive/format/warc/WARCConstants.java | 4 +++- src/main/java/org/archive/io/UTF8Bytes.java | 3 ++- .../java/org/archive/io/WriterPoolMember.java | 3 ++- .../java/org/archive/net/PublicSuffixes.java | 18 ++++++------------ .../archive/resource/gzip/GZIPMetaData.java | 8 ++++---- .../resource/html/HTMLResourceFactory.java | 3 ++- src/main/java/org/archive/url/LaxURLCodec.java | 2 +- src/main/java/org/archive/url/URI.java | 2 +- .../org/archive/util/ChunkedInputStream.java | 3 +-- src/main/java/org/archive/util/HMACSigner.java | 6 ++++-- src/main/java/org/archive/util/IAUtils.java | 6 +----- .../java/org/archive/util/LaxHttpParser.java | 4 ++-- .../format/dns/DNSResponseParserTest.java | 4 +++- 17 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index bb179fd1..621656b7 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -176,7 +176,7 @@ private void writeWARCMDRecord(OutputStream recOut, MetaData md, throw new IOException(e1); } osw.flush(); -// ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes("UTF-8")); +// ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes(UTF_8)); Date capDate; try { capDate = DateUtils.getSecondsSinceEpoch(capDateString); diff --git a/src/main/java/org/archive/format/arc/FiledescRecordParser.java b/src/main/java/org/archive/format/arc/FiledescRecordParser.java index c2d7bb65..6a34eb5d 100644 --- a/src/main/java/org/archive/format/arc/FiledescRecordParser.java +++ b/src/main/java/org/archive/format/arc/FiledescRecordParser.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; public class FiledescRecordParser { public boolean strict = false; @@ -12,7 +13,7 @@ public FiledescRecord parse(InputStream is) throws IOException { FiledescRecord rec = new FiledescRecord(); try { // TODO: count input bytes read... - BufferedReader br = new BufferedReader(new InputStreamReader(is,"UTF-8")); + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); String line = br.readLine(); parseLine1(rec,line); line = br.readLine(); diff --git a/src/main/java/org/archive/format/dns/DNSResponseParser.java b/src/main/java/org/archive/format/dns/DNSResponseParser.java index b5f81633..3e868ccf 100644 --- a/src/main/java/org/archive/format/dns/DNSResponseParser.java +++ b/src/main/java/org/archive/format/dns/DNSResponseParser.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; public class DNSResponseParser { @@ -28,7 +29,7 @@ public void parse(InputStream is, DNSResponse response) throws IOException, DNSP try { // TODO: should we wrap in a CountingInputStream and indicate // observed octet-length? - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); String date = br.readLine().trim(); if(isDate(date)) { response.setDate(date); diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 49286764..08aac469 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -92,7 +93,7 @@ public abstract class CharsetDetector { // ...and if the chardet library fails, use the Content-Type header protected final static String HTTP_CONTENT_TYPE_HEADER = "CONTENT-TYPE"; /** the default charset name to use when giving up */ - public final static String DEFAULT_CHARSET = "UTF-8"; + public final static String DEFAULT_CHARSET = StandardCharsets.UTF_8.name(); protected boolean isCharsetSupported(String charsetName) { // can you believe that this throws a runtime? Just asking if it's diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java index 72dad45a..a6bdb3f4 100644 --- a/src/main/java/org/archive/format/warc/WARCConstants.java +++ b/src/main/java/org/archive/format/warc/WARCConstants.java @@ -19,6 +19,8 @@ package org.archive.format.warc; +import java.nio.charset.StandardCharsets; + import org.archive.format.ArchiveFileConstants; /** @@ -93,7 +95,7 @@ public interface WARCConstants extends ArchiveFileConstants { * till we figure it, DEFAULT_ENCODING is single-byte charset -- same as * ARCs. */ - public static final String DEFAULT_ENCODING = "UTF-8"; + public static final String DEFAULT_ENCODING = StandardCharsets.UTF_8.name(); public static final String HEADER_LINE_ENCODING = DEFAULT_ENCODING; // TODO: Revisit. 8859 isn't correct, especially if we settle on RFC822 diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java index c280b08d..4dc0144b 100644 --- a/src/main/java/org/archive/io/UTF8Bytes.java +++ b/src/main/java/org/archive/io/UTF8Bytes.java @@ -19,6 +19,7 @@ package org.archive.io; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; /** * Marker Interface for instances that can be serialized as UTF8 bytes. @@ -27,7 +28,7 @@ * @version $Date$ $Version$ */ public interface UTF8Bytes { - public static final String UTF8 = "UTF-8"; + public static final String UTF8 = StandardCharsets.UTF_8.name(); /** * @return Instance as UTF-8 bytes. diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java index 4679ea78..5d350534 100644 --- a/src/main/java/org/archive/io/WriterPoolMember.java +++ b/src/main/java/org/archive/io/WriterPoolMember.java @@ -25,6 +25,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.text.DecimalFormat; import java.text.DecimalFormatSymbols; import java.text.NumberFormat; @@ -54,7 +55,7 @@ public abstract class WriterPoolMember { private final Logger logger = Logger.getLogger(this.getClass().getName()); - public static final String UTF8 = "UTF-8"; + public static final String UTF8 = StandardCharsets.UTF_8.name(); /** * Default archival-aggregate filename template. diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java index 5b3219d5..79130332 100644 --- a/src/main/java/org/archive/net/PublicSuffixes.java +++ b/src/main/java/org/archive/net/PublicSuffixes.java @@ -28,7 +28,6 @@ import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; @@ -193,7 +192,7 @@ public static void main(String args[]) throws IOException { } else { is = new FileInputStream(args[0]); } - BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8)); String regex = getTopmostAssignedSurtPrefixRegex(reader); IOUtils.closeQuietly(is); @@ -335,16 +334,11 @@ public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() { public static synchronized String getTopmostAssignedSurtPrefixRegex() { if (topmostAssignedSurtPrefixRegex == null) { // use bundled list - try { - BufferedReader reader = new BufferedReader(new InputStreamReader( - PublicSuffixes.class.getResourceAsStream( - "/org/archive/effective_tld_names.dat"), "UTF-8")); - topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader); - IOUtils.closeQuietly(reader); - } catch (UnsupportedEncodingException ex) { - // should never happen - throw new RuntimeException(ex); - } + BufferedReader reader = new BufferedReader(new InputStreamReader( + PublicSuffixes.class.getResourceAsStream( + "/org/archive/effective_tld_names.dat"), UTF_8)); + topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader); + IOUtils.closeQuietly(reader); } return topmostAssignedSurtPrefixRegex; } diff --git a/src/main/java/org/archive/resource/gzip/GZIPMetaData.java b/src/main/java/org/archive/resource/gzip/GZIPMetaData.java index 0fc18162..1058b01b 100644 --- a/src/main/java/org/archive/resource/gzip/GZIPMetaData.java +++ b/src/main/java/org/archive/resource/gzip/GZIPMetaData.java @@ -15,6 +15,8 @@ import org.json.JSONException; import org.json.JSONObject; +import static java.nio.charset.StandardCharsets.UTF_8; + public class GZIPMetaData extends MetaData implements ResourceConstants { private static final Logger LOG = Logger.getLogger(GZIPMetaData.class.getName()); @@ -26,7 +28,7 @@ public void setData(GZIPSeriesMember member) { GZIPHeader header = member.getHeader(); GZIPStaticHeader staticH = header.getStaticHeader(); if(staticH.isFNameSet()) { - putString(GZIP_FILENAME,new String(header.getFileName(),"UTF-8")); + putString(GZIP_FILENAME, new String(header.getFileName(), UTF_8)); } if(staticH.isFCommentSet()) { putLong(GZIP_COMMENT_LENGTH,header.getCommentLength()); @@ -39,7 +41,7 @@ public void setData(GZIPSeriesMember member) { for(int i = 0; i < records; i++) { GZIPFExtraRecord rec = header.getRecord(i); JSONObject recJO = new JSONObject(); - String name = new String(rec.getName(),"UTF-8"); + String name = new String(rec.getName(), UTF_8); recJO.put(GZIP_FEXTRA_NAME, name); if(name.equals("SL") || name.equals("LX")) { recJO.put(GZIP_FEXTRA_VALUE, ByteOp.bytesToInt(rec.getValue())); @@ -55,8 +57,6 @@ public void setData(GZIPSeriesMember member) { putLong(GZIP_INFLATED_CRC,footer.getCRC()); putLong(GZIP_INFLATED_LENGTH,footer.getLength()); - } catch (UnsupportedEncodingException e) { - LOG.warning(e.getMessage()); } catch (JSONException e) { LOG.warning(e.getMessage()); } diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 6e95270c..410449a1 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.logging.Logger; import org.archive.format.http.HttpHeaders; @@ -40,7 +41,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData, CDATALexer lex = new CDATALexer(); // guess charset based on HTTP header and sniffed content chunk - String charset = "UTF-8"; + String charset = StandardCharsets.UTF_8.name(); is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; is.mark(0); diff --git a/src/main/java/org/archive/url/LaxURLCodec.java b/src/main/java/org/archive/url/LaxURLCodec.java index 92c7cae6..b68a0c19 100644 --- a/src/main/java/org/archive/url/LaxURLCodec.java +++ b/src/main/java/org/archive/url/LaxURLCodec.java @@ -29,7 +29,7 @@ * @author gojomo */ public class LaxURLCodec extends URLCodec { - public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8"); + public static LaxURLCodec DEFAULT = new LaxURLCodec(StandardCharsets.UTF_8.name()); // passthrough constructor public LaxURLCodec(String encoding) { diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java index b19151cd..492f7772 100644 --- a/src/main/java/org/archive/url/URI.java +++ b/src/main/java/org/archive/url/URI.java @@ -626,7 +626,7 @@ public URI(URI base, URI relative) throws URIException { /** * The default charset of the protocol. RFC 2277, 2396 */ - protected static String defaultProtocolCharset = "UTF-8"; + protected static String defaultProtocolCharset = UTF_8.name(); /** diff --git a/src/main/java/org/archive/util/ChunkedInputStream.java b/src/main/java/org/archive/util/ChunkedInputStream.java index 69b23047..b6a604c8 100644 --- a/src/main/java/org/archive/util/ChunkedInputStream.java +++ b/src/main/java/org/archive/util/ChunkedInputStream.java @@ -280,8 +280,7 @@ private static int getChunkSizeFromInputStream(final InputStream in) * @throws IOException If an IO problem occurs */ private void parseTrailerHeaders() throws IOException { - String charset = "US-ASCII"; - LaxHttpParser.parseHeaders(in, charset); + LaxHttpParser.parseHeaders(in, StandardCharsets.US_ASCII.name()); } /** diff --git a/src/main/java/org/archive/util/HMACSigner.java b/src/main/java/org/archive/util/HMACSigner.java index d7a5208e..b502b4fb 100644 --- a/src/main/java/org/archive/util/HMACSigner.java +++ b/src/main/java/org/archive/util/HMACSigner.java @@ -1,5 +1,7 @@ package org.archive.util; +import java.nio.charset.StandardCharsets; + /** * Generate an HMAC key given a secret sig, key name and optional id and an expiration time * @@ -63,11 +65,11 @@ public static String hmacDigest(String msg, String keyString, String algo) { String digest = null; try { SecretKeySpec key = new SecretKeySpec( - (keyString).getBytes("UTF-8"), algo); + (keyString).getBytes(StandardCharsets.UTF_8), algo); Mac mac = Mac.getInstance(algo); mac.init(key); - byte[] bytes = mac.doFinal(msg.getBytes("ASCII")); + byte[] bytes = mac.doFinal(msg.getBytes(StandardCharsets.US_ASCII)); StringBuilder hash = new StringBuilder(); diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java index 1d15256e..334a31b4 100644 --- a/src/main/java/org/archive/util/IAUtils.java +++ b/src/main/java/org/archive/util/IAUtils.java @@ -73,11 +73,7 @@ public static String loadCommons(String id) { if (input == null) { return "UNKNOWN"; } - try { - reader = new InputStreamReader(input, "UTF-8"); - } catch (UnsupportedEncodingException e) { - return "UNKNOWN"; - } + reader = new InputStreamReader(input, UTF_8); Properties prop = new Properties(); try { prop.load(reader); diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java index 05d2469c..434522c8 100644 --- a/src/main/java/org/archive/util/LaxHttpParser.java +++ b/src/main/java/org/archive/util/LaxHttpParser.java @@ -148,7 +148,7 @@ public static String readLine(InputStream inputStream, String charset) throws IO public static String readLine(InputStream inputStream) throws IOException { LOG.finest("enter LaxHttpParser.readLine(InputStream)"); - return readLine(inputStream, "US-ASCII"); + return readLine(inputStream, StandardCharsets.US_ASCII.name()); } /** @@ -238,6 +238,6 @@ public static HttpHeader[] parseHeaders(InputStream is, String charset) throws I */ public static HttpHeader[] parseHeaders(InputStream is) throws IOException { LOG.finest("enter HeaderParser.parseHeaders(InputStream, String)"); - return parseHeaders(is, "US-ASCII"); + return parseHeaders(is, StandardCharsets.US_ASCII.name()); } } diff --git a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java index 7ade0ad5..73e1fda8 100644 --- a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java +++ b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java @@ -5,6 +5,8 @@ import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; public class DNSResponseParserTest { @@ -20,7 +22,7 @@ public void testParse() throws DNSParseException, IOException { } private void verifyResults(String res, String date, String d[][]) throws DNSParseException, IOException { ByteArrayInputStream is = - new ByteArrayInputStream(res.getBytes("UTF-8")); + new ByteArrayInputStream(res.getBytes(UTF_8)); DNSResponse response = new DNSResponse(); parser.parse(is, response); verifyResults(response,date,d); From 3f58e5a8161892aedac0dc2435a998dd71eb1a85 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 12 Nov 2025 14:17:48 +0100 Subject: [PATCH 15/16] Github workflow: call `mvn verify` to run package build and forbiddenAPIs checks --- .github/workflows/maven.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 2421cef3..bb63cd56 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -34,4 +34,4 @@ jobs: restore-keys: | ${{ runner.os }}-maven- - name: Build with Maven - run: mvn -B package --file pom.xml + run: mvn -B verify --file pom.xml From ad6e62d9c233c71798c39e3626396bf58a97271d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 13 Nov 2025 21:25:50 +0100 Subject: [PATCH 16/16] Fix forbiddenAPIs check for Java 8 - explicetly pass Java version - ignore signatures of missing classes --- pom.xml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index c1c17e9b..3dca19e1 100644 --- a/pom.xml +++ b/pom.xml @@ -48,6 +48,7 @@ UTF-8 ${maven.build.timestamp} yyyyMMddhhmmss + 8 @@ -164,8 +165,8 @@ maven-compiler-plugin 3.14.1 - 8 - 8 + ${java.version} + ${java.version} @@ -178,6 +179,8 @@ forbiddenapis 3.10 + ${java.version} + true false