diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 2421cef3..bb63cd56 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -34,4 +34,4 @@ jobs: restore-keys: | ${{ runner.os }}-maven- - name: Build with Maven - run: mvn -B package --file pom.xml + run: mvn -B verify --file pom.xml diff --git a/pom.xml b/pom.xml index 73ba9ba2..3dca19e1 100644 --- a/pom.xml +++ b/pom.xml @@ -48,6 +48,7 @@ UTF-8 ${maven.build.timestamp} yyyyMMddhhmmss + 8 @@ -164,8 +165,8 @@ maven-compiler-plugin 3.14.1 - 8 - 8 + ${java.version} + ${java.version} @@ -173,6 +174,33 @@ maven-surefire-plugin 3.2.5 + + de.thetaphi + forbiddenapis + 3.10 + + ${java.version} + true + + false + + jdk-unsafe + jdk-deprecated + jdk-non-portable + + + src/test/resources/forbidden-apis-signatures.txt + + + + + + check + testCheck + + + + diff --git a/src/main/java/org/archive/extract/DumpingExtractorOutput.java b/src/main/java/org/archive/extract/DumpingExtractorOutput.java index 69591931..1ccbf771 100644 --- a/src/main/java/org/archive/extract/DumpingExtractorOutput.java +++ b/src/main/java/org/archive/extract/DumpingExtractorOutput.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; +import java.io.UnsupportedEncodingException; import java.util.logging.Logger; import org.archive.resource.Resource; @@ -12,13 +13,18 @@ import com.google.common.io.ByteStreams; import com.google.common.io.CountingOutputStream; +import static java.nio.charset.StandardCharsets.UTF_8; + public class DumpingExtractorOutput implements ExtractorOutput { private static final Logger LOG = Logger.getLogger(DumpingExtractorOutput.class.getName()); private PrintStream out; public DumpingExtractorOutput(OutputStream out) { - this.out = new PrintStream(out); + try { + this.out = new PrintStream(out, false, UTF_8.name()); + } catch (UnsupportedEncodingException e) { + } } public void output(Resource resource) throws IOException { diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java index 0afe16fb..567b1cd8 100644 --- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java +++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java @@ -1,6 +1,7 @@ package org.archive.extract; import java.util.Iterator; +import java.util.Locale; import java.util.logging.Logger; import org.archive.format.arc.ARCConstants; @@ -68,14 +69,14 @@ private boolean childFieldStartsWith(MetaData m, String child, String key, String search) { String val = getChildField(m,child,key); return val == null ? false : - val.toLowerCase().startsWith(search.toLowerCase()); + val.toLowerCase(Locale.ROOT).startsWith(search.toLowerCase(Locale.ROOT)); } private boolean childFieldContains(MetaData m, String child, String key, String search) { String val = getChildField(m,child,key); return val == null ? false : - val.toLowerCase().contains(search.toLowerCase()); + val.toLowerCase(Locale.ROOT).contains(search.toLowerCase(Locale.ROOT)); } private boolean childFieldEquals(MetaData m, String child, @@ -88,7 +89,7 @@ private boolean childFieldEquals(MetaData m, String child, private String caseInsensitiveKeyScan(MetaData m, String child, String k) { try { if(m.has(child)) { - String kLC = k.toLowerCase(); + String kLC = k.toLowerCase(Locale.ROOT); JSONObject childJSObj = m.getJSONObject(child); @SuppressWarnings("rawtypes") Iterator i = childJSObj.keys(); @@ -96,7 +97,7 @@ private String caseInsensitiveKeyScan(MetaData m, String child, String k) { Object kObj = i.next(); if(kObj instanceof String) { String kString = (String) kObj; - if(kString.toLowerCase().equals(kLC)) { + if(kString.toLowerCase(Locale.ROOT).equals(kLC)) { return childJSObj.getString(kString); } } @@ -128,7 +129,7 @@ private boolean isHTTPARCResource(MetaData envelope) { private boolean isHTMLHttpResource(MetaData m) { String type = caseInsensitiveKeyScan(m,HTTP_HEADERS_LIST, "Content-Type"); - return type == null ? false : type.toLowerCase().contains("html"); + return type == null ? false : type.toLowerCase(Locale.ROOT).contains("html"); } private boolean isWARCType(MetaData envelope, WARCRecordType type) { diff --git a/src/main/java/org/archive/extract/ExtractingResourceProducer.java b/src/main/java/org/archive/extract/ExtractingResourceProducer.java index de671bee..07cdb88a 100644 --- a/src/main/java/org/archive/extract/ExtractingResourceProducer.java +++ b/src/main/java/org/archive/extract/ExtractingResourceProducer.java @@ -1,6 +1,7 @@ package org.archive.extract; import java.io.IOException; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -33,7 +34,7 @@ public Resource getNext() throws ResourceParseException, IOException { return current; } if(LOG.isLoggable(Level.INFO)) { - LOG.info(String.format("Extracting (%s) with (%s)\n", + LOG.info(String.format(Locale.ROOT, "Extracting (%s) with (%s)\n", current.getClass().toString(), f.getClass().toString())); } diff --git a/src/main/java/org/archive/extract/JSONViewExtractorOutput.java b/src/main/java/org/archive/extract/JSONViewExtractorOutput.java index fb6dc847..6cb7c445 100644 --- a/src/main/java/org/archive/extract/JSONViewExtractorOutput.java +++ b/src/main/java/org/archive/extract/JSONViewExtractorOutput.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; +import java.io.UnsupportedEncodingException; import java.util.List; import org.apache.commons.lang3.StringUtils; @@ -10,12 +11,17 @@ import org.archive.resource.Resource; import org.archive.util.StreamCopy; +import static java.nio.charset.StandardCharsets.UTF_8; + public class JSONViewExtractorOutput implements ExtractorOutput { private PrintStream out; private JSONView view; public JSONViewExtractorOutput(OutputStream out, String filterPath) { view = new JSONView(filterPath.split(",")); - this.out = new PrintStream(out); + try { + this.out = new PrintStream(out, false, UTF_8.name()); + } catch (UnsupportedEncodingException e) { + } } public void output(Resource resource) throws IOException { StreamCopy.readToEOF(resource.getInputStream()); diff --git a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java index e6f6e82f..ff0b9e83 100644 --- a/src/main/java/org/archive/extract/RealCDXExtractorOutput.java +++ b/src/main/java/org/archive/extract/RealCDXExtractorOutput.java @@ -8,6 +8,7 @@ import java.net.URISyntaxException; import java.net.URL; import java.util.List; +import java.util.Locale; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -131,7 +132,7 @@ public void output(Resource resource) throws IOException { } else { meta = "-"; } - if(mime.toLowerCase().contains("html")) { + if(mime.toLowerCase(Locale.ROOT).contains("html")) { if(redir.equals("-")) { // maybe an obvious meta-refresh? redir = extractHTMLMetaRefresh(origUrl,m); @@ -202,7 +203,7 @@ public void output(Resource resource) throws IOException { } else { meta = "-"; } - if(mime.toLowerCase().contains("html")) { + if(mime.toLowerCase(Locale.ROOT).contains("html")) { if(redir.equals("-")) { // maybe an obvious meta-refresh? redir = extractHTMLMetaRefresh(origUrl,m); @@ -222,7 +223,8 @@ public void output(Resource resource) throws IOException { canUrl = keyMaker.makeKey(origUrl); // URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE if(dumpJSON) { - out.format("%s %s %s %s %s %s %s %s %s %s %s %s\n", + out.format(Locale.ROOT, + "%s %s %s %s %s %s %s %s %s %s %s %s\n", canUrl, date, origUrl, @@ -236,7 +238,8 @@ public void output(Resource resource) throws IOException { filename, m.toString(1)); } else { - out.format("%s %s %s %s %s %s %s %s %s %s %s\n", + out.format(Locale.ROOT, + "%s %s %s %s %s %s %s %s %s %s %s\n", canUrl, date, origUrl, @@ -269,7 +272,7 @@ private String extractHTMLRobots(MetaData m) { if(meta != null) { String name = scanHeadersLC(meta, "name", null); if(name != null) { - if(name.toLowerCase().equals("robots")) { + if(name.toLowerCase(Locale.ROOT).equals("robots")) { // alright - some robot instructions: String content = scanHeadersLC(meta, "content", null); if(content != null) { @@ -291,7 +294,7 @@ private String extractHTMLMetaRefresh(String origUrl, MetaData m) { if(meta != null) { String name = scanHeadersLC(meta, "http-equiv", null); if(name != null) { - if(name.toLowerCase().equals("refresh")) { + if(name.toLowerCase(Locale.ROOT).equals("refresh")) { // alright - some robot instructions: String content = scanHeadersLC(meta, "content", null); if(content != null) { @@ -330,7 +333,7 @@ private String scanHeadersLC(JSONObject o, String match, String defaultVal) { if(o.length() == 0) { return defaultVal; } - String lc = match.toLowerCase().trim(); + String lc = match.toLowerCase(Locale.ROOT).trim(); // try { // System.err.println("REC:" + o.toString(1)); // } catch (JSONException e1) { @@ -338,7 +341,7 @@ private String scanHeadersLC(JSONObject o, String match, String defaultVal) { // e1.printStackTrace(); // } for(String key : JSONObject.getNames(o)) { - if(lc.equals(key.toLowerCase().trim())) { + if(lc.equals(key.toLowerCase(Locale.ROOT).trim())) { try { return o.getString(key).trim(); } catch (JSONException e) { @@ -472,7 +475,7 @@ private String parseRobotInstructions(String input) { if(input == null) { return "-"; } - String up = input.replaceAll("-", "").toUpperCase(); + String up = input.replaceAll("-", "").toUpperCase(Locale.ROOT); StringBuilder sb = new StringBuilder(3); if(up.contains(NO_FOLLOW_MATCH)) { sb.append("F"); diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java index 2812aa5b..d9b9f396 100644 --- a/src/main/java/org/archive/extract/ResourceExtractor.java +++ b/src/main/java/org/archive/extract/ResourceExtractor.java @@ -7,7 +7,8 @@ import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.URISyntaxException; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -26,7 +27,6 @@ public class ResourceExtractor implements ResourceConstants, Tool { private final static Logger LOG = Logger.getLogger(ResourceExtractor.class.getName()); - Charset UTF8 = Charset.forName("utf-8"); public final static String TOOL_NAME = "extractor"; public static final String TOOL_DESCRIPTION = "A tool for extracting metadata from WARC, ARC, and WAT files"; @@ -65,7 +65,7 @@ public static void main(String[] args) throws Exception { private PrintWriter makePrintWriter(OutputStream os) { - return new PrintWriter(new OutputStreamWriter(os, Charset.forName("UTF-8"))); + return new PrintWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8)); } public int run(String[] args) @@ -138,18 +138,18 @@ public int run(String[] args) out.output(r); } catch(GZIPFormatException e) { - LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); + LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied - System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()); if(ProducerUtils.STRICT_GZ) { throw e; } e.printStackTrace(); } catch(ResourceParseException e) { - LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); + LOG.severe(String.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied - System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()); if(ProducerUtils.STRICT_GZ) { throw e; @@ -157,9 +157,9 @@ public int run(String[] args) e.printStackTrace(); } catch(RecoverableRecordFormatException e) { // this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions... - LOG.severe(String.format("RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage())); + LOG.severe(String.format(Locale.ROOT, "RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage())); //Log is not coming out for some damn reason....needs to be studied - System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + System.err.format(Locale.ROOT, "%s: %s",exProducer.getContext(),e.getMessage()); e.printStackTrace(); diff --git a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java index 68f9d1c8..b1050a14 100644 --- a/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java +++ b/src/main/java/org/archive/extract/WARCMetadataRecordExtractorOutput.java @@ -3,23 +3,16 @@ import java.io.IOException; import java.io.OutputStream; import java.io.PrintWriter; -import java.net.MalformedURLException; -import java.net.URISyntaxException; -import java.net.URL; import java.util.List; +import java.util.Locale; import java.util.logging.Logger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.archive.format.gzip.GZIPFormatException; -import org.archive.format.json.JSONUtils; import org.archive.format.json.SimpleJSONPathSpec; import org.archive.resource.MetaData; import org.archive.resource.Resource; -import org.archive.util.IAUtils; import org.archive.util.StreamCopy; import org.json.JSONArray; -import org.json.JSONException; import org.json.JSONObject; import com.google.common.io.ByteStreams; @@ -87,7 +80,7 @@ public void output(Resource resource) throws IOException { String[] linkParts = outLinkValue.split(" "); if(linkParts.length > 2) //'outlinks': 'origUrl date origOutlinkUrl linktype linktext' - out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]); + out.format(Locale.ROOT,"%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]); } } else if(outputType.equals("hopinfo")) { String key = obj.get("Name").toString(); @@ -103,7 +96,7 @@ public void output(Resource resource) throws IOException { } if(outputType.equals("hopinfo")) { //'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag' - out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag); + out.format(Locale.ROOT,"%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag); } } } diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java index 4b5f72ed..621656b7 100644 --- a/src/main/java/org/archive/extract/WATExtractorOutput.java +++ b/src/main/java/org/archive/extract/WATExtractorOutput.java @@ -1,15 +1,14 @@ package org.archive.extract; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; -import java.nio.charset.Charset; import java.text.ParseException; import java.net.UnknownHostException; import java.util.Date; +import java.util.Locale; import org.archive.format.gzip.GZIPMemberWriter; import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream; @@ -30,13 +29,14 @@ import java.util.logging.Logger; +import static java.nio.charset.StandardCharsets.UTF_8; + public class WATExtractorOutput implements ExtractorOutput { WARCRecordWriter recW; private boolean wroteFirst; private GZIPMemberWriter gzW; private static int DEFAULT_BUFFER_RAM = 1024 * 1024; private int bufferRAM = DEFAULT_BUFFER_RAM; - private final static Charset UTF8 = Charset.forName("UTF-8"); private String outputFile; private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName()); @@ -143,7 +143,7 @@ private void writeARC(OutputStream recOut, MetaData md) throws IOException { String capDateString = extractOrIO(md, "Envelope.ARC-Header-Metadata.Date"); String filename = extractOrIO(md, "Container.Filename"); String offset = extractOrIO(md, "Container.Offset"); - String recId = String.format("",filename,offset); + String recId = String.format(Locale.ROOT, "",filename,offset); writeWARCMDRecord(recOut,md,targetURI,capDateString,recId); } @@ -156,7 +156,7 @@ private void writeWARC(OutputStream recOut, MetaData md) throws IOException { targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI"); } // handle date of generation in WARC format - DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); + DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.ROOT); String capDateString = dateFormat.format(new Date()); String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID"); writeWARCMDRecord(recOut,md,targetURI,capDateString,recId); @@ -168,7 +168,7 @@ private void writeWARCMDRecord(OutputStream recOut, MetaData md, ByteArrayOutputStream bos = new ByteArrayOutputStream(); - OutputStreamWriter osw = new OutputStreamWriter(bos, UTF8); + OutputStreamWriter osw = new OutputStreamWriter(bos, UTF_8); try { md.write(osw); } catch (JSONException e1) { @@ -176,7 +176,7 @@ private void writeWARCMDRecord(OutputStream recOut, MetaData md, throw new IOException(e1); } osw.flush(); -// ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes("UTF-8")); +// ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes(UTF_8)); Date capDate; try { capDate = DateUtils.getSecondsSinceEpoch(capDateString); diff --git a/src/main/java/org/archive/format/arc/ARCConstants.java b/src/main/java/org/archive/format/arc/ARCConstants.java index 5987b49f..39dbf7ed 100755 --- a/src/main/java/org/archive/format/arc/ARCConstants.java +++ b/src/main/java/org/archive/format/arc/ARCConstants.java @@ -1,6 +1,7 @@ package org.archive.format.arc; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; import java.util.zip.Deflater; @@ -16,7 +17,7 @@ */ public interface ARCConstants extends ArchiveFileConstants { public final static int MAX_META_LENGTH = 1024 * 32; - public final static Charset ARC_META_CHARSET = Charset.forName("utf-8"); + public final static Charset ARC_META_CHARSET = StandardCharsets.UTF_8; public final static int NEW_LINE_ORD = 10; public static final int CARRIAGE_RETURN_ORD = 13; public final static String DELIMITER = " "; diff --git a/src/main/java/org/archive/format/arc/FiledescRecordParser.java b/src/main/java/org/archive/format/arc/FiledescRecordParser.java index c2d7bb65..6a34eb5d 100644 --- a/src/main/java/org/archive/format/arc/FiledescRecordParser.java +++ b/src/main/java/org/archive/format/arc/FiledescRecordParser.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; public class FiledescRecordParser { public boolean strict = false; @@ -12,7 +13,7 @@ public FiledescRecord parse(InputStream is) throws IOException { FiledescRecord rec = new FiledescRecord(); try { // TODO: count input bytes read... - BufferedReader br = new BufferedReader(new InputStreamReader(is,"UTF-8")); + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); String line = br.readLine(); parseLine1(rec,line); line = br.readLine(); diff --git a/src/main/java/org/archive/format/cdx/CDXFile.java b/src/main/java/org/archive/format/cdx/CDXFile.java index 7dca0464..612f7454 100644 --- a/src/main/java/org/archive/format/cdx/CDXFile.java +++ b/src/main/java/org/archive/format/cdx/CDXFile.java @@ -18,6 +18,8 @@ import org.archive.util.iterator.CloseableIterator; import org.archive.util.zip.OpenJDK7GZIPInputStream; +import static java.nio.charset.StandardCharsets.UTF_8; + public class CDXFile extends SortedTextFile implements CDXInputSource { public CDXFile(String uri) throws IOException { @@ -94,7 +96,7 @@ public static BufferedReader createStreamingLineReader(String uri, boolean gzipp input = new OpenJDK7GZIPInputStream(swis); } - BufferedReader reader = new BufferedReader(new InputStreamReader(input)); + BufferedReader reader = new BufferedReader(new InputStreamReader(input, UTF_8)); return reader; } diff --git a/src/main/java/org/archive/format/dns/DNSResponseParser.java b/src/main/java/org/archive/format/dns/DNSResponseParser.java index b5f81633..3e868ccf 100644 --- a/src/main/java/org/archive/format/dns/DNSResponseParser.java +++ b/src/main/java/org/archive/format/dns/DNSResponseParser.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; public class DNSResponseParser { @@ -28,7 +29,7 @@ public void parse(InputStream is, DNSResponse response) throws IOException, DNSP try { // TODO: should we wrap in a CountingInputStream and indicate // observed octet-length? - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); String date = br.readLine().trim(); if(isDate(date)) { response.setDate(date); diff --git a/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java b/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java index d70bf394..154cf5f1 100644 --- a/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java +++ b/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.InputStream; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.Inflater; @@ -227,7 +228,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException } if(LOG.isLoggable(Level.INFO)) { - LOG.info(String.format( + LOG.info(String.format(Locale.ROOT, "Got EOF after %d bytes before finding magic in %s\n", amtSkipped * -1, streamContext)); } @@ -237,7 +238,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException if(amtSkipped > 0) { if(strict) { if(state == STATE_START) { - LOG.info(String.format( + LOG.info(String.format(Locale.ROOT, "Strict mode Skipped %d bytes in (%s) before finding magic at offset(%d)\n", amtSkipped, streamContext, offset-3)); } else { @@ -248,7 +249,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException } if(LOG.isLoggable(Level.INFO)) { - LOG.info(String.format( + LOG.info(String.format(Locale.ROOT, "Skipped %d bytes in (%s) before finding magic at offset(%d)\n", amtSkipped, streamContext, offset-3)); } @@ -268,7 +269,7 @@ public GZIPSeriesMember getNextMember() throws GZIPFormatException, IOException } offset = currentMemberStartOffset + 3; stream.setOffset(currentMemberStartOffset + 3); - LOG.warning(String.format( + LOG.warning(String.format(Locale.ROOT, "GZIPFormatException with record around offset(%d) in (%s)\n", offset, streamContext)); } diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java index a3d34a4b..0a3fa1bf 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumCluster.java @@ -11,8 +11,9 @@ */ import java.io.BufferedReader; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -35,6 +36,8 @@ import org.archive.util.binsearch.impl.HTTPSeekableLineReader; import org.archive.util.iterator.CloseableIterator; +import static java.nio.charset.StandardCharsets.UTF_8; + public class ZipNumCluster extends ZipNumIndex { final static Logger LOGGER = Logger.getLogger(ZipNumCluster.class.getName()); @@ -367,7 +370,7 @@ protected void loadLastBlockSizes(String filename) totalAdjustment = 0; try { - reader = new BufferedReader(new FileReader(filename)); + reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), UTF_8)); while ((line = reader.readLine()) != null) { String[] splits = line.split("\t"); diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java index a104244a..c0e4e01d 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java @@ -3,18 +3,18 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.nio.charset.Charset; import org.archive.format.gzip.GZIPMemberWriter; import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream; +import static java.nio.charset.StandardCharsets.UTF_8; + public class ZipNumWriter extends GZIPMemberWriterCommittedOutputStream { int limit; int count; OutputStream manifestOut; ByteArrayOutputStream manifestBuffer; char delimiter = '\t'; - private static final Charset UTF8 = Charset.forName("utf-8"); public ZipNumWriter(OutputStream main, OutputStream manifest, int limit) { super(new GZIPMemberWriter(main)); manifestOut = manifest; @@ -51,7 +51,7 @@ private void finishCurrent() throws IOException { sb.append(delimiter); sb.append(len); sb.append(delimiter); - manifestOut.write(sb.toString().getBytes(UTF8)); + manifestOut.write(sb.toString().getBytes(UTF_8)); manifestBuffer.writeTo(manifestOut); manifestOut.flush(); count = 0; diff --git a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java index ed5dfcb2..f1ac16c6 100755 --- a/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java +++ b/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java @@ -1,10 +1,10 @@ package org.archive.format.http; import java.io.PrintStream; -import java.nio.charset.Charset; +import java.util.Locale; + public class DumpingHTTPParseObserver implements HttpHeaderObserver { - private static final Charset UTF8 = Charset.forName("UTF-8"); private PrintStream ps = null; public DumpingHTTPParseObserver() { ps = System.out; @@ -15,13 +15,13 @@ public DumpingHTTPParseObserver(PrintStream ps) { public void headerParsed(byte[] name, int ns, int nl, byte[] value, int vs, int vl) { - ps.format("headerParsed:(%d:%d)(%s)(%d:%d)(%s)\n", + ps.format(Locale.ROOT,"headerParsed:(%d:%d)(%s)(%d:%d)(%s)\n", ns,nl,new String(name,0,nl,UTF8), vs,vl,new String(value,0,vl,UTF8)); } public void headersComplete(int bytesRead) { - ps.format("headersComplete(%d)\n",bytesRead); + ps.format(Locale.ROOT,"headersComplete(%d)\n",bytesRead); } public void headersCorrupt() { ps.println("headersCorrupted\n"); diff --git a/src/main/java/org/archive/format/http/HttpConstants.java b/src/main/java/org/archive/format/http/HttpConstants.java index fa0a7e10..8ae4d4db 100755 --- a/src/main/java/org/archive/format/http/HttpConstants.java +++ b/src/main/java/org/archive/format/http/HttpConstants.java @@ -1,9 +1,10 @@ package org.archive.format.http; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; public interface HttpConstants { - public static final Charset UTF8 = Charset.forName("UTF-8"); + public static final Charset UTF8 = StandardCharsets.UTF_8; public static final byte CR = 13; public static final byte LF = 10; public static final byte SP = 32; diff --git a/src/main/java/org/archive/format/http/HttpHeader.java b/src/main/java/org/archive/format/http/HttpHeader.java index 57b70e1f..9ebe860f 100755 --- a/src/main/java/org/archive/format/http/HttpHeader.java +++ b/src/main/java/org/archive/format/http/HttpHeader.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.OutputStream; +import java.util.Locale; public class HttpHeader implements HttpConstants { private String name = null; @@ -27,7 +28,7 @@ public void write(OutputStream out) throws IOException { public String toString() { StringBuilder sb = new StringBuilder(name.length() + value.length()+20); - sb.append(String.format("HttpHeader(%s)(%s)",name,value)); + sb.append(String.format(Locale.ROOT, "HttpHeader(%s)(%s)",name,value)); return sb.toString(); } } diff --git a/src/main/java/org/archive/format/http/HttpHeaderParser.java b/src/main/java/org/archive/format/http/HttpHeaderParser.java index bee3c28b..ddbb6e47 100755 --- a/src/main/java/org/archive/format/http/HttpHeaderParser.java +++ b/src/main/java/org/archive/format/http/HttpHeaderParser.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; public class HttpHeaderParser implements HttpConstants { private static final int DEFAULT_MAX_NAME_LENGTH = 1024 * 100; @@ -288,7 +289,8 @@ public ParseState handleByte(byte b, HttpHeaderParser parser) return parser.postColonState; } if(parser.isStrict) { - throw new HttpParseException("Illegal char after name("+new String(name,0,nameLength)+")"); + throw new HttpParseException("Illegal char after name(" + + new String(name, 0, nameLength, StandardCharsets.ISO_8859_1) + ")"); } parser.headersCorrupted(); return parser.laxLineEatParseState; diff --git a/src/main/java/org/archive/format/http/HttpHeaders.java b/src/main/java/org/archive/format/http/HttpHeaders.java index ed8061d7..a65dd8fb 100755 --- a/src/main/java/org/archive/format/http/HttpHeaders.java +++ b/src/main/java/org/archive/format/http/HttpHeaders.java @@ -4,6 +4,7 @@ import java.io.OutputStream; import java.util.ArrayList; import java.util.Date; +import java.util.Locale; import java.util.logging.Logger; import org.archive.util.ByteOp; @@ -54,9 +55,9 @@ public String getValue(String name) { } public String getValueCaseInsensitive(String name) { - String lc = name.toLowerCase(); + String lc = name.toLowerCase(Locale.ROOT); for(HttpHeader h : this) { - if(h.getName().toLowerCase().equals(lc)) { + if(h.getName().toLowerCase(Locale.ROOT).equals(lc)) { return h.getValue(); } } diff --git a/src/main/java/org/archive/format/http/HttpMessageParser.java b/src/main/java/org/archive/format/http/HttpMessageParser.java index c4fcdf92..24e59e03 100644 --- a/src/main/java/org/archive/format/http/HttpMessageParser.java +++ b/src/main/java/org/archive/format/http/HttpMessageParser.java @@ -1,5 +1,6 @@ package org.archive.format.http; +import java.util.Locale; public class HttpMessageParser implements HttpConstants { @@ -22,11 +23,11 @@ protected int parseVersionLax(byte buf[], int start, int len) throws HttpParseException { String v = new String(buf,start,len,UTF8); - if(v.toLowerCase().compareTo(VERSION_0_STATUS.toLowerCase()) == 0) { + if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_0_STATUS.toLowerCase(Locale.ROOT)) == 0) { return VERSION_0; - } else if(v.toLowerCase().compareTo(VERSION_1_STATUS.toLowerCase()) == 0) { + } else if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_1_STATUS.toLowerCase(Locale.ROOT)) == 0) { return VERSION_1; - } else if(v.toLowerCase().compareTo(VERSION_9_STATUS.toLowerCase()) == 0) { + } else if(v.toLowerCase(Locale.ROOT).compareTo(VERSION_9_STATUS.toLowerCase(Locale.ROOT)) == 0) { return VERSION_9; } return VERSION_0; diff --git a/src/main/java/org/archive/format/http/HttpRequestMessageParser.java b/src/main/java/org/archive/format/http/HttpRequestMessageParser.java index f7bc43c7..759bbe5d 100644 --- a/src/main/java/org/archive/format/http/HttpRequestMessageParser.java +++ b/src/main/java/org/archive/format/http/HttpRequestMessageParser.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.InputStream; +import java.util.Locale; public class HttpRequestMessageParser extends HttpMessageParser { public int maxBytes = 1024 * 1024; @@ -223,7 +224,7 @@ protected int parseMethodStrict(byte buf[], int start, int len) protected int parseMethodLax(byte buf[], int start, int len) throws HttpParseException { - String v = new String(buf,start,len,UTF8).toUpperCase(); + String v = new String(buf,start,len,UTF8).toUpperCase(Locale.ROOT); if(v.compareTo(METHOD_GET_STRING) == 0) { return METHOD_GET; } else if(v.compareTo(METHOD_HEAD_STRING) == 0) { diff --git a/src/main/java/org/archive/format/http/HttpResponseMessage.java b/src/main/java/org/archive/format/http/HttpResponseMessage.java index 0cb7b7e5..6d3f5c35 100755 --- a/src/main/java/org/archive/format/http/HttpResponseMessage.java +++ b/src/main/java/org/archive/format/http/HttpResponseMessage.java @@ -1,5 +1,7 @@ package org.archive.format.http; +import java.util.Locale; + public class HttpResponseMessage extends HttpMessage implements HttpResponseMessageObserver { private int status = 0; private String reason = null; @@ -20,10 +22,10 @@ public String getReason() { return reason; } public String toString() { - return String.format("%s %d %s%s", getVersionString(), status, reason, CRLF); + return String.format(Locale.ROOT, "%s %d %s%s", getVersionString(), status, reason, CRLF); } public String toDebugString() { - return String.format("Message(%d):(%s) (%d) (%s)\n", + return String.format(Locale.ROOT, "Message(%d):(%s) (%d) (%s)\n", reason.length(),getVersionString(),status,reason,CRLF); } diff --git a/src/main/java/org/archive/format/http/HttpResponseMessageParser.java b/src/main/java/org/archive/format/http/HttpResponseMessageParser.java index 3aee7c48..4ddef2ad 100755 --- a/src/main/java/org/archive/format/http/HttpResponseMessageParser.java +++ b/src/main/java/org/archive/format/http/HttpResponseMessageParser.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; public class HttpResponseMessageParser extends HttpMessageParser { public int maxBytes = 1024 * 128; @@ -97,7 +98,7 @@ public int parseStrict(byte buf[], int len, HttpResponseMessageObserver obs) version = parseVersionStrict(buf, vs, vl); status = parseStatusStrict(buf,ss,sl); - reason = new String(buf,idx+1,(len - idx)-1); + reason = new String(buf,idx+1,(len - idx)-1,StandardCharsets.ISO_8859_1); obs.messageParsed(version, status, reason, len); @@ -155,7 +156,7 @@ private int parseLax(byte buf[], int len, HttpResponseMessageObserver obs) idx++; int reasonLen = bufferEnd - idx; if(reasonLen > 0) { - reason = new String(buf,idx,reasonLen); + reason = new String(buf,idx,reasonLen,StandardCharsets.ISO_8859_1); } } else { // missed some: diff --git a/src/main/java/org/archive/format/json/CrossProductOfLists.java b/src/main/java/org/archive/format/json/CrossProductOfLists.java index f9e2abd2..69cdae33 100644 --- a/src/main/java/org/archive/format/json/CrossProductOfLists.java +++ b/src/main/java/org/archive/format/json/CrossProductOfLists.java @@ -4,6 +4,7 @@ import java.util.ArrayList; import java.util.Deque; import java.util.List; +import java.util.Locale; import java.util.Stack; import java.util.logging.Level; import java.util.logging.Logger; @@ -18,12 +19,12 @@ public List> crossProduct(List>> listOfLists) { if(LOG.isLoggable(Level.INFO)) { int count = listOfLists.size(); - LOG.info(String.format("Total of (%d) lists to cross product",count)); + LOG.info(String.format(Locale.ROOT, "Total of (%d) lists to cross product",count)); for(int i = 0; i < count; i++) { - LOG.info(String.format("Field (%d) is (%d) deep",i,listOfLists.get(i).size())); + LOG.info(String.format(Locale.ROOT, "Field (%d) is (%d) deep",i,listOfLists.get(i).size())); for(List inner : listOfLists.get(i)) { LOG.info( - String.format("----(%d):(%s)" + String.format(Locale.ROOT, "----(%d):(%s)" ,i,StringUtils.join(inner.toArray(),",") ) ); } } diff --git a/src/main/java/org/archive/format/json/JSONView.java b/src/main/java/org/archive/format/json/JSONView.java index 7a984ebe..444ea7e6 100644 --- a/src/main/java/org/archive/format/json/JSONView.java +++ b/src/main/java/org/archive/format/json/JSONView.java @@ -2,6 +2,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -28,7 +29,7 @@ public class JSONView { public JSONView(String... pathSpecs) { this.pathSpecs = new ArrayList(pathSpecs.length); if(LOG.isLoggable(Level.INFO)) { - LOG.info(String.format("Creating JSONView with(%s)", + LOG.info(String.format(Locale.ROOT, "Creating JSONView with(%s)", StringUtils.join(pathSpecs,","))); } for(String pathSpec : pathSpecs) { diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 214fde07..08aac469 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -22,6 +22,8 @@ import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -91,7 +93,7 @@ public abstract class CharsetDetector { // ...and if the chardet library fails, use the Content-Type header protected final static String HTTP_CONTENT_TYPE_HEADER = "CONTENT-TYPE"; /** the default charset name to use when giving up */ - public final static String DEFAULT_CHARSET = "UTF-8"; + public final static String DEFAULT_CHARSET = StandardCharsets.UTF_8.name(); protected boolean isCharsetSupported(String charsetName) { // can you believe that this throws a runtime? Just asking if it's @@ -106,7 +108,7 @@ protected boolean isCharsetSupported(String charsetName) { } } protected String mapCharset(String orig) { - String lc = orig.toLowerCase(); + String lc = orig.toLowerCase(Locale.ROOT); if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) { return "cp1252"; } @@ -114,7 +116,7 @@ protected String mapCharset(String orig) { } protected String contentTypeToCharset(final String contentType) { int offset = - contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase()); + contentType.toUpperCase(Locale.ROOT).indexOf(CHARSET_TOKEN.toUpperCase(Locale.ROOT)); if (offset != -1) { String cs = contentType.substring(offset + CHARSET_TOKEN.length()); @@ -148,7 +150,7 @@ protected String getCharsetFromHeaders(HttpHeaders headers) return null; } for(HttpHeader header : headers) { - if(header.getName().toUpperCase().trim().equals( + if(header.getName().toUpperCase(Locale.ROOT).trim().equals( HTTP_CONTENT_TYPE_HEADER)) { return contentTypeToCharset(header.getValue()); } diff --git a/src/main/java/org/archive/format/text/html/NodeUtils.java b/src/main/java/org/archive/format/text/html/NodeUtils.java index 625d9099..f231b91a 100644 --- a/src/main/java/org/archive/format/text/html/NodeUtils.java +++ b/src/main/java/org/archive/format/text/html/NodeUtils.java @@ -19,6 +19,8 @@ */ package org.archive.format.text.html; +import java.util.Locale; + import org.htmlparser.Node; import org.htmlparser.nodes.RemarkNode; import org.htmlparser.nodes.TagNode; @@ -41,7 +43,7 @@ public static boolean isTagNodeNamed(Node node, String name) { if(isTagNode(node)) { TagNode tagNode = (TagNode) node; String nodeName = tagNode.getTagName(); - return nodeName.equals(name.toUpperCase()); + return nodeName.equals(name.toUpperCase(Locale.ROOT)); } return false; } @@ -50,7 +52,7 @@ public static boolean isOpenTagNodeNamed(Node node, String name) { TagNode tagNode = (TagNode) node; if(!tagNode.isEndTag()) { String nodeName = tagNode.getTagName(); - return nodeName.equals(name.toUpperCase()); + return nodeName.equals(name.toUpperCase(Locale.ROOT)); } } return false; @@ -60,7 +62,7 @@ public static boolean isNonEmptyOpenTagNodeNamed(Node node, String name) { TagNode tagNode = (TagNode) node; if(!tagNode.isEndTag() && !tagNode.isEmptyXmlTag()) { String nodeName = tagNode.getTagName(); - return nodeName.equals(name.toUpperCase()); + return nodeName.equals(name.toUpperCase(Locale.ROOT)); } } return false; @@ -70,7 +72,7 @@ public static boolean isCloseTagNodeNamed(Node node, String name) { TagNode tagNode = (TagNode) node; if(tagNode.isEndTag()) { String nodeName = tagNode.getTagName(); - return nodeName.equals(name.toUpperCase()); + return nodeName.equals(name.toUpperCase(Locale.ROOT)); } } return false; diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java index 72dad45a..a6bdb3f4 100644 --- a/src/main/java/org/archive/format/warc/WARCConstants.java +++ b/src/main/java/org/archive/format/warc/WARCConstants.java @@ -19,6 +19,8 @@ package org.archive.format.warc; +import java.nio.charset.StandardCharsets; + import org.archive.format.ArchiveFileConstants; /** @@ -93,7 +95,7 @@ public interface WARCConstants extends ArchiveFileConstants { * till we figure it, DEFAULT_ENCODING is single-byte charset -- same as * ARCs. */ - public static final String DEFAULT_ENCODING = "UTF-8"; + public static final String DEFAULT_ENCODING = StandardCharsets.UTF_8.name(); public static final String HEADER_LINE_ENCODING = DEFAULT_ENCODING; // TODO: Revisit. 8859 isn't correct, especially if we settle on RFC822 diff --git a/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java b/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java index 37c8af99..a3cbb26c 100644 --- a/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java +++ b/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Locale; import java.util.logging.Logger; import org.apache.hadoop.mapreduce.InputFormat; @@ -54,7 +55,7 @@ public Tuple getNext() throws IOException { try { key = reader.getCurrentKey(); - LOG.info(String.format("Loaded key-offset %d\n", key.offset)); + LOG.info(String.format(Locale.ROOT, "Loaded key-offset %d\n", key.offset)); value = reader.getCurrentValue(); } catch (InterruptedException e) { // is this needed and the right way? diff --git a/src/main/java/org/archive/hadoop/FilenameInputFormat.java b/src/main/java/org/archive/hadoop/FilenameInputFormat.java index 5893afb1..3f41cdee 100644 --- a/src/main/java/org/archive/hadoop/FilenameInputFormat.java +++ b/src/main/java/org/archive/hadoop/FilenameInputFormat.java @@ -17,7 +17,6 @@ package org.archive.hadoop; import java.io.*; -import java.util.*; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; diff --git a/src/main/java/org/archive/hadoop/PerMapOutputFormat.java b/src/main/java/org/archive/hadoop/PerMapOutputFormat.java index 28ebca73..684202bb 100644 --- a/src/main/java/org/archive/hadoop/PerMapOutputFormat.java +++ b/src/main/java/org/archive/hadoop/PerMapOutputFormat.java @@ -17,7 +17,6 @@ package org.archive.hadoop; import java.io.*; -import java.util.*; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; diff --git a/src/main/java/org/archive/hadoop/ResourceRecordReader.java b/src/main/java/org/archive/hadoop/ResourceRecordReader.java index 06d3ce2e..88b93dd2 100644 --- a/src/main/java/org/archive/hadoop/ResourceRecordReader.java +++ b/src/main/java/org/archive/hadoop/ResourceRecordReader.java @@ -1,6 +1,7 @@ package org.archive.hadoop; import java.io.IOException; +import java.util.Locale; import java.util.logging.Logger; import org.apache.hadoop.fs.FSDataInputStream; @@ -111,7 +112,7 @@ public boolean nextKeyValue() throws IOException, InterruptedException { if(r != null) { StreamCopy.readToEOF(r.getInputStream()); - LOG.info(String.format("Extracted offset %d\n", + LOG.info(String.format(Locale.ROOT, "Extracted offset %d\n", series.getCurrentMemberStartOffset())); cachedK = new ResourceContext(name, series.getCurrentMemberStartOffset()); @@ -121,7 +122,7 @@ public boolean nextKeyValue() throws IOException, InterruptedException { } catch (ResourceParseException e) { e.printStackTrace(); throw new IOException( - String.format("ResourceParseException at(%s)(%d)", + String.format(Locale.ROOT, "ResourceParseException at(%s)(%d)", name,series.getCurrentMemberStartOffset()), e); } diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java index 449cdc24..070455a5 100644 --- a/src/main/java/org/archive/io/ArchiveReader.java +++ b/src/main/java/org/archive/io/ArchiveReader.java @@ -26,12 +26,14 @@ import java.io.EOFException; import java.io.File; import java.io.FileInputStream; -import java.io.FileWriter; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; @@ -44,6 +46,8 @@ import static org.archive.format.ArchiveFileConstants.*; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Reader for an Archive file of Archive {@link ArchiveRecord}s. @@ -615,7 +619,7 @@ protected static boolean getTrueOrFalse(final String value) { if (value == null || value.length() <= 0) { return false; } - return Boolean.TRUE.toString().equals(value.toLowerCase()); + return Boolean.TRUE.toString().equals(value.toLowerCase(Locale.ROOT)); } /** @@ -659,7 +663,7 @@ protected void cdxOutput(boolean toFile) DOT_COMPRESSED_FILE_EXTENSION); cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); cdxFilename += ('.' + CDX); - cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); + cdxWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(cdxFilename), UTF_8)); } String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") @@ -757,4 +761,4 @@ protected static Options getOptions() { "'or 'nohead'. Default: 'cdx'.")); return options; } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java index bc316893..fe72236b 100644 --- a/src/main/java/org/archive/io/ArchiveReaderFactory.java +++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java @@ -25,6 +25,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; +import java.util.Locale; import org.archive.io.arc.ARCReaderFactory; import org.archive.io.warc.WARCReaderFactory; @@ -296,7 +297,7 @@ protected void addUserAgent(final HttpURLConnection connection) { * @throws IOException */ protected boolean isCompressed(final File f) throws IOException { - return f.getName().toLowerCase(). + return f.getName().toLowerCase(Locale.ROOT). endsWith(DOT_COMPRESSED_FILE_EXTENSION); } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java index 4bd1fa02..01e8d5ec 100644 --- a/src/main/java/org/archive/io/ArchiveRecord.java +++ b/src/main/java/org/archive/io/ArchiveRecord.java @@ -23,6 +23,7 @@ import java.io.OutputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Locale; import java.util.logging.Level; import org.archive.format.ArchiveFileConstants; @@ -393,7 +394,7 @@ public boolean hasContentHeaders() { return false; } - if (!url.toLowerCase().startsWith("http")) { + if (!url.toLowerCase(Locale.ROOT).startsWith("http")) { return false; } diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java index 14b56219..6e331565 100644 --- a/src/main/java/org/archive/io/CompositeFileReader.java +++ b/src/main/java/org/archive/io/CompositeFileReader.java @@ -23,6 +23,8 @@ import java.io.InputStreamReader; import java.util.List; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * @author gojomo @@ -34,7 +36,7 @@ public class CompositeFileReader extends InputStreamReader { * @throws IOException */ public CompositeFileReader(List filenames) throws IOException { - super(new CompositeFileInputStream(filenames)); + super(new CompositeFileInputStream(filenames), UTF_8); } } diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java index c427550b..ff96717c 100644 --- a/src/main/java/org/archive/io/GenericReplayCharSequence.java +++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java @@ -33,14 +33,15 @@ import java.nio.channels.FileChannel; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.text.NumberFormat; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.IOUtils; import org.archive.util.DevUtils; -import com.google.common.base.Charsets; import com.google.common.primitives.Ints; /** @@ -67,7 +68,7 @@ public class GenericReplayCharSequence implements ReplayCharSequence { * *

See Encoding. */ - public static final Charset WRITE_ENCODING = Charsets.UTF_16BE; + public static final Charset WRITE_ENCODING = StandardCharsets.UTF_16BE; private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M @@ -168,8 +169,8 @@ private void updateMemoryMappedBuffer() { long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES); logger.fine("updateMemoryMappedBuffer: mapOffset=" - + NumberFormat.getInstance().format(mapByteOffset) - + " mapSize=" + NumberFormat.getInstance().format(mapSize)); + + NumberFormat.getInstance(Locale.ROOT).format(mapByteOffset) + + " mapSize=" + NumberFormat.getInstance(Locale.ROOT).format(mapSize)); try { // TODO: stress-test without these possibly-costly requests! // System.gc(); @@ -255,9 +256,9 @@ protected void decode(InputStream inStream, int prefixMax, this.length = Ints.saturatedCast(count); if(count>Integer.MAX_VALUE) { logger.warning("input stream is longer than Integer.MAX_VALUE=" - + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE) + " characters -- only first " - + NumberFormat.getInstance().format(Integer.MAX_VALUE) + + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE) + " are accessible through this GenericReplayCharSequence"); } diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java index 809a9e54..858edb4d 100644 --- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -25,6 +25,8 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.util.Locale; import org.archive.format.http.HttpHeader; import org.archive.format.arc.ARCConstants; @@ -144,20 +146,17 @@ private InputStream readContentHeaders() throws IOException { int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new IOException("Failed to read raw lie where one " + - " was expected: " + new String(statusBytes)); + " was expected: " + new String(statusBytes, ARCConstants.DEFAULT_ENCODING)); } String statusLine = new String(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); - if (statusLine == null) { - throw new NullPointerException("Expected status line is null"); - } statusLine = statusLine.trim(); // TODO: Tighten up this test. boolean isHttpResponse = statusLine.startsWith("HTTP"); boolean isHttpRequest = false; if (!isHttpResponse) { - isHttpRequest = statusLine.toUpperCase().startsWith("GET") || - !statusLine.toUpperCase().startsWith("POST"); + isHttpRequest = statusLine.toUpperCase(Locale.ROOT).startsWith("GET") || + !statusLine.toUpperCase(Locale.ROOT).startsWith("POST"); } if (!isHttpResponse && !isHttpRequest) { throw new UnexpectedStartLineIOException("Failed parse of " + @@ -185,7 +184,7 @@ private InputStream readContentHeaders() throws IOException { eolCharCount = getEolCharsCount(lineBytes); if (eolCharCount <= 0) { throw new IOException("Failed reading headers: " + - ((lineBytes != null)? new String(lineBytes): null)); + ((lineBytes != null)? new String(lineBytes, StandardCharsets.ISO_8859_1): null)); } // Save the bytes read. baos.write(lineBytes); diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java index e456e293..bd74f2f8 100644 --- a/src/main/java/org/archive/io/ReplayCharSequence.java +++ b/src/main/java/org/archive/io/ReplayCharSequence.java @@ -23,8 +23,7 @@ import java.io.IOException; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; - -import com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; /** @@ -40,7 +39,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable { /** charset to use in replay when declared value * is absent/illegal/unavailable */ - public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8? + public Charset FALLBACK_CHARSET = StandardCharsets.ISO_8859_1; // TODO: should this be UTF-8? /** * Call this method when done so implementation has chance to clean up diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java index c280b08d..4dc0144b 100644 --- a/src/main/java/org/archive/io/UTF8Bytes.java +++ b/src/main/java/org/archive/io/UTF8Bytes.java @@ -19,6 +19,7 @@ package org.archive.io; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; /** * Marker Interface for instances that can be serialized as UTF8 bytes. @@ -27,7 +28,7 @@ * @version $Date$ $Version$ */ public interface UTF8Bytes { - public static final String UTF8 = "UTF-8"; + public static final String UTF8 = StandardCharsets.UTF_8.name(); /** * @return Instance as UTF-8 bytes. diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java index a488354a..5d350534 100644 --- a/src/main/java/org/archive/io/WriterPoolMember.java +++ b/src/main/java/org/archive/io/WriterPoolMember.java @@ -25,10 +25,13 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.text.DecimalFormat; +import java.text.DecimalFormatSymbols; import java.text.NumberFormat; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Properties; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; @@ -52,7 +55,7 @@ public abstract class WriterPoolMember { private final Logger logger = Logger.getLogger(this.getClass().getName()); - public static final String UTF8 = "UTF-8"; + public static final String UTF8 = StandardCharsets.UTF_8.name(); /** * Default archival-aggregate filename template. @@ -103,12 +106,17 @@ public abstract class WriterPoolMember { */ protected static int roundRobinIndex = 0; + /** + * Symbol set for serial number formatter. + */ + protected static DecimalFormatSymbols serialNoFormatterSymbols = new DecimalFormatSymbols(Locale.ROOT); + /** * NumberFormat instance for formatting serial number. * * Pads serial number with zeros. */ - protected static NumberFormat serialNoFormatter = new DecimalFormat("00000"); + protected static NumberFormat serialNoFormatter = new DecimalFormat("00000", serialNoFormatterSymbols); /** diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java index f0515694..aec571e9 100644 --- a/src/main/java/org/archive/io/arc/ARC2WCDX.java +++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java @@ -32,6 +32,8 @@ import org.archive.util.ArchiveUtils; import org.archive.util.SURT; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC. * Writes .wcdx.gz in same directory. @@ -61,7 +63,7 @@ public static Object[] createWcdx(ARCReader reader) { PrintStream writer = null; long count = 0; try { - writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile))); + writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)), false, UTF_8.name()); // write header: legend + timestamp StringBuilder legend = new StringBuilder(); diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java index c9a88415..f8935e79 100644 --- a/src/main/java/org/archive/io/arc/ARCReader.java +++ b/src/main/java/org/archive/io/arc/ARCReader.java @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; @@ -447,7 +448,6 @@ public static void createCDXIndexFile(String urlOrPath) * @throws IOException * @throws java.text.ParseException */ - @SuppressWarnings("unchecked") public static void main(String [] args) throws ParseException, IOException, java.text.ParseException { Options options = getOptions(); @@ -493,7 +493,7 @@ public static void main(String [] args) break; case 'f': - format = cmdlineOptions[i].getValue().toLowerCase(); + format = cmdlineOptions[i].getValue().toLowerCase(Locale.ROOT); boolean match = false; // List of supported formats. final String [] supportedFormats = diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java index d2f10842..bbcc8b6f 100644 --- a/src/main/java/org/archive/io/arc/ARCReaderFactory.java +++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java @@ -27,6 +27,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; +import java.util.Locale; import java.util.logging.Level; import org.archive.io.ArchiveReader; @@ -230,7 +231,7 @@ public static boolean testCompressedARCFile(File arcFile, throws IOException { boolean compressedARCFile = false; FileUtils.assertReadable(arcFile); - if(!skipSuffixCheck && !arcFile.getName().toLowerCase() + if(!skipSuffixCheck && !arcFile.getName().toLowerCase(Locale.ROOT) .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { return compressedARCFile; } @@ -247,9 +248,9 @@ public static boolean testCompressedARCFile(File arcFile, public static boolean isARCSuffix(final String arcName) { return (arcName == null)? false: - (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))? + (arcName.toLowerCase(Locale.ROOT).endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))? true: - (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))? + (arcName.toLowerCase(Locale.ROOT).endsWith(DOT_ARC_FILE_EXTENSION))? true: false; } @@ -452,4 +453,4 @@ protected void gotoEOR(ArchiveRecord rec) throws IOException { logStdErr(Level.WARNING, message); } } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index 0815c18a..c14426a5 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -27,6 +27,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; @@ -376,7 +377,7 @@ private ARCRecordMetaData computeMetaData(List keys, if (keys.size() != values.size()) { // Early ARCs had a space in mimetype. if (values.size() == (keys.size() + 1) && - values.get(4).toLowerCase().startsWith("charset=")) { + values.get(4).toLowerCase(Locale.ROOT).startsWith("charset=")) { List nuvalues = new ArrayList(keys.size()); nuvalues.add(0, values.get(0)); @@ -588,7 +589,7 @@ private InputStream readHttpHeader() throws IOException { if (eolCharCount <= 0) { throw new RecoverableIOException( "Failed to read http status where one was expected: " - + ((statusBytes == null) ? "" : new String(statusBytes))); + + ((statusBytes == null) ? "" : new String(statusBytes, DEFAULT_ENCODING))); } statusLine = new String(statusBytes, 0, @@ -658,7 +659,7 @@ private InputStream readHttpHeader() throws IOException { break; } else { throw new IOException("Failed reading http headers: " + - ((lineBytes != null)? new String(lineBytes): null)); + ((lineBytes != null)? new String(lineBytes, DEFAULT_ENCODING): null)); } } else { httpHeaderBytesRead += lineBytes.length; diff --git a/src/main/java/org/archive/io/arc/ARCUtils.java b/src/main/java/org/archive/io/arc/ARCUtils.java index 5bcb4cc3..05c15abb 100644 --- a/src/main/java/org/archive/io/arc/ARCUtils.java +++ b/src/main/java/org/archive/io/arc/ARCUtils.java @@ -27,6 +27,7 @@ import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; +import java.util.Locale; import org.archive.url.UsableURI; import org.archive.util.zip.GzipHeader; @@ -94,7 +95,7 @@ public static boolean testCompressedARCFile(File arcFile, throws IOException { boolean compressedARCFile = false; isReadable(arcFile); - if(!skipSuffixCheck && !arcFile.getName().toLowerCase() + if(!skipSuffixCheck && !arcFile.getName().toLowerCase(Locale.ROOT) .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { return compressedARCFile; } @@ -197,7 +198,7 @@ public static boolean testUncompressedARCFile(File arcFile) throws IOException { boolean uncompressedARCFile = false; isReadable(arcFile); - if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) { + if(arcFile.getName().toLowerCase(Locale.ROOT).endsWith(ARC_FILE_EXTENSION)) { FileInputStream fis = new FileInputStream(arcFile); try { byte [] b = new byte[ARC_MAGIC_NUMBER.length()]; diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java index d33874a3..34583e58 100644 --- a/src/main/java/org/archive/io/warc/WARCReader.java +++ b/src/main/java/org/archive/io/warc/WARCReader.java @@ -24,6 +24,7 @@ import java.io.InputStream; import java.util.Iterator; import java.util.List; +import java.util.Locale; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; @@ -198,7 +199,6 @@ public static void main(String [] args) Options options = getOptions(); PosixParser parser = new PosixParser(); CommandLine cmdline = parser.parse(options, args, false); - @SuppressWarnings("unchecked") List cmdlineArgs = cmdline.getArgList(); Option [] cmdlineOptions = cmdline.getOptions(); HelpFormatter formatter = new HelpFormatter(); @@ -233,7 +233,7 @@ public static void main(String [] args) break; case 'f': - format = cmdlineOptions[i].getValue().toLowerCase(); + format = cmdlineOptions[i].getValue().toLowerCase(Locale.ROOT); boolean match = false; // List of supported formats. final String [] supportedFormats = @@ -286,4 +286,4 @@ public static void main(String [] args) } } } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/warc/WARCReaderFactory.java b/src/main/java/org/archive/io/warc/WARCReaderFactory.java index 881da869..70b80340 100644 --- a/src/main/java/org/archive/io/warc/WARCReaderFactory.java +++ b/src/main/java/org/archive/io/warc/WARCReaderFactory.java @@ -26,6 +26,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; +import java.util.Locale; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; @@ -307,9 +308,9 @@ protected void gotoEOR(ArchiveRecord rec) throws IOException { public static boolean isWARCSuffix(final String f) { return (f == null)? false: - (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))? + (f.toLowerCase(Locale.ROOT).endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))? true: - (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))? + (f.toLowerCase(Locale.ROOT).endsWith(DOT_WARC_FILE_EXTENSION))? true: false; } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java index 5c6a6854..65eb3346 100644 --- a/src/main/java/org/archive/io/warc/WARCWriter.java +++ b/src/main/java/org/archive/io/warc/WARCWriter.java @@ -38,13 +38,14 @@ import org.apache.commons.lang3.StringUtils; import org.archive.format.ArchiveFileConstants; -import org.archive.io.UTF8Bytes; import org.archive.io.WriterPoolMember; import org.archive.util.ArchiveUtils; import org.archive.util.anvl.Element; import static org.archive.format.warc.WARCConstants.*; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * WARC implementation. @@ -357,12 +358,12 @@ public URI writeWarcinfoRecord(String filename, final String description) byte [] warcinfoBody = null; if (settings.getMetadata() == null) { // TODO: What to write into a warcinfo? What to associate? - warcinfoBody = "TODO: Unimplemented".getBytes(); + warcinfoBody = "TODO: Unimplemented".getBytes(UTF_8); } else { ByteArrayOutputStream baos = new ByteArrayOutputStream(); for (final Iterator i = settings.getMetadata().iterator(); i.hasNext();) { - baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8)); + baos.write(i.next().toString().getBytes(UTF_8)); } warcinfoBody = baos.toByteArray(); } diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java index e436b8dc..79130332 100644 --- a/src/main/java/org/archive/net/PublicSuffixes.java +++ b/src/main/java/org/archive/net/PublicSuffixes.java @@ -22,21 +22,24 @@ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; -import java.io.FileWriter; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.archive.util.TextUtils; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Utility class for making use of the information about 'public suffixes' at * http://publicsuffix.org. @@ -189,7 +192,7 @@ public static void main(String args[]) throws IOException { } else { is = new FileInputStream(args[0]); } - BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8)); String regex = getTopmostAssignedSurtPrefixRegex(reader); IOUtils.closeQuietly(is); @@ -197,11 +200,11 @@ public static void main(String args[]) throws IOException { BufferedWriter writer; if (args.length >= 2) { // write to specified file - writer = new BufferedWriter(new FileWriter(args[1])); + writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), UTF_8)); needsClose = true; } else { // write to stdout - writer = new BufferedWriter(new OutputStreamWriter(System.out)); + writer = new BufferedWriter(new OutputStreamWriter(System.out, Charset.defaultCharset())); } writer.append(regex); writer.flush(); @@ -231,7 +234,7 @@ protected static Node readPublishedFileToSurtTrie(BufferedReader reader) throws // discard utf8 notation after entry line = line.split("\\s+")[0]; // TODO: maybe we don't need to create lower-cased String - line = line.toLowerCase(); + line = line.toLowerCase(Locale.ROOT); // SURT-order domain segments String[] segs = line.split("\\."); StringBuilder sb = new StringBuilder(); @@ -331,16 +334,11 @@ public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() { public static synchronized String getTopmostAssignedSurtPrefixRegex() { if (topmostAssignedSurtPrefixRegex == null) { // use bundled list - try { - BufferedReader reader = new BufferedReader(new InputStreamReader( - PublicSuffixes.class.getResourceAsStream( - "/org/archive/effective_tld_names.dat"), "UTF-8")); - topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader); - IOUtils.closeQuietly(reader); - } catch (UnsupportedEncodingException ex) { - // should never happen - throw new RuntimeException(ex); - } + BufferedReader reader = new BufferedReader(new InputStreamReader( + PublicSuffixes.class.getResourceAsStream( + "/org/archive/effective_tld_names.dat"), UTF_8)); + topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader); + IOUtils.closeQuietly(reader); } return topmostAssignedSurtPrefixRegex; } diff --git a/src/main/java/org/archive/resource/generic/GenericResourceProducer.java b/src/main/java/org/archive/resource/generic/GenericResourceProducer.java index 812a3f0d..b111dc1e 100644 --- a/src/main/java/org/archive/resource/generic/GenericResourceProducer.java +++ b/src/main/java/org/archive/resource/generic/GenericResourceProducer.java @@ -1,6 +1,7 @@ package org.archive.resource.generic; import java.io.IOException; +import java.util.Locale; import org.archive.resource.MetaData; import org.archive.resource.Resource; @@ -45,6 +46,6 @@ public void close() throws IOException { stream.close(); } public String getContext() { - return String.format("Context(%s)(%d)", name, stream.getOffset()); + return String.format(Locale.ROOT, "Context(%s)(%d)", name, stream.getOffset()); } } diff --git a/src/main/java/org/archive/resource/gzip/GZIPMetaData.java b/src/main/java/org/archive/resource/gzip/GZIPMetaData.java index 0fc18162..1058b01b 100644 --- a/src/main/java/org/archive/resource/gzip/GZIPMetaData.java +++ b/src/main/java/org/archive/resource/gzip/GZIPMetaData.java @@ -15,6 +15,8 @@ import org.json.JSONException; import org.json.JSONObject; +import static java.nio.charset.StandardCharsets.UTF_8; + public class GZIPMetaData extends MetaData implements ResourceConstants { private static final Logger LOG = Logger.getLogger(GZIPMetaData.class.getName()); @@ -26,7 +28,7 @@ public void setData(GZIPSeriesMember member) { GZIPHeader header = member.getHeader(); GZIPStaticHeader staticH = header.getStaticHeader(); if(staticH.isFNameSet()) { - putString(GZIP_FILENAME,new String(header.getFileName(),"UTF-8")); + putString(GZIP_FILENAME, new String(header.getFileName(), UTF_8)); } if(staticH.isFCommentSet()) { putLong(GZIP_COMMENT_LENGTH,header.getCommentLength()); @@ -39,7 +41,7 @@ public void setData(GZIPSeriesMember member) { for(int i = 0; i < records; i++) { GZIPFExtraRecord rec = header.getRecord(i); JSONObject recJO = new JSONObject(); - String name = new String(rec.getName(),"UTF-8"); + String name = new String(rec.getName(), UTF_8); recJO.put(GZIP_FEXTRA_NAME, name); if(name.equals("SL") || name.equals("LX")) { recJO.put(GZIP_FEXTRA_VALUE, ByteOp.bytesToInt(rec.getValue())); @@ -55,8 +57,6 @@ public void setData(GZIPSeriesMember member) { putLong(GZIP_INFLATED_CRC,footer.getCRC()); putLong(GZIP_INFLATED_LENGTH,footer.getLength()); - } catch (UnsupportedEncodingException e) { - LOG.warning(e.getMessage()); } catch (JSONException e) { LOG.warning(e.getMessage()); } diff --git a/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java b/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java index 39611ab8..5267a0f9 100644 --- a/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java +++ b/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java @@ -1,6 +1,7 @@ package org.archive.resource.gzip; import java.io.IOException; +import java.util.Locale; import org.archive.format.gzip.GZIPMemberSeries; import org.archive.format.gzip.GZIPSeriesMember; @@ -54,6 +55,6 @@ public void close() throws IOException { series.close(); } public String getContext() { - return String.format("Context(%s)(%d)", series.getStreamContext(), series.getCurrentMemberStartOffset()); + return String.format(Locale.ROOT, "Context(%s)(%d)", series.getStreamContext(), series.getCurrentMemberStartOffset()); } } diff --git a/src/main/java/org/archive/resource/html/HTMLMetaData.java b/src/main/java/org/archive/resource/html/HTMLMetaData.java index 024d9677..d995cf65 100644 --- a/src/main/java/org/archive/resource/html/HTMLMetaData.java +++ b/src/main/java/org/archive/resource/html/HTMLMetaData.java @@ -1,6 +1,7 @@ package org.archive.resource.html; import java.util.List; +import java.util.Locale; import java.util.logging.Logger; import org.archive.resource.MetaData; @@ -98,7 +99,7 @@ private void appendObj2(JSONObject o, String arr, String... a) { } catch(JSONException e) { try { - System.err.format("GotErr(%s) JSON(%s)(%s)", e.getMessage(), + System.err.format(Locale.ROOT, "GotErr(%s) JSON(%s)(%s)", e.getMessage(), o.toString(1),a.toString()); } catch (JSONException e1) { // TODO Auto-generated catch block diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 6e95270c..410449a1 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.logging.Logger; import org.archive.format.http.HttpHeaders; @@ -40,7 +41,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData, CDATALexer lex = new CDATALexer(); // guess charset based on HTTP header and sniffed content chunk - String charset = "UTF-8"; + String charset = StandardCharsets.UTF_8.name(); is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; is.mark(0); diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java index a9c3fcc3..a5e5ac35 100644 --- a/src/main/java/org/archive/resource/warc/WARCResource.java +++ b/src/main/java/org/archive/resource/warc/WARCResource.java @@ -5,6 +5,7 @@ import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Locale; import org.archive.format.http.HttpHeader; import org.archive.format.http.HttpResponse; @@ -43,7 +44,7 @@ public WARCResource(MetaData metaData, ResourceContainer container, String name = h.getName(); String value = h.getValue(); fields.putString(name,value); - if(name.toLowerCase().equals("content-length")) { + if(name.toLowerCase(Locale.ROOT).equals("content-length")) { // TODO: catch formatexception length = Long.parseLong(value); } diff --git a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java index 43041efb..8cc8c146 100644 --- a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java +++ b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java @@ -3,7 +3,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.nio.charset.Charset; import org.archive.resource.MetaData; import org.archive.resource.Resource; @@ -14,9 +13,9 @@ import org.json.JSONException; import org.json.JSONTokener; -public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants { - private static final Charset UTF8 = Charset.forName("UTF-8"); +import static java.nio.charset.StandardCharsets.UTF_8; +public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants { public WARCJSONMetaDataResourceFactory() { } @@ -27,7 +26,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData, MetaData md; try { - md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF8))); + md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF_8))); } catch (JSONException e) { throw new ResourceParseException(e); } diff --git a/src/main/java/org/archive/streamcontext/HTTP11Stream.java b/src/main/java/org/archive/streamcontext/HTTP11Stream.java index 06f51409..995dc53e 100755 --- a/src/main/java/org/archive/streamcontext/HTTP11Stream.java +++ b/src/main/java/org/archive/streamcontext/HTTP11Stream.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.net.URL; import java.net.URLConnection; +import java.util.Locale; public class HTTP11Stream extends AbstractBufferingStream { private URL url; @@ -42,7 +43,7 @@ public int doRead(byte[] b, int off, int len) throws IOException { public void doSeek(long offset) throws IOException { doClose(); conn = url.openConnection(); - conn.setRequestProperty("Range", String.format("bytes=%d-", offset)); + conn.setRequestProperty("Range", String.format(Locale.ROOT, "bytes=%d-", offset)); conn.connect(); is = conn.getInputStream(); } diff --git a/src/main/java/org/archive/uid/RecordIDGenerator.java b/src/main/java/org/archive/uid/RecordIDGenerator.java index 4f16c5ab..80cc5565 100644 --- a/src/main/java/org/archive/uid/RecordIDGenerator.java +++ b/src/main/java/org/archive/uid/RecordIDGenerator.java @@ -19,7 +19,6 @@ package org.archive.uid; import java.net.URI; -import java.net.URISyntaxException; import java.util.Map; /** diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java index 37b448c1..dd0d9ac7 100644 --- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java +++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java @@ -6,7 +6,9 @@ import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CoderResult; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -64,7 +66,7 @@ public void canonicalize(HandyURL url) { if (ip != null) { host = ip; } else if (host != null) { - host = escapeOnce(host.toLowerCase()); + host = escapeOnce(host.toLowerCase(Locale.ROOT)); } url.setHost(host); // now the path: @@ -159,7 +161,7 @@ public String attemptIPFormats(String host) { // throws URIException { } ip[i] = octet; } - return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]); + return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]); } else { Matcher m2 = DECIMAL_IP.matcher(host); if (m2.matches()) { @@ -190,7 +192,7 @@ public String attemptIPFormats(String host) { // throws URIException { } ip[i] = octet; } - return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2], + return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]); } @@ -203,12 +205,9 @@ public String minimalEscape(String input) { return escapeOnce(unescapeRepeatedly(input)); } - protected static Charset _UTF8 = null; + protected static Charset _UTF8 = StandardCharsets.UTF_8; protected static Charset UTF8() { - if (_UTF8 == null) { - _UTF8 = Charset.forName("UTF-8"); - } return _UTF8; } @@ -261,7 +260,7 @@ public String escapeOnce(String input) { } sb.append("%"); - String hex = Integer.toHexString(b).toUpperCase(); + String hex = Integer.toHexString(b).toUpperCase(Locale.ROOT); if (hex.length() == 1) { sb.append('0'); } diff --git a/src/main/java/org/archive/url/HandyURL.java b/src/main/java/org/archive/url/HandyURL.java index 91539b3f..0c2c81f7 100644 --- a/src/main/java/org/archive/url/HandyURL.java +++ b/src/main/java/org/archive/url/HandyURL.java @@ -2,6 +2,7 @@ import java.net.MalformedURLException; import java.net.URL; +import java.util.Locale; public class HandyURL { public final static int DEFAULT_PORT = -1; @@ -277,7 +278,7 @@ public void setOpaque(String opaque) { } public String toDebugString() { - return String.format("Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)", + return String.format(Locale.ROOT, "Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)", scheme, authUser, authPass, host, port, path, query, hash); } diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java index 0cf7c8a4..e964cd00 100644 --- a/src/main/java/org/archive/url/IAURLCanonicalizer.java +++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java @@ -2,6 +2,7 @@ import java.util.Arrays; import java.util.Comparator; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -20,11 +21,11 @@ public void canonicalize(HandyURL url) { } if (rules.isSet(SCHEME_SETTINGS, SCHEME_LOWERCASE)) { if (url.getScheme() != null) { - url.setScheme(url.getScheme().toLowerCase()); + url.setScheme(url.getScheme().toLowerCase(Locale.ROOT)); } } if(rules.isSet(HOST_SETTINGS, HOST_LOWERCASE)) { - url.setHost(url.getHost().toLowerCase()); + url.setHost(url.getHost().toLowerCase(Locale.ROOT)); } if(rules.isSet(HOST_SETTINGS, HOST_MASSAGE)) { url.setHost(massageHost(url.getHost())); @@ -46,7 +47,7 @@ public void canonicalize(HandyURL url) { url.setPath(null); } else { if(rules.isSet(PATH_SETTINGS, PATH_LOWERCASE)) { - path = path.toLowerCase(); + path = path.toLowerCase(Locale.ROOT); } if(rules.isSet(PATH_SETTINGS, PATH_STRIP_SESSION_ID)) { path = URLRegexTransformer.stripPathSessionID(path); @@ -71,7 +72,7 @@ public void canonicalize(HandyURL url) { } // lower-case: if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) { - query = query.toLowerCase(); + query = query.toLowerCase(Locale.ROOT); } // re-order? if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) { @@ -155,7 +156,7 @@ public static String massageHost(String host) { return host; } public static int getDefaultPort(String scheme) { - String lcScheme = scheme.toLowerCase(); + String lcScheme = scheme.toLowerCase(Locale.ROOT); if(lcScheme.equals("http")) { return 80; } else if(lcScheme.equals("https")) { diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java index 57071460..9b7485c7 100644 --- a/src/main/java/org/archive/url/LaxURI.java +++ b/src/main/java/org/archive/url/LaxURI.java @@ -18,10 +18,12 @@ */ package org.archive.url; -import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.BitSet; +import java.util.Locale; /** * URI subclass which allows partial/inconsistent encoding, matching @@ -121,9 +123,10 @@ protected static String decode(String component, String charset) byte[] rawdata = null; rawdata = LaxURLCodec.decodeUrlLoose(component.getBytes(StandardCharsets.US_ASCII)); try { - return new String(rawdata, charset); - } catch (UnsupportedEncodingException e) { - return new String(rawdata); + Charset cs = Charset.forName(charset); + return new String(rawdata, cs); + } catch (IllegalCharsetNameException e) { + return new String(rawdata, StandardCharsets.US_ASCII); } } @@ -321,7 +324,7 @@ protected void parseUriReference(String original, boolean escaped) *

*/ if (at > 0 && at < length && tmp.charAt(at) == ':') { - char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); + char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray(); if (validate(target, scheme)) { _scheme = target; from = ++at; diff --git a/src/main/java/org/archive/url/LaxURLCodec.java b/src/main/java/org/archive/url/LaxURLCodec.java index e27d9de0..b68a0c19 100644 --- a/src/main/java/org/archive/url/LaxURLCodec.java +++ b/src/main/java/org/archive/url/LaxURLCodec.java @@ -20,17 +20,16 @@ import java.io.ByteArrayOutputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.BitSet; import org.apache.commons.codec.net.URLCodec; -import com.google.common.base.Charsets; - /** * @author gojomo */ public class LaxURLCodec extends URLCodec { - public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8"); + public static LaxURLCodec DEFAULT = new LaxURLCodec(StandardCharsets.UTF_8.name()); // passthrough constructor public LaxURLCodec(String encoding) { @@ -155,6 +154,6 @@ public String encode(BitSet safe, String pString, String cs) if (pString == null) { return null; } - return new String(encodeUrl(safe,pString.getBytes(cs)), Charsets.US_ASCII); + return new String(encodeUrl(safe,pString.getBytes(cs)), StandardCharsets.US_ASCII); } } diff --git a/src/main/java/org/archive/url/SURT.java b/src/main/java/org/archive/url/SURT.java index 3e0bcd55..9598f458 100644 --- a/src/main/java/org/archive/url/SURT.java +++ b/src/main/java/org/archive/url/SURT.java @@ -2,7 +2,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.logging.Logger; @@ -33,7 +33,7 @@ public static String toSURT(String input) { } public static void main(String[] args) { String line; - InputStreamReader isr = new InputStreamReader(System.in,Charset.forName("UTF-8")); + InputStreamReader isr = new InputStreamReader(System.in, StandardCharsets.UTF_8); BufferedReader br = new BufferedReader(isr); Iterator i = AbstractPeekableIterator.wrapReader(br); while(i.hasNext()) { diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java index 374e0574..492f7772 100644 --- a/src/main/java/org/archive/url/URI.java +++ b/src/main/java/org/archive/url/URI.java @@ -34,12 +34,16 @@ import org.apache.commons.codec.net.URLCodec; import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.BitSet; import java.util.Hashtable; import java.util.Locale; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396. * This class has the purpose of supportting of parsing a URI reference to @@ -261,7 +265,7 @@ public URI(String scheme, String schemeSpecificPart, String fragment) if (scheme == null) { throw new URIException(URIException.PARSING, "scheme required"); } - char[] s = scheme.toLowerCase().toCharArray(); + char[] s = scheme.toLowerCase(Locale.ROOT).toCharArray(); if (validate(s, URI.scheme)) { _scheme = s; // is_absoluteURI } else { @@ -622,7 +626,7 @@ public URI(URI base, URI relative) throws URIException { /** * The default charset of the protocol. RFC 2277, 2396 */ - protected static String defaultProtocolCharset = "UTF-8"; + protected static String defaultProtocolCharset = UTF_8.name(); /** @@ -1694,7 +1698,7 @@ private static byte[] getBytes(String original, String charset) { try { return original.getBytes(charset); } catch (UnsupportedEncodingException e) { - return original.getBytes(); + return original.getBytes(UTF_8); } } @@ -1780,11 +1784,13 @@ protected static String decode(String component, String charset) throw new URIException(e.getMessage()); } try { - return new String(rawdata, charset); - } catch (UnsupportedEncodingException e) { - return new String(rawdata); + Charset cs = Charset.forName(charset); + return new String(rawdata, cs); + } catch (IllegalCharsetNameException e) { + return new String(rawdata, StandardCharsets.US_ASCII); } } + /** * Pre-validate the unescaped URI string within a specific component. * @@ -1954,7 +1960,7 @@ protected void parseUriReference(String original, boolean escaped) *

*/ if (at > 0 && at < length && tmp.charAt(at) == ':') { - char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); + char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray(); if (validate(target, scheme)) { _scheme = target; } else { diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java index 5f31c81c..182eb218 100644 --- a/src/main/java/org/archive/url/URLRegexTransformer.java +++ b/src/main/java/org/archive/url/URLRegexTransformer.java @@ -1,5 +1,6 @@ package org.archive.url; +import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -27,7 +28,7 @@ public class URLRegexTransformer { public static String stripOpts(String orig, OptimizedPattern op[]) { - String origLC = orig.toLowerCase(); + String origLC = orig.toLowerCase(Locale.ROOT); StringBuilder sb = null; int i = 0; int max = op.length; diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java index 08f18999..3038ada5 100644 --- a/src/main/java/org/archive/url/UsableURIFactory.java +++ b/src/main/java/org/archive/url/UsableURIFactory.java @@ -23,6 +23,7 @@ import java.io.UnsupportedEncodingException; import java.util.BitSet; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; @@ -609,7 +610,7 @@ private String fixupDomainlabel(String label) throw ue; } } - label = label.toLowerCase(); + label = label.toLowerCase(Locale.ROOT); return label; } @@ -755,6 +756,6 @@ private String checkUriElement(String element) { */ private String checkUriElementAndLowerCase(String element) { String tmp = checkUriElement(element); - return (tmp != null)? tmp.toLowerCase(): tmp; + return (tmp != null)? tmp.toLowerCase(Locale.ROOT): tmp; } } diff --git a/src/main/java/org/archive/util/ArchiveUtils.java b/src/main/java/org/archive/util/ArchiveUtils.java index 22ba2787..cce411df 100644 --- a/src/main/java/org/archive/util/ArchiveUtils.java +++ b/src/main/java/org/archive/util/ArchiveUtils.java @@ -49,6 +49,8 @@ import org.archive.format.gzip.GZIPDecoder; import org.archive.format.gzip.GZIPFormatException; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Miscellaneous useful methods. * @@ -851,7 +853,7 @@ private static String loadVersion() { BufferedReader br = null; String version; try { - br = new BufferedReader(new InputStreamReader(input)); + br = new BufferedReader(new InputStreamReader(input, UTF_8)); version = br.readLine(); br.readLine(); } catch (IOException e) { @@ -873,7 +875,7 @@ private static String loadVersion() { br = null; String timestamp; try { - br = new BufferedReader(new InputStreamReader(input)); + br = new BufferedReader(new InputStreamReader(input, UTF_8)); timestamp = br.readLine(); } catch (IOException e) { return version; @@ -894,13 +896,13 @@ private static String loadVersion() { TLDS = new HashSet(); InputStream is = ArchiveUtils.class.getResourceAsStream("tlds-alpha-by-domain.txt"); try { - BufferedReader reader = new BufferedReader(new InputStreamReader(is)); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8)); String line; while((line = reader.readLine())!=null) { if (line.startsWith("#")) { continue; } - TLDS.add(line.trim().toLowerCase()); + TLDS.add(line.trim().toLowerCase(Locale.ROOT)); } } catch (Exception e) { LOGGER.log(Level.SEVERE,"TLD list unavailable",e); @@ -917,7 +919,7 @@ private static String loadVersion() { * @return boolean true if recognized as TLD */ public static boolean isTld(String dom) { - return TLDS.contains(dom.toLowerCase()); + return TLDS.contains(dom.toLowerCase(Locale.ROOT)); } public static void closeQuietly(Object input) { @@ -981,12 +983,12 @@ public static int readFully(InputStream input, byte[] buf) */ public static BufferedReader getBufferedReader(File source) throws IOException { InputStream is = new BufferedInputStream(new FileInputStream(source)); - boolean isGzipped = source.getName().toLowerCase(). + boolean isGzipped = source.getName().toLowerCase(Locale.ROOT). endsWith(GZIP_SUFFIX); if(isGzipped) { is = new GZIPInputStream(is); } - return new BufferedReader(new InputStreamReader(is)); + return new BufferedReader(new InputStreamReader(is, UTF_8)); } /** @@ -1002,8 +1004,8 @@ public static BufferedReader getBufferedReader(URL source) throws IOException { || conn.getContentEncoding() != null && conn.getContentEncoding().equalsIgnoreCase("gzip"); InputStream uis = conn.getInputStream(); return new BufferedReader(isGzipped? - new InputStreamReader(new GZIPInputStream(uis)): - new InputStreamReader(uis)); + new InputStreamReader(new GZIPInputStream(uis), UTF_8): + new InputStreamReader(uis, UTF_8)); } /** diff --git a/src/main/java/org/archive/util/ChunkedInputStream.java b/src/main/java/org/archive/util/ChunkedInputStream.java index 69b23047..b6a604c8 100644 --- a/src/main/java/org/archive/util/ChunkedInputStream.java +++ b/src/main/java/org/archive/util/ChunkedInputStream.java @@ -280,8 +280,7 @@ private static int getChunkSizeFromInputStream(final InputStream in) * @throws IOException If an IO problem occurs */ private void parseTrailerHeaders() throws IOException { - String charset = "US-ASCII"; - LaxHttpParser.parseHeaders(in, charset); + LaxHttpParser.parseHeaders(in, StandardCharsets.US_ASCII.name()); } /** diff --git a/src/main/java/org/archive/util/DevUtils.java b/src/main/java/org/archive/util/DevUtils.java index f2a1d044..7ee4b13a 100644 --- a/src/main/java/org/archive/util/DevUtils.java +++ b/src/main/java/org/archive/util/DevUtils.java @@ -25,6 +25,7 @@ import java.io.StringWriter; import java.util.logging.Logger; +import static java.nio.charset.StandardCharsets.UTF_8; /** * Write a message and stack trace to the 'org.archive.util.DevUtils' logger. @@ -92,7 +93,7 @@ public static void sigquitSelf() { Process p = Runtime.getRuntime().exec( new String[] {"perl", "-e", "print getppid(). \"\n\";"}); BufferedReader br = - new BufferedReader(new InputStreamReader(p.getInputStream())); + new BufferedReader(new InputStreamReader(p.getInputStream(), UTF_8)); String ppid = br.readLine(); Runtime.getRuntime().exec( new String[] {"sh", "-c", "kill -3 "+ppid}).waitFor(); diff --git a/src/main/java/org/archive/util/FileNameSpec.java b/src/main/java/org/archive/util/FileNameSpec.java index a3312cfc..7ace8b59 100644 --- a/src/main/java/org/archive/util/FileNameSpec.java +++ b/src/main/java/org/archive/util/FileNameSpec.java @@ -1,5 +1,6 @@ package org.archive.util; +import java.util.Locale; import java.util.concurrent.atomic.AtomicInteger; public class FileNameSpec { @@ -15,7 +16,7 @@ public FileNameSpec(String prefix, String suffix) { public String getNextName() { StringBuilder sb = new StringBuilder(); sb.append(prefix); - sb.append(String.format("%06d",aInt.incrementAndGet())); + sb.append(String.format(Locale.ROOT, "%06d",aInt.incrementAndGet())); sb.append(suffix); return sb.toString(); } diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java index 70b5ffae..271d0212 100644 --- a/src/main/java/org/archive/util/FileUtils.java +++ b/src/main/java/org/archive/util/FileUtils.java @@ -32,6 +32,7 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import java.util.Locale; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; @@ -219,8 +220,8 @@ protected static void workaroundCopyFile(final File src, FileFilter prefixFilter = new FileFilter() { public boolean accept(File pathname) { - return pathname.getName().toLowerCase(). - startsWith(prefix.toLowerCase()); + return pathname.getName().toLowerCase(Locale.ROOT). + startsWith(prefix.toLowerCase(Locale.ROOT)); } }; return dir.listFiles(prefixFilter); @@ -283,7 +284,7 @@ public static boolean isReadableWithExtensionAndMagic(final File f, throws IOException { boolean result = false; FileUtils.assertReadable(f); - if(f.getName().toLowerCase().endsWith(uncompressedExtension)) { + if(f.getName().toLowerCase(Locale.ROOT).endsWith(uncompressedExtension)) { FileInputStream fis = new FileInputStream(f); try { byte [] b = new byte[magic.length()]; @@ -392,7 +393,6 @@ public static boolean moveAsideIfExists(File file) throws IOException { * after the end of the last line returned * @throws IOException */ - @SuppressWarnings("unchecked") public static LongRange pagedLines(File file, long position, int signedDesiredLineCount, List lines, int lineEstimate) throws IOException { @@ -708,4 +708,4 @@ public static void appendTo(File fileToAppendTo, File fileToAppendFrom) throws I out.flush(); } } -} \ No newline at end of file +} diff --git a/src/main/java/org/archive/util/Grep.java b/src/main/java/org/archive/util/Grep.java index e446e47e..892429bd 100644 --- a/src/main/java/org/archive/util/Grep.java +++ b/src/main/java/org/archive/util/Grep.java @@ -1,10 +1,13 @@ package org.archive.util; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.BufferedReader; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; +import java.nio.charset.Charset; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; @@ -119,14 +122,14 @@ protected void doTheGrepThing() throws Exception { if (files != null) { if (files.size() == 1) { - grep(new BufferedReader(new FileReader(files.get(0))), ""); + grep(new BufferedReader(new InputStreamReader(new FileInputStream(files.get(0)), UTF_8)), ""); } else { for (String path : files) { - grep(new BufferedReader(new FileReader(path)), path + ": "); + grep(new BufferedReader(new InputStreamReader(new FileInputStream(path), UTF_8)), path + ": "); } } } else { - grep(new BufferedReader(new InputStreamReader(System.in)), ""); + grep(new BufferedReader(new InputStreamReader(System.in, Charset.defaultCharset())), ""); } } diff --git a/src/main/java/org/archive/util/HMACSigner.java b/src/main/java/org/archive/util/HMACSigner.java index d7a5208e..b502b4fb 100644 --- a/src/main/java/org/archive/util/HMACSigner.java +++ b/src/main/java/org/archive/util/HMACSigner.java @@ -1,5 +1,7 @@ package org.archive.util; +import java.nio.charset.StandardCharsets; + /** * Generate an HMAC key given a secret sig, key name and optional id and an expiration time * @@ -63,11 +65,11 @@ public static String hmacDigest(String msg, String keyString, String algo) { String digest = null; try { SecretKeySpec key = new SecretKeySpec( - (keyString).getBytes("UTF-8"), algo); + (keyString).getBytes(StandardCharsets.UTF_8), algo); Mac mac = Mac.getInstance(algo); mac.init(key); - byte[] bytes = mac.doFinal(msg.getBytes("ASCII")); + byte[] bytes = mac.doFinal(msg.getBytes(StandardCharsets.US_ASCII)); StringBuilder hash = new StringBuilder(); diff --git a/src/main/java/org/archive/util/IAUtils.java b/src/main/java/org/archive/util/IAUtils.java index 4597d723..334a31b4 100644 --- a/src/main/java/org/archive/util/IAUtils.java +++ b/src/main/java/org/archive/util/IAUtils.java @@ -29,13 +29,15 @@ import java.nio.charset.Charset; import java.util.Properties; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Miscellaneous useful methods. * * @author gojomo & others */ public class IAUtils { - public final static Charset UTF8 = Charset.forName("utf-8"); + public final static Charset UTF8 = UTF_8; final public static String COMMONS_VERSION = loadCommonsVersion(); final public static String PUBLISHER = loadCommons("publisher"); @@ -53,7 +55,7 @@ public static String loadCommonsVersion() { BufferedReader br = null; String version; try { - br = new BufferedReader(new InputStreamReader(input)); + br = new BufferedReader(new InputStreamReader(input, UTF_8)); version = br.readLine(); br.readLine(); } catch (IOException e) { @@ -71,11 +73,7 @@ public static String loadCommons(String id) { if (input == null) { return "UNKNOWN"; } - try { - reader = new InputStreamReader(input, "UTF-8"); - } catch (UnsupportedEncodingException e) { - return "UNKNOWN"; - } + reader = new InputStreamReader(input, UTF_8); Properties prop = new Properties(); try { prop.load(reader); diff --git a/src/main/java/org/archive/util/IterableLineIterator.java b/src/main/java/org/archive/util/IterableLineIterator.java index 33efa1fd..c9010031 100644 --- a/src/main/java/org/archive/util/IterableLineIterator.java +++ b/src/main/java/org/archive/util/IterableLineIterator.java @@ -19,7 +19,6 @@ public IterableLineIterator(final Reader reader) super(reader); } - @SuppressWarnings("unchecked") public Iterator iterator() { return this; } diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java index 0545fd95..434522c8 100644 --- a/src/main/java/org/archive/util/LaxHttpParser.java +++ b/src/main/java/org/archive/util/LaxHttpParser.java @@ -36,6 +36,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.logging.Logger; @@ -127,7 +128,7 @@ public static String readLine(InputStream inputStream, String charset) throws IO try { return new String(rawdata, 0, len - offset, charset); } catch (UnsupportedEncodingException e) { - return new String(rawdata, 0, len - offset); + return new String(rawdata, 0, len - offset, StandardCharsets.ISO_8859_1); } } @@ -147,7 +148,7 @@ public static String readLine(InputStream inputStream, String charset) throws IO public static String readLine(InputStream inputStream) throws IOException { LOG.finest("enter LaxHttpParser.readLine(InputStream)"); - return readLine(inputStream, "US-ASCII"); + return readLine(inputStream, StandardCharsets.US_ASCII.name()); } /** @@ -237,6 +238,6 @@ public static HttpHeader[] parseHeaders(InputStream is, String charset) throws I */ public static HttpHeader[] parseHeaders(InputStream is) throws IOException { LOG.finest("enter HeaderParser.parseHeaders(InputStream, String)"); - return parseHeaders(is, "US-ASCII"); + return parseHeaders(is, StandardCharsets.US_ASCII.name()); } } diff --git a/src/main/java/org/archive/util/ProcessUtils.java b/src/main/java/org/archive/util/ProcessUtils.java index af792981..0a3eeb67 100644 --- a/src/main/java/org/archive/util/ProcessUtils.java +++ b/src/main/java/org/archive/util/ProcessUtils.java @@ -26,6 +26,8 @@ import java.util.logging.Level; import java.util.logging.Logger; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Class to run an external process. * @author stack @@ -55,7 +57,7 @@ protected StreamGobbler(InputStream is, String name) { public void run() { try { BufferedReader br = - new BufferedReader(new InputStreamReader(this.is)); + new BufferedReader(new InputStreamReader(this.is, UTF_8)); for (String line = null; (line = br.readLine()) != null;) { this.sink.append(line); } diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java index 6a7a53d7..9f10ec92 100644 --- a/src/main/java/org/archive/util/Recorder.java +++ b/src/main/java/org/archive/util/Recorder.java @@ -25,7 +25,9 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.HashSet; +import java.util.Locale; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; @@ -41,8 +43,6 @@ import org.archive.io.ReplayCharSequence; import org.archive.io.ReplayInputStream; -import com.google.common.base.Charsets; - /** * Pairs together a RecordingInputStream and RecordingOutputStream @@ -95,7 +95,7 @@ public class Recorder { * (current behavior is for consistency with our prior but perhaps not * optimal behavior) */ - protected Charset charset = Charsets.UTF_8; + protected Charset charset = StandardCharsets.UTF_8; /** whether recording-input (ris) message-body is chunked */ protected boolean inputIsChunked = false; @@ -338,8 +338,8 @@ public void setInputIsChunked(boolean chunked) { * @param contentEncoding declared content-encoding of input recording. */ public void setContentEncoding(String contentEncoding) { - String lowerCoding = contentEncoding.toLowerCase(); - if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase())) { + String lowerCoding = contentEncoding.toLowerCase(Locale.ROOT); + if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase(Locale.ROOT))) { throw new IllegalArgumentException("contentEncoding unsupported: "+contentEncoding); } this.contentEncoding = lowerCoding; diff --git a/src/main/java/org/archive/util/SURT.java b/src/main/java/org/archive/util/SURT.java index 059b2ec6..99347e9f 100644 --- a/src/main/java/org/archive/util/SURT.java +++ b/src/main/java/org/archive/util/SURT.java @@ -27,11 +27,14 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintStream; +import java.nio.charset.Charset; import java.util.regex.Matcher; import org.archive.url.URIException; import org.archive.url.UsableURIFactory; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Sort-friendly URI Reordering Transform. * @@ -238,10 +241,10 @@ public static void main(String[] args) throws IOException { InputStream in = args.length > 0 ? new BufferedInputStream( new FileInputStream(args[0])) : System.in; PrintStream out = args.length > 1 ? new PrintStream( - new BufferedOutputStream(new FileOutputStream(args[1]))) + new BufferedOutputStream(new FileOutputStream(args[1])), false, UTF_8.name()) : System.out; BufferedReader br = - new BufferedReader(new InputStreamReader(in)); + new BufferedReader(new InputStreamReader(in, Charset.defaultCharset())); String line; while((line = br.readLine())!=null) { if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#")); diff --git a/src/main/java/org/archive/util/SurtPrefixSet.java b/src/main/java/org/archive/util/SurtPrefixSet.java index 6925cc83..b2f0ea4f 100644 --- a/src/main/java/org/archive/util/SurtPrefixSet.java +++ b/src/main/java/org/archive/util/SurtPrefixSet.java @@ -31,11 +31,14 @@ import java.io.PrintStream; import java.io.Reader; import java.util.Iterator; +import java.util.Locale; import org.archive.url.UsableURI; import org.archive.util.iterator.LineReadingIterator; import org.archive.util.iterator.RegexLineIterator; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Specialized TreeSet for keeping a set of String prefixes. * @@ -70,7 +73,7 @@ public void importFrom(Reader r) { while (iter.hasNext()) { s = (String) iter.next(); - add(s.toLowerCase()); + add(s.toLowerCase(Locale.ROOT)); } } @@ -145,7 +148,7 @@ public boolean considerAsAddDirective(String suri) { } if(u.indexOf("(")>0) { // formal SURT prefix; toLowerCase just in case - add(u.toLowerCase()); + add(u.toLowerCase(Locale.ROOT)); } else { // hostname/normal form URI from which // to deduce SURT prefix @@ -342,10 +345,10 @@ public static void main(String[] args) throws IOException { InputStream in = args.length > 0 ? new BufferedInputStream( new FileInputStream(args[0])) : System.in; PrintStream out = args.length > 1 ? new PrintStream( - new BufferedOutputStream(new FileOutputStream(args[1]))) + new BufferedOutputStream(new FileOutputStream(args[1])), false, UTF_8.name()) : System.out; BufferedReader br = - new BufferedReader(new InputStreamReader(in)); + new BufferedReader(new InputStreamReader(in, UTF_8.name())); String line; while((line = br.readLine())!=null) { if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#")); diff --git a/src/main/java/org/archive/util/TextUtils.java b/src/main/java/org/archive/util/TextUtils.java index 98b471f8..627d411a 100644 --- a/src/main/java/org/archive/util/TextUtils.java +++ b/src/main/java/org/archive/util/TextUtils.java @@ -30,7 +30,6 @@ import java.net.URLEncoder; import java.util.HashMap; import java.util.Map; -import java.util.concurrent.ConcurrentMap; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -40,6 +39,8 @@ import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; +import static java.nio.charset.StandardCharsets.UTF_8; + public class TextUtils { private static final String FIRSTWORD = "^([^\\s]*).*$"; @@ -279,14 +280,11 @@ public static String exceptionToString(String message, Throwable e) { * @param s String to escape * @return URL-escaped string */ - @SuppressWarnings("deprecation") public static String urlEscape(String s) { try { - return URLEncoder.encode(s,"UTF8"); + return URLEncoder.encode(s, UTF_8.name()); } catch (UnsupportedEncodingException e) { - // should be impossible; all JVMs must support UTF8 - // but have a fallback just in case - return URLEncoder.encode(s); + return s; } } @@ -296,14 +294,11 @@ public static String urlEscape(String s) { * @param s String do unescape * @return URL-unescaped String */ - @SuppressWarnings("deprecation") public static String urlUnescape(String s) { try { - return URLDecoder.decode(s, "UTF8"); + return URLDecoder.decode(s, UTF_8.name()); } catch (UnsupportedEncodingException e) { - // should be impossible; all JVMs must support UTF8 - // but have a fallback just in case - return URLDecoder.decode(s); + return s; } } } \ No newline at end of file diff --git a/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java b/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java index de57278e..17d411fa 100644 --- a/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java +++ b/src/main/java/org/archive/util/binsearch/AbstractSeekableLineReader.java @@ -7,13 +7,14 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import org.archive.util.zip.GZIPMembersInputStream; import com.google.common.io.ByteStreams; public abstract class AbstractSeekableLineReader implements SeekableLineReader { - public final static Charset UTF8 = Charset.forName("UTF-8"); + public final static Charset UTF8 = StandardCharsets.UTF_8; protected int blockSize = 128 * 1024; diff --git a/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java b/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java index 76b7b2b9..45c2ee04 100644 --- a/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java +++ b/src/main/java/org/archive/util/binsearch/SeekCDXBenchmarker.java @@ -3,6 +3,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.charset.Charset; import org.archive.url.WaybackURLKeyMaker; import org.archive.util.binsearch.impl.MappedSeekableLineReaderFactory; @@ -52,7 +53,7 @@ public static void main(String[] args) throws IOException { SortedTextFile sorted = new SortedTextFile(factory); sorted.setBinsearchBlockSize(blocksize); - BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader reader = new BufferedReader(new InputStreamReader(System.in, Charset.defaultCharset())); WaybackURLKeyMaker keymaker = new WaybackURLKeyMaker(true); diff --git a/src/main/java/org/archive/util/binsearch/SortedTextFile.java b/src/main/java/org/archive/util/binsearch/SortedTextFile.java index ab8118b7..bb4a1f66 100644 --- a/src/main/java/org/archive/util/binsearch/SortedTextFile.java +++ b/src/main/java/org/archive/util/binsearch/SortedTextFile.java @@ -2,12 +2,15 @@ import java.io.IOException; import java.util.Comparator; +import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import org.archive.util.GeneralURIStreamFactory; import org.archive.util.iterator.CloseableIterator; +import static java.nio.charset.StandardCharsets.UTF_8; + public class SortedTextFile { public static class NumericComparator implements Comparator @@ -142,14 +145,14 @@ public long binaryFindOffset(SeekableLineReader slr, final String key, Comparato if (comparator.compare(key, line) > 0) { if(LOGGER.isLoggable(Level.FINE)) { - LOGGER.fine(String.format("Search(%d) (%s)/(%s) : After", + LOGGER.fine(String.format(Locale.ROOT, "Search(%d) (%s)/(%s) : After", mid * blockSize, key,line)); } min = mid; } else { if(LOGGER.isLoggable(Level.FINE)) { - LOGGER.fine(String.format("Search(%d) (%s)/(%s) : Before", + LOGGER.fine(String.format(Locale.ROOT, "Search(%d) (%s)/(%s) : Before", mid * blockSize, key,line)); } max = mid; @@ -370,7 +373,7 @@ private long searchOffset(SeekableLineReader slr, String prev = null; while(true) { if (line != null) { - offset += line.getBytes().length + 1; + offset += line.getBytes(UTF_8).length + 1; } line = slr.readLine(); if(line == null) break; @@ -379,7 +382,7 @@ private long searchOffset(SeekableLineReader slr, } if (lessThan && prev != null) { - offset -= prev.getBytes().length + 1; + offset -= prev.getBytes(UTF_8).length + 1; } return offset; @@ -391,7 +394,7 @@ private CloseableIterator search(SeekableLineReader slr, long min = binaryFindOffset(slr, key, comparator); if (LOGGER.isLoggable(Level.FINE)) { - LOGGER.fine(String.format("Aligning(%d)",min)); + LOGGER.fine(String.format(Locale.ROOT, "Aligning(%d)",min)); } slr.seek(min); diff --git a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java index 7ade0ad5..73e1fda8 100644 --- a/src/test/java/org/archive/format/dns/DNSResponseParserTest.java +++ b/src/test/java/org/archive/format/dns/DNSResponseParserTest.java @@ -5,6 +5,8 @@ import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; public class DNSResponseParserTest { @@ -20,7 +22,7 @@ public void testParse() throws DNSParseException, IOException { } private void verifyResults(String res, String date, String d[][]) throws DNSParseException, IOException { ByteArrayInputStream is = - new ByteArrayInputStream(res.getBytes("UTF-8")); + new ByteArrayInputStream(res.getBytes(UTF_8)); DNSResponse response = new DNSResponse(); parser.parse(is, response); verifyResults(response,date,d); diff --git a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java index 25a5eaa7..13658bcb 100644 --- a/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java +++ b/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java @@ -10,7 +10,7 @@ import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import java.nio.charset.StandardCharsets; +import java.util.Locale; import org.archive.format.gzip.GZIPMemberSeries; import org.archive.format.gzip.GZIPSeriesMember; @@ -18,6 +18,8 @@ import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; public class ZipNumWriterTest { @@ -28,16 +30,16 @@ public void testAddRecord() throws IOException { File summ = File.createTempFile("test-znw",".summ"); main.deleteOnExit(); summ.deleteOnExit(); - System.out.format("Summ: %s\n", summ.getAbsolutePath()); + System.out.format(Locale.ROOT, "Summ: %s\n", summ.getAbsolutePath()); int limit = 10; ZipNumWriter znw = new ZipNumWriter(new FileOutputStream(main,false), new FileOutputStream(summ,false), limit); for(int i = 0; i < 1000; i++) { - znw.addRecord(String.format("%06d\n",i).getBytes(StandardCharsets.UTF_8)); + znw.addRecord(String.format(Locale.ROOT,"%06d\n",i).getBytes(UTF_8)); } znw.close(); InputStreamReader isr = - new InputStreamReader(new FileInputStream(summ), StandardCharsets.UTF_8); + new InputStreamReader(new FileInputStream(summ), UTF_8); BufferedReader br = new BufferedReader(isr); String line = null; int count = 0; diff --git a/src/test/java/org/archive/format/json/JSONViewTest.java b/src/test/java/org/archive/format/json/JSONViewTest.java index aabbe7df..6d199025 100644 --- a/src/test/java/org/archive/format/json/JSONViewTest.java +++ b/src/test/java/org/archive/format/json/JSONViewTest.java @@ -1,5 +1,7 @@ package org.archive.format.json; +import java.util.Locale; + import org.archive.util.TestUtils; import org.json.JSONException; import org.json.JSONObject; @@ -17,16 +19,16 @@ public void testBytes() throws JSONException { JSONObject o = new JSONObject(); o.append("name1", "val\\rue1"); String json = o.toString(); - System.out.format("once: (%s)\n",json); + System.out.format(Locale.ROOT, "once: (%s)\n", json); JSONObject o2 = new JSONObject(json); - System.out.format("twice: (%s)\n",o2.toString()); + System.out.format(Locale.ROOT, "twice: (%s)\n", o2.toString()); byte b[] = new byte[2]; for(int i = 0; i < 256; i++) { b[0] = (byte) i; int gi = getInt(b); - System.out.format("I(%d) gi(%d)\n",i,gi); + System.out.format(Locale.ROOT, "I(%d) gi(%d)\n", i, gi); } } diff --git a/src/test/java/org/archive/format/text/html/CDATALexerTest.java b/src/test/java/org/archive/format/text/html/CDATALexerTest.java index 856576ba..7c9f24f3 100644 --- a/src/test/java/org/archive/format/text/html/CDATALexerTest.java +++ b/src/test/java/org/archive/format/text/html/CDATALexerTest.java @@ -10,6 +10,8 @@ import static org.junit.jupiter.api.Assertions.*; +import java.util.Locale; + public class CDATALexerTest { CDATALexer l; Node n; @@ -102,7 +104,7 @@ public void testInJSComment() throws ParserException { } private void assertJSContentWorks(String js) throws ParserException { - String html = String.format("",js); + String html = String.format(Locale.ROOT,"",js); l = makeLexer(html); assertFalse(l.inCSS()); assertFalse(l.inJS()); diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java index 005e2c49..5d31b890 100644 --- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java +++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java @@ -31,6 +31,8 @@ import org.archive.io.warc.WARCRecord; import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -68,7 +70,7 @@ public void testParseHttpHeadersInWARC() throws IOException { final String hdr = warcHeader + HTTPHEADER + BODY; - WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()), + WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)), "READER_IDENTIFIER", 0, false, true); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); @@ -76,7 +78,7 @@ public void testParseHttpHeadersInWARC() throws IOException { byte[] b = new byte[BODY.length()]; har.read(b); - String bodyRead = new String(b); + String bodyRead = new String(b, UTF_8); assertEquals(BODY, bodyRead); assertHeaderCorrectlyParsed(har.getContentHeaders()); assertEquals(har.getHeader().getUrl(), url, @@ -156,14 +158,14 @@ public String getVersion() { } }; - ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)), arh, 0, false, true, false); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); har.skipHttpHeader(); byte[] b = new byte[BODY.length()]; har.read(b); - String bodyRead = new String(b); + String bodyRead = new String(b, UTF_8); assertEquals(BODY, bodyRead); assertHeaderCorrectlyParsed(har.getContentHeaders()); } @@ -175,14 +177,14 @@ public void testEasierParseHttpHeadersInARC() throws IOException { + " 192.168.0.1 20070515111004 text/html 167568\n"; final String hdr = arcHeader + HTTPHEADER + BODY; - ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()), + ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes(UTF_8)), "READER_IDENTIFIER", 0, false, true, false); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); har.skipHttpHeader(); byte[] b = new byte[BODY.length()]; har.read(b); - String bodyRead = new String(b); + String bodyRead = new String(b, UTF_8); assertEquals(BODY, bodyRead); assertHeaderCorrectlyParsed(har.getContentHeaders()); assertEquals(har.getHeader().getUrl(), url, "failed to retrieve Url from metadata"); @@ -205,7 +207,7 @@ public void testNoheaderWARC() throws IOException { String c = "WARC/0.12\r\nContent-Type: text/plain\r\n" + "Content-Length: " + b.length() + "\r\n\r\n" + b; org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord( - new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0, + new ByteArrayInputStream(c.getBytes(UTF_8)), "READER_IDENTIFIER", 0, false, true); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); assertTrue(har.isStrict()); diff --git a/src/test/java/org/archive/io/RecordingInputStreamTest.java b/src/test/java/org/archive/io/RecordingInputStreamTest.java index 49160aa3..74e92024 100644 --- a/src/test/java/org/archive/io/RecordingInputStreamTest.java +++ b/src/test/java/org/archive/io/RecordingInputStreamTest.java @@ -28,6 +28,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -55,7 +57,7 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, RecordingInputStream ris = new RecordingInputStream(16384, (new File( tempDir, "testReadFullyOrUntil").getAbsolutePath())); ByteArrayInputStream bais = new ByteArrayInputStream( - "abcdefghijklmnopqrstuvwxyz".getBytes()); + "abcdefghijklmnopqrstuvwxyz".getBytes(UTF_8)); // test soft max ris.open(bais); ris.setLimits(10,0,0); @@ -64,8 +66,9 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, ReplayInputStream res = ris.getReplayInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); res.readFullyTo(baos); - assertEquals("abcdefg",new String(baos.toByteArray()),"soft max cutoff"); - // test hard max + assertEquals("abcdefg", new String(baos.toByteArray(), UTF_8), + "soft max cutoff"); + // test hard max bais.reset(); baos.reset(); ris.open(bais); @@ -80,14 +83,14 @@ public void testReadFullyOrUntil() throws RecorderTimeoutException, IOException, ris.close(); res = ris.getReplayInputStream(); res.readFullyTo(baos); - assertEquals("abcdefghijk",new String(baos.toByteArray()), - "hard max cutoff"); + assertEquals("abcdefghijk", new String(baos.toByteArray(), UTF_8), + "hard max cutoff"); // test timeout PipedInputStream pin = new PipedInputStream(); PipedOutputStream pout = new PipedOutputStream(pin); ris.open(pin); exceptionThrown = false; - trickle("abcdefghijklmnopqrstuvwxyz".getBytes(),pout); + trickle("abcdefghijklmnopqrstuvwxyz".getBytes(UTF_8),pout); int timeout = 200; try { ris.setLimits(0, timeout,0); @@ -133,10 +136,10 @@ public void testAsOutputStream() throws IOException { RecordingInputStream ris = new RecordingInputStream(16384, (new File( tempDir, "testAsOutputStream").getAbsolutePath())); ris.open(null); - ris.asOutputStream().write("hello".getBytes()); + ris.asOutputStream().write("hello".getBytes(UTF_8)); ris.close(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); ris.getReplayInputStream().readFullyTo(baos); - assertEquals("hello", baos.toString()); + assertEquals("hello", baos.toString(UTF_8.name())); } } diff --git a/src/test/java/org/archive/io/RecordingOutputStreamTest.java b/src/test/java/org/archive/io/RecordingOutputStreamTest.java index c94f8245..0dba910e 100644 --- a/src/test/java/org/archive/io/RecordingOutputStreamTest.java +++ b/src/test/java/org/archive/io/RecordingOutputStreamTest.java @@ -28,6 +28,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -266,61 +268,61 @@ public void testMessageBodyBegin() throws IOException { ros.setSha1Digest(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n\nabcdefghij".getBytes()); + ros.write("0123456789\n\nabcdefghij".getBytes(UTF_8)); assertEquals(12, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\r\n\r\nabcdefghij".getBytes()); + ros.write("0123456789\r\n\r\nabcdefghij".getBytes(UTF_8)); assertEquals(14, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n\r\nabcdefghij".getBytes()); + ros.write("0123456789\n\r\nabcdefghij".getBytes(UTF_8)); assertEquals(13, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n".getBytes()); + ros.write("0123456789\n".getBytes(UTF_8)); assertEquals(-1, ros.getMessageBodyBegin()); - ros.write("\nabcdefghij".getBytes()); + ros.write("\nabcdefghij".getBytes(UTF_8)); assertEquals(12, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n".getBytes()); + ros.write("0123456789\n".getBytes(UTF_8)); assertEquals(-1, ros.getMessageBodyBegin()); - ros.write("\r\nabcdefghij".getBytes()); + ros.write("\r\nabcdefghij".getBytes(UTF_8)); assertEquals(13, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n\r".getBytes()); + ros.write("0123456789\n\r".getBytes(UTF_8)); assertEquals(-1, ros.getMessageBodyBegin()); - ros.write("\nabcdefghij".getBytes()); + ros.write("\nabcdefghij".getBytes(UTF_8)); assertEquals(13, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789".getBytes()); + ros.write("0123456789".getBytes(UTF_8)); ros.write('\n'); assertEquals(-1, ros.getMessageBodyBegin()); - ros.write("\nabcdefghij".getBytes()); + ros.write("\nabcdefghij".getBytes(UTF_8)); assertEquals(12, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789".getBytes()); + ros.write("0123456789".getBytes(UTF_8)); ros.write('\n'); ros.write('\n'); - for (int b: "abcdefghij".getBytes()) { + for (int b: "abcdefghij".getBytes(UTF_8)) { ros.write(b); } assertEquals(12, ros.getMessageBodyBegin()); @@ -328,11 +330,11 @@ public void testMessageBodyBegin() throws IOException { ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789".getBytes()); + ros.write("0123456789".getBytes(UTF_8)); ros.write('\n'); ros.write('\r'); ros.write('\n'); - for (int b: "abcdefghij".getBytes()) { + for (int b: "abcdefghij".getBytes(UTF_8)) { ros.write(b); } assertEquals(13, ros.getMessageBodyBegin()); @@ -340,17 +342,17 @@ public void testMessageBodyBegin() throws IOException { ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n".getBytes()); + ros.write("0123456789\n".getBytes(UTF_8)); ros.write('\n'); - ros.write("abcdefghij".getBytes()); + ros.write("abcdefghij".getBytes(UTF_8)); assertEquals(12, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); ros.open(new ByteArrayOutputStream()); - ros.write("0123456789\n\r".getBytes()); + ros.write("0123456789\n\r".getBytes(UTF_8)); ros.write('\n'); - ros.write("abcdefghij".getBytes()); + ros.write("abcdefghij".getBytes(UTF_8)); assertEquals(13, ros.getMessageBodyBegin()); assertEquals("22GBTIFDIW36VN4NLYI6TEOAE3WGBW3D", Base32.encode(ros.getDigestValue())); ros.close(); diff --git a/src/test/java/org/archive/io/ReplayCharSequenceTest.java b/src/test/java/org/archive/io/ReplayCharSequenceTest.java index 3234259c..3935837b 100644 --- a/src/test/java/org/archive/io/ReplayCharSequenceTest.java +++ b/src/test/java/org/archive/io/ReplayCharSequenceTest.java @@ -25,17 +25,21 @@ import java.nio.charset.StandardCharsets; import java.text.NumberFormat; import java.util.Date; +import java.util.Locale; import java.util.Random; import java.util.logging.Logger; import org.archive.util.FileUtils; -import com.google.common.base.Charsets; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.US_ASCII; +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.*; /** @@ -133,7 +137,7 @@ public void testGetReplayCharSequenceMultiByteZeroOffset() RecordingOutputStream ros = writeTestStream( regularBuffer,MULTIPLIER, "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER); - ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); + ReplayCharSequence rcs = getReplayCharSequence(ros, UTF_8); for (int i = 0; i < MULTIPLIER; i++) { accessingCharacters(rcs); @@ -143,7 +147,7 @@ public void testGetReplayCharSequenceMultiByteZeroOffset() @Test public void testReplayCharSequenceByteToString() throws IOException { String fileContent = "Some file content"; - byte [] buffer = fileContent.getBytes(); + byte [] buffer = fileContent.getBytes(UTF_8); RecordingOutputStream ros = writeTestStream( buffer,1, "testReplayCharSequenceByteToString.txt",0); @@ -179,7 +183,7 @@ public void testSingleByteEncodings() throws IOException { String latin1String = new String(bytes, "latin1"); RecordingOutputStream ros = writeTestStream( bytes, 1, "testSingleByteEncodings-latin1.txt", 0); - ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1); + ReplayCharSequence rcs = getReplayCharSequence(ros, ISO_8859_1); String result = rcs.toString(); logger.fine("latin1[0] " + toHexString(latin1String)); logger.fine("latin1[1] " + toHexString(result)); @@ -207,7 +211,7 @@ public void testSingleByteEncodings() throws IOException { @Test public void testReplayCharSequenceByteToStringOverflow() throws IOException { String fileContent = "Some file content. "; // ascii - byte [] buffer = fileContent.getBytes(); + byte [] buffer = fileContent.getBytes(UTF_8); RecordingOutputStream ros = writeTestStream( buffer,1, "testReplayCharSequenceByteToStringOverflow.txt",1); @@ -217,8 +221,8 @@ public void testReplayCharSequenceByteToStringOverflow() throws IOException { // both encodings because they exercise different code paths. UTF-8 is // decoded to UTF-16 while windows-1252 is memory mapped directly. See // GenericReplayCharSequence - ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8); - ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252")); + ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros, UTF_8); + ReplayCharSequence rcs1252 = getReplayCharSequence(ros, Charset.forName("windows-1252")); String result = rcsUtf8.toString(); assertEquals(expectedContent, result, "Strings don't match"); @@ -242,7 +246,7 @@ public void testReplayCharSequenceByteToStringMulti() throws IOException { buffer,1, "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1); for (int i = 0; i < 3; i++) { - ReplayCharSequence rcs = getReplayCharSequence(ros,StandardCharsets.UTF_8); + ReplayCharSequence rcs = getReplayCharSequence(ros, UTF_8); String result = rcs.toString(); assertEquals(result, expectedResult, "Strings don't match"); rcs.close(); @@ -255,8 +259,7 @@ public void testReplayCharSequenceByteToStringMulti() throws IOException { @Disabled public void xestHugeReplayCharSequence() throws IOException { String fileContent = "01234567890123456789"; - String characterEncoding = "ascii"; - byte[] buffer = fileContent.getBytes(characterEncoding); + byte[] buffer = fileContent.getBytes(US_ASCII); long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l; @@ -264,7 +267,7 @@ public void xestHugeReplayCharSequence() throws IOException { + " bytes to testHugeReplayCharSequence.txt"); RecordingOutputStream ros = writeTestStream(buffer, 0, "testHugeReplayCharSequence.txt", reps); - ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding)); + ReplayCharSequence rcs = getReplayCharSequence(ros, US_ASCII); if (reps * fileContent.length() > (long) Integer.MAX_VALUE) { assertEquals(Integer.MAX_VALUE, rcs.length(), "ReplayCharSequence has wrong length (length()=" @@ -283,7 +286,7 @@ public void xestHugeReplayCharSequence() throws IOException { // NumberFormat.getInstance().format(index)); assertEquals(fileContent.charAt(index % fileContent.length()), rcs.charAt(index), "Characters don't match (index=" - + NumberFormat.getInstance().format(index) + ")"); + + NumberFormat.getInstance(Locale.ROOT).format(index) + ")"); } // check that out of bounds indices throw exception @@ -307,7 +310,7 @@ public void xestHugeReplayCharSequence() throws IOException { // NumberFormat.getInstance().format(index)); assertEquals(fileContent.charAt(index % fileContent.length()), rcs.charAt(index), "Characters don't match (index=" - + NumberFormat.getInstance().format(index) + ")"); + + NumberFormat.getInstance(Locale.ROOT).format(index) + ")"); } } diff --git a/src/test/java/org/archive/io/RepositionableInputStreamTest.java b/src/test/java/org/archive/io/RepositionableInputStreamTest.java index 228c9042..4aad11b9 100644 --- a/src/test/java/org/archive/io/RepositionableInputStreamTest.java +++ b/src/test/java/org/archive/io/RepositionableInputStreamTest.java @@ -21,12 +21,15 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.OutputStreamWriter; import java.io.PrintWriter; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; public class RepositionableInputStreamTest { @@ -38,7 +41,7 @@ public class RepositionableInputStreamTest { @BeforeEach protected void setUp() throws Exception { this.testFile = new File(tempDir, this.getClass().getName()); - PrintWriter pw = new PrintWriter(new FileOutputStream(testFile)); + PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(testFile), UTF_8)); for (int i = 0; i < 100; i++) { pw.print(LINE); } @@ -63,7 +66,7 @@ public void testname() throws Exception { long offset = 0; for (int i = 0; i < 10; i++) { ris.read(bytes, 0, LINE.length()); - assertEquals(LINE, new String(bytes)); + assertEquals(LINE, new String(bytes, UTF_8)); offset += LINE.length(); assertEquals(offset, ris.position()); } diff --git a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java index 954da636..f6820337 100644 --- a/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java +++ b/src/test/java/org/archive/io/arc/ARCWriterPoolTest.java @@ -30,6 +30,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.archive.format.arc.ARCConstants.*; @@ -51,7 +53,7 @@ public void testARCWriterPool() WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; final String CONTENT = "Any old content"; ByteArrayOutputStream baos = new ByteArrayOutputStream(); - baos.write(CONTENT.getBytes()); + baos.write(CONTENT.getBytes(UTF_8)); for (int i = 0; i < MAX_ACTIVE; i++) { writers[i] = pool.borrowFile(); assertEquals(i + 1, pool.getNumActive(), "Number active"); @@ -81,7 +83,7 @@ public void testInvalidate() throws Exception { WriterPoolMember [] writers = new WriterPoolMember[MAX_ACTIVE]; final String CONTENT = "Any old content"; ByteArrayOutputStream baos = new ByteArrayOutputStream(); - baos.write(CONTENT.getBytes()); + baos.write(CONTENT.getBytes(UTF_8)); for (int i = 0; i < MAX_ACTIVE; i++) { writers[i] = pool.borrowFile(); assertEquals(i + 1, pool.getNumActive(), "Number active"); @@ -124,4 +126,4 @@ private WriterPoolSettings getSettings(final boolean isCompressed) { Arrays.asList(files), null); } -} \ No newline at end of file +} diff --git a/src/test/java/org/archive/io/arc/ARCWriterTest.java b/src/test/java/org/archive/io/arc/ARCWriterTest.java index ca300697..f6c48462 100644 --- a/src/test/java/org/archive/io/arc/ARCWriterTest.java +++ b/src/test/java/org/archive/io/arc/ARCWriterTest.java @@ -47,6 +47,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.*; import static org.archive.format.arc.ARCConstants.*; @@ -122,11 +124,11 @@ protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index) // Start the record with an arbitrary 14-digit date per RFC2540 String now = ArchiveUtils.get14DigitDate(); int recordLength = 0; - byte[] record = (getContent(indexStr)).getBytes(); + byte[] record = (getContent(indexStr)).getBytes(UTF_8); recordLength += record.length; baos.write(record); // Add the newline between records back in - baos.write("\n".getBytes()); + baos.write("\n".getBytes(UTF_8)); recordLength += 1; arcWriter.write("http://www.one.net/id=" + indexStr, "text/html", "0.1.2.3", Long.parseLong(now), recordLength, baos); @@ -260,7 +262,7 @@ public void testWriteRecordCompressed() throws IOException { } public void testWriteGiantRecord() throws IOException { - PrintStream dummyStream = new PrintStream(new NullOutputStream()); + PrintStream dummyStream = new PrintStream(new NullOutputStream(), false, UTF_8.name()); ARCWriter arcWriter = new ARCWriter( SERIAL_NO, @@ -305,7 +307,7 @@ protected CorruptibleARCWriter createARCWriter(String name, boolean compress) { protected static ByteArrayInputStream getBais(String str) throws IOException { - return new ByteArrayInputStream(str.getBytes()); + return new ByteArrayInputStream(str.getBytes(UTF_8)); } /** @@ -417,7 +419,7 @@ protected void lengthTooShort(String name, boolean compress, boolean strict) ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES"); writeRecord(writer, SOME_URL, "text/html", content.length(), bais); - writer.setEndJunk("SOME TRAILING BYTES".getBytes()); + writer.setEndJunk("SOME TRAILING BYTES".getBytes(UTF_8)); writeRecord(writer, SOME_URL, "text/html", content.length(), getBais(content)); } finally { @@ -429,7 +431,7 @@ protected void lengthTooShort(String name, boolean compress, boolean strict) PrintStream origErr = System.err; ARCReader r = null; try { - System.setErr(new PrintStream(os)); + System.setErr(new PrintStream(os, false, UTF_8.name())); r = ARCReaderFactory.get(writer.getFile()); r.setStrict(strict); @@ -438,7 +440,7 @@ protected void lengthTooShort(String name, boolean compress, boolean strict) // Make sure we get the warning string which complains about the // trailing bytes. - String err = os.toString(); + String err = os.toString(UTF_8.name()); assertTrue(err.startsWith("WARNING") && (err.indexOf("Record STARTING at") > 0), "No message " + err); r.close(); @@ -494,7 +496,7 @@ protected void lengthTooLong(String name, boolean compress, PrintStream origErr = System.err; ARCReader r = null; try { - System.setErr(new PrintStream(os)); + System.setErr(new PrintStream(os, false, UTF_8.name())); r = ARCReaderFactory.get(writer.getFile()); r.setStrict(strict); @@ -503,7 +505,7 @@ protected void lengthTooLong(String name, boolean compress, // Make sure we get the warning string which complains about the // trailing bytes. - String err = os.toString(); + String err = os.toString(UTF_8.name()); assertTrue(err.startsWith("WARNING Premature EOF before end-of-record"), "No message " + err); } finally { @@ -518,7 +520,7 @@ public void testGapError() throws IOException { String content = getContent(); // Make a 'weird' RIS that returns bad 'remaining' length // awhen remaining should be 0 - ReplayInputStream ris = new ReplayInputStream(content.getBytes(), + ReplayInputStream ris = new ReplayInputStream(content.getBytes(UTF_8), content.length(), null) { public long remaining() { return (super.remaining()==0) ? -1 : super.remaining(); diff --git a/src/test/java/org/archive/io/warc/WARCWriterTest.java b/src/test/java/org/archive/io/warc/WARCWriterTest.java index c0ace5f0..d2684fa4 100644 --- a/src/test/java/org/archive/io/warc/WARCWriterTest.java +++ b/src/test/java/org/archive/io/warc/WARCWriterTest.java @@ -42,6 +42,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.*; import static org.archive.format.warc.WARCConstants.*; @@ -228,7 +230,7 @@ protected int writeRandomHTTPRecord(WARCWriter w, int index) String indexStr = Integer.toString(index); recordInfo.setUrl("http://www.one.net/id=" + indexStr); - byte[] record = (getContent(indexStr)).getBytes(); + byte[] record = (getContent(indexStr)).getBytes(UTF_8); recordInfo.setContentLength((long) record.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -385,7 +387,7 @@ protected WARCWriter createWARCWriter(String name, protected static ByteArrayOutputStream getBaos(String str) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); - baos.write(str.getBytes()); + baos.write(str.getBytes(UTF_8)); return baos; } @@ -524,4 +526,4 @@ public void testArcRecordOffsetReads() throws Exception { assertTrue(totalRead > 0); } } -} \ No newline at end of file +} diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 157499ff..e34d4e6f 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.logging.Logger; import org.archive.extract.ExtractingResourceFactoryMapper; @@ -52,7 +53,7 @@ public void testHandleStyleNodeExceptions() throws Exception { TextNode tn = new TextNode(css); epo.handleStyleNode(tn); } catch(Exception e) { - System.err.format("And the winner is....(%s)\n", css); + System.err.format(Locale.ROOT, "And the winner is....(%s)\n", css); e.printStackTrace(); except = true; throw e; diff --git a/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java b/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java index 3b4193b9..a3c8c1c9 100644 --- a/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java +++ b/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java @@ -1,5 +1,7 @@ package org.archive.resource.html; +import java.util.Locale; + import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; @@ -59,7 +61,7 @@ private void appendStrArr(JSONObject o, String a[][]) throws JSONException { } private void appendStrArr2(JSONObject o, String k, String... a) throws JSONException { - System.out.format("A length(%d)\n", a.length); + System.out.format(Locale.ROOT, "A length(%d)\n", a.length); JSONObject n = new JSONObject(); if((a.length & 1) == 1) { throw new IllegalArgumentException(); diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java index 19b1984f..45989416 100644 --- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java @@ -1,6 +1,7 @@ package org.archive.url; import java.net.URISyntaxException; +import java.util.Locale; import org.junit.jupiter.api.Test; @@ -204,12 +205,12 @@ public void testFoo() { String path = "/a/b/c/"; String[] paths = path.split("/",-1); for(String p : paths) { - System.out.format("(%s)",p); + System.out.format(Locale.ROOT, "(%s)", p); } System.out.println(); paths = path.split("/"); for(String p : paths) { - System.out.format("(%s)",p); + System.out.format(Locale.ROOT, "(%s)", p); } System.out.println(); } diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java index bc8fc3a5..c942a260 100644 --- a/src/test/java/org/archive/url/URLParserTest.java +++ b/src/test/java/org/archive/url/URLParserTest.java @@ -3,10 +3,14 @@ import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; import java.net.URLDecoder; +import java.util.Locale; import com.google.common.net.InetAddresses; + import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; public class URLParserTest { @@ -15,7 +19,7 @@ public void testGuava() throws URIException, UnsupportedEncodingException { Long l = Long.parseLong("3279880203"); int i2 = l.intValue(); // int i = Integer.decode("3279880203"); - System.err.format("FromNum(%s)\n", InetAddresses.fromInteger(i2).getHostAddress()); + System.err.format(Locale.ROOT, "FromNum(%s)\n", InetAddresses.fromInteger(i2).getHostAddress()); } @Test @@ -30,7 +34,7 @@ public void testAddDefaultSchemeIfNeeded() { @Test public void testParse() throws UnsupportedEncodingException, URISyntaxException { - System.out.format("O(%s) E(%s)\n","%66",URLDecoder.decode("%66","UTF-8")); + System.out.format(Locale.ROOT, "O(%s) E(%s)\n","%66", URLDecoder.decode("%66", UTF_8.name())); checkParse("http://www.archive.org/index.html#foo", null, "http", null, null, "www.archive.org", -1, "/index.html", null, "foo", "http://www.archive.org/index.html#foo", "/index.html"); @@ -96,7 +100,7 @@ private void checkParse(String s, String opaque, String scheme, String authUser, String authPass, String host, int port, String path, String query, String fragment, String urlString, String pathQuery) throws URISyntaxException { HandyURL h = URLParser.parse(s); - System.out.format("Input:(%s)\nHandyURL\t%s\n",s,h.toDebugString()); + System.out.format(Locale.ROOT, "Input:(%s)\nHandyURL\t%s\n", s, h.toDebugString()); assertEquals(scheme, h.getScheme()); assertEquals(authUser, h.getAuthUser()); assertEquals(authPass, h.getAuthPass()); diff --git a/src/test/java/org/archive/url/URLRegexTransformerTest.java b/src/test/java/org/archive/url/URLRegexTransformerTest.java index 73c43f96..d5c98f6a 100644 --- a/src/test/java/org/archive/url/URLRegexTransformerTest.java +++ b/src/test/java/org/archive/url/URLRegexTransformerTest.java @@ -5,6 +5,8 @@ import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.Locale; + public class URLRegexTransformerTest { @Test @@ -49,7 +51,7 @@ public void testStripPathSessionID() { private static void checkStripPathSessionID(String orig, String want) { String got = URLRegexTransformer.stripPathSessionID(orig); - assertEquals(want, got, String.format("FAIL Orig(%s) Got(%s) Want(%s)", orig, got, want)); + assertEquals(want, got, String.format(Locale.ROOT, "FAIL Orig(%s) Got(%s) Want(%s)", orig, got, want)); } // private static final String BASE = "http://www.archive.org/index.html"; diff --git a/src/test/java/org/archive/util/ByteOpTest.java b/src/test/java/org/archive/util/ByteOpTest.java index 49781c36..eb89353e 100644 --- a/src/test/java/org/archive/util/ByteOpTest.java +++ b/src/test/java/org/archive/util/ByteOpTest.java @@ -4,6 +4,7 @@ import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.IOException; +import java.util.Locale; import com.google.common.io.LittleEndianDataOutputStream; @@ -18,10 +19,10 @@ public void testReadShort() throws IOException { byte a[] = new byte[]{0,1,2,3}; ByteArrayInputStream bais = new ByteArrayInputStream(a); int bos = ByteOp.readShort(bais); - System.out.format("BO.Read short(%d)\n", bos); + System.out.format(Locale.ROOT, "BO.Read short(%d)\n", bos); DataInputStream dis = new DataInputStream(new ByteArrayInputStream(a)); int disv = dis.readUnsignedShort(); - System.out.format("DI.Read short(%d)\n", disv); + System.out.format(Locale.ROOT, "DI.Read short(%d)\n", disv); for(int i = 0; i < 256 * 256; i++) { ByteArrayOutputStream baos = new ByteArrayOutputStream(2); LittleEndianDataOutputStream dos = new LittleEndianDataOutputStream(baos); diff --git a/src/test/java/org/archive/util/CrossProductTest.java b/src/test/java/org/archive/util/CrossProductTest.java index 211fa65e..a487ab15 100644 --- a/src/test/java/org/archive/util/CrossProductTest.java +++ b/src/test/java/org/archive/util/CrossProductTest.java @@ -2,10 +2,12 @@ import java.util.ArrayList; import java.util.List; +import java.util.Locale; import org.junit.jupiter.api.Test; public class CrossProductTest { + private void dumpC(List a) { StringBuilder sb = new StringBuilder(); boolean first = false; @@ -19,16 +21,19 @@ private void dumpC(List a) { } System.out.println("Dump:" + sb.toString()); } + private void dumpLOL(List> coc) { for(List co : coc) { dumpC(co); } } + @Test public void testVersion() { String version = IAUtils.loadCommonsVersion(); - System.out.format("Loaded version(%s)\n", version); + System.out.format(Locale.ROOT, "Loaded version(%s)\n", version); } + @Test public void testCrossProduct() { ArrayList> input = new ArrayList>(); @@ -40,6 +45,7 @@ public void testCrossProduct() { List> cross = xp.crossProduct(input); dumpLOL(cross); } + private List AtoL(Object... a) { ArrayList al = new ArrayList(a.length); for(Object s : a) { diff --git a/src/test/java/org/archive/util/FileUtilsTest.java b/src/test/java/org/archive/util/FileUtilsTest.java index bd58bd09..51c416f0 100644 --- a/src/test/java/org/archive/util/FileUtilsTest.java +++ b/src/test/java/org/archive/util/FileUtilsTest.java @@ -185,7 +185,6 @@ public void testTailLinesNakedWindows() throws IOException { verifyTailLines(nakedLastLineWindows); } - @SuppressWarnings("unchecked") private void verifyTailLines(File file) throws IOException { List lines = org.apache.commons.io.FileUtils.readLines(file); verifyTailLines(file, lines, 1, 80); @@ -263,7 +262,6 @@ public void testHeadLinesNakedWindows() throws IOException { } - @SuppressWarnings("unchecked") private void verifyHeadLines(File file) throws IOException { List lines = org.apache.commons.io.FileUtils.readLines(file); verifyHeadLines(file, lines, 1, 80); diff --git a/src/test/java/org/archive/util/TestUtils.java b/src/test/java/org/archive/util/TestUtils.java index 01b0d099..b8fee0f4 100644 --- a/src/test/java/org/archive/util/TestUtils.java +++ b/src/test/java/org/archive/util/TestUtils.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.List; +import java.util.Locale; import com.google.common.io.ByteStreams; @@ -12,9 +13,9 @@ public class TestUtils { public static void dumpMatch(String context, List> res) { - System.out.format("Context(%s) Found (%d) matches\n", context, res.size()); + System.out.format(Locale.ROOT, "Context(%s) Found (%d) matches\n", context, res.size()); for(List r : res) { - System.out.format("Match(%s)\n", StringParse.join(r)); + System.out.format(Locale.ROOT, "Match(%s)\n", StringParse.join(r)); } } diff --git a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java index 5e8889e5..26d7a16d 100644 --- a/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java +++ b/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java @@ -4,20 +4,26 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.util.Locale; import org.archive.util.binsearch.impl.RandomAccessFileSeekableLineReaderFactory; import org.archive.util.iterator.CloseableIterator; import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; public class SortedTextFileTest { + private static String formatS(int i) { - return String.format("%07d",i); + return String.format(Locale.ROOT, "%07d", i); } - private void createFile(File target, int max) throws FileNotFoundException { - PrintWriter pw = new PrintWriter(target); + + private void createFile(File target, int max) throws FileNotFoundException, UnsupportedEncodingException { + PrintWriter pw = new PrintWriter(target, UTF_8.name()); for(int i = 0; i < max; i++) { pw.println(formatS(i)); } diff --git a/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java b/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java index 20143289..6d5685ad 100644 --- a/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java +++ b/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java @@ -5,7 +5,6 @@ import java.util.List; import java.util.TreeSet; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; diff --git a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java index 98de1416..fa1213f7 100644 --- a/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java +++ b/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java @@ -2,14 +2,17 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.Comparator; import org.junit.jupiter.api.Test; +import static java.nio.charset.StandardCharsets.UTF_8; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -21,19 +24,19 @@ public void testHasNext() throws FileNotFoundException, IOException { File a = File.createTempFile("filea", null); File b = File.createTempFile("fileb", null); - PrintWriter apw = new PrintWriter(a); - PrintWriter bpw = new PrintWriter(b); + PrintWriter apw = new PrintWriter(a, UTF_8.name()); + PrintWriter bpw = new PrintWriter(b, UTF_8.name()); apw.println("1"); apw.println("3"); bpw.println("2"); bpw.println("4"); apw.close(); bpw.close(); - BufferedReader abr = new BufferedReader(new FileReader(a)); - BufferedReader bbr = new BufferedReader(new FileReader(b)); + BufferedReader abr = new BufferedReader(new InputStreamReader(new FileInputStream(a), UTF_8)); + BufferedReader bbr = new BufferedReader(new InputStreamReader(new FileInputStream(b), UTF_8)); SortedCompositeIterator sci = new SortedCompositeIterator(new Comparator() { - @Override + @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } diff --git a/src/test/resources/forbidden-apis-signatures.txt b/src/test/resources/forbidden-apis-signatures.txt new file mode 100644 index 00000000..1eda9eec --- /dev/null +++ b/src/test/resources/forbidden-apis-signatures.txt @@ -0,0 +1,2 @@ +java.net.URL#equals(java.lang.Object) @ may trigger a DNS lookup to resolve the host part +java.net.URL#hashCode() @ may trigger a DNS lookup to resolve the host part