We're not supposed to have access to the underlying connection object;
- * am only violating contract because see cases where httpclient is skipping
- * out w/o cleaning up after itself.
- *
- * @author stack
- * @version $Revision$, $Date$
- * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
- */
-@Deprecated
-public class HttpRecorderGetMethod extends GetMethod {
-
- protected static Logger logger =
- Logger.getLogger(HttpRecorderGetMethod.class.getName());
-
- /**
- * Instance of http recorder method.
- */
- protected HttpRecorderMethod httpRecorderMethod = null;
-
-
- public HttpRecorderGetMethod(String uri, Recorder recorder) {
- super(uri);
- this.httpRecorderMethod = new HttpRecorderMethod(recorder);
- }
-
- protected void readResponseBody(HttpState state, HttpConnection connection)
- throws IOException, HttpException {
- // We're about to read the body. Mark transition in http recorder.
- this.httpRecorderMethod.markContentBegin(connection);
- super.readResponseBody(state, connection);
- }
-
- protected boolean shouldCloseConnection(HttpConnection conn) {
- // Always close connection after each request. As best I can tell, this
- // is superfluous -- we've set our client to be HTTP/1.0. Doing this
- // out of paranoia.
- return true;
- }
-
- public int execute(HttpState state, HttpConnection conn)
- throws HttpException, IOException {
- // Save off the connection so we can close it on our way out in case
- // httpclient fails to (We're not supposed to have access to the
- // underlying connection object; am only violating contract because
- // see cases where httpclient is skipping out w/o cleaning up
- // after itself).
- this.httpRecorderMethod.setConnection(conn);
- return super.execute(state, conn);
- }
-
- protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
- throws IOException, HttpException {
- super.addProxyConnectionHeader(state, conn);
- this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
- }
-
- // XXX see https://webarchive.jira.com/browse/HER-2059
- // We never call this method with the implied question mark prepended, so
- // adding it does the trick, since commons-httpclient will strip it later.
- public void setQueryString(String queryString) {
- if (queryString != null) {
- super.setQueryString('?' + queryString);
- } else {
- super.setQueryString(queryString);
- }
- }
-
-}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
deleted file mode 100644
index b08bc0bd..00000000
--- a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.httpclient;
-
-import java.util.logging.Logger;
-
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.HttpMethod;
-import org.archive.util.Recorder;
-
-
-/**
- * This class encapsulates the specializations supplied by the
- * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}.
- *
- * It keeps instance of HttpRecorder and HttpConnection.
- *
- * @author stack
- * @version $Revision$, $Date$
- * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
- */
-@Deprecated
-public class HttpRecorderMethod {
- protected static Logger logger =
- Logger.getLogger(HttpRecorderMethod.class.getName());
-
- /**
- * Instance of http recorder we're using recording this http get.
- */
- private Recorder httpRecorder = null;
-
- /**
- * Save around so can force close.
- *
- * See [ 922080 ] IllegalArgumentException (size is wrong).
- * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099
- */
- private HttpConnection connection = null;
-
-
- public HttpRecorderMethod(Recorder recorder) {
- this.httpRecorder = recorder;
- }
-
- public void markContentBegin(HttpConnection c) {
- if (c != this.connection) {
- // We're checking that we're not being asked to work on
- // a connection that is other than the one we started
- // this method#execute with.
- throw new IllegalArgumentException("Connections differ: " +
- this.connection + " " + c + " " +
- Thread.currentThread().getName());
- }
- this.httpRecorder.markContentBegin();
- }
-
- /**
- * @return Returns the connection.
- */
- public HttpConnection getConnection() {
- return this.connection;
- }
-
- /**
- * @param connection The connection to set.
- */
- public void setConnection(HttpConnection connection) {
- this.connection = connection;
- }
- /**
- * @return Returns the httpRecorder.
- */
- public Recorder getHttpRecorder() {
- return httpRecorder;
- }
-
- /**
- * If a 'Proxy-Connection' header has been added to the request,
- * it'll be of a 'keep-alive' type. Until we support 'keep-alives',
- * override the Proxy-Connection setting and instead pass a 'close'
- * (Otherwise every request has to timeout before we notice
- * end-of-document).
- * @param method Method to find proxy-connection header in.
- */
- public void handleAddProxyConnectionHeader(HttpMethod method) {
- Header h = method.getRequestHeader("Proxy-Connection");
- if (h != null) {
- h.setValue("close");
- method.setRequestHeader(h);
- }
- }
-}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
deleted file mode 100644
index d55d816a..00000000
--- a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.httpclient;
-
-import java.io.IOException;
-
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.HttpException;
-import org.apache.commons.httpclient.HttpState;
-import org.apache.commons.httpclient.methods.PostMethod;
-import org.archive.util.Recorder;
-
-
-/**
- * Override of PostMethod that marks the passed HttpRecorder w/ the transition
- * from HTTP head to body and that forces a close on the responseConnection.
- *
- * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the
- * parent subclass.
- *
- * @author stack
- * @version $Date$ $Revision$
- * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
- */
-@Deprecated
-public class HttpRecorderPostMethod extends PostMethod {
- /**
- * Instance of http recorder method.
- */
- protected HttpRecorderMethod httpRecorderMethod = null;
-
-
- public HttpRecorderPostMethod(String uri, Recorder recorder) {
- super(uri);
- this.httpRecorderMethod = new HttpRecorderMethod(recorder);
- }
-
- protected void readResponseBody(HttpState state, HttpConnection connection)
- throws IOException, HttpException {
- // We're about to read the body. Mark transition in http recorder.
- this.httpRecorderMethod.markContentBegin(connection);
- super.readResponseBody(state, connection);
- }
-
- protected boolean shouldCloseConnection(HttpConnection conn) {
- // Always close connection after each request. As best I can tell, this
- // is superfluous -- we've set our client to be HTTP/1.0. Doing this
- // out of paranoia.
- return true;
- }
-
- public int execute(HttpState state, HttpConnection conn)
- throws HttpException, IOException {
- // Save off the connection so we can close it on our way out in case
- // httpclient fails to (We're not supposed to have access to the
- // underlying connection object; am only violating contract because
- // see cases where httpclient is skipping out w/o cleaning up
- // after itself).
- this.httpRecorderMethod.setConnection(conn);
- return super.execute(state, conn);
- }
-
- protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
- throws IOException, HttpException {
- super.addProxyConnectionHeader(state, conn);
- this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
- }
-}
diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
deleted file mode 100644
index d6cf27ab..00000000
--- a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.httpclient;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.commons.httpclient.HostConfiguration;
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.SimpleHttpConnectionManager;
-
-/**
- * An HttpClient-compatible HttpConnection "manager" that actually
- * just gives out a new connection each time -- skipping the overhead
- * of connection management, since we already throttle our crawler
- * with external mechanisms.
- *
- * @author gojomo
- * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
- */
-@Deprecated
-public class SingleHttpConnectionManager extends SimpleHttpConnectionManager {
-
- public SingleHttpConnectionManager() {
- super();
- }
-
- public HttpConnection getConnectionWithTimeout(
- HostConfiguration hostConfiguration, long timeout) {
-
- HttpConnection conn = new HttpConnection(hostConfiguration);
- conn.setHttpConnectionManager(this);
- conn.getParams().setDefaults(this.getParams());
- return conn;
- }
-
- public void releaseConnection(HttpConnection conn) {
- // ensure connection is closed
- conn.close();
- finishLast(conn);
- }
-
- protected static void finishLast(HttpConnection conn) {
- // copied from superclass because it wasn't made available to subclasses
- InputStream lastResponse = conn.getLastResponseInputStream();
- if (lastResponse != null) {
- conn.setLastResponseInputStream(null);
- try {
- lastResponse.close();
- } catch (IOException ioe) {
- //FIXME: badness - close to force reconnect.
- conn.close();
- }
- }
- }
-}
diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
deleted file mode 100644
index 16821b36..00000000
--- a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
+++ /dev/null
@@ -1,293 +0,0 @@
-/**
- * ====================================================================
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ====================================================================
- *
- */
-package org.archive.httpclient;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import org.apache.commons.httpclient.HostConfiguration;
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.HttpConnectionManager;
-import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
-
-/**
- * A simple, but thread-safe HttpClient {@link HttpConnectionManager}.
- * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}.
- *
- * Java >= 1.4 is recommended.
- *
- * @author Christian Kohlschuetter
- * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0
- */
-@Deprecated
-public final class ThreadLocalHttpConnectionManager implements
- HttpConnectionManager {
-
- private static final CloserThread closer = new CloserThread();
- private static final Logger logger = Logger
- .getLogger(ThreadLocalHttpConnectionManager.class.getName());
-
- private final ThreadLocal tl = new ThreadLocal() {
- protected synchronized ConnectionInfo initialValue() {
- return new ConnectionInfo();
- }
- };
-
- private ConnectionInfo getConnectionInfo() {
- return (ConnectionInfo) tl.get();
- }
-
- private static final class ConnectionInfo {
- /** The http connection */
- private HttpConnection conn = null;
-
- /**
- * The time the connection was made idle.
- */
- private long idleStartTime = Long.MAX_VALUE;
- }
-
- public ThreadLocalHttpConnectionManager() {
- }
-
- /**
- * Since the same connection is about to be reused, make sure the
- * previous request was completely processed, and if not
- * consume it now.
- * @param conn The connection
- * @return true, if the connection is reusable
- */
- private static boolean finishLastResponse(final HttpConnection conn) {
- InputStream lastResponse = conn.getLastResponseInputStream();
- if(lastResponse != null) {
- conn.setLastResponseInputStream(null);
- try {
- lastResponse.close();
- return true;
- } catch (IOException ioe) {
- // force reconnect.
- return false;
- }
- } else {
- return false;
- }
- }
-
- /**
- * Collection of parameters associated with this connection manager.
- */
- private HttpConnectionManagerParams params = new HttpConnectionManagerParams();
-
- /**
- * @see HttpConnectionManager#getConnection(HostConfiguration)
- */
- public HttpConnection getConnection(
- final HostConfiguration hostConfiguration) {
- return getConnection(hostConfiguration, 0);
- }
-
- /**
- * Gets the staleCheckingEnabled value to be set on HttpConnections that are created.
- *
- * @return true if stale checking will be enabled on HttpConections
- *
- * @see HttpConnection#isStaleCheckingEnabled()
- *
- * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()},
- * {@link HttpConnectionManager#getParams()}.
- */
- public boolean isConnectionStaleCheckingEnabled() {
- return this.params.isStaleCheckingEnabled();
- }
-
- /**
- * Sets the staleCheckingEnabled value to be set on HttpConnections that are created.
- *
- * @param connectionStaleCheckingEnabled true if stale checking will be enabled
- * on HttpConections
- *
- * @see HttpConnection#setStaleCheckingEnabled(boolean)
- *
- * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)},
- * {@link HttpConnectionManager#getParams()}.
- */
- public void setConnectionStaleCheckingEnabled(
- final boolean connectionStaleCheckingEnabled) {
- this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled);
- }
-
- /**
- * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long)
- *
- * @since 3.0
- */
- public HttpConnection getConnectionWithTimeout(
- final HostConfiguration hostConfiguration, final long timeout) {
-
- final ConnectionInfo ci = getConnectionInfo();
- HttpConnection httpConnection = ci.conn;
-
- // make sure the host and proxy are correct for this connection
- // close it and set the values if they are not
- if(httpConnection == null || !finishLastResponse(httpConnection)
- || !hostConfiguration.hostEquals(httpConnection)
- || !hostConfiguration.proxyEquals(httpConnection)) {
-
- if(httpConnection != null && httpConnection.isOpen()) {
- closer.closeConnection(httpConnection);
- }
-
- httpConnection = new HttpConnection(hostConfiguration);
- httpConnection.setHttpConnectionManager(this);
- httpConnection.getParams().setDefaults(this.params);
- ci.conn = httpConnection;
-
- httpConnection.setHost(hostConfiguration.getHost());
- httpConnection.setPort(hostConfiguration.getPort());
- httpConnection.setProtocol(hostConfiguration.getProtocol());
- httpConnection.setLocalAddress(hostConfiguration.getLocalAddress());
-
- httpConnection.setProxyHost(hostConfiguration.getProxyHost());
- httpConnection.setProxyPort(hostConfiguration.getProxyPort());
- }
-
- // remove the connection from the timeout handler
- ci.idleStartTime = Long.MAX_VALUE;
-
- return httpConnection;
- }
-
- /**
- * @see HttpConnectionManager#getConnection(HostConfiguration, long)
- *
- * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long)
- */
- public HttpConnection getConnection(
- final HostConfiguration hostConfiguration, final long timeout) {
- return getConnectionWithTimeout(hostConfiguration, timeout);
- }
-
- /**
- * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection)
- */
- public void releaseConnection(final HttpConnection conn) {
- final ConnectionInfo ci = getConnectionInfo();
- HttpConnection httpConnection = ci.conn;
-
- if(conn != httpConnection) {
- throw new IllegalStateException(
- "Unexpected release of an unknown connection.");
- }
-
- finishLastResponse(httpConnection);
-
- // track the time the connection was made idle
- ci.idleStartTime = System.currentTimeMillis();
- }
-
- /**
- * Returns {@link HttpConnectionManagerParams parameters} associated
- * with this connection manager.
- *
- * @since 2.1
- *
- * @see HttpConnectionManagerParams
- */
- public HttpConnectionManagerParams getParams() {
- return this.params;
- }
-
- /**
- * Assigns {@link HttpConnectionManagerParams parameters} for this
- * connection manager.
- *
- * @since 2.1
- *
- * @see HttpConnectionManagerParams
- */
- public void setParams(final HttpConnectionManagerParams p) {
- if(p == null) {
- throw new IllegalArgumentException("Parameters may not be null");
- }
- this.params = p;
- }
-
- /**
- * @since 3.0
- */
- public void closeIdleConnections(final long idleTimeout) {
- long maxIdleTime = System.currentTimeMillis() - idleTimeout;
-
- final ConnectionInfo ci = getConnectionInfo();
-
- if(ci.idleStartTime <= maxIdleTime) {
- ci.conn.close();
- }
- }
-
- private static final class CloserThread extends Thread {
- private List connections
- = new ArrayList();
-
- private static final int SLEEP_INTERVAL = 5000;
-
- public CloserThread() {
- super("HttpConnection closer");
- // Make this a daemon thread so it can't be responsible for the JVM
- // not shutting down.
- setDaemon(true);
- start();
- }
-
- public void closeConnection(final HttpConnection conn) {
- synchronized (connections) {
- connections.add(conn);
- }
- }
-
- public void run() {
- try {
- while (!Thread.interrupted()) {
- Thread.sleep(SLEEP_INTERVAL);
-
- List s;
- synchronized (connections) {
- s = connections;
- connections = new ArrayList();
- }
- logger.log(Level.INFO, "Closing " + s.size()
- + " HttpConnections");
- for(final Iterator it = s.iterator();
- it.hasNext();) {
- HttpConnection conn = it.next();
- conn.close();
- conn.setHttpConnectionManager(null);
- it.remove();
- }
- }
- } catch (InterruptedException e) {
- return;
- }
- }
- }
-}
diff --git a/src/main/java/org/archive/io/ArchiveFileConstants.java b/src/main/java/org/archive/io/ArchiveFileConstants.java
deleted file mode 100644
index b1a39194..00000000
--- a/src/main/java/org/archive/io/ArchiveFileConstants.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.io;
-
-@Deprecated
-public interface ArchiveFileConstants extends org.archive.format.ArchiveFileConstants {
-}
diff --git a/src/main/java/org/archive/io/ArchiveReader.java b/src/main/java/org/archive/io/ArchiveReader.java
index 0038cccf..070455a5 100644
--- a/src/main/java/org/archive/io/ArchiveReader.java
+++ b/src/main/java/org/archive/io/ArchiveReader.java
@@ -26,12 +26,14 @@
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -42,13 +44,17 @@
import com.google.common.io.CountingInputStream;
+import static org.archive.format.ArchiveFileConstants.*;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Reader for an Archive file of Archive {@link ArchiveRecord}s.
* @author stack
* @version $Date$ $Version$
*/
-public abstract class ArchiveReader implements ArchiveFileConstants, Iterable, Closeable {
+public abstract class ArchiveReader implements Iterable, Closeable {
/**
* Is this Archive file compressed?
*/
@@ -601,8 +607,7 @@ public String getStrippedFileName() {
*/
public static String getStrippedFileName(String name,
final String dotFileExtension) {
- name = stripExtension(name,
- ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);
+ name = stripExtension(name, DOT_COMPRESSED_FILE_EXTENSION);
return stripExtension(name, dotFileExtension);
}
@@ -614,7 +619,7 @@ protected static boolean getTrueOrFalse(final String value) {
if (value == null || value.length() <= 0) {
return false;
}
- return Boolean.TRUE.toString().equals(value.toLowerCase());
+ return Boolean.TRUE.toString().equals(value.toLowerCase(Locale.ROOT));
}
/**
@@ -658,7 +663,7 @@ protected void cdxOutput(boolean toFile)
DOT_COMPRESSED_FILE_EXTENSION);
cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
cdxFilename += ('.' + CDX);
- cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
+ cdxWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(cdxFilename), UTF_8));
}
String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
@@ -699,7 +704,7 @@ public boolean outputRecord(final String format)
boolean result = true;
if (format.equals(CDX)) {
System.out.println(get().outputCdx(getStrippedFileName()));
- } else if(format.equals(ArchiveFileConstants.DUMP)) {
+ } else if(format.equals(DUMP)) {
// No point digesting if dumping content.
setDigest(false);
get().dump();
@@ -756,4 +761,4 @@ protected static Options getOptions() {
"'or 'nohead'. Default: 'cdx'."));
return options;
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/ArchiveReaderFactory.java b/src/main/java/org/archive/io/ArchiveReaderFactory.java
index 17f14d3a..fe72236b 100644
--- a/src/main/java/org/archive/io/ArchiveReaderFactory.java
+++ b/src/main/java/org/archive/io/ArchiveReaderFactory.java
@@ -25,6 +25,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
+import java.util.Locale;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.warc.WARCReaderFactory;
@@ -33,6 +34,7 @@
import org.archive.url.UsableURI;
import org.archive.util.FileUtils;
+import static org.archive.format.ArchiveFileConstants.*;
/**
* Factory that returns an Archive file Reader.
@@ -40,7 +42,7 @@
* @author stack
* @version $Date$ $Revision$
*/
-public class ArchiveReaderFactory implements ArchiveFileConstants {
+public class ArchiveReaderFactory {
// Static block to enable S3 URLs
static {
if (System.getProperty("java.protocol.handler.pkgs") != null) {
@@ -295,7 +297,7 @@ protected void addUserAgent(final HttpURLConnection connection) {
* @throws IOException
*/
protected boolean isCompressed(final File f) throws IOException {
- return f.getName().toLowerCase().
+ return f.getName().toLowerCase(Locale.ROOT).
endsWith(DOT_COMPRESSED_FILE_EXTENSION);
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/ArchiveRecord.java b/src/main/java/org/archive/io/ArchiveRecord.java
index 63bfe628..01e8d5ec 100644
--- a/src/main/java/org/archive/io/ArchiveRecord.java
+++ b/src/main/java/org/archive/io/ArchiveRecord.java
@@ -23,8 +23,10 @@
import java.io.OutputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
+import java.util.Locale;
import java.util.logging.Level;
+import org.archive.format.ArchiveFileConstants;
import org.archive.util.Base32;
/**
@@ -392,7 +394,7 @@ public boolean hasContentHeaders() {
return false;
}
- if (!url.toLowerCase().startsWith("http")) {
+ if (!url.toLowerCase(Locale.ROOT).startsWith("http")) {
return false;
}
diff --git a/src/main/java/org/archive/io/CompositeFileReader.java b/src/main/java/org/archive/io/CompositeFileReader.java
index 14b56219..6e331565 100644
--- a/src/main/java/org/archive/io/CompositeFileReader.java
+++ b/src/main/java/org/archive/io/CompositeFileReader.java
@@ -23,6 +23,8 @@
import java.io.InputStreamReader;
import java.util.List;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* @author gojomo
@@ -34,7 +36,7 @@ public class CompositeFileReader extends InputStreamReader {
* @throws IOException
*/
public CompositeFileReader(List filenames) throws IOException {
- super(new CompositeFileInputStream(filenames));
+ super(new CompositeFileInputStream(filenames), UTF_8);
}
}
diff --git a/src/main/java/org/archive/io/GZIPMembersInputStream.java b/src/main/java/org/archive/io/GZIPMembersInputStream.java
deleted file mode 100644
index 35fb9e90..00000000
--- a/src/main/java/org/archive/io/GZIPMembersInputStream.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.io;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-/**
- * @deprecated use {@link org.archive.util.zip.GZIPMembersInputStream}
- */
-@Deprecated
-public class GZIPMembersInputStream extends org.archive.util.zip.GZIPMembersInputStream {
-
- public GZIPMembersInputStream(InputStream in) throws IOException {
- super(in);
- }
-
- public GZIPMembersInputStream(InputStream in, int size) throws IOException {
- super(in, size);
- }
-
-}
\ No newline at end of file
diff --git a/src/main/java/org/archive/io/GenericReplayCharSequence.java b/src/main/java/org/archive/io/GenericReplayCharSequence.java
index c427550b..ff96717c 100644
--- a/src/main/java/org/archive/io/GenericReplayCharSequence.java
+++ b/src/main/java/org/archive/io/GenericReplayCharSequence.java
@@ -33,14 +33,15 @@
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.text.NumberFormat;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.IOUtils;
import org.archive.util.DevUtils;
-import com.google.common.base.Charsets;
import com.google.common.primitives.Ints;
/**
@@ -67,7 +68,7 @@ public class GenericReplayCharSequence implements ReplayCharSequence {
*
*
See Encoding.
*/
- public static final Charset WRITE_ENCODING = Charsets.UTF_16BE;
+ public static final Charset WRITE_ENCODING = StandardCharsets.UTF_16BE;
private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M
@@ -168,8 +169,8 @@ private void updateMemoryMappedBuffer() {
long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES);
logger.fine("updateMemoryMappedBuffer: mapOffset="
- + NumberFormat.getInstance().format(mapByteOffset)
- + " mapSize=" + NumberFormat.getInstance().format(mapSize));
+ + NumberFormat.getInstance(Locale.ROOT).format(mapByteOffset)
+ + " mapSize=" + NumberFormat.getInstance(Locale.ROOT).format(mapSize));
try {
// TODO: stress-test without these possibly-costly requests!
// System.gc();
@@ -255,9 +256,9 @@ protected void decode(InputStream inStream, int prefixMax,
this.length = Ints.saturatedCast(count);
if(count>Integer.MAX_VALUE) {
logger.warning("input stream is longer than Integer.MAX_VALUE="
- + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE)
+ " characters -- only first "
- + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE)
+ " are accessible through this GenericReplayCharSequence");
}
diff --git a/src/main/java/org/archive/io/GzipHeader.java b/src/main/java/org/archive/io/GzipHeader.java
deleted file mode 100644
index 6b8263bc..00000000
--- a/src/main/java/org/archive/io/GzipHeader.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.io;
-
-/**
- * @deprecated use {@link org.archive.util.zip.GzipHeader}
- */
-@Deprecated
-public class GzipHeader extends org.archive.util.zip.GzipHeader {
-}
diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
index 3cce595b..858edb4d 100644
--- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java
+++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
@@ -25,12 +25,11 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HttpParser;
-import org.apache.commons.httpclient.StatusLine;
-import org.apache.commons.httpclient.util.EncodingUtil;
-import org.archive.io.arc.ARCConstants;
+import org.archive.format.http.HttpHeader;
+import org.archive.format.arc.ARCConstants;
import org.archive.util.LaxHttpParser;
/**
@@ -59,7 +58,7 @@ public class HeaderedArchiveRecord extends ArchiveRecord {
*
* Only available after the reading of headers.
*/
- private Header [] contentHeaders = null;
+ private HttpHeader[] contentHeaders = null;
public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException {
@@ -147,27 +146,29 @@ private InputStream readContentHeaders() throws IOException {
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new IOException("Failed to read raw lie where one " +
- " was expected: " + new String(statusBytes));
+ " was expected: " + new String(statusBytes, ARCConstants.DEFAULT_ENCODING));
}
- String statusLine = EncodingUtil.getString(statusBytes, 0,
+ String statusLine = new String(statusBytes, 0,
statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
- if (statusLine == null) {
- throw new NullPointerException("Expected status line is null");
- }
+ statusLine = statusLine.trim();
// TODO: Tighten up this test.
- boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine);
+ boolean isHttpResponse = statusLine.startsWith("HTTP");
boolean isHttpRequest = false;
if (!isHttpResponse) {
- isHttpRequest = statusLine.toUpperCase().startsWith("GET") ||
- !statusLine.toUpperCase().startsWith("POST");
+ isHttpRequest = statusLine.toUpperCase(Locale.ROOT).startsWith("GET") ||
+ !statusLine.toUpperCase(Locale.ROOT).startsWith("POST");
}
if (!isHttpResponse && !isHttpRequest) {
throw new UnexpectedStartLineIOException("Failed parse of " +
"status line: " + statusLine);
}
- this.statusCode = isHttpResponse?
- (new StatusLine(statusLine)).getStatusCode(): -1;
-
+
+ if (isHttpResponse) {
+ this.statusCode = parseStatusCode(statusLine);
+ } else {
+ this.statusCode = -1;
+ }
+
// Save off all bytes read. Keep them as bytes rather than
// convert to strings so we don't have to worry about encodings
// though this should never be a problem doing http headers since
@@ -183,7 +184,7 @@ private InputStream readContentHeaders() throws IOException {
eolCharCount = getEolCharsCount(lineBytes);
if (eolCharCount <= 0) {
throw new IOException("Failed reading headers: " +
- ((lineBytes != null)? new String(lineBytes): null));
+ ((lineBytes != null)? new String(lineBytes, StandardCharsets.ISO_8859_1): null));
}
// Save the bytes read.
baos.write(lineBytes);
@@ -210,7 +211,19 @@ private InputStream readContentHeaders() throws IOException {
bais.reset();
return bais;
}
-
+
+ public static int parseStatusCode(String statusLine) {
+ int i = statusLine.indexOf(' ');
+ if (i < 0) return -1;
+ int j = statusLine.indexOf(' ', i + 1);
+ if (j < 0) j = statusLine.length();
+ try {
+ return Integer.parseInt(statusLine.substring(i + 1, j));
+ } catch (NumberFormatException e) {
+ return -1;
+ }
+ }
+
public static class UnexpectedStartLineIOException
extends RecoverableIOException {
private static final long serialVersionUID = 1L;
@@ -252,7 +265,7 @@ public int getContentHeadersLength() {
return this.contentHeadersLength;
}
- public Header[] getContentHeaders() {
+ public HttpHeader[] getContentHeaders() {
return contentHeaders;
}
diff --git a/src/main/java/org/archive/io/NoGzipMagicException.java b/src/main/java/org/archive/io/NoGzipMagicException.java
deleted file mode 100644
index 27d1058a..00000000
--- a/src/main/java/org/archive/io/NoGzipMagicException.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.io;
-
-/**
- * @deprecated use {@link org.archive.util.zip.NoGzipMagicException}
- */
-@Deprecated
-public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException {
-}
diff --git a/src/main/java/org/archive/io/RecordingInputStream.java b/src/main/java/org/archive/io/RecordingInputStream.java
index 95419280..3c9db61f 100644
--- a/src/main/java/org/archive/io/RecordingInputStream.java
+++ b/src/main/java/org/archive/io/RecordingInputStream.java
@@ -383,12 +383,12 @@ public synchronized void mark(int readlimit) {
@Override
public boolean markSupported() {
- return this.in.markSupported();
+ return in != null && this.in.markSupported();
}
@Override
public synchronized void reset() throws IOException {
- this.in.reset();
+ if (in != null) this.in.reset();
this.recordingOutputStream.reset();
}
@@ -418,4 +418,13 @@ public void chopAtMessageBodyBegin() {
public void clearForReuse() throws IOException {
recordingOutputStream.clearForReuse();
}
+
+ /**
+ * Returns an OutputStream that can be used for recording input data. This is useful if the input comes in some
+ * form other than an InputStream. For example, if the input is provided by a callback periodically called with
+ * a chunk of data.
+ */
+ public RecordingOutputStream asOutputStream() {
+ return this.recordingOutputStream;
+ }
}
diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java
index e456e293..bd74f2f8 100644
--- a/src/main/java/org/archive/io/ReplayCharSequence.java
+++ b/src/main/java/org/archive/io/ReplayCharSequence.java
@@ -23,8 +23,7 @@
import java.io.IOException;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
-
-import com.google.common.base.Charsets;
+import java.nio.charset.StandardCharsets;
/**
@@ -40,7 +39,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable {
/** charset to use in replay when declared value
* is absent/illegal/unavailable */
- public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8?
+ public Charset FALLBACK_CHARSET = StandardCharsets.ISO_8859_1; // TODO: should this be UTF-8?
/**
* Call this method when done so implementation has chance to clean up
diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java
index c280b08d..4dc0144b 100644
--- a/src/main/java/org/archive/io/UTF8Bytes.java
+++ b/src/main/java/org/archive/io/UTF8Bytes.java
@@ -19,6 +19,7 @@
package org.archive.io;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
/**
* Marker Interface for instances that can be serialized as UTF8 bytes.
@@ -27,7 +28,7 @@
* @version $Date$ $Version$
*/
public interface UTF8Bytes {
- public static final String UTF8 = "UTF-8";
+ public static final String UTF8 = StandardCharsets.UTF_8.name();
/**
* @return Instance as UTF-8 bytes.
diff --git a/src/main/java/org/archive/io/WriterPool.java b/src/main/java/org/archive/io/WriterPool.java
index db184c5f..79da16c0 100644
--- a/src/main/java/org/archive/io/WriterPool.java
+++ b/src/main/java/org/archive/io/WriterPool.java
@@ -30,6 +30,7 @@
import java.util.logging.Level;
import java.util.logging.Logger;
+import org.archive.format.ArchiveFileConstants;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@@ -215,7 +216,7 @@ public synchronized void invalidateFile(WriterPoolMember f)
// gets attention.
File file = f.getFile();
file.renameTo(new File(file.getAbsoluteFile() +
- WriterPoolMember.INVALID_SUFFIX));
+ ArchiveFileConstants.INVALID_SUFFIX));
}
/**
diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java
index e10d443b..5d350534 100644
--- a/src/main/java/org/archive/io/WriterPoolMember.java
+++ b/src/main/java/org/archive/io/WriterPoolMember.java
@@ -25,10 +25,13 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
@@ -38,6 +41,7 @@
import org.archive.util.FileUtils;
import org.archive.util.PropertyUtils;
+import static org.archive.format.ArchiveFileConstants.*;
/**
@@ -48,10 +52,10 @@
* @author stack
* @version $Date$ $Revision$
*/
-public abstract class WriterPoolMember implements ArchiveFileConstants {
+public abstract class WriterPoolMember {
private final Logger logger = Logger.getLogger(this.getClass().getName());
- public static final String UTF8 = "UTF-8";
+ public static final String UTF8 = StandardCharsets.UTF_8.name();
/**
* Default archival-aggregate filename template.
@@ -102,12 +106,17 @@ public abstract class WriterPoolMember implements ArchiveFileConstants {
*/
protected static int roundRobinIndex = 0;
+ /**
+ * Symbol set for serial number formatter.
+ */
+ protected static DecimalFormatSymbols serialNoFormatterSymbols = new DecimalFormatSymbols(Locale.ROOT);
+
/**
* NumberFormat instance for formatting serial number.
*
* Pads serial number with zeros.
*/
- protected static NumberFormat serialNoFormatter = new DecimalFormat("00000");
+ protected static NumberFormat serialNoFormatter = new DecimalFormat("00000", serialNoFormatterSymbols);
/**
diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java
index 19010131..aec571e9 100644
--- a/src/main/java/org/archive/io/arc/ARC2WCDX.java
+++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java
@@ -22,18 +22,18 @@
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
-import java.util.Date;
-import java.util.Iterator;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
import java.util.zip.GZIPOutputStream;
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HeaderGroup;
-import org.apache.commons.httpclient.util.DateParseException;
-import org.apache.commons.httpclient.util.DateUtil;
+import org.archive.format.http.HttpHeader;
import org.archive.io.ArchiveRecord;
import org.archive.util.ArchiveUtils;
import org.archive.util.SURT;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
* Writes .wcdx.gz in same directory.
@@ -63,7 +63,7 @@ public static Object[] createWcdx(ARCReader reader) {
PrintStream writer = null;
long count = 0;
try {
- writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)));
+ writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)), false, UTF_8.name());
// write header: legend + timestamp
StringBuilder legend = new StringBuilder();
@@ -95,12 +95,15 @@ public static Object[] createWcdx(ARCReader reader) {
ARCRecord record = (ARCRecord) iter.next();
record.close();
ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader();
- Header[] httpHeaders = record.getHttpHeaders();
+ HttpHeader[] httpHeaders = record.getHttpHeaders();
if(httpHeaders==null) {
- httpHeaders = new Header[0];
+ httpHeaders = new HttpHeader[0];
+ }
+ Map headerMap = new HashMap<>();
+ for (HttpHeader header : httpHeaders) {
+ headerMap.putIfAbsent(header.getName().toLowerCase(Locale.ROOT), header);
}
- HeaderGroup hg = new HeaderGroup();
- hg.setHeaders(httpHeaders);
+
StringBuilder builder = new StringBuilder();
// SURT-form URI
@@ -108,7 +111,7 @@ public static Object[] createWcdx(ARCReader reader) {
// record timestamp ('b')
appendField(builder,h.getDate());
// http header date
- appendTimeField(builder,hg.getFirstHeader("Date"));
+ appendTimeField(builder, headerMap.get("date"));
// response code ('s')
appendField(builder,h.getStatusCode());
// media type ('m')
@@ -131,17 +134,17 @@ public static Object[] createWcdx(ARCReader reader) {
// uncompressed (declared in ARC headerline) record length
appendField(builder,h.getLength());
// http header content-length
- appendField(builder,hg.getFirstHeader("Content-Length"));
+ appendField(builder, headerMap.get("content-length"));
// http header mod-date
- appendTimeField(builder,hg.getFirstHeader("Last-Modified"));
+ appendTimeField(builder, headerMap.get("last-modified"));
// http header expires
- appendTimeField(builder,hg.getFirstHeader("Expires"));
+ appendTimeField(builder, headerMap.get("expires"));
// http header etag
- appendField(builder,hg.getFirstHeader("ETag"));
+ appendField(builder, headerMap.get("etag"));
// http header redirect ('Location' header?)
- appendField(builder,hg.getFirstHeader("Location"));
+ appendField(builder, headerMap.get("location"));
// ip ('e')
appendField(builder,h.getIp());
// original URI
@@ -186,8 +189,8 @@ protected static void appendField(StringBuilder builder, Object obj) {
// prepend with delimiter
builder.append(' ');
}
- if(obj instanceof Header) {
- obj = ((Header)obj).getValue().trim();
+ if(obj instanceof HttpHeader) {
+ obj = ((HttpHeader)obj).getValue().trim();
}
builder.append((obj==null||obj.toString().length()==0)?"-":obj);
@@ -202,16 +205,16 @@ protected static void appendTimeField(StringBuilder builder, Object obj) {
builder.append("-");
return;
}
- if(obj instanceof Header) {
- String s = ((Header)obj).getValue().trim();
+ if(obj instanceof HttpHeader) {
+ String s = ((HttpHeader)obj).getValue().trim();
try {
- Date date = DateUtil.parseDate(s);
+ Date date = parseDate(s);
String d = ArchiveUtils.get14DigitDate(date);
if(d.startsWith("209")) {
d = "199"+d.substring(3);
}
obj = d;
- } catch (DateParseException e) {
+ } catch (ParseException e) {
builder.append('e');
return;
}
@@ -219,6 +222,23 @@ protected static void appendTimeField(StringBuilder builder, Object obj) {
}
builder.append(obj);
}
+
+ private static Date parseDate(String s) throws ParseException {
+ SimpleDateFormat format = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
+ format.setTimeZone(TimeZone.getTimeZone("GMT"));
+ format.set2DigitYearStart(new Date(946684800)); // year 2000
+ try {
+ return format.parse(s);
+ } catch (ParseException e) {
+ try {
+ format.applyPattern("EEEE, dd-MMM-yy HH:mm:ss zzz");
+ return format.parse(s);
+ } catch (ParseException e1) {
+ format.applyPattern("EEE MMM d HH:mm:ss yyyy");
+ return format.parse(s);
+ }
+ }
+ }
}
//'wide' CDX
diff --git a/src/main/java/org/archive/io/arc/ARCConstants.java b/src/main/java/org/archive/io/arc/ARCConstants.java
deleted file mode 100644
index c44cfef7..00000000
--- a/src/main/java/org/archive/io/arc/ARCConstants.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.io.arc;
-
-
-/**
- * Constants used by ARC files and in ARC file processing.
- *
- * @author stack
- * @deprecated
- */
-public interface ARCConstants extends org.archive.format.arc.ARCConstants {
-}
diff --git a/src/main/java/org/archive/io/arc/ARCReader.java b/src/main/java/org/archive/io/arc/ARCReader.java
index 7f85cc2a..f8935e79 100644
--- a/src/main/java/org/archive/io/arc/ARCReader.java
+++ b/src/main/java/org/archive/io/arc/ARCReader.java
@@ -27,6 +27,7 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
@@ -43,6 +44,7 @@
import org.archive.io.WriterPoolMember;
import org.archive.util.ArchiveUtils;
+import static org.archive.format.arc.ARCConstants.*;
/**
* Get an iterator on an ARC file or get a record by absolute position.
@@ -66,7 +68,7 @@
* @version $Date$ $Revision$
*/
public abstract class ARCReader extends ArchiveReader
-implements ARCConstants, Closeable {
+implements Closeable {
private final Logger logger = Logger.getLogger(ARCReader.class.getName());
/**
@@ -446,7 +448,6 @@ public static void createCDXIndexFile(String urlOrPath)
* @throws IOException
* @throws java.text.ParseException
*/
- @SuppressWarnings("unchecked")
public static void main(String [] args)
throws ParseException, IOException, java.text.ParseException {
Options options = getOptions();
@@ -492,7 +493,7 @@ public static void main(String [] args)
break;
case 'f':
- format = cmdlineOptions[i].getValue().toLowerCase();
+ format = cmdlineOptions[i].getValue().toLowerCase(Locale.ROOT);
boolean match = false;
// List of supported formats.
final String [] supportedFormats =
diff --git a/src/main/java/org/archive/io/arc/ARCReaderFactory.java b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
index 44437ed7..bbcc8b6f 100644
--- a/src/main/java/org/archive/io/arc/ARCReaderFactory.java
+++ b/src/main/java/org/archive/io/arc/ARCReaderFactory.java
@@ -27,6 +27,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
+import java.util.Locale;
import java.util.logging.Level;
import org.archive.io.ArchiveReader;
@@ -40,6 +41,7 @@
import com.google.common.io.CountingInputStream;
+import static org.archive.format.arc.ARCConstants.*;
/**
* Factory that returns an ARCReader.
@@ -48,8 +50,7 @@
*
* @author stack
*/
-public class ARCReaderFactory extends ArchiveReaderFactory
-implements ARCConstants {
+public class ARCReaderFactory extends ArchiveReaderFactory {
/**
* This factory instance.
*/
@@ -230,7 +231,7 @@ public static boolean testCompressedARCFile(File arcFile,
throws IOException {
boolean compressedARCFile = false;
FileUtils.assertReadable(arcFile);
- if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+ if(!skipSuffixCheck && !arcFile.getName().toLowerCase(Locale.ROOT)
.endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
return compressedARCFile;
}
@@ -247,9 +248,9 @@ public static boolean testCompressedARCFile(File arcFile,
public static boolean isARCSuffix(final String arcName) {
return (arcName == null)?
false:
- (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
+ (arcName.toLowerCase(Locale.ROOT).endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
true:
- (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
+ (arcName.toLowerCase(Locale.ROOT).endsWith(DOT_ARC_FILE_EXTENSION))?
true: false;
}
@@ -452,4 +453,4 @@ protected void gotoEOR(ArchiveRecord rec) throws IOException {
logStdErr(Level.WARNING, message);
}
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java
index bacaca38..c14426a5 100644
--- a/src/main/java/org/archive/io/arc/ARCRecord.java
+++ b/src/main/java/org/archive/io/arc/ARCRecord.java
@@ -27,34 +27,36 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.StatusLine;
-import org.apache.commons.httpclient.util.EncodingUtil;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.archive.format.http.HttpHeader;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.HeaderedArchiveRecord;
import org.archive.io.RecoverableIOException;
import org.archive.util.InetAddressUtil;
import org.archive.util.LaxHttpParser;
import org.archive.util.TextUtils;
+import static org.archive.format.arc.ARCConstants.*;
+
/**
* An ARC file record.
* Does not compass the ARCRecord metadata line, just the record content.
* @author stack
*/
-public class ARCRecord extends ArchiveRecord implements ARCConstants {
+public class ARCRecord extends ArchiveRecord {
/**
- * Http status line object.
+ * Http status code.
*
- * May be null if record is not http.
+ * May be -1 if record is not http.
*/
- private StatusLine httpStatus = null;
+ private int statusCode = -1;
/**
* Http header bytes.
@@ -69,7 +71,7 @@ public class ARCRecord extends ArchiveRecord implements ARCConstants {
*
* Only populated after reading of headers.
*/
- private Header [] httpHeaders = null;
+ private HttpHeader[] httpHeaders = null;
/**
* Array of field names.
@@ -375,7 +377,7 @@ private ARCRecordMetaData computeMetaData(List keys,
if (keys.size() != values.size()) {
// Early ARCs had a space in mimetype.
if (values.size() == (keys.size() + 1) &&
- values.get(4).toLowerCase().startsWith("charset=")) {
+ values.get(4).toLowerCase(Locale.ROOT).startsWith("charset=")) {
List nuvalues =
new ArrayList(keys.size());
nuvalues.add(0, values.get(0));
@@ -587,11 +589,11 @@ private InputStream readHttpHeader() throws IOException {
if (eolCharCount <= 0) {
throw new RecoverableIOException(
"Failed to read http status where one was expected: "
- + ((statusBytes == null) ? "" : new String(statusBytes)));
+ + ((statusBytes == null) ? "" : new String(statusBytes, DEFAULT_ENCODING)));
}
-
- statusLine = EncodingUtil.getString(statusBytes, 0,
- statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
+
+ statusLine = new String(statusBytes, 0,
+ statusBytes.length - eolCharCount, DEFAULT_ENCODING);
// If a null or DELETED break immediately
if ((statusLine == null) || statusLine.startsWith("DELETED")) {
@@ -600,7 +602,7 @@ private InputStream readHttpHeader() throws IOException {
// If it's actually the status line, break, otherwise continue skipping any
// previous header values
- if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) {
+ if (!statusLine.contains(":") && statusLine.trim().startsWith("HTTP")) {
break;
}
@@ -613,7 +615,7 @@ private InputStream readHttpHeader() throws IOException {
}
if ((statusLine == null) ||
- !StatusLine.startsWithHTTP(statusLine)) {
+ !statusLine.trim().startsWith("HTTP")) {
if (statusLine.startsWith("DELETED")) {
// Some old ARCs have deleted records like following:
// http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
@@ -629,13 +631,12 @@ private InputStream readHttpHeader() throws IOException {
}
}
- try {
- this.httpStatus = new StatusLine(statusLine);
- } catch(IOException e) {
- logger.warning(e.getMessage() + " at offset: " + h.getOffset());
- this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION);
+ this.statusCode = HeaderedArchiveRecord.parseStatusCode(statusLine.trim());
+ if (statusCode == -1) {
+ logger.warning("Bad status line at offset: " + h.getOffset());
+ this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION);
}
-
+
// Save off all bytes read. Keep them as bytes rather than
// convert to strings so we don't have to worry about encodings
// though this should never be a problem doing http headers since
@@ -658,7 +659,7 @@ private InputStream readHttpHeader() throws IOException {
break;
} else {
throw new IOException("Failed reading http headers: " +
- ((lineBytes != null)? new String(lineBytes): null));
+ ((lineBytes != null)? new String(lineBytes, DEFAULT_ENCODING): null));
}
} else {
httpHeaderBytesRead += lineBytes.length;
@@ -683,8 +684,7 @@ private InputStream readHttpHeader() throws IOException {
// Read the status line. Don't let it into the parseHeaders function.
// It doesn't know what to do with it.
bais.read(statusBytes, 0, statusBytes.length);
- this.httpHeaders = LaxHttpParser.parseHeaders(bais,
- ARCConstants.DEFAULT_ENCODING);
+ this.httpHeaders = LaxHttpParser.parseHeaders(bais, DEFAULT_ENCODING);
this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
bais.reset();
return bais;
@@ -706,7 +706,7 @@ public DeletedARCRecordIOException(final String reason) {
* @return Status code.
*/
public int getStatusCode() {
- return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
+ return statusCode;
}
/**
@@ -735,7 +735,7 @@ public ARCRecordMetaData getMetaData() {
/**
* @return http headers (Only available after header has been read).
*/
- public Header [] getHttpHeaders() {
+ public HttpHeader[] getHttpHeaders() {
return this.httpHeaders;
}
diff --git a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
index 02b368e4..2a187477 100644
--- a/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
+++ b/src/main/java/org/archive/io/arc/ARCRecordMetaData.java
@@ -27,13 +27,14 @@
import org.archive.io.ArchiveRecordHeader;
+import static org.archive.format.arc.ARCConstants.*;
/**
* An immutable class to hold an ARC record meta data.
*
* @author stack
*/
-public class ARCRecordMetaData implements ArchiveRecordHeader, ARCConstants {
+public class ARCRecordMetaData implements ArchiveRecordHeader {
/**
* Map of record header fields.
*
diff --git a/src/main/java/org/archive/io/arc/ARCUtils.java b/src/main/java/org/archive/io/arc/ARCUtils.java
index 985457e2..05c15abb 100644
--- a/src/main/java/org/archive/io/arc/ARCUtils.java
+++ b/src/main/java/org/archive/io/arc/ARCUtils.java
@@ -27,12 +27,15 @@
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
+import java.util.Locale;
import org.archive.url.UsableURI;
import org.archive.util.zip.GzipHeader;
import org.archive.util.zip.NoGzipMagicException;
-public class ARCUtils implements ARCConstants {
+import static org.archive.format.arc.ARCConstants.*;
+
+public class ARCUtils {
/**
* @param pathOrUri Path or URI to extract arc filename from.
* @return Extracted arc file name.
@@ -92,7 +95,7 @@ public static boolean testCompressedARCFile(File arcFile,
throws IOException {
boolean compressedARCFile = false;
isReadable(arcFile);
- if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
+ if(!skipSuffixCheck && !arcFile.getName().toLowerCase(Locale.ROOT)
.endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
return compressedARCFile;
}
@@ -195,7 +198,7 @@ public static boolean testUncompressedARCFile(File arcFile)
throws IOException {
boolean uncompressedARCFile = false;
isReadable(arcFile);
- if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) {
+ if(arcFile.getName().toLowerCase(Locale.ROOT).endsWith(ARC_FILE_EXTENSION)) {
FileInputStream fis = new FileInputStream(arcFile);
try {
byte [] b = new byte[ARC_MAGIC_NUMBER.length()];
diff --git a/src/main/java/org/archive/io/arc/ARCWriter.java b/src/main/java/org/archive/io/arc/ARCWriter.java
index c7042943..82d13e9f 100644
--- a/src/main/java/org/archive/io/arc/ARCWriter.java
+++ b/src/main/java/org/archive/io/arc/ARCWriter.java
@@ -42,6 +42,7 @@
import org.archive.util.DevUtils;
import org.archive.util.MimetypeUtils;
+import static org.archive.format.arc.ARCConstants.*;
/**
* Write ARC files.
@@ -110,7 +111,7 @@
*
* @author stack
*/
-public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeable {
+public class ARCWriter extends WriterPoolMember implements Closeable {
private static final Logger logger =
Logger.getLogger(ARCWriter.class.getName());
diff --git a/src/main/java/org/archive/io/warc/WARCConstants.java b/src/main/java/org/archive/io/warc/WARCConstants.java
deleted file mode 100644
index 83cc8a6d..00000000
--- a/src/main/java/org/archive/io/warc/WARCConstants.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.io.warc;
-
-@Deprecated
-public interface WARCConstants extends org.archive.format.warc.WARCConstants {
-}
diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java
index a34854ef..34583e58 100644
--- a/src/main/java/org/archive/io/warc/WARCReader.java
+++ b/src/main/java/org/archive/io/warc/WARCReader.java
@@ -24,6 +24,7 @@
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
@@ -31,17 +32,19 @@
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
-import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.lang3.NotImplementedException;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
+import static org.archive.format.warc.WARCConstants.*;
+
/**
* WARCReader.
* Go via {@link WARCReaderFactory} to get instance.
* @author stack
* @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
*/
-public class WARCReader extends ArchiveReader implements WARCConstants {
+public class WARCReader extends ArchiveReader {
protected WARCReader() {
super();
}
@@ -196,7 +199,6 @@ public static void main(String [] args)
Options options = getOptions();
PosixParser parser = new PosixParser();
CommandLine cmdline = parser.parse(options, args, false);
- @SuppressWarnings("unchecked")
List cmdlineArgs = cmdline.getArgList();
Option [] cmdlineOptions = cmdline.getOptions();
HelpFormatter formatter = new HelpFormatter();
@@ -231,7 +233,7 @@ public static void main(String [] args)
break;
case 'f':
- format = cmdlineOptions[i].getValue().toLowerCase();
+ format = cmdlineOptions[i].getValue().toLowerCase(Locale.ROOT);
boolean match = false;
// List of supported formats.
final String [] supportedFormats =
@@ -284,4 +286,4 @@ public static void main(String [] args)
}
}
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/warc/WARCReaderFactory.java b/src/main/java/org/archive/io/warc/WARCReaderFactory.java
index c3e5baa0..70b80340 100644
--- a/src/main/java/org/archive/io/warc/WARCReaderFactory.java
+++ b/src/main/java/org/archive/io/warc/WARCReaderFactory.java
@@ -26,17 +26,19 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
+import java.util.Locale;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
-import org.archive.io.warc.WARCConstants;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.zip.GZIPMembersInputStream;
import com.google.common.io.CountingInputStream;
+import static org.archive.format.warc.WARCConstants.*;
+
/**
* Factory for WARC Readers.
* Figures whether to give out a compressed file Reader or an uncompressed
@@ -44,8 +46,7 @@
* @author stack
* @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$
*/
-public class WARCReaderFactory extends ArchiveReaderFactory
-implements WARCConstants {
+public class WARCReaderFactory extends ArchiveReaderFactory {
private static final WARCReaderFactory factory = new WARCReaderFactory();
/**
@@ -307,9 +308,9 @@ protected void gotoEOR(ArchiveRecord rec) throws IOException {
public static boolean isWARCSuffix(final String f) {
return (f == null)?
false:
- (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
+ (f.toLowerCase(Locale.ROOT).endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
true:
- (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?
+ (f.toLowerCase(Locale.ROOT).endsWith(DOT_WARC_FILE_EXTENSION))?
true: false;
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/archive/io/warc/WARCRecord.java b/src/main/java/org/archive/io/warc/WARCRecord.java
index 635d1c3b..21f662ea 100644
--- a/src/main/java/org/archive/io/warc/WARCRecord.java
+++ b/src/main/java/org/archive/io/warc/WARCRecord.java
@@ -29,19 +29,22 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HttpParser;
+import org.archive.format.http.HttpHeader;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.util.LaxHttpParser;
+import static org.archive.format.ArchiveFileConstants.ABSOLUTE_OFFSET_KEY;
+import static org.archive.format.ArchiveFileConstants.READER_IDENTIFIER_FIELD_KEY;
+import static org.archive.format.warc.WARCConstants.*;
+
/**
* A WARC file Record.
*
* @author stack
*/
-public class WARCRecord extends ArchiveRecord implements WARCConstants {
+public class WARCRecord extends ArchiveRecord {
private Pattern WHITESPACE = Pattern.compile("\\s");
/**
@@ -123,7 +126,7 @@ protected ArchiveRecordHeader parseHeaders(final InputStream in,
// keep count of bytes read, digest and fail properly if EOR too soon...
// We don't want digesting while reading Headers.
//
- Header [] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING);
+ HttpHeader[] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING);
for (int i = 0; i < h.length; i++) {
m.put(h[i].getName(), h[i].getValue());
}
diff --git a/src/main/java/org/archive/io/warc/WARCWriter.java b/src/main/java/org/archive/io/warc/WARCWriter.java
index 982b8bc4..65eb3346 100644
--- a/src/main/java/org/archive/io/warc/WARCWriter.java
+++ b/src/main/java/org/archive/io/warc/WARCWriter.java
@@ -36,13 +36,16 @@
import java.util.logging.Level;
import java.util.logging.Logger;
-import org.apache.commons.lang.StringUtils;
-import org.archive.io.ArchiveFileConstants;
-import org.archive.io.UTF8Bytes;
+import org.apache.commons.lang3.StringUtils;
+import org.archive.format.ArchiveFileConstants;
import org.archive.io.WriterPoolMember;
import org.archive.util.ArchiveUtils;
import org.archive.util.anvl.Element;
+import static org.archive.format.warc.WARCConstants.*;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* WARC implementation.
@@ -56,8 +59,7 @@
* @author stack
* @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
*/
-public class WARCWriter extends WriterPoolMember
-implements WARCConstants {
+public class WARCWriter extends WriterPoolMember {
public static final String TOTALS = "totals";
public static final String SIZE_ON_DISK = "sizeOnDisk";
public static final String TOTAL_BYTES = "totalBytes";
@@ -343,9 +345,9 @@ public URI writeWarcinfoRecord(String filename, final String description)
recordInfo.setMimetype("application/warc-fields");
// Strip .open suffix if present.
- if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
+ if (filename.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) {
filename = filename.substring(0,
- filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
+ filename.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length());
}
recordInfo.addExtraHeader(HEADER_KEY_FILENAME, filename);
if (description != null && description.length() > 0) {
@@ -356,12 +358,12 @@ public URI writeWarcinfoRecord(String filename, final String description)
byte [] warcinfoBody = null;
if (settings.getMetadata() == null) {
// TODO: What to write into a warcinfo? What to associate?
- warcinfoBody = "TODO: Unimplemented".getBytes();
+ warcinfoBody = "TODO: Unimplemented".getBytes(UTF_8);
} else {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (final Iterator i = settings.getMetadata().iterator();
i.hasNext();) {
- baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8));
+ baos.write(i.next().toString().getBytes(UTF_8));
}
warcinfoBody = baos.toByteArray();
}
diff --git a/src/main/java/org/archive/net/PublicSuffixes.java b/src/main/java/org/archive/net/PublicSuffixes.java
index af024949..79130332 100644
--- a/src/main/java/org/archive/net/PublicSuffixes.java
+++ b/src/main/java/org/archive/net/PublicSuffixes.java
@@ -22,21 +22,24 @@
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
-import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.archive.util.TextUtils;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Utility class for making use of the information about 'public suffixes' at
* http://publicsuffix.org.
@@ -121,6 +124,7 @@ public boolean add(CharSequence s) {
i++;
// zero-length match holds only when both cs and s are empty.
if (i == 0) return cs.length() == 0 && s.length() == 0;
+ // cs is longer than s, so we need to replace cs with a prefix, and add a branch
if (i < cs.length()) {
CharSequence cs0 = cs.subSequence(0, i);
CharSequence cs1 = cs.subSequence(i, cs.length());
@@ -128,10 +132,21 @@ public boolean add(CharSequence s) {
cs = cs0;
Node alt1 = new Node(cs1, branches);
(branches = new ArrayList()).add(alt1);
- addBranch(cs2);
+ if(cs2.length() == 0) {
+ // if cs2 is empty, we have a terminal node.
+ branches.add(new Node("", null));
+ } else {
+ // otherwise, we have a new branch.
+ addBranch(cs2);
+ }
+
} else {
- assert i == cs.length();
- addBranch(s.subSequence(i, s.length()));
+ // s is longer than cs, so we need to add a branch
+ if(i != s.length()) {
+ // but not if they are equal.
+ assert i == cs.length();
+ addBranch(s.subSequence(i, s.length()));
+ }
}
return true;
}
@@ -172,12 +187,12 @@ public static void main(String args[]) throws IOException {
InputStream is;
if (args.length == 0 || "=".equals(args[0])) {
// use bundled list
- is = PublicSuffixes.class.getClassLoader().getResourceAsStream(
- "effective_tld_names.dat");
+ is = PublicSuffixes.class.getResourceAsStream(
+ "/org/archive/effective_tld_names.dat");
} else {
is = new FileInputStream(args[0]);
}
- BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8));
String regex = getTopmostAssignedSurtPrefixRegex(reader);
IOUtils.closeQuietly(is);
@@ -185,11 +200,11 @@ public static void main(String args[]) throws IOException {
BufferedWriter writer;
if (args.length >= 2) {
// write to specified file
- writer = new BufferedWriter(new FileWriter(args[1]));
+ writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), UTF_8));
needsClose = true;
} else {
// write to stdout
- writer = new BufferedWriter(new OutputStreamWriter(System.out));
+ writer = new BufferedWriter(new OutputStreamWriter(System.out, Charset.defaultCharset()));
}
writer.append(regex);
writer.flush();
@@ -219,7 +234,7 @@ protected static Node readPublishedFileToSurtTrie(BufferedReader reader) throws
// discard utf8 notation after entry
line = line.split("\\s+")[0];
// TODO: maybe we don't need to create lower-cased String
- line = line.toLowerCase();
+ line = line.toLowerCase(Locale.ROOT);
// SURT-order domain segments
String[] segs = line.split("\\.");
StringBuilder sb = new StringBuilder();
@@ -265,7 +280,7 @@ protected static void buildRegex(Node alt, StringBuilder sb) {
sb.append("(?=");
close = ")";
} else if (c == '*') {
- sb.append("[-\\w]+");
+ sb.append("[-\\w\\u00C0-\\u017F]+");
} else {
sb.append(c);
}
@@ -304,7 +319,7 @@ private static String surtPrefixRegexFromTrie(Node trie) {
regex.append("(?ix)^\n");
trie.addBranch("*,"); // for new/unknown TLDs
buildRegex(trie, regex);
- regex.append("\n([-\\w]+,)");
+ regex.append("\n([-\\w\\u00C0-\\u017F]+,)");
return regex.toString();
}
@@ -319,16 +334,11 @@ public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() {
public static synchronized String getTopmostAssignedSurtPrefixRegex() {
if (topmostAssignedSurtPrefixRegex == null) {
// use bundled list
- try {
- BufferedReader reader = new BufferedReader(new InputStreamReader(
- PublicSuffixes.class.getClassLoader().getResourceAsStream(
- "effective_tld_names.dat"), "UTF-8"));
- topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader);
- IOUtils.closeQuietly(reader);
- } catch (UnsupportedEncodingException ex) {
- // should never happen
- throw new RuntimeException(ex);
- }
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ PublicSuffixes.class.getResourceAsStream(
+ "/org/archive/effective_tld_names.dat"), UTF_8));
+ topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader);
+ IOUtils.closeQuietly(reader);
}
return topmostAssignedSurtPrefixRegex;
}
diff --git a/src/main/java/org/archive/resource/generic/GenericResourceProducer.java b/src/main/java/org/archive/resource/generic/GenericResourceProducer.java
index 812a3f0d..b111dc1e 100644
--- a/src/main/java/org/archive/resource/generic/GenericResourceProducer.java
+++ b/src/main/java/org/archive/resource/generic/GenericResourceProducer.java
@@ -1,6 +1,7 @@
package org.archive.resource.generic;
import java.io.IOException;
+import java.util.Locale;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
@@ -45,6 +46,6 @@ public void close() throws IOException {
stream.close();
}
public String getContext() {
- return String.format("Context(%s)(%d)", name, stream.getOffset());
+ return String.format(Locale.ROOT, "Context(%s)(%d)", name, stream.getOffset());
}
}
diff --git a/src/main/java/org/archive/resource/gzip/GZIPMetaData.java b/src/main/java/org/archive/resource/gzip/GZIPMetaData.java
index 0fc18162..1058b01b 100644
--- a/src/main/java/org/archive/resource/gzip/GZIPMetaData.java
+++ b/src/main/java/org/archive/resource/gzip/GZIPMetaData.java
@@ -15,6 +15,8 @@
import org.json.JSONException;
import org.json.JSONObject;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class GZIPMetaData extends MetaData implements ResourceConstants {
private static final Logger LOG = Logger.getLogger(GZIPMetaData.class.getName());
@@ -26,7 +28,7 @@ public void setData(GZIPSeriesMember member) {
GZIPHeader header = member.getHeader();
GZIPStaticHeader staticH = header.getStaticHeader();
if(staticH.isFNameSet()) {
- putString(GZIP_FILENAME,new String(header.getFileName(),"UTF-8"));
+ putString(GZIP_FILENAME, new String(header.getFileName(), UTF_8));
}
if(staticH.isFCommentSet()) {
putLong(GZIP_COMMENT_LENGTH,header.getCommentLength());
@@ -39,7 +41,7 @@ public void setData(GZIPSeriesMember member) {
for(int i = 0; i < records; i++) {
GZIPFExtraRecord rec = header.getRecord(i);
JSONObject recJO = new JSONObject();
- String name = new String(rec.getName(),"UTF-8");
+ String name = new String(rec.getName(), UTF_8);
recJO.put(GZIP_FEXTRA_NAME, name);
if(name.equals("SL") || name.equals("LX")) {
recJO.put(GZIP_FEXTRA_VALUE, ByteOp.bytesToInt(rec.getValue()));
@@ -55,8 +57,6 @@ public void setData(GZIPSeriesMember member) {
putLong(GZIP_INFLATED_CRC,footer.getCRC());
putLong(GZIP_INFLATED_LENGTH,footer.getLength());
- } catch (UnsupportedEncodingException e) {
- LOG.warning(e.getMessage());
} catch (JSONException e) {
LOG.warning(e.getMessage());
}
diff --git a/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java b/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java
index 39611ab8..5267a0f9 100644
--- a/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java
+++ b/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java
@@ -1,6 +1,7 @@
package org.archive.resource.gzip;
import java.io.IOException;
+import java.util.Locale;
import org.archive.format.gzip.GZIPMemberSeries;
import org.archive.format.gzip.GZIPSeriesMember;
@@ -54,6 +55,6 @@ public void close() throws IOException {
series.close();
}
public String getContext() {
- return String.format("Context(%s)(%d)", series.getStreamContext(), series.getCurrentMemberStartOffset());
+ return String.format(Locale.ROOT, "Context(%s)(%d)", series.getStreamContext(), series.getCurrentMemberStartOffset());
}
}
diff --git a/src/main/java/org/archive/resource/html/HTMLMetaData.java b/src/main/java/org/archive/resource/html/HTMLMetaData.java
index 024d9677..d995cf65 100644
--- a/src/main/java/org/archive/resource/html/HTMLMetaData.java
+++ b/src/main/java/org/archive/resource/html/HTMLMetaData.java
@@ -1,6 +1,7 @@
package org.archive.resource.html;
import java.util.List;
+import java.util.Locale;
import java.util.logging.Logger;
import org.archive.resource.MetaData;
@@ -98,7 +99,7 @@ private void appendObj2(JSONObject o, String arr, String... a) {
} catch(JSONException e) {
try {
- System.err.format("GotErr(%s) JSON(%s)(%s)", e.getMessage(),
+ System.err.format(Locale.ROOT, "GotErr(%s) JSON(%s)(%s)", e.getMessage(),
o.toString(1),a.toString());
} catch (JSONException e1) {
// TODO Auto-generated catch block
diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
index afb1c850..410449a1 100644
--- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
+++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
@@ -4,9 +4,9 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
+import java.util.logging.Logger;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.text.charset.CharsetDetector;
@@ -25,7 +25,7 @@
public class HTMLResourceFactory implements ResourceFactory {
- public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);
+ private static final Logger LOG = Logger.getLogger(HTMLResourceFactory.class.getName());
protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
@@ -41,7 +41,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
CDATALexer lex = new CDATALexer();
// guess charset based on HTTP header and sniffed content chunk
- String charset = "UTF-8";
+ String charset = StandardCharsets.UTF_8.name();
is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
is.mark(0);
@@ -58,7 +58,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
try {
charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
} catch (Exception e) {
- LOG.error("Failed to guess charset: " + e.getMessage());
+ LOG.severe("Failed to guess charset: " + e.getMessage());
}
}
diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java
index a9c3fcc3..a5e5ac35 100644
--- a/src/main/java/org/archive/resource/warc/WARCResource.java
+++ b/src/main/java/org/archive/resource/warc/WARCResource.java
@@ -5,6 +5,7 @@
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
+import java.util.Locale;
import org.archive.format.http.HttpHeader;
import org.archive.format.http.HttpResponse;
@@ -43,7 +44,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
String name = h.getName();
String value = h.getValue();
fields.putString(name,value);
- if(name.toLowerCase().equals("content-length")) {
+ if(name.toLowerCase(Locale.ROOT).equals("content-length")) {
// TODO: catch formatexception
length = Long.parseLong(value);
}
diff --git a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
index 43041efb..8cc8c146 100644
--- a/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java
@@ -3,7 +3,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.nio.charset.Charset;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
@@ -14,9 +13,9 @@
import org.json.JSONException;
import org.json.JSONTokener;
-public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants {
- private static final Charset UTF8 = Charset.forName("UTF-8");
+import static java.nio.charset.StandardCharsets.UTF_8;
+public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants {
public WARCJSONMetaDataResourceFactory() {
}
@@ -27,7 +26,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
MetaData md;
try {
- md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF8)));
+ md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF_8)));
} catch (JSONException e) {
throw new ResourceParseException(e);
}
diff --git a/src/main/java/org/archive/streamcontext/HTTP11Stream.java b/src/main/java/org/archive/streamcontext/HTTP11Stream.java
index 06f51409..995dc53e 100755
--- a/src/main/java/org/archive/streamcontext/HTTP11Stream.java
+++ b/src/main/java/org/archive/streamcontext/HTTP11Stream.java
@@ -5,6 +5,7 @@
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
+import java.util.Locale;
public class HTTP11Stream extends AbstractBufferingStream {
private URL url;
@@ -42,7 +43,7 @@ public int doRead(byte[] b, int off, int len) throws IOException {
public void doSeek(long offset) throws IOException {
doClose();
conn = url.openConnection();
- conn.setRequestProperty("Range", String.format("bytes=%d-", offset));
+ conn.setRequestProperty("Range", String.format(Locale.ROOT, "bytes=%d-", offset));
conn.connect();
is = conn.getInputStream();
}
diff --git a/src/main/java/org/archive/uid/RecordIDGenerator.java b/src/main/java/org/archive/uid/RecordIDGenerator.java
index 4f16c5ab..80cc5565 100644
--- a/src/main/java/org/archive/uid/RecordIDGenerator.java
+++ b/src/main/java/org/archive/uid/RecordIDGenerator.java
@@ -19,7 +19,6 @@
package org.archive.uid;
import java.net.URI;
-import java.net.URISyntaxException;
import java.util.Map;
/**
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
index 37b448c1..3957c9ef 100644
--- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -6,7 +6,9 @@
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -34,7 +36,9 @@ public class BasicURLCanonicalizer implements URLCanonicalizer {
.compile("^(0[0-7]*)(\\.[0-7]+)?(\\.[0-7]+)?(\\.[0-7]+)?$");
Pattern DECIMAL_IP = Pattern
.compile("^([1-9][0-9]*)(\\.[0-9]+)?(\\.[0-9]+)?(\\.[0-9]+)?$");
+ Pattern MULTIDOT = Pattern.compile("\\.{2,}");
+ @Override
public void canonicalize(HandyURL url) {
url.setHash(null);
url.setAuthUser(minimalEscape(url.getAuthUser()));
@@ -55,8 +59,7 @@ public void canonicalize(HandyURL url) {
host = hostE;
}
- host = host.replaceAll("^\\.+", "").replaceAll("\\.\\.+", ".")
- .replaceAll("\\.$", "");
+ host = normalizeDots(host);
}
String ip = null;
@@ -64,7 +67,7 @@ public void canonicalize(HandyURL url) {
if (ip != null) {
host = ip;
} else if (host != null) {
- host = escapeOnce(host.toLowerCase());
+ host = escapeOnce(host.toLowerCase(Locale.ROOT));
}
url.setHost(host);
// now the path:
@@ -74,6 +77,36 @@ public void canonicalize(HandyURL url) {
url.setPath(escapeOnce(normalizePath(path)));
}
+ /**
+ * Normalize dots in the host name.
+ *
+ * @param host
+ * @return host name with all sequences of dots replaced with a single dot,
+ * and all leading and trailing dots removed
+ */
+ private String normalizeDots(String host) {
+ if (host.indexOf('.') == -1) {
+ return host;
+ }
+ int start = 0, end = host.length();
+ boolean changed = false;
+ while (start < end && host.charAt(start) == '.') {
+ start++;
+ changed = true;
+ }
+ while (end > start && host.charAt(end - 1) == '.') {
+ end--;
+ changed = true;
+ }
+ if (changed) {
+ host = host.substring(start, end);
+ }
+ if (host.contains("..")) {
+ host = MULTIDOT.matcher(host).replaceAll(".");
+ }
+ return host;
+ }
+
private static final Pattern SINGLE_FORWARDSLASH_PATTERN = Pattern
.compile("/");
@@ -159,7 +192,7 @@ public String attemptIPFormats(String host) { // throws URIException {
}
ip[i] = octet;
}
- return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]);
+ return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]);
} else {
Matcher m2 = DECIMAL_IP.matcher(host);
if (m2.matches()) {
@@ -190,7 +223,7 @@ public String attemptIPFormats(String host) { // throws URIException {
}
ip[i] = octet;
}
- return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2],
+ return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2],
ip[3]);
}
@@ -203,12 +236,9 @@ public String minimalEscape(String input) {
return escapeOnce(unescapeRepeatedly(input));
}
- protected static Charset _UTF8 = null;
+ protected static Charset _UTF8 = StandardCharsets.UTF_8;
protected static Charset UTF8() {
- if (_UTF8 == null) {
- _UTF8 = Charset.forName("UTF-8");
- }
return _UTF8;
}
@@ -261,7 +291,7 @@ public String escapeOnce(String input) {
}
sb.append("%");
- String hex = Integer.toHexString(b).toUpperCase();
+ String hex = Integer.toHexString(b).toUpperCase(Locale.ROOT);
if (hex.length() == 1) {
sb.append('0');
}
diff --git a/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java b/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java
deleted file mode 100644
index 3d4d8581..00000000
--- a/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package org.archive.url;
-
-/**
- * @deprecated use AggressiveIACanonicalizerRules
- */
-public class DefaultIACanonicalizerRules extends AggressiveIACanonicalizerRules {
-}
diff --git a/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java b/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java
deleted file mode 100644
index 3d1f985d..00000000
--- a/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package org.archive.url;
-
-/**
- * @deprecated use AggressiveIAURLCanonicalizer
- */
-public class DefaultIAURLCanonicalizer extends AggressiveIAURLCanonicalizer {
-}
diff --git a/src/main/java/org/archive/url/GoogleURLCanonicalizer.java b/src/main/java/org/archive/url/GoogleURLCanonicalizer.java
deleted file mode 100644
index 388db8aa..00000000
--- a/src/main/java/org/archive/url/GoogleURLCanonicalizer.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package org.archive.url;
-
-/**
- * @deprecated use {@link BasicURLCanonicalizer}
- */
-public class GoogleURLCanonicalizer extends BasicURLCanonicalizer {
-}
diff --git a/src/main/java/org/archive/url/HandyURL.java b/src/main/java/org/archive/url/HandyURL.java
index 91539b3f..0c2c81f7 100644
--- a/src/main/java/org/archive/url/HandyURL.java
+++ b/src/main/java/org/archive/url/HandyURL.java
@@ -2,6 +2,7 @@
import java.net.MalformedURLException;
import java.net.URL;
+import java.util.Locale;
public class HandyURL {
public final static int DEFAULT_PORT = -1;
@@ -277,7 +278,7 @@ public void setOpaque(String opaque) {
}
public String toDebugString() {
- return String.format("Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)",
+ return String.format(Locale.ROOT, "Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)",
scheme, authUser, authPass, host, port, path, query, hash);
}
diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java
index 0cf7c8a4..e964cd00 100644
--- a/src/main/java/org/archive/url/IAURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java
@@ -2,6 +2,7 @@
import java.util.Arrays;
import java.util.Comparator;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -20,11 +21,11 @@ public void canonicalize(HandyURL url) {
}
if (rules.isSet(SCHEME_SETTINGS, SCHEME_LOWERCASE)) {
if (url.getScheme() != null) {
- url.setScheme(url.getScheme().toLowerCase());
+ url.setScheme(url.getScheme().toLowerCase(Locale.ROOT));
}
}
if(rules.isSet(HOST_SETTINGS, HOST_LOWERCASE)) {
- url.setHost(url.getHost().toLowerCase());
+ url.setHost(url.getHost().toLowerCase(Locale.ROOT));
}
if(rules.isSet(HOST_SETTINGS, HOST_MASSAGE)) {
url.setHost(massageHost(url.getHost()));
@@ -46,7 +47,7 @@ public void canonicalize(HandyURL url) {
url.setPath(null);
} else {
if(rules.isSet(PATH_SETTINGS, PATH_LOWERCASE)) {
- path = path.toLowerCase();
+ path = path.toLowerCase(Locale.ROOT);
}
if(rules.isSet(PATH_SETTINGS, PATH_STRIP_SESSION_ID)) {
path = URLRegexTransformer.stripPathSessionID(path);
@@ -71,7 +72,7 @@ public void canonicalize(HandyURL url) {
}
// lower-case:
if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
- query = query.toLowerCase();
+ query = query.toLowerCase(Locale.ROOT);
}
// re-order?
if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
@@ -155,7 +156,7 @@ public static String massageHost(String host) {
return host;
}
public static int getDefaultPort(String scheme) {
- String lcScheme = scheme.toLowerCase();
+ String lcScheme = scheme.toLowerCase(Locale.ROOT);
if(lcScheme.equals("http")) {
return 80;
} else if(lcScheme.equals("https")) {
diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java
index d7318dfd..9b7485c7 100644
--- a/src/main/java/org/archive/url/LaxURI.java
+++ b/src/main/java/org/archive/url/LaxURI.java
@@ -18,12 +18,12 @@
*/
package org.archive.url;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.BitSet;
-
-import org.apache.commons.httpclient.URI;
-import org.apache.commons.httpclient.URIException;
-import org.apache.commons.httpclient.util.EncodingUtil;
+import java.util.Locale;
/**
* URI subclass which allows partial/inconsistent encoding, matching
@@ -121,13 +121,13 @@ protected static String decode(String component, String charset)
"Component array of chars may not be null");
}
byte[] rawdata = null;
- // try {
- rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil
- .getAsciiBytes(component));
- // } catch (DecoderException e) {
- // throw new URIException(e.getMessage());
- // }
- return EncodingUtil.getString(rawdata, charset);
+ rawdata = LaxURLCodec.decodeUrlLoose(component.getBytes(StandardCharsets.US_ASCII));
+ try {
+ Charset cs = Charset.forName(charset);
+ return new String(rawdata, cs);
+ } catch (IllegalCharsetNameException e) {
+ return new String(rawdata, StandardCharsets.US_ASCII);
+ }
}
// overidden to lax() the acceptable-char BitSet passed in
@@ -183,7 +183,7 @@ protected BitSet lax(BitSet generous) {
* two instances to one where possible, slimming
* instances.
*
- * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)
+ * @see URI#parseAuthority(java.lang.String, boolean)
*/
protected void parseAuthority(String original, boolean escaped)
throws URIException {
@@ -204,7 +204,7 @@ protected void parseAuthority(String original, boolean escaped)
* long-lived instance from a static field, saving 12-14 bytes
* per instance.
*
- * @see org.apache.commons.httpclient.URI#setURI()
+ * @see URI#setURI()
*/
protected void setURI() {
if (_scheme != null) {
@@ -324,7 +324,7 @@ protected void parseUriReference(String original, boolean escaped)
*
*/
if (at > 0 && at < length && tmp.charAt(at) == ':') {
- char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
+ char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray();
if (validate(target, scheme)) {
_scheme = target;
from = ++at;
diff --git a/src/main/java/org/archive/url/LaxURLCodec.java b/src/main/java/org/archive/url/LaxURLCodec.java
index e27d9de0..b68a0c19 100644
--- a/src/main/java/org/archive/url/LaxURLCodec.java
+++ b/src/main/java/org/archive/url/LaxURLCodec.java
@@ -20,17 +20,16 @@
import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
import java.util.BitSet;
import org.apache.commons.codec.net.URLCodec;
-import com.google.common.base.Charsets;
-
/**
* @author gojomo
*/
public class LaxURLCodec extends URLCodec {
- public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8");
+ public static LaxURLCodec DEFAULT = new LaxURLCodec(StandardCharsets.UTF_8.name());
// passthrough constructor
public LaxURLCodec(String encoding) {
@@ -155,6 +154,6 @@ public String encode(BitSet safe, String pString, String cs)
if (pString == null) {
return null;
}
- return new String(encodeUrl(safe,pString.getBytes(cs)), Charsets.US_ASCII);
+ return new String(encodeUrl(safe,pString.getBytes(cs)), StandardCharsets.US_ASCII);
}
}
diff --git a/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java b/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java
index cd579eb0..830b7b92 100644
--- a/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java
@@ -1,10 +1,10 @@
package org.archive.url;
public class NonMassagingIAURLCanonicalizer implements URLCanonicalizer {
- private static final GoogleURLCanonicalizer google =
- new GoogleURLCanonicalizer();
+ private static final BasicURLCanonicalizer basic =
+ new BasicURLCanonicalizer();
private static CanonicalizeRules nonMassagingRules =
- new DefaultIACanonicalizerRules();
+ new AggressiveIACanonicalizerRules();
static {
nonMassagingRules.setRule(CanonicalizeRules.HOST_SETTINGS,
CanonicalizeRules.HOST_LOWERCASE);
@@ -14,7 +14,7 @@ public class NonMassagingIAURLCanonicalizer implements URLCanonicalizer {
public void canonicalize(HandyURL url) {
// just google's stuff, followed by the IA default stuff:
- google.canonicalize(url);
+ basic.canonicalize(url);
ia.canonicalize(url);
}
}
diff --git a/src/main/java/org/archive/url/SURT.java b/src/main/java/org/archive/url/SURT.java
index 2c8e1b02..9598f458 100644
--- a/src/main/java/org/archive/url/SURT.java
+++ b/src/main/java/org/archive/url/SURT.java
@@ -2,11 +2,10 @@
import java.io.BufferedReader;
import java.io.InputStreamReader;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.logging.Logger;
-import org.apache.commons.httpclient.URIException;
import org.archive.util.iterator.AbstractPeekableIterator;
public class SURT {
@@ -34,7 +33,7 @@ public static String toSURT(String input) {
}
public static void main(String[] args) {
String line;
- InputStreamReader isr = new InputStreamReader(System.in,Charset.forName("UTF-8"));
+ InputStreamReader isr = new InputStreamReader(System.in, StandardCharsets.UTF_8);
BufferedReader br = new BufferedReader(isr);
Iterator i = AbstractPeekableIterator.wrapReader(br);
while(i.hasNext()) {
diff --git a/src/main/java/org/archive/url/SURTTokenizer.java b/src/main/java/org/archive/url/SURTTokenizer.java
index da8f58f2..52b80a03 100644
--- a/src/main/java/org/archive/url/SURTTokenizer.java
+++ b/src/main/java/org/archive/url/SURTTokenizer.java
@@ -19,7 +19,6 @@
*/
package org.archive.url;
-import org.apache.commons.httpclient.URIException;
import org.archive.util.SURT;
/**
diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java
new file mode 100644
index 00000000..492f7772
--- /dev/null
+++ b/src/main/java/org/archive/url/URI.java
@@ -0,0 +1,3984 @@
+/*
+ * $HeadURL: https://svn.apache.org/repos/asf/jakarta/httpcomponents/oac.hc3x/tags/HTTPCLIENT_3_1/src/java/org/apache/commons/httpclient/URI.java $
+ * $Revision: 564973 $
+ * $Date: 2007-08-11 22:51:47 +0200 (Sat, 11 Aug 2007) $
+ *
+ * ====================================================================
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ *
+ */
+
+package org.archive.url;
+
+import org.apache.commons.codec.DecoderException;
+import org.apache.commons.codec.net.URLCodec;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Hashtable;
+import java.util.Locale;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396.
+ * This class has the purpose of supportting of parsing a URI reference to
+ * extend any specific protocols, the character encoding of the protocol to
+ * be transported and the charset of the document.
+ *
+ * A URI is always in an "escaped" form, since escaping or unescaping a
+ * completed URI might change its semantics.
+ *
+ * Implementers should be careful not to escape or unescape the same string
+ * more than once, since unescaping an already unescaped string might lead to
+ * misinterpreting a percent data character as another escaped character,
+ * or vice versa in the case of escaping an already escaped string.
+ *
+ * In order to avoid these problems, data types used as follows:
+ *
+ * URI character sequence: char
+ * octet sequence: byte
+ * original character sequence: String
+ *
+ *
+ * So, a URI is a sequence of characters as an array of a char type, which
+ * is not always represented as a sequence of octets as an array of byte.
+ *
+ *
+ * The following examples illustrate URI that are in common use.
+ *
+ * ftp://ftp.is.co.za/rfc/rfc1808.txt
+ * -- ftp scheme for File Transfer Protocol services
+ * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
+ * -- gopher scheme for Gopher and Gopher+ Protocol services
+ * http://www.math.uio.no/faq/compression-faq/part1.html
+ * -- http scheme for Hypertext Transfer Protocol services
+ * mailto:mduerst@ifi.unizh.ch
+ * -- mailto scheme for electronic mail addresses
+ * news:comp.infosystems.www.servers.unix
+ * -- news scheme for USENET news groups and articles
+ * telnet://melvyl.ucop.edu/
+ * -- telnet scheme for interactive services via the TELNET Protocol
+ *
+ * Please, notice that there are many modifications from URL(RFC 1738) and
+ * relative URL(RFC 1808).
+ *
+ * The expressions for a URI
+ *
+ * For escaped URI forms
+ * - URI(char[]) // constructor
+ * - char[] getRawXxx() // method
+ * - String getEscapedXxx() // method
+ * - String toString() // method
+ *
+ * For unescaped URI forms
+ * - URI(String) // constructor
+ * - String getXXX() // method
+ *
+ *
+ * @author Sung-Gu
+ * @author Mike Bowler
+ * @version $Revision: 564973 $ $Date: 2002/03/14 15:14:01
+ */
+class URI implements Cloneable, Comparable, Serializable {
+
+
+ // ----------------------------------------------------------- Constructors
+
+ /** Create an instance as an internal use */
+ protected URI() {
+ }
+
+ /**
+ * Construct a URI from a string with the given charset. The input string can
+ * be either in escaped or unescaped form.
+ *
+ * @param s URI character sequence
+ * @param escaped true if URI character sequence is in escaped form.
+ * false otherwise.
+ * @param charset the charset string to do escape encoding, if required
+ *
+ * @throws URIException If the URI cannot be created.
+ * @throws NullPointerException if input string is null
+ *
+ * @see #getProtocolCharset
+ *
+ * @since 3.0
+ */
+ public URI(String s, boolean escaped, String charset)
+ throws URIException, NullPointerException {
+ protocolCharset = charset;
+ parseUriReference(s, escaped);
+ }
+
+ /**
+ * Construct a URI from a string with the given charset. The input string can
+ * be either in escaped or unescaped form.
+ *
+ * @param s URI character sequence
+ * @param escaped true if URI character sequence is in escaped form.
+ * false otherwise.
+ *
+ * @throws URIException If the URI cannot be created.
+ * @throws NullPointerException if input string is null
+ *
+ * @see #getProtocolCharset
+ *
+ * @since 3.0
+ */
+ public URI(String s, boolean escaped)
+ throws URIException, NullPointerException {
+ parseUriReference(s, escaped);
+ }
+
+ /**
+ * Construct a URI as an escaped form of a character array with the given
+ * charset.
+ *
+ * @param escaped the URI character sequence
+ * @param charset the charset string to do escape encoding
+ * @throws URIException If the URI cannot be created.
+ * @throws NullPointerException if escaped is null
+ * @see #getProtocolCharset
+ *
+ * @deprecated Use #URI(String, boolean, String)
+ */
+ public URI(char[] escaped, String charset)
+ throws URIException, NullPointerException {
+ protocolCharset = charset;
+ parseUriReference(new String(escaped), true);
+ }
+
+
+ /**
+ * Construct a URI as an escaped form of a character array.
+ * An URI can be placed within double-quotes or angle brackets like
+ * "http://test.com/" and <http://test.com/>
+ *
+ * @param escaped the URI character sequence
+ * @throws URIException If the URI cannot be created.
+ * @throws NullPointerException if escaped is null
+ * @see #getDefaultProtocolCharset
+ *
+ * @deprecated Use #URI(String, boolean)
+ */
+ public URI(char[] escaped)
+ throws URIException, NullPointerException {
+ parseUriReference(new String(escaped), true);
+ }
+
+
+ /**
+ * Construct a URI from the given string with the given charset.
+ *
+ * @param original the string to be represented to URI character sequence
+ * It is one of absoluteURI and relativeURI.
+ * @param charset the charset string to do escape encoding
+ * @throws URIException If the URI cannot be created.
+ * @see #getProtocolCharset
+ *
+ * @deprecated Use #URI(String, boolean, String)
+ */
+ public URI(String original, String charset) throws URIException {
+ protocolCharset = charset;
+ parseUriReference(original, false);
+ }
+
+
+ /**
+ * Construct a URI from the given string.
+ *
+ * An URI can be placed within double-quotes or angle brackets like
+ * "http://test.com/" and <http://test.com/>
+ *
+ * @param original the string to be represented to URI character sequence
+ * It is one of absoluteURI and relativeURI.
+ * @throws URIException If the URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ *
+ * @deprecated Use #URI(String, boolean)
+ */
+ public URI(String original) throws URIException {
+ parseUriReference(original, false);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * It's for absolute URI = <scheme>:<path>?<query>#<
+ * fragment> and relative URI = <path>?<query>#<fragment
+ * >.
+ *
+ * @param scheme the scheme string
+ * @param authority the authority string
+ * @param path the path string
+ * @param query the query string
+ * @param fragment the fragment string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String authority, String path, String query,
+ String fragment) throws URIException {
+
+ // validate and contruct the URI character sequence
+ StringBuffer buff = new StringBuffer();
+ if (scheme != null) {
+ buff.append(scheme);
+ buff.append(':');
+ }
+ if (authority != null) {
+ buff.append("//");
+ buff.append(authority);
+ }
+ if (path != null) { // accept empty path
+ if ((scheme != null || authority != null)
+ && !path.startsWith("/")) {
+ throw new URIException(URIException.PARSING,
+ "abs_path requested");
+ }
+ buff.append(path);
+ }
+ if (query != null) {
+ buff.append('?');
+ buff.append(query);
+ }
+ if (fragment != null) {
+ buff.append('#');
+ buff.append(fragment);
+ }
+ parseUriReference(buff.toString(), false);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param userinfo the userinfo string
+ * @param host the host string
+ * @param port the port number
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String userinfo, String host, int port)
+ throws URIException {
+
+ this(scheme, userinfo, host, port, null, null, null);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param userinfo the userinfo string
+ * @param host the host string
+ * @param port the port number
+ * @param path the path string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String userinfo, String host, int port,
+ String path) throws URIException {
+
+ this(scheme, userinfo, host, port, path, null, null);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param userinfo the userinfo string
+ * @param host the host string
+ * @param port the port number
+ * @param path the path string
+ * @param query the query string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String userinfo, String host, int port,
+ String path, String query) throws URIException {
+
+ this(scheme, userinfo, host, port, path, query, null);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param userinfo the userinfo string
+ * @param host the host string
+ * @param port the port number
+ * @param path the path string
+ * @param query the query string
+ * @param fragment the fragment string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String userinfo, String host, int port,
+ String path, String query, String fragment) throws URIException {
+
+ this(scheme, (host == null) ? null
+ : ((userinfo != null) ? userinfo + '@' : "") + host
+ + ((port != -1) ? ":" + port : ""), path, query, fragment);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param host the host string
+ * @param path the path string
+ * @param fragment the fragment string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String host, String path, String fragment)
+ throws URIException {
+
+ this(scheme, host, path, null, fragment);
+ }
+
+
+ /**
+ * Construct a general URI with the given relative URI string.
+ *
+ * @param base the base URI
+ * @param relative the relative URI string
+ * @throws URIException If the new URI cannot be created.
+ *
+ * @deprecated Use #URI(URI, String, boolean)
+ */
+ public URI(URI base, String relative) throws URIException {
+ this(base, new URI(relative));
+ }
+
+
+ /**
+ * Construct a general URI with the given relative URI string.
+ *
+ * @param base the base URI
+ * @param relative the relative URI string
+ * @param escaped true if URI character sequence is in escaped form.
+ * false otherwise.
+ *
+ * @throws URIException If the new URI cannot be created.
+ *
+ * @since 3.0
+ */
+ public URI(URI base, String relative, boolean escaped) throws URIException {
+ this(base, new URI(relative, escaped));
+ }
+
+
+ /**
+ * Construct a general URI with the given relative URI.
+ *
+ * Resolving Relative References to Absolute Form.
+ *
+ * Examples of Resolving Relative URI References
+ *
+ * Within an object with a well-defined base URI of
+ *
+ * http://a/b/c/d;p?q
+ *
+ * the relative URI would be resolved as follows:
+ *
+ * Normal Examples
+ *
+ *
+ */
+ protected static final BitSet mark = new BitSet(256);
+ // Static initializer for mark
+ static {
+ mark.set('-');
+ mark.set('_');
+ mark.set('.');
+ mark.set('!');
+ mark.set('~');
+ mark.set('*');
+ mark.set('\'');
+ mark.set('(');
+ mark.set(')');
+ }
+
+
+ /**
+ * Data characters that are allowed in a URI but do not have a reserved
+ * purpose are called unreserved.
+ *
+ * unreserved = alphanum | mark
+ *
+ */
+ protected static final BitSet unreserved = new BitSet(256);
+ // Static initializer for unreserved
+ static {
+ unreserved.or(alphanum);
+ unreserved.or(mark);
+ }
+
+
+ /**
+ * BitSet for reserved.
+ *
+ */
+ protected static final BitSet uric_no_slash = new BitSet(256);
+ // Static initializer for uric_no_slash
+ static {
+ uric_no_slash.or(unreserved);
+ uric_no_slash.or(escaped);
+ uric_no_slash.set(';');
+ uric_no_slash.set('?');
+ uric_no_slash.set(';');
+ uric_no_slash.set('@');
+ uric_no_slash.set('&');
+ uric_no_slash.set('=');
+ uric_no_slash.set('+');
+ uric_no_slash.set('$');
+ uric_no_slash.set(',');
+ }
+
+
+ /**
+ * URI bitset that combines uric_no_slash and uric.
+ *
+ * opaque_part = uric_no_slash *uric
+ *
+ */
+ protected static final BitSet opaque_part = new BitSet(256);
+ // Static initializer for opaque_part
+ static {
+ // it's generous. because first character must not include a slash
+ opaque_part.or(uric_no_slash);
+ opaque_part.or(uric);
+ }
+
+
+ /**
+ * URI bitset that combines absolute path and opaque part.
+ *
+ * path = [ abs_path | opaque_part ]
+ *
+ */
+ protected static final BitSet path = new BitSet(256);
+ // Static initializer for path
+ static {
+ path.or(abs_path);
+ path.or(opaque_part);
+ }
+
+
+ /**
+ * Port, a logical alias for digit.
+ */
+ protected static final BitSet port = digit;
+
+
+ /**
+ * Bitset that combines digit and dot fo IPv$address.
+ *
+ */
+ protected static final BitSet URI_reference = new BitSet(256);
+ // Static initializer for URI_reference
+ static {
+ URI_reference.or(absoluteURI);
+ URI_reference.or(relativeURI);
+ URI_reference.set('#');
+ URI_reference.or(fragment);
+ }
+
+ // ---------------------------- Characters disallowed within the URI syntax
+ // Excluded US-ASCII Characters are like control, space, delims and unwise
+
+ /**
+ * BitSet for control.
+ */
+ public static final BitSet control = new BitSet(256);
+ // Static initializer for control
+ static {
+ for (int i = 0; i <= 0x1F; i++) {
+ control.set(i);
+ }
+ control.set(0x7F);
+ }
+
+ /**
+ * BitSet for space.
+ */
+ public static final BitSet space = new BitSet(256);
+ // Static initializer for space
+ static {
+ space.set(0x20);
+ }
+
+
+ /**
+ * BitSet for delims.
+ */
+ public static final BitSet delims = new BitSet(256);
+ // Static initializer for delims
+ static {
+ delims.set('<');
+ delims.set('>');
+ delims.set('#');
+ delims.set('%');
+ delims.set('"');
+ }
+
+
+ /**
+ * BitSet for unwise.
+ */
+ public static final BitSet unwise = new BitSet(256);
+ // Static initializer for unwise
+ static {
+ unwise.set('{');
+ unwise.set('}');
+ unwise.set('|');
+ unwise.set('\\');
+ unwise.set('^');
+ unwise.set('[');
+ unwise.set(']');
+ unwise.set('`');
+ }
+
+
+ /**
+ * Disallowed rel_path before escaping.
+ */
+ public static final BitSet disallowed_rel_path = new BitSet(256);
+ // Static initializer for disallowed_rel_path
+ static {
+ disallowed_rel_path.or(uric);
+ disallowed_rel_path.andNot(rel_path);
+ }
+
+
+ /**
+ * Disallowed opaque_part before escaping.
+ */
+ public static final BitSet disallowed_opaque_part = new BitSet(256);
+ // Static initializer for disallowed_opaque_part
+ static {
+ disallowed_opaque_part.or(uric);
+ disallowed_opaque_part.andNot(opaque_part);
+ }
+
+ // ----------------------- Characters allowed within and for each component
+
+ /**
+ * Those characters that are allowed for the authority component.
+ */
+ public static final BitSet allowed_authority = new BitSet(256);
+ // Static initializer for allowed_authority
+ static {
+ allowed_authority.or(authority);
+ allowed_authority.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for the opaque_part.
+ */
+ public static final BitSet allowed_opaque_part = new BitSet(256);
+ // Static initializer for allowed_opaque_part
+ static {
+ allowed_opaque_part.or(opaque_part);
+ allowed_opaque_part.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for the reg_name.
+ */
+ public static final BitSet allowed_reg_name = new BitSet(256);
+ // Static initializer for allowed_reg_name
+ static {
+ allowed_reg_name.or(reg_name);
+ // allowed_reg_name.andNot(percent);
+ allowed_reg_name.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for the userinfo component.
+ */
+ public static final BitSet allowed_userinfo = new BitSet(256);
+ // Static initializer for allowed_userinfo
+ static {
+ allowed_userinfo.or(userinfo);
+ // allowed_userinfo.andNot(percent);
+ allowed_userinfo.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for within the userinfo component.
+ */
+ public static final BitSet allowed_within_userinfo = new BitSet(256);
+ // Static initializer for allowed_within_userinfo
+ static {
+ allowed_within_userinfo.or(within_userinfo);
+ allowed_within_userinfo.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for the IPv6reference component.
+ * The characters '[', ']' in IPv6reference should be excluded.
+ */
+ public static final BitSet allowed_IPv6reference = new BitSet(256);
+ // Static initializer for allowed_IPv6reference
+ static {
+ allowed_IPv6reference.or(IPv6reference);
+ // allowed_IPv6reference.andNot(unwise);
+ allowed_IPv6reference.clear('[');
+ allowed_IPv6reference.clear(']');
+ }
+
+
+ /**
+ * Those characters that are allowed for the host component.
+ * The characters '[', ']' in IPv6reference should be excluded.
+ */
+ public static final BitSet allowed_host = new BitSet(256);
+ // Static initializer for allowed_host
+ static {
+ allowed_host.or(hostname);
+ allowed_host.or(allowed_IPv6reference);
+ }
+
+
+ /**
+ * Those characters that are allowed for the authority component.
+ */
+ public static final BitSet allowed_within_authority = new BitSet(256);
+ // Static initializer for allowed_within_authority
+ static {
+ allowed_within_authority.or(server);
+ allowed_within_authority.or(reg_name);
+ allowed_within_authority.clear(';');
+ allowed_within_authority.clear(':');
+ allowed_within_authority.clear('@');
+ allowed_within_authority.clear('?');
+ allowed_within_authority.clear('/');
+ }
+
+
+ /**
+ * Those characters that are allowed for the abs_path.
+ */
+ public static final BitSet allowed_abs_path = new BitSet(256);
+ // Static initializer for allowed_abs_path
+ static {
+ allowed_abs_path.or(abs_path);
+ // allowed_abs_path.set('/'); // aleady included
+ allowed_abs_path.andNot(percent);
+ allowed_abs_path.clear('+');
+ }
+
+
+ /**
+ * Those characters that are allowed for the rel_path.
+ */
+ public static final BitSet allowed_rel_path = new BitSet(256);
+ // Static initializer for allowed_rel_path
+ static {
+ allowed_rel_path.or(rel_path);
+ allowed_rel_path.clear('%');
+ allowed_rel_path.clear('+');
+ }
+
+
+ /**
+ * Those characters that are allowed within the path.
+ */
+ public static final BitSet allowed_within_path = new BitSet(256);
+ // Static initializer for allowed_within_path
+ static {
+ allowed_within_path.or(abs_path);
+ allowed_within_path.clear('/');
+ allowed_within_path.clear(';');
+ allowed_within_path.clear('=');
+ allowed_within_path.clear('?');
+ }
+
+
+ /**
+ * Those characters that are allowed for the query component.
+ */
+ public static final BitSet allowed_query = new BitSet(256);
+ // Static initializer for allowed_query
+ static {
+ allowed_query.or(uric);
+ allowed_query.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed within the query component.
+ */
+ public static final BitSet allowed_within_query = new BitSet(256);
+ // Static initializer for allowed_within_query
+ static {
+ allowed_within_query.or(allowed_query);
+ allowed_within_query.andNot(reserved); // excluded 'reserved'
+ }
+
+
+ /**
+ * Those characters that are allowed for the fragment component.
+ */
+ public static final BitSet allowed_fragment = new BitSet(256);
+ // Static initializer for allowed_fragment
+ static {
+ allowed_fragment.or(uric);
+ allowed_fragment.clear('%');
+ }
+
+ // ------------------------------------------- Flags for this URI-reference
+
+ // TODO: Figure out what all these variables are for and provide javadoc
+
+ // URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ // absoluteURI = scheme ":" ( hier_part | opaque_part )
+ protected boolean _is_hier_part;
+ protected boolean _is_opaque_part;
+ // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
+ // hier_part = ( net_path | abs_path ) [ "?" query ]
+ protected boolean _is_net_path;
+ protected boolean _is_abs_path;
+ protected boolean _is_rel_path;
+ // net_path = "//" authority [ abs_path ]
+ // authority = server | reg_name
+ protected boolean _is_reg_name;
+ protected boolean _is_server; // = _has_server
+ // server = [ [ userinfo "@" ] hostport ]
+ // host = hostname | IPv4address | IPv6reference
+ protected boolean _is_hostname;
+ protected boolean _is_IPv4address;
+ protected boolean _is_IPv6reference;
+
+ // ------------------------------------------ Character and escape encoding
+
+ /**
+ * Encodes URI string.
+ *
+ * This is a two mapping, one from original characters to octets, and
+ * subsequently a second from octets to URI characters:
+ *
+ * original character sequence->octet sequence->URI character sequence
+ *
+ *
+ * An escaped octet is encoded as a character triplet, consisting of the
+ * percent character "%" followed by the two hexadecimal digits
+ * representing the octet code. For example, "%20" is the escaped
+ * encoding for the US-ASCII space character.
+ *
+ * Conversion from the local filesystem character set to UTF-8 will
+ * normally involve a two step process. First convert the local character
+ * set to the UCS; then convert the UCS to UTF-8.
+ * The first step in the process can be performed by maintaining a mapping
+ * table that includes the local character set code and the corresponding
+ * UCS code.
+ * The next step is to convert the UCS character code to the UTF-8 encoding.
+ *
+ * Mapping between vendor codepages can be done in a very similar manner
+ * as described above.
+ *
+ * The only time escape encodings can allowedly be made is when a URI is
+ * being created from its component parts. The escape and validate methods
+ * are internally performed within this method.
+ *
+ * @param original the original character sequence
+ * @param allowed those characters that are allowed within a component
+ * @param charset the protocol charset
+ * @return URI character sequence
+ * @throws URIException null component or unsupported character encoding
+ */
+
+ protected static char[] encode(String original, BitSet allowed,
+ String charset) throws URIException {
+ if (original == null) {
+ throw new IllegalArgumentException("Original string may not be null");
+ }
+ if (allowed == null) {
+ throw new IllegalArgumentException("Allowed bitset may not be null");
+ }
+ byte[] rawdata = URLCodec.encodeUrl(allowed, getBytes(original, charset));
+ return new String(rawdata, StandardCharsets.US_ASCII).toCharArray();
+ }
+
+ private static byte[] getBytes(String original, String charset) {
+ try {
+ return original.getBytes(charset);
+ } catch (UnsupportedEncodingException e) {
+ return original.getBytes(UTF_8);
+ }
+ }
+
+ /**
+ * Decodes URI encoded string.
+ *
+ * This is a two mapping, one from URI characters to octets, and
+ * subsequently a second from octets to original characters:
+ *
+ * URI character sequence->octet sequence->original character sequence
+ *
+ *
+ * A URI must be separated into its components before the escaped
+ * characters within those components can be allowedly decoded.
+ *
+ * Notice that there is a chance that URI characters that are non UTF-8
+ * may be parsed as valid UTF-8. A recent non-scientific analysis found
+ * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
+ * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
+ * false reading.
+ *
+ * The percent "%" character always has the reserved purpose of being
+ * the escape indicator, it must be escaped as "%25" in order to be used
+ * as data within a URI.
+ *
+ * The unescape method is internally performed within this method.
+ *
+ * @param component the URI character sequence
+ * @param charset the protocol charset
+ * @return original character sequence
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ */
+ protected static String decode(char[] component, String charset)
+ throws URIException {
+ if (component == null) {
+ throw new IllegalArgumentException("Component array of chars may not be null");
+ }
+ return decode(new String(component), charset);
+ }
+
+ /**
+ * Decodes URI encoded string.
+ *
+ * This is a two mapping, one from URI characters to octets, and
+ * subsequently a second from octets to original characters:
+ *
+ * URI character sequence->octet sequence->original character sequence
+ *
+ *
+ * A URI must be separated into its components before the escaped
+ * characters within those components can be allowedly decoded.
+ *
+ * Notice that there is a chance that URI characters that are non UTF-8
+ * may be parsed as valid UTF-8. A recent non-scientific analysis found
+ * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
+ * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
+ * false reading.
+ *
+ * The percent "%" character always has the reserved purpose of being
+ * the escape indicator, it must be escaped as "%25" in order to be used
+ * as data within a URI.
+ *
+ * The unescape method is internally performed within this method.
+ *
+ * @param component the URI character sequence
+ * @param charset the protocol charset
+ * @return original character sequence
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ *
+ * @since 3.0
+ */
+ protected static String decode(String component, String charset)
+ throws URIException {
+ if (component == null) {
+ throw new IllegalArgumentException("Component array of chars may not be null");
+ }
+ byte[] rawdata = null;
+ try {
+ rawdata = URLCodec.decodeUrl(component.getBytes(StandardCharsets.US_ASCII));
+ } catch (DecoderException e) {
+ throw new URIException(e.getMessage());
+ }
+ try {
+ Charset cs = Charset.forName(charset);
+ return new String(rawdata, cs);
+ } catch (IllegalCharsetNameException e) {
+ return new String(rawdata, StandardCharsets.US_ASCII);
+ }
+ }
+
+ /**
+ * Pre-validate the unescaped URI string within a specific component.
+ *
+ * @param component the component string within the component
+ * @param disallowed those characters disallowed within the component
+ * @return if true, it doesn't have the disallowed characters
+ * if false, the component is undefined or an incorrect one
+ */
+ protected boolean prevalidate(String component, BitSet disallowed) {
+ // prevalidate the given component by disallowed characters
+ if (component == null) {
+ return false; // undefined
+ }
+ char[] target = component.toCharArray();
+ for (int i = 0; i < target.length; i++) {
+ if (disallowed.get(target[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ /**
+ * Validate the URI characters within a specific component.
+ * The component must be performed after escape encoding. Or it doesn't
+ * include escaped characters.
+ *
+ * @param component the characters sequence within the component
+ * @param generous those characters that are allowed within a component
+ * @return if true, it's the correct URI character sequence
+ */
+ protected boolean validate(char[] component, BitSet generous) {
+ // validate each component by generous characters
+ return validate(component, 0, -1, generous);
+ }
+
+
+ /**
+ * Validate the URI characters within a specific component.
+ * The component must be performed after escape encoding. Or it doesn't
+ * include escaped characters.
+ *
+ * It's not that much strict, generous. The strict validation might be
+ * performed before being called this method.
+ *
+ * @param component the characters sequence within the component
+ * @param soffset the starting offset of the given component
+ * @param eoffset the ending offset of the given component
+ * if -1, it means the length of the component
+ * @param generous those characters that are allowed within a component
+ * @return if true, it's the correct URI character sequence
+ */
+ protected boolean validate(char[] component, int soffset, int eoffset,
+ BitSet generous) {
+ // validate each component by generous characters
+ if (eoffset == -1) {
+ eoffset = component.length - 1;
+ }
+ for (int i = soffset; i <= eoffset; i++) {
+ if (!generous.get(component[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ /**
+ * In order to avoid any possilbity of conflict with non-ASCII characters,
+ * Parse a URI reference as a String with the character
+ * encoding of the local system or the document.
+ *
+ * The following line is the regular expression for breaking-down a URI
+ * reference into its components.
+ *
+ * For example, matching the above expression to
+ * http://jakarta.apache.org/ietf/uri/#Related
+ * results in the following subexpression matches:
+ *
+ */
+ if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
+ int next = tmp.indexOf('#', at + 1);
+ if (next == -1) {
+ next = tmp.length();
+ }
+ if (escaped) {
+ _query = tmp.substring(at + 1, next).toCharArray();
+ if (!validate(_query, uric)) {
+ throw new URIException("Invalid query");
+ }
+ } else {
+ _query = encode(tmp.substring(at + 1, next), allowed_query, charset);
+ }
+ at = next;
+ }
+
+ /*
+ * Parse the fragment component.
+ *
+ * fragment = $9 = Related
+ * @@@@@@@@
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *
+ */
+ if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
+ if (at + 1 == length) { // empty fragment
+ _fragment = "".toCharArray();
+ } else {
+ _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
+ : encode(tmp.substring(at + 1), allowed_fragment, charset);
+ }
+ }
+
+ // set this URI.
+ setURI();
+ }
+
+
+ /**
+ * Get the earlier index that to be searched for the first occurrance in
+ * one of any of the given string.
+ *
+ * @param s the string to be indexed
+ * @param delims the delimiters used to index
+ * @return the earlier index if there are delimiters
+ */
+ protected int indexFirstOf(String s, String delims) {
+ return indexFirstOf(s, delims, -1);
+ }
+
+
+ /**
+ * Get the earlier index that to be searched for the first occurrance in
+ * one of any of the given string.
+ *
+ * @param s the string to be indexed
+ * @param delims the delimiters used to index
+ * @param offset the from index
+ * @return the earlier index if there are delimiters
+ */
+ protected int indexFirstOf(String s, String delims, int offset) {
+ if (s == null || s.length() == 0) {
+ return -1;
+ }
+ if (delims == null || delims.length() == 0) {
+ return -1;
+ }
+ // check boundaries
+ if (offset < 0) {
+ offset = 0;
+ } else if (offset > s.length()) {
+ return -1;
+ }
+ // s is never null
+ int min = s.length();
+ char[] delim = delims.toCharArray();
+ for (int i = 0; i < delim.length; i++) {
+ int at = s.indexOf(delim[i], offset);
+ if (at >= 0 && at < min) {
+ min = at;
+ }
+ }
+ return (min == s.length()) ? -1 : min;
+ }
+
+
+ /**
+ * Get the earlier index that to be searched for the first occurrance in
+ * one of any of the given array.
+ *
+ * @param s the character array to be indexed
+ * @param delim the delimiter used to index
+ * @return the ealier index if there are a delimiter
+ */
+ protected int indexFirstOf(char[] s, char delim) {
+ return indexFirstOf(s, delim, 0);
+ }
+
+
+ /**
+ * Get the earlier index that to be searched for the first occurrance in
+ * one of any of the given array.
+ *
+ * @param s the character array to be indexed
+ * @param delim the delimiter used to index
+ * @param offset The offset.
+ * @return the ealier index if there is a delimiter
+ */
+ protected int indexFirstOf(char[] s, char delim, int offset) {
+ if (s == null || s.length == 0) {
+ return -1;
+ }
+ // check boundaries
+ if (offset < 0) {
+ offset = 0;
+ } else if (offset > s.length) {
+ return -1;
+ }
+ for (int i = offset; i < s.length; i++) {
+ if (s[i] == delim) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+
+ /**
+ * Parse the authority component.
+ *
+ * @param original the original character sequence of authority component
+ * @param escaped true if original is escaped
+ * @throws URIException If an error occurs.
+ */
+ protected void parseAuthority(String original, boolean escaped)
+ throws URIException {
+
+ // Reset flags
+ _is_reg_name = _is_server =
+ _is_hostname = _is_IPv4address = _is_IPv6reference = false;
+
+ // set the charset to do escape encoding
+ String charset = getProtocolCharset();
+
+ boolean hasPort = true;
+ int from = 0;
+ int next = original.indexOf('@');
+ if (next != -1) { // neither -1 and 0
+ // each protocol extented from URI supports the specific userinfo
+ _userinfo = (escaped) ? original.substring(0, next).toCharArray()
+ : encode(original.substring(0, next), allowed_userinfo,
+ charset);
+ from = next + 1;
+ }
+ next = original.indexOf('[', from);
+ if (next >= from) {
+ next = original.indexOf(']', from);
+ if (next == -1) {
+ throw new URIException(URIException.PARSING, "IPv6reference");
+ } else {
+ next++;
+ }
+ // In IPv6reference, '[', ']' should be excluded
+ _host = (escaped) ? original.substring(from, next).toCharArray()
+ : encode(original.substring(from, next), allowed_IPv6reference,
+ charset);
+ // Set flag
+ _is_IPv6reference = true;
+ } else { // only for !_is_IPv6reference
+ next = original.indexOf(':', from);
+ if (next == -1) {
+ next = original.length();
+ hasPort = false;
+ }
+ // REMINDME: it doesn't need the pre-validation
+ _host = original.substring(from, next).toCharArray();
+ if (validate(_host, IPv4address)) {
+ // Set flag
+ _is_IPv4address = true;
+ } else if (validate(_host, hostname)) {
+ // Set flag
+ _is_hostname = true;
+ } else {
+ // Set flag
+ _is_reg_name = true;
+ }
+ }
+ if (_is_reg_name) {
+ // Reset flags for a server-based naming authority
+ _is_server = _is_hostname = _is_IPv4address =
+ _is_IPv6reference = false;
+ // set a registry-based naming authority
+ if (escaped) {
+ _authority = original.toCharArray();
+ if (!validate(_authority, reg_name)) {
+ throw new URIException("Invalid authority");
+ }
+ } else {
+ _authority = encode(original, allowed_reg_name, charset);
+ }
+ } else {
+ if (original.length() - 1 > next && hasPort
+ && original.charAt(next) == ':') { // not empty
+ from = next + 1;
+ try {
+ _port = Integer.parseInt(original.substring(from));
+ } catch (NumberFormatException error) {
+ throw new URIException(URIException.PARSING,
+ "invalid port number");
+ }
+ }
+ // set a server-based naming authority
+ StringBuffer buf = new StringBuffer();
+ if (_userinfo != null) { // has_userinfo
+ buf.append(_userinfo);
+ buf.append('@');
+ }
+ if (_host != null) {
+ buf.append(_host);
+ if (_port != -1) {
+ buf.append(':');
+ buf.append(_port);
+ }
+ }
+ _authority = buf.toString().toCharArray();
+ // Set flag
+ _is_server = true;
+ }
+ }
+
+
+ /**
+ * Once it's parsed successfully, set this URI.
+ *
+ * @see #getRawURI
+ */
+ protected void setURI() {
+ // set _uri
+ StringBuffer buf = new StringBuffer();
+ // ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ if (_scheme != null) {
+ buf.append(_scheme);
+ buf.append(':');
+ }
+ if (_is_net_path) {
+ buf.append("//");
+ if (_authority != null) { // has_authority
+ buf.append(_authority);
+ }
+ }
+ if (_opaque != null && _is_opaque_part) {
+ buf.append(_opaque);
+ } else if (_path != null) {
+ // _is_hier_part or _is_relativeURI
+ if (_path.length != 0) {
+ buf.append(_path);
+ }
+ }
+ if (_query != null) { // has_query
+ buf.append('?');
+ buf.append(_query);
+ }
+ // ignore the fragment identifier
+ _uri = buf.toString().toCharArray();
+ hash = 0;
+ }
+
+ // ----------------------------------------------------------- Test methods
+
+
+ /**
+ * Tell whether or not this URI is absolute.
+ *
+ * @return true iif this URI is absoluteURI
+ */
+ public boolean isAbsoluteURI() {
+ return (_scheme != null);
+ }
+
+
+ /**
+ * Tell whether or not this URI is relative.
+ *
+ * @return true iif this URI is relativeURI
+ */
+ public boolean isRelativeURI() {
+ return (_scheme == null);
+ }
+
+
+ /**
+ * Tell whether or not the absoluteURI of this URI is hier_part.
+ *
+ * @return true iif the absoluteURI is hier_part
+ */
+ public boolean isHierPart() {
+ return _is_hier_part;
+ }
+
+
+ /**
+ * Tell whether or not the absoluteURI of this URI is opaque_part.
+ *
+ * @return true iif the absoluteURI is opaque_part
+ */
+ public boolean isOpaquePart() {
+ return _is_opaque_part;
+ }
+
+
+ /**
+ * Tell whether or not the relativeURI or heir_part of this URI is net_path.
+ * It's the same function as the has_authority() method.
+ *
+ * @return true iif the relativeURI or heir_part is net_path
+ * @see #hasAuthority
+ */
+ public boolean isNetPath() {
+ return _is_net_path || (_authority != null);
+ }
+
+
+ /**
+ * Tell whether or not the relativeURI or hier_part of this URI is abs_path.
+ *
+ * @return true iif the relativeURI or hier_part is abs_path
+ */
+ public boolean isAbsPath() {
+ return _is_abs_path;
+ }
+
+
+ /**
+ * Tell whether or not the relativeURI of this URI is rel_path.
+ *
+ * @return true iif the relativeURI is rel_path
+ */
+ public boolean isRelPath() {
+ return _is_rel_path;
+ }
+
+
+ /**
+ * Tell whether or not this URI has authority.
+ * It's the same function as the is_net_path() method.
+ *
+ * @return true iif this URI has authority
+ * @see #isNetPath
+ */
+ public boolean hasAuthority() {
+ return (_authority != null) || _is_net_path;
+ }
+
+ /**
+ * Tell whether or not the authority component of this URI is reg_name.
+ *
+ * @return true iif the authority component is reg_name
+ */
+ public boolean isRegName() {
+ return _is_reg_name;
+ }
+
+
+ /**
+ * Tell whether or not the authority component of this URI is server.
+ *
+ * @return true iif the authority component is server
+ */
+ public boolean isServer() {
+ return _is_server;
+ }
+
+
+ /**
+ * Tell whether or not this URI has userinfo.
+ *
+ * @return true iif this URI has userinfo
+ */
+ public boolean hasUserinfo() {
+ return (_userinfo != null);
+ }
+
+
+ /**
+ * Tell whether or not the host part of this URI is hostname.
+ *
+ * @return true iif the host part is hostname
+ */
+ public boolean isHostname() {
+ return _is_hostname;
+ }
+
+
+ /**
+ * Tell whether or not the host part of this URI is IPv4address.
+ *
+ * @return true iif the host part is IPv4address
+ */
+ public boolean isIPv4address() {
+ return _is_IPv4address;
+ }
+
+
+ /**
+ * Tell whether or not the host part of this URI is IPv6reference.
+ *
+ * @return true iif the host part is IPv6reference
+ */
+ public boolean isIPv6reference() {
+ return _is_IPv6reference;
+ }
+
+
+ /**
+ * Tell whether or not this URI has query.
+ *
+ * @return true iif this URI has query
+ */
+ public boolean hasQuery() {
+ return (_query != null);
+ }
+
+
+ /**
+ * Tell whether or not this URI has fragment.
+ *
+ * @return true iif this URI has fragment
+ */
+ public boolean hasFragment() {
+ return (_fragment != null);
+ }
+
+
+ // ---------------------------------------------------------------- Charset
+
+
+ /**
+ * Set the default charset of the protocol.
+ *
+ * The character set used to store files SHALL remain a local decision and
+ * MAY depend on the capability of local operating systems. Prior to the
+ * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format
+ * and UTF-8 encoded. This approach, while allowing international exchange
+ * of URIs, will still allow backward compatibility with older systems
+ * because the code set positions for ASCII characters are identical to the
+ * one byte sequence in UTF-8.
+ *
+ * An individual URI scheme may require a single charset, define a default
+ * charset, or provide a way to indicate the charset used.
+ *
+ *
+ * Always all the time, the setter method is always succeeded and throws
+ * DefaultCharsetChanged exception.
+ *
+ * So API programmer must follow the following way:
+ *
+ * import org.apache.util.URI$DefaultCharsetChanged;
+ * .
+ * .
+ * .
+ * try {
+ * URI.setDefaultProtocolCharset("UTF-8");
+ * } catch (DefaultCharsetChanged cc) {
+ * // CASE 1: the exception could be ignored, when it is set by user
+ * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
+ * // CASE 2: let user know the default protocol charset changed
+ * } else {
+ * // CASE 2: let user know the default document charset changed
+ * }
+ * }
+ *
+ *
+ * The API programmer is responsible to set the correct charset.
+ * And each application should remember its own charset to support.
+ *
+ * @param charset the default charset for each protocol
+ * @throws DefaultCharsetChanged default charset changed
+ */
+ public static void setDefaultProtocolCharset(String charset)
+ throws DefaultCharsetChanged {
+
+ defaultProtocolCharset = charset;
+ throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET,
+ "the default protocol charset changed");
+ }
+
+
+ /**
+ * Get the default charset of the protocol.
+ *
+ * An individual URI scheme may require a single charset, define a default
+ * charset, or provide a way to indicate the charset used.
+ *
+ * To work globally either requires support of a number of character sets
+ * and to be able to convert between them, or the use of a single preferred
+ * character set.
+ * For support of global compatibility it is STRONGLY RECOMMENDED that
+ * clients and servers use UTF-8 encoding when exchanging URIs.
+ *
+ * @return the default charset string
+ */
+ public static String getDefaultProtocolCharset() {
+ return defaultProtocolCharset;
+ }
+
+
+ /**
+ * Get the protocol charset used by this current URI instance.
+ * It was set by the constructor for this instance. If it was not set by
+ * contructor, it will return the default protocol charset.
+ *
+ * @return the protocol charset string
+ * @see #getDefaultProtocolCharset
+ */
+ public String getProtocolCharset() {
+ return (protocolCharset != null)
+ ? protocolCharset
+ : defaultProtocolCharset;
+ }
+
+
+ /**
+ * Set the default charset of the document.
+ *
+ * Notice that it will be possible to contain mixed characters (e.g.
+ * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional
+ * display of these character sets, the protocol charset could be simply
+ * used again. Because it's not yet implemented that the insertion of BIDI
+ * control characters at different points during composition is extracted.
+ *
+ *
+ * Always all the time, the setter method is always succeeded and throws
+ * DefaultCharsetChanged exception.
+ *
+ * So API programmer must follow the following way:
+ *
+ * import org.apache.util.URI$DefaultCharsetChanged;
+ * .
+ * .
+ * .
+ * try {
+ * URI.setDefaultDocumentCharset("EUC-KR");
+ * } catch (DefaultCharsetChanged cc) {
+ * // CASE 1: the exception could be ignored, when it is set by user
+ * if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
+ * // CASE 2: let user know the default document charset changed
+ * } else {
+ * // CASE 2: let user know the default protocol charset changed
+ * }
+ * }
+ *
+ *
+ * The API programmer is responsible to set the correct charset.
+ * And each application should remember its own charset to support.
+ *
+ * @param charset the default charset for the document
+ * @throws DefaultCharsetChanged default charset changed
+ */
+ public static void setDefaultDocumentCharset(String charset)
+ throws DefaultCharsetChanged {
+
+ defaultDocumentCharset = charset;
+ throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET,
+ "the default document charset changed");
+ }
+
+
+ /**
+ * Get the recommended default charset of the document.
+ *
+ * @return the default charset string
+ */
+ public static String getDefaultDocumentCharset() {
+ return defaultDocumentCharset;
+ }
+
+
+ /**
+ * Get the default charset of the document by locale.
+ *
+ * @return the default charset string by locale
+ */
+ public static String getDefaultDocumentCharsetByLocale() {
+ return defaultDocumentCharsetByLocale;
+ }
+
+
+ /**
+ * Get the default charset of the document by platform.
+ *
+ * @return the default charset string by platform
+ */
+ public static String getDefaultDocumentCharsetByPlatform() {
+ return defaultDocumentCharsetByPlatform;
+ }
+
+ // ------------------------------------------------------------- The scheme
+
+ /**
+ * Get the scheme.
+ *
+ * @return the scheme
+ */
+ public char[] getRawScheme() {
+ return _scheme;
+ }
+
+
+ /**
+ * Get the scheme.
+ *
+ * @return the scheme
+ * null if undefined scheme
+ */
+ public String getScheme() {
+ return (_scheme == null) ? null : new String(_scheme);
+ }
+
+ // ---------------------------------------------------------- The authority
+
+ /**
+ * Set the authority. It can be one type of server, hostport, hostname,
+ * IPv4address, IPv6reference and reg_name.
+ *
+ * authority = server | reg_name
+ *
+ *
+ * @param escapedAuthority the raw escaped authority
+ * @throws URIException If {@link
+ * #parseAuthority(String,boolean)} fails
+ * @throws NullPointerException null authority
+ */
+ public void setRawAuthority(char[] escapedAuthority)
+ throws URIException, NullPointerException {
+
+ parseAuthority(new String(escapedAuthority), true);
+ setURI();
+ }
+
+
+ /**
+ * Set the authority. It can be one type of server, hostport, hostname,
+ * IPv4address, IPv6reference and reg_name.
+ * Note that there is no setAuthority method by the escape encoding reason.
+ *
+ * @param escapedAuthority the escaped authority string
+ * @throws URIException If {@link
+ * #parseAuthority(String,boolean)} fails
+ */
+ public void setEscapedAuthority(String escapedAuthority)
+ throws URIException {
+
+ parseAuthority(escapedAuthority, true);
+ setURI();
+ }
+
+
+ /**
+ * Get the raw-escaped authority.
+ *
+ * @return the raw-escaped authority
+ */
+ public char[] getRawAuthority() {
+ return _authority;
+ }
+
+
+ /**
+ * Get the escaped authority.
+ *
+ * @return the escaped authority
+ */
+ public String getEscapedAuthority() {
+ return (_authority == null) ? null : new String(_authority);
+ }
+
+
+ /**
+ * Get the authority.
+ *
+ * @return the authority
+ * @throws URIException If {@link #decode} fails
+ */
+ public String getAuthority() throws URIException {
+ return (_authority == null) ? null : decode(_authority,
+ getProtocolCharset());
+ }
+
+ // ----------------------------------------------------------- The userinfo
+
+ /**
+ * Get the raw-escaped userinfo.
+ *
+ * @return the raw-escaped userinfo
+ * @see #getAuthority
+ */
+ public char[] getRawUserinfo() {
+ return _userinfo;
+ }
+
+
+ /**
+ * Get the escaped userinfo.
+ *
+ * @return the escaped userinfo
+ * @see #getAuthority
+ */
+ public String getEscapedUserinfo() {
+ return (_userinfo == null) ? null : new String(_userinfo);
+ }
+
+
+ /**
+ * Get the userinfo.
+ *
+ * @return the userinfo
+ * @throws URIException If {@link #decode} fails
+ * @see #getAuthority
+ */
+ public String getUserinfo() throws URIException {
+ return (_userinfo == null) ? null : decode(_userinfo,
+ getProtocolCharset());
+ }
+
+ // --------------------------------------------------------------- The host
+
+ /**
+ * Get the host.
+ *
+ *
+ * @return the escaped path string
+ */
+ public String getEscapedPath() {
+ char[] path = getRawPath();
+ return (path == null) ? null : new String(path);
+ }
+
+
+ /**
+ * Get the path.
+ *
+ * path = [ abs_path | opaque_part ]
+ *
+ * @return the path string
+ * @throws URIException If {@link #decode} fails.
+ * @see #decode
+ */
+ public String getPath() throws URIException {
+ char[] path = getRawPath();
+ return (path == null) ? null : decode(path, getProtocolCharset());
+ }
+
+
+ /**
+ * Get the raw-escaped basename of the path.
+ *
+ * @return the raw-escaped basename
+ */
+ public char[] getRawName() {
+ if (_path == null) {
+ return null;
+ }
+
+ int at = 0;
+ for (int i = _path.length - 1; i >= 0; i--) {
+ if (_path[i] == '/') {
+ at = i + 1;
+ break;
+ }
+ }
+ int len = _path.length - at;
+ char[] basename = new char[len];
+ System.arraycopy(_path, at, basename, 0, len);
+ return basename;
+ }
+
+
+ /**
+ * Get the escaped basename of the path.
+ *
+ * @return the escaped basename string
+ */
+ public String getEscapedName() {
+ char[] basename = getRawName();
+ return (basename == null) ? null : new String(basename);
+ }
+
+
+ /**
+ * Get the basename of the path.
+ *
+ * @return the basename string
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getName() throws URIException {
+ char[] basename = getRawName();
+ return (basename == null) ? null : decode(getRawName(),
+ getProtocolCharset());
+ }
+
+ // ----------------------------------------------------- The path and query
+
+ /**
+ * Get the raw-escaped path and query.
+ *
+ * @return the raw-escaped path and query
+ */
+ public char[] getRawPathQuery() {
+
+ if (_path == null && _query == null) {
+ return null;
+ }
+ StringBuffer buff = new StringBuffer();
+ if (_path != null) {
+ buff.append(_path);
+ }
+ if (_query != null) {
+ buff.append('?');
+ buff.append(_query);
+ }
+ return buff.toString().toCharArray();
+ }
+
+
+ /**
+ * Get the escaped query.
+ *
+ * @return the escaped path and query string
+ */
+ public String getEscapedPathQuery() {
+ char[] rawPathQuery = getRawPathQuery();
+ return (rawPathQuery == null) ? null : new String(rawPathQuery);
+ }
+
+
+ /**
+ * Get the path and query.
+ *
+ * @return the path and query string.
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getPathQuery() throws URIException {
+ char[] rawPathQuery = getRawPathQuery();
+ return (rawPathQuery == null) ? null : decode(rawPathQuery,
+ getProtocolCharset());
+ }
+
+ // -------------------------------------------------------------- The query
+
+ /**
+ * Set the raw-escaped query.
+ *
+ * @param escapedQuery the raw-escaped query
+ * @throws URIException escaped query not valid
+ */
+ public void setRawQuery(char[] escapedQuery) throws URIException {
+ if (escapedQuery == null || escapedQuery.length == 0) {
+ _query = escapedQuery;
+ setURI();
+ return;
+ }
+ // remove the fragment identifier
+ escapedQuery = removeFragmentIdentifier(escapedQuery);
+ if (!validate(escapedQuery, query)) {
+ throw new URIException(URIException.ESCAPING,
+ "escaped query not valid");
+ }
+ _query = escapedQuery;
+ setURI();
+ }
+
+
+ /**
+ * Set the escaped query string.
+ *
+ * @param escapedQuery the escaped query string
+ * @throws URIException escaped query not valid
+ */
+ public void setEscapedQuery(String escapedQuery) throws URIException {
+ if (escapedQuery == null) {
+ _query = null;
+ setURI();
+ return;
+ }
+ setRawQuery(escapedQuery.toCharArray());
+ }
+
+
+ /**
+ * Set the query.
+ *
+ * When a query string is not misunderstood the reserved special characters
+ * ("&", "=", "+", ",", and "$") within a query component, it is
+ * recommended to use in encoding the whole query with this method.
+ *
+ * The additional APIs for the special purpose using by the reserved
+ * special characters used in each protocol are implemented in each protocol
+ * classes inherited from URI. So refer to the same-named APIs
+ * implemented in each specific protocol instance.
+ *
+ * @param query the query string.
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #encode
+ */
+ public void setQuery(String query) throws URIException {
+ if (query == null || query.length() == 0) {
+ _query = (query == null) ? null : query.toCharArray();
+ setURI();
+ return;
+ }
+ setRawQuery(encode(query, allowed_query, getProtocolCharset()));
+ }
+
+
+ /**
+ * Get the raw-escaped query.
+ *
+ * @return the raw-escaped query
+ */
+ public char[] getRawQuery() {
+ return _query;
+ }
+
+
+ /**
+ * Get the escaped query.
+ *
+ * @return the escaped query string
+ */
+ public String getEscapedQuery() {
+ return (_query == null) ? null : new String(_query);
+ }
+
+
+ /**
+ * Get the query.
+ *
+ * @return the query string.
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getQuery() throws URIException {
+ return (_query == null) ? null : decode(_query, getProtocolCharset());
+ }
+
+ // ----------------------------------------------------------- The fragment
+
+ /**
+ * Set the raw-escaped fragment.
+ *
+ * @param escapedFragment the raw-escaped fragment
+ * @throws URIException escaped fragment not valid
+ */
+ public void setRawFragment(char[] escapedFragment) throws URIException {
+ if (escapedFragment == null || escapedFragment.length == 0) {
+ _fragment = escapedFragment;
+ hash = 0;
+ return;
+ }
+ if (!validate(escapedFragment, fragment)) {
+ throw new URIException(URIException.ESCAPING,
+ "escaped fragment not valid");
+ }
+ _fragment = escapedFragment;
+ hash = 0;
+ }
+
+
+ /**
+ * Set the escaped fragment string.
+ *
+ * @param escapedFragment the escaped fragment string
+ * @throws URIException escaped fragment not valid
+ */
+ public void setEscapedFragment(String escapedFragment) throws URIException {
+ if (escapedFragment == null) {
+ _fragment = null;
+ hash = 0;
+ return;
+ }
+ setRawFragment(escapedFragment.toCharArray());
+ }
+
+
+ /**
+ * Set the fragment.
+ *
+ * @param fragment the fragment string.
+ * @throws URIException If an error occurs.
+ */
+ public void setFragment(String fragment) throws URIException {
+ if (fragment == null || fragment.length() == 0) {
+ _fragment = (fragment == null) ? null : fragment.toCharArray();
+ hash = 0;
+ return;
+ }
+ _fragment = encode(fragment, allowed_fragment, getProtocolCharset());
+ hash = 0;
+ }
+
+
+ /**
+ * Get the raw-escaped fragment.
+ *
+ * The optional fragment identifier is not part of a URI, but is often used
+ * in conjunction with a URI.
+ *
+ * The format and interpretation of fragment identifiers is dependent on
+ * the media type [RFC2046] of the retrieval result.
+ *
+ * A fragment identifier is only meaningful when a URI reference is
+ * intended for retrieval and the result of that retrieval is a document
+ * for which the identified fragment is consistently defined.
+ *
+ * @return the raw-escaped fragment
+ */
+ public char[] getRawFragment() {
+ return _fragment;
+ }
+
+
+ /**
+ * Get the escaped fragment.
+ *
+ * @return the escaped fragment string
+ */
+ public String getEscapedFragment() {
+ return (_fragment == null) ? null : new String(_fragment);
+ }
+
+
+ /**
+ * Get the fragment.
+ *
+ * @return the fragment string
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getFragment() throws URIException {
+ return (_fragment == null) ? null : decode(_fragment,
+ getProtocolCharset());
+ }
+
+ // ------------------------------------------------------------- Utilities
+
+ /**
+ * Remove the fragment identifier of the given component.
+ *
+ * @param component the component that a fragment may be included
+ * @return the component that the fragment identifier is removed
+ */
+ protected char[] removeFragmentIdentifier(char[] component) {
+ if (component == null) {
+ return null;
+ }
+ int lastIndex = new String(component).indexOf('#');
+ if (lastIndex != -1) {
+ component = new String(component).substring(0,
+ lastIndex).toCharArray();
+ }
+ return component;
+ }
+
+
+ /**
+ * Normalize the given hier path part.
+ *
+ *
Algorithm taken from URI reference parser at
+ * http://www.apache.org/~fielding/uri/rev-2002/issues.html.
+ *
+ * @param path the path to normalize
+ * @return the normalized path
+ * @throws URIException no more higher path level to be normalized
+ */
+ protected char[] normalize(char[] path) throws URIException {
+
+ if (path == null) {
+ return null;
+ }
+
+ String normalized = new String(path);
+
+ // If the buffer begins with "./" or "../", the "." or ".." is removed.
+ if (normalized.startsWith("./")) {
+ normalized = normalized.substring(1);
+ } else if (normalized.startsWith("../")) {
+ normalized = normalized.substring(2);
+ } else if (normalized.startsWith("..")) {
+ normalized = normalized.substring(2);
+ }
+
+ // All occurrences of "/./" in the buffer are replaced with "/"
+ int index = -1;
+ while ((index = normalized.indexOf("/./")) != -1) {
+ normalized = normalized.substring(0, index) + normalized.substring(index + 2);
+ }
+
+ // If the buffer ends with "/.", the "." is removed.
+ if (normalized.endsWith("/.")) {
+ normalized = normalized.substring(0, normalized.length() - 1);
+ }
+
+ int startIndex = 0;
+
+ // All occurrences of "//../" in the buffer, where ".."
+ // and are complete path segments, are iteratively replaced
+ // with "/" in order from left to right until no matching pattern remains.
+ // If the buffer ends with "//..", that is also replaced
+ // with "/". Note that may be empty.
+ while ((index = normalized.indexOf("/../", startIndex)) != -1) {
+ int slashIndex = normalized.lastIndexOf('/', index - 1);
+ if (slashIndex >= 0) {
+ normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3);
+ } else {
+ startIndex = index + 3;
+ }
+ }
+ if (normalized.endsWith("/..")) {
+ int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
+ if (slashIndex >= 0) {
+ normalized = normalized.substring(0, slashIndex + 1);
+ }
+ }
+
+ // All prefixes of "/../" in the buffer, where ".."
+ // and are complete path segments, are iteratively replaced
+ // with "/" in order from left to right until no matching pattern remains.
+ // If the buffer ends with "/..", that is also replaced
+ // with "/". Note that may be empty.
+ while ((index = normalized.indexOf("/../")) != -1) {
+ int slashIndex = normalized.lastIndexOf('/', index - 1);
+ if (slashIndex >= 0) {
+ break;
+ } else {
+ normalized = normalized.substring(index + 3);
+ }
+ }
+ if (normalized.endsWith("/..")) {
+ int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
+ if (slashIndex < 0) {
+ normalized = "/";
+ }
+ }
+
+ return normalized.toCharArray();
+ }
+
+
+ /**
+ * Normalizes the path part of this URI. Normalization is only meant to be performed on
+ * URIs with an absolute path. Calling this method on a relative path URI will have no
+ * effect.
+ *
+ * @throws URIException no more higher path level to be normalized
+ *
+ * @see #isAbsPath()
+ */
+ public void normalize() throws URIException {
+ if (isAbsPath()) {
+ _path = normalize(_path);
+ setURI();
+ }
+ }
+
+
+ /**
+ * Test if the first array is equal to the second array.
+ *
+ * @param first the first character array
+ * @param second the second character array
+ * @return true if they're equal
+ */
+ protected boolean equals(char[] first, char[] second) {
+
+ if (first == null && second == null) {
+ return true;
+ }
+ if (first == null || second == null) {
+ return false;
+ }
+ if (first.length != second.length) {
+ return false;
+ }
+ for (int i = 0; i < first.length; i++) {
+ if (first[i] != second[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ /**
+ * Test an object if this URI is equal to another.
+ *
+ * @param obj an object to compare
+ * @return true if two URI objects are equal
+ */
+ public boolean equals(Object obj) {
+
+ // normalize and test each components
+ if (obj == this) {
+ return true;
+ }
+ if (!(obj instanceof URI)) {
+ return false;
+ }
+ URI another = (URI) obj;
+ // scheme
+ if (!equals(_scheme, another._scheme)) {
+ return false;
+ }
+ // is_opaque_part or is_hier_part? and opaque
+ if (!equals(_opaque, another._opaque)) {
+ return false;
+ }
+ // is_hier_part
+ // has_authority
+ if (!equals(_authority, another._authority)) {
+ return false;
+ }
+ // path
+ if (!equals(_path, another._path)) {
+ return false;
+ }
+ // has_query
+ if (!equals(_query, another._query)) {
+ return false;
+ }
+ // has_fragment? should be careful of the only fragment case.
+ if (!equals(_fragment, another._fragment)) {
+ return false;
+ }
+ return true;
+ }
+
+ // ---------------------------------------------------------- Serialization
+
+ /**
+ * Write the content of this URI.
+ *
+ * @param oos the object-output stream
+ * @throws IOException If an IO problem occurs.
+ */
+ private void writeObject(ObjectOutputStream oos)
+ throws IOException {
+
+ oos.defaultWriteObject();
+ }
+
+
+ /**
+ * Read a URI.
+ *
+ * @param ois the object-input stream
+ * @throws ClassNotFoundException If one of the classes specified in the
+ * input stream cannot be found.
+ * @throws IOException If an IO problem occurs.
+ */
+ private void readObject(ObjectInputStream ois)
+ throws ClassNotFoundException, IOException {
+
+ ois.defaultReadObject();
+ }
+
+ // -------------------------------------------------------------- Hash code
+
+ /**
+ * Return a hash code for this URI.
+ *
+ * @return a has code value for this URI
+ */
+ public int hashCode() {
+ if (hash == 0) {
+ char[] c = _uri;
+ if (c != null) {
+ for (int i = 0, len = c.length; i < len; i++) {
+ hash = 31 * hash + c[i];
+ }
+ }
+ c = _fragment;
+ if (c != null) {
+ for (int i = 0, len = c.length; i < len; i++) {
+ hash = 31 * hash + c[i];
+ }
+ }
+ }
+ return hash;
+ }
+
+ // ------------------------------------------------------------- Comparison
+
+ /**
+ * Compare this URI to another object.
+ *
+ * @param obj the object to be compared.
+ * @return 0, if it's same,
+ * -1, if failed, first being compared with in the authority component
+ * @throws ClassCastException not URI argument
+ */
+ public int compareTo(Object obj) throws ClassCastException {
+
+ URI another = (URI) obj;
+ if (!equals(_authority, another.getRawAuthority())) {
+ return -1;
+ }
+ return toString().compareTo(another.toString());
+ }
+
+ // ------------------------------------------------------------------ Clone
+
+ /**
+ * Create and return a copy of this object, the URI-reference containing
+ * the userinfo component. Notice that the whole URI-reference including
+ * the userinfo component counld not be gotten as a String.
+ *
+ * To copy the identical URI object including the userinfo
+ * component, it should be used.
+ *
+ * @return a clone of this instance
+ */
+ public synchronized Object clone() throws CloneNotSupportedException {
+
+ URI instance = (URI) super.clone();
+
+ instance._uri = _uri;
+ instance._scheme = _scheme;
+ instance._opaque = _opaque;
+ instance._authority = _authority;
+ instance._userinfo = _userinfo;
+ instance._host = _host;
+ instance._port = _port;
+ instance._path = _path;
+ instance._query = _query;
+ instance._fragment = _fragment;
+ // the charset to do escape encoding for this instance
+ instance.protocolCharset = protocolCharset;
+ // flags
+ instance._is_hier_part = _is_hier_part;
+ instance._is_opaque_part = _is_opaque_part;
+ instance._is_net_path = _is_net_path;
+ instance._is_abs_path = _is_abs_path;
+ instance._is_rel_path = _is_rel_path;
+ instance._is_reg_name = _is_reg_name;
+ instance._is_server = _is_server;
+ instance._is_hostname = _is_hostname;
+ instance._is_IPv4address = _is_IPv4address;
+ instance._is_IPv6reference = _is_IPv6reference;
+
+ return instance;
+ }
+
+ // ------------------------------------------------------------ Get the URI
+
+ /**
+ * It can be gotten the URI character sequence. It's raw-escaped.
+ * For the purpose of the protocol to be transported, it will be useful.
+ *
+ * It is clearly unwise to use a URL that contains a password which is
+ * intended to be secret. In particular, the use of a password within
+ * the 'userinfo' component of a URL is strongly disrecommended except
+ * in those rare cases where the 'password' parameter is intended to be
+ * public.
+ *
+ * When you want to get each part of the userinfo, you need to use the
+ * specific methods in the specific URL. It depends on the specific URL.
+ *
+ * @return the URI character sequence
+ */
+ public char[] getRawURI() {
+ return _uri;
+ }
+
+
+ /**
+ * It can be gotten the URI character sequence. It's escaped.
+ * For the purpose of the protocol to be transported, it will be useful.
+ *
+ * @return the escaped URI string
+ */
+ public String getEscapedURI() {
+ return (_uri == null) ? null : new String(_uri);
+ }
+
+
+ /**
+ * It can be gotten the URI character sequence.
+ *
+ * @return the original URI string
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getURI() throws URIException {
+ return (_uri == null) ? null : decode(_uri, getProtocolCharset());
+ }
+
+
+ /**
+ * Get the URI reference character sequence.
+ *
+ * @return the URI reference character sequence
+ */
+ public char[] getRawURIReference() {
+ if (_fragment == null) {
+ return _uri;
+ }
+ if (_uri == null) {
+ return _fragment;
+ }
+ // if _uri != null && _fragment != null
+ String uriReference = new String(_uri) + "#" + new String(_fragment);
+ return uriReference.toCharArray();
+ }
+
+
+ /**
+ * Get the escaped URI reference string.
+ *
+ * @return the escaped URI reference string
+ */
+ public String getEscapedURIReference() {
+ char[] uriReference = getRawURIReference();
+ return (uriReference == null) ? null : new String(uriReference);
+ }
+
+
+ /**
+ * Get the original URI reference string.
+ *
+ * @return the original URI reference string
+ * @throws URIException If {@link #decode} fails.
+ */
+ public String getURIReference() throws URIException {
+ char[] uriReference = getRawURIReference();
+ return (uriReference == null) ? null : decode(uriReference,
+ getProtocolCharset());
+ }
+
+
+ /**
+ * Get the escaped URI string.
+ *
+ * On the document, the URI-reference form is only used without the userinfo
+ * component like http://jakarta.apache.org/ by the security reason.
+ * But the URI-reference form with the userinfo component could be parsed.
+ *
+ * In other words, this URI and any its subclasses must not expose the
+ * URI-reference expression with the userinfo component like
+ * http://user:password@hostport/restricted_zone.
+ * It means that the API client programmer should extract each user and
+ * password to access manually. Probably it will be supported in the each
+ * subclass, however, not a whole URI-reference expression.
+ *
+ * @return the escaped URI string
+ * @see #clone()
+ */
+ public String toString() {
+ return getEscapedURI();
+ }
+
+
+ // ------------------------------------------------------------ Inner class
+
+ /**
+ * The charset-changed normal operation to represent to be required to
+ * alert to user the fact the default charset is changed.
+ */
+ public static class DefaultCharsetChanged extends RuntimeException {
+
+ // ------------------------------------------------------- constructors
+
+ /**
+ * The constructor with a reason string and its code arguments.
+ *
+ * @param reasonCode the reason code
+ * @param reason the reason
+ */
+ public DefaultCharsetChanged(int reasonCode, String reason) {
+ super(reason);
+ this.reason = reason;
+ this.reasonCode = reasonCode;
+ }
+
+ // ---------------------------------------------------------- constants
+
+ /** No specified reason code. */
+ public static final int UNKNOWN = 0;
+
+ /** Protocol charset changed. */
+ public static final int PROTOCOL_CHARSET = 1;
+
+ /** Document charset changed. */
+ public static final int DOCUMENT_CHARSET = 2;
+
+ // ------------------------------------------------- instance variables
+
+ /** The reason code. */
+ private int reasonCode;
+
+ /** The reason message. */
+ private String reason;
+
+ // ------------------------------------------------------------ methods
+
+ /**
+ * Get the reason code.
+ *
+ * @return the reason code
+ */
+ public int getReasonCode() {
+ return reasonCode;
+ }
+
+ /**
+ * Get the reason message.
+ *
+ * @return the reason message
+ */
+ public String getReason() {
+ return reason;
+ }
+
+ }
+
+
+ /**
+ * A mapping to determine the (somewhat arbitrarily) preferred charset for a
+ * given locale. Supports all locales recognized in JDK 1.1.
+ *
+ * The distribution of this class is Servlets.com. It was originally
+ * written by Jason Hunter [jhunter at acm.org] and used by with permission.
+ */
+ public static class LocaleToCharsetMap {
+
+ /** A mapping of language code to charset */
+ private static final Hashtable LOCALE_TO_CHARSET_MAP;
+ static {
+ LOCALE_TO_CHARSET_MAP = new Hashtable();
+ LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6");
+ LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7");
+ LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8");
+ LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS");
+ LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR");
+ LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9");
+ LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("zh", "GB2312");
+ LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5");
+ }
+
+ /**
+ * Get the preferred charset for the given locale.
+ *
+ * @param locale the locale
+ * @return the preferred charset or null if the locale is not
+ * recognized.
+ */
+ public static String getCharset(Locale locale) {
+ // try for an full name match (may include country)
+ String charset =
+ (String) LOCALE_TO_CHARSET_MAP.get(locale.toString());
+ if (charset != null) {
+ return charset;
+ }
+
+ // if a full name didn't match, try just the language
+ charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage());
+ return charset; // may be null
+ }
+
+ }
+
+}
+
diff --git a/src/main/java/org/archive/url/URIException.java b/src/main/java/org/archive/url/URIException.java
new file mode 100644
index 00000000..49fa2cb5
--- /dev/null
+++ b/src/main/java/org/archive/url/URIException.java
@@ -0,0 +1,180 @@
+/*
+ * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/URIException.java,v 1.12 2004/09/30 18:53:20 olegk Exp $
+ * $Revision: 480424 $
+ * $Date: 2006-11-29 06:56:49 +0100 (Wed, 29 Nov 2006) $
+ *
+ * ====================================================================
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ *
+ */
+
+package org.archive.url;
+
+import java.io.IOException;
+
+/**
+ * The URI parsing and escape encoding exception.
+ *
+ * @author Sung-Gu
+ * @author Oleg Kalnichevski
+ * @version $Revision: 480424 $ $Date: 2002/03/14 15:14:01
+ */
+public class URIException extends IOException {
+
+ // ----------------------------------------------------------- constructors
+
+ /**
+ * Default constructor.
+ */
+ public URIException() {
+ }
+
+
+ /**
+ * The constructor with a reason code argument.
+ *
+ * @param reasonCode the reason code
+ */
+ public URIException(int reasonCode) {
+ this.reasonCode = reasonCode;
+ }
+
+
+ /**
+ * The constructor with a reason string and its code arguments.
+ *
+ * @param reasonCode the reason code
+ * @param reason the reason
+ */
+ public URIException(int reasonCode, String reason) {
+ super(reason); // for backward compatibility of Throwable
+ this.reason = reason;
+ this.reasonCode = reasonCode;
+ }
+
+
+ /**
+ * The constructor with a reason string argument.
+ *
+ * @param reason the reason
+ */
+ public URIException(String reason) {
+ super(reason); // for backward compatibility of Throwable
+ this.reason = reason;
+ this.reasonCode = UNKNOWN;
+ }
+
+ // -------------------------------------------------------------- constants
+
+ /**
+ * No specified reason code.
+ */
+ public static final int UNKNOWN = 0;
+
+
+ /**
+ * The URI parsing error.
+ */
+ public static final int PARSING = 1;
+
+
+ /**
+ * The unsupported character encoding.
+ */
+ public static final int UNSUPPORTED_ENCODING = 2;
+
+
+ /**
+ * The URI escape encoding and decoding error.
+ */
+ public static final int ESCAPING = 3;
+
+
+ /**
+ * The DNS punycode encoding or decoding error.
+ */
+ public static final int PUNYCODE = 4;
+
+ // ------------------------------------------------------------- properties
+
+ /**
+ * The reason code.
+ */
+ protected int reasonCode;
+
+
+ /**
+ * The reason message.
+ */
+ protected String reason;
+
+ // ---------------------------------------------------------------- methods
+
+ /**
+ * Get the reason code.
+ *
+ * @return the reason code
+ */
+ public int getReasonCode() {
+ return reasonCode;
+ }
+
+ /**
+ * Set the reason code.
+ *
+ * @param reasonCode the reason code
+ *
+ * @deprecated Callers should set the reason code as a parameter to the
+ * constructor.
+ */
+ public void setReasonCode(int reasonCode) {
+ this.reasonCode = reasonCode;
+ }
+
+
+ /**
+ * Get the reason message.
+ *
+ * @return the reason message
+ *
+ * @deprecated You should instead call {@link #getMessage()}.
+ */
+ public String getReason() {
+ return reason;
+ }
+
+
+ /**
+ * Set the reason message.
+ *
+ * @param reason the reason message
+ *
+ * @deprecated Callers should instead set this via a parameter to the constructor.
+ */
+ public void setReason(String reason) {
+ this.reason = reason;
+ }
+
+
+}
+
diff --git a/src/main/java/org/archive/url/URLRegexTransformer.java b/src/main/java/org/archive/url/URLRegexTransformer.java
index 5f31c81c..182eb218 100644
--- a/src/main/java/org/archive/url/URLRegexTransformer.java
+++ b/src/main/java/org/archive/url/URLRegexTransformer.java
@@ -1,5 +1,6 @@
package org.archive.url;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -27,7 +28,7 @@ public class URLRegexTransformer {
public static String stripOpts(String orig, OptimizedPattern op[]) {
- String origLC = orig.toLowerCase();
+ String origLC = orig.toLowerCase(Locale.ROOT);
StringBuilder sb = null;
int i = 0;
int max = op.length;
diff --git a/src/main/java/org/archive/url/UsableURI.java b/src/main/java/org/archive/url/UsableURI.java
index ed40f41a..b7d0cf71 100644
--- a/src/main/java/org/archive/url/UsableURI.java
+++ b/src/main/java/org/archive/url/UsableURI.java
@@ -26,14 +26,13 @@
import java.net.URI;
import java.net.URISyntaxException;
-import org.apache.commons.httpclient.URIException;
import org.archive.util.SURT;
import org.archive.util.TextUtils;
/**
* Usable URI.
*
- * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
+ * This class wraps {@link org.archive.url.URI} adding caching
* and methods. It cannot be instantiated directly. Go via UURIFactory.
*
*
We used to use {@link java.net.URI} for parsing URIs but ran across
@@ -50,7 +49,7 @@
* @author gojomo
* @author stack
*
- * @see org.apache.commons.httpclient.URI
+ * @see org.archive.url.URI
*/
public class UsableURI extends LaxURI
implements CharSequence, Serializable {
@@ -121,7 +120,6 @@ protected UsableURI() {
* @param uri String representation of an absolute URI.
* @param escaped If escaped.
* @param charset Charset to use.
- * @throws org.apache.commons.httpclient.URIException
*/
protected UsableURI(String uri, boolean escaped, String charset)
throws URIException {
@@ -132,7 +130,6 @@ protected UsableURI(String uri, boolean escaped, String charset)
/**
* @param relative String representation of URI.
* @param base Parent UURI to use derelativizing.
- * @throws org.apache.commons.httpclient.URIException
*/
protected UsableURI(UsableURI base, UsableURI relative) throws URIException {
super(base, relative);
@@ -275,7 +272,7 @@ public String toString() {
/**
* In the case of a puny encoded IDN, this method returns the decoded Unicode version.
*
- * Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}.
+ * Most of this implementation is copied from {@link org.archive.url.URI#setURI()}.
*
* @return decoded IDN version of URI
*/
diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java
index 3dfc33a7..3038ada5 100644
--- a/src/main/java/org/archive/url/UsableURIFactory.java
+++ b/src/main/java/org/archive/url/UsableURIFactory.java
@@ -23,13 +23,12 @@
import java.io.UnsupportedEncodingException;
import java.util.BitSet;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.httpclient.URI;
-import org.apache.commons.httpclient.URIException;
import org.archive.util.TextUtils;
/**
@@ -611,7 +610,7 @@ private String fixupDomainlabel(String label)
throw ue;
}
}
- label = label.toLowerCase();
+ label = label.toLowerCase(Locale.ROOT);
return label;
}
@@ -757,6 +756,6 @@ private String checkUriElement(String element) {
*/
private String checkUriElementAndLowerCase(String element) {
String tmp = checkUriElement(element);
- return (tmp != null)? tmp.toLowerCase(): tmp;
+ return (tmp != null)? tmp.toLowerCase(Locale.ROOT): tmp;
}
}
diff --git a/src/main/java/org/archive/url/WaybackURLKeyMaker.java b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
index 99fb92e9..56f51b49 100644
--- a/src/main/java/org/archive/url/WaybackURLKeyMaker.java
+++ b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
@@ -5,7 +5,7 @@
public class WaybackURLKeyMaker implements URLKeyMaker {
// URLCanonicalizer canonicalizer = new NonMassagingIAURLCanonicalizer();
- URLCanonicalizer canonicalizer = new DefaultIAURLCanonicalizer();
+ URLCanonicalizer canonicalizer = new AggressiveIAURLCanonicalizer();
public URLCanonicalizer getCanonicalizer() {
return canonicalizer;
diff --git a/src/main/java/org/archive/util/ArchiveUtils.java b/src/main/java/org/archive/util/ArchiveUtils.java
index 22ba2787..cce411df 100644
--- a/src/main/java/org/archive/util/ArchiveUtils.java
+++ b/src/main/java/org/archive/util/ArchiveUtils.java
@@ -49,6 +49,8 @@
import org.archive.format.gzip.GZIPDecoder;
import org.archive.format.gzip.GZIPFormatException;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Miscellaneous useful methods.
*
@@ -851,7 +853,7 @@ private static String loadVersion() {
BufferedReader br = null;
String version;
try {
- br = new BufferedReader(new InputStreamReader(input));
+ br = new BufferedReader(new InputStreamReader(input, UTF_8));
version = br.readLine();
br.readLine();
} catch (IOException e) {
@@ -873,7 +875,7 @@ private static String loadVersion() {
br = null;
String timestamp;
try {
- br = new BufferedReader(new InputStreamReader(input));
+ br = new BufferedReader(new InputStreamReader(input, UTF_8));
timestamp = br.readLine();
} catch (IOException e) {
return version;
@@ -894,13 +896,13 @@ private static String loadVersion() {
TLDS = new HashSet();
InputStream is = ArchiveUtils.class.getResourceAsStream("tlds-alpha-by-domain.txt");
try {
- BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8));
String line;
while((line = reader.readLine())!=null) {
if (line.startsWith("#")) {
continue;
}
- TLDS.add(line.trim().toLowerCase());
+ TLDS.add(line.trim().toLowerCase(Locale.ROOT));
}
} catch (Exception e) {
LOGGER.log(Level.SEVERE,"TLD list unavailable",e);
@@ -917,7 +919,7 @@ private static String loadVersion() {
* @return boolean true if recognized as TLD
*/
public static boolean isTld(String dom) {
- return TLDS.contains(dom.toLowerCase());
+ return TLDS.contains(dom.toLowerCase(Locale.ROOT));
}
public static void closeQuietly(Object input) {
@@ -981,12 +983,12 @@ public static int readFully(InputStream input, byte[] buf)
*/
public static BufferedReader getBufferedReader(File source) throws IOException {
InputStream is = new BufferedInputStream(new FileInputStream(source));
- boolean isGzipped = source.getName().toLowerCase().
+ boolean isGzipped = source.getName().toLowerCase(Locale.ROOT).
endsWith(GZIP_SUFFIX);
if(isGzipped) {
is = new GZIPInputStream(is);
}
- return new BufferedReader(new InputStreamReader(is));
+ return new BufferedReader(new InputStreamReader(is, UTF_8));
}
/**
@@ -1002,8 +1004,8 @@ public static BufferedReader getBufferedReader(URL source) throws IOException {
|| conn.getContentEncoding() != null && conn.getContentEncoding().equalsIgnoreCase("gzip");
InputStream uis = conn.getInputStream();
return new BufferedReader(isGzipped?
- new InputStreamReader(new GZIPInputStream(uis)):
- new InputStreamReader(uis));
+ new InputStreamReader(new GZIPInputStream(uis), UTF_8):
+ new InputStreamReader(uis, UTF_8));
}
/**
diff --git a/src/main/java/org/archive/util/ChunkedInputStream.java b/src/main/java/org/archive/util/ChunkedInputStream.java
new file mode 100644
index 00000000..b6a604c8
--- /dev/null
+++ b/src/main/java/org/archive/util/ChunkedInputStream.java
@@ -0,0 +1,323 @@
+/*
+ * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/ChunkedInputStream.java,v 1.24 2004/10/10 15:18:55 olegk Exp $
+ * $Revision: 480424 $
+ * $Date: 2006-11-29 06:56:49 +0100 (Wed, 29 Nov 2006) $
+ *
+ * ====================================================================
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ *
+ */
+
+package org.archive.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+
+/**
+ *
Transparently coalesces chunks of a HTTP stream that uses
+ * Transfer-Encoding chunked.
+ *
+ *
Note that this class NEVER closes the underlying stream, even when close
+ * gets called. Instead, it will read until the "end" of its chunking on close,
+ * which allows for the seamless invocation of subsequent HTTP 1.1 calls, while
+ * not requiring the client to remember to read the entire contents of the
+ * response.
+ *
+ * @author Ortwin Glueck
+ * @author Sean C. Sullivan
+ * @author Martin Elwin
+ * @author Eric Johnson
+ * @author Mike Bowler
+ * @author Michael Becke
+ * @author Oleg Kalnichevski
+ *
+ * @since 2.0
+ *
+ */
+class ChunkedInputStream extends InputStream {
+ /** The inputstream that we're wrapping */
+ private InputStream in;
+
+ /** The chunk size */
+ private int chunkSize;
+
+ /** The current position within the current chunk */
+ private int pos;
+
+ /** True if we'are at the beginning of stream */
+ private boolean bof = true;
+
+ /** True if we've reached the end of stream */
+ private boolean eof = false;
+
+ /** True if this stream is closed */
+ private boolean closed = false;
+
+ /**
+ * ChunkedInputStream constructor
+ *
+ * @param in the raw input stream
+ *
+ */
+ public ChunkedInputStream(final InputStream in) {
+
+ if (in == null) {
+ throw new IllegalArgumentException("InputStream parameter may not be null");
+ }
+ this.in = in;
+ this.pos = 0;
+ }
+
+ /**
+ *
Returns all the data in a chunked stream in coalesced form. A chunk
+ * is followed by a CRLF. The method returns -1 as soon as a chunksize of 0
+ * is detected.
+ *
+ *
Trailer headers are read automcatically at the end of the stream and
+ * can be obtained with the getResponseFooters() method.
+ *
+ * @return -1 of the end of the stream has been reached or the next data
+ * byte
+ * @throws IOException If an IO problem occurs
+ */
+ public int read() throws IOException {
+
+ if (closed) {
+ throw new IOException("Attempted read from closed stream.");
+ }
+ if (eof) {
+ return -1;
+ }
+ if (pos >= chunkSize) {
+ nextChunk();
+ if (eof) {
+ return -1;
+ }
+ }
+ pos++;
+ return in.read();
+ }
+
+ /**
+ * Read some bytes from the stream.
+ * @param b The byte array that will hold the contents from the stream.
+ * @param off The offset into the byte array at which bytes will start to be
+ * placed.
+ * @param len the maximum number of bytes that can be returned.
+ * @return The number of bytes returned or -1 if the end of stream has been
+ * reached.
+ * @see InputStream#read(byte[], int, int)
+ * @throws IOException if an IO problem occurs.
+ */
+ public int read (byte[] b, int off, int len) throws IOException {
+
+ if (closed) {
+ throw new IOException("Attempted read from closed stream.");
+ }
+
+ if (eof) {
+ return -1;
+ }
+ if (pos >= chunkSize) {
+ nextChunk();
+ if (eof) {
+ return -1;
+ }
+ }
+ len = Math.min(len, chunkSize - pos);
+ int count = in.read(b, off, len);
+ pos += count;
+ return count;
+ }
+
+ /**
+ * Read some bytes from the stream.
+ * @param b The byte array that will hold the contents from the stream.
+ * @return The number of bytes returned or -1 if the end of stream has been
+ * reached.
+ * @see InputStream#read(byte[])
+ * @throws IOException if an IO problem occurs.
+ */
+ public int read (byte[] b) throws IOException {
+ return read(b, 0, b.length);
+ }
+
+ /**
+ * Read the CRLF terminator.
+ * @throws IOException If an IO error occurs.
+ */
+ private void readCRLF() throws IOException {
+ int cr = in.read();
+ int lf = in.read();
+ if ((cr != '\r') || (lf != '\n')) {
+ throw new IOException(
+ "CRLF expected at end of chunk: " + cr + "/" + lf);
+ }
+ }
+
+
+ /**
+ * Read the next chunk.
+ * @throws IOException If an IO error occurs.
+ */
+ private void nextChunk() throws IOException {
+ if (!bof) {
+ readCRLF();
+ }
+ chunkSize = getChunkSizeFromInputStream(in);
+ bof = false;
+ pos = 0;
+ if (chunkSize == 0) {
+ eof = true;
+ parseTrailerHeaders();
+ }
+ }
+
+ /**
+ * Expects the stream to start with a chunksize in hex with optional
+ * comments after a semicolon. The line must end with a CRLF: "a3; some
+ * comment\r\n" Positions the stream at the start of the next line.
+ *
+ * @param in The new input stream.
+ *
+ * @return the chunk size as integer
+ *
+ * @throws IOException when the chunk size could not be parsed
+ */
+ private static int getChunkSizeFromInputStream(final InputStream in)
+ throws IOException {
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ // States: 0=normal, 1=\r was scanned, 2=inside quoted string, -1=end
+ int state = 0;
+ while (state != -1) {
+ int b = in.read();
+ if (b == -1) {
+ throw new IOException("chunked stream ended unexpectedly");
+ }
+ switch (state) {
+ case 0:
+ switch (b) {
+ case '\r':
+ state = 1;
+ break;
+ case '\"':
+ state = 2;
+ /* fall through */
+ default:
+ baos.write(b);
+ }
+ break;
+
+ case 1:
+ if (b == '\n') {
+ state = -1;
+ } else {
+ // this was not CRLF
+ throw new IOException("Protocol violation: Unexpected"
+ + " single newline character in chunk size");
+ }
+ break;
+
+ case 2:
+ switch (b) {
+ case '\\':
+ b = in.read();
+ baos.write(b);
+ break;
+ case '\"':
+ state = 0;
+ /* fall through */
+ default:
+ baos.write(b);
+ }
+ break;
+ default: throw new RuntimeException("assertion failed");
+ }
+ }
+
+ //parse data
+ String dataString = baos.toString(StandardCharsets.US_ASCII.name());
+ int separator = dataString.indexOf(';');
+ dataString = (separator > 0)
+ ? dataString.substring(0, separator).trim()
+ : dataString.trim();
+
+ int result;
+ try {
+ result = Integer.parseInt(dataString.trim(), 16);
+ } catch (NumberFormatException e) {
+ throw new IOException ("Bad chunk size: " + dataString);
+ }
+ return result;
+ }
+
+ /**
+ * Reads and stores the Trailer headers.
+ * @throws IOException If an IO problem occurs
+ */
+ private void parseTrailerHeaders() throws IOException {
+ LaxHttpParser.parseHeaders(in, StandardCharsets.US_ASCII.name());
+ }
+
+ /**
+ * Upon close, this reads the remainder of the chunked message,
+ * leaving the underlying socket at a position to start reading the
+ * next response without scanning.
+ * @throws IOException If an IO problem occurs.
+ */
+ public void close() throws IOException {
+ if (!closed) {
+ try {
+ if (!eof) {
+ exhaustInputStream(this);
+ }
+ } finally {
+ eof = true;
+ closed = true;
+ }
+ }
+ }
+
+ /**
+ * Exhaust an input stream, reading until EOF has been encountered.
+ *
+ *
Note that this function is intended as a non-public utility.
+ * This is a little weird, but it seemed silly to make a utility
+ * class for this one function, so instead it is just static and
+ * shared that way.