diff --git a/pom.xml b/pom.xml index c70a2cd7..81bd9b32 100644 --- a/pom.xml +++ b/pom.xml @@ -82,9 +82,9 @@ - commons-httpclient - commons-httpclient - 3.1 + commons-codec + commons-codec + 1.18.0 diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java index 2247eda4..c28ee536 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java @@ -31,7 +31,7 @@ public class ZipNumBlockLoader { protected int signDurationSecs = DEFAULT_SIG_DURATION_SECS; protected boolean useNio = false; - protected String httpLib = HttpLibs.APACHE_31.name(); + protected String httpLib = HttpLibs.APACHE_43.name(); protected boolean bufferFully = true; protected boolean noKeepAlive = true; diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java deleted file mode 100644 index 1a94af1f..00000000 --- a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.httpclient; - -import java.io.IOException; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpState; -import org.apache.commons.httpclient.methods.GetMethod; -import org.archive.util.Recorder; - - -/** - * Override of GetMethod that marks the passed HttpRecorder w/ the transition - * from HTTP head to body and that forces a close on the http connection. - * - * The actions done in this subclass used to be done by copying - * org.apache.commons.HttpMethodBase, overlaying our version in place of the - * one that came w/ httpclient. Here is the patch of the difference between - * shipped httpclient code and our mods: - *
- *    -- -1338,6 +1346,12 --
- *
- *        public void releaseConnection() {
- *
- *   +        // HERITRIX always ants the streams closed.
- *   +        if (responseConnection != null)
- *   +        {
- *   +            responseConnection.close();
- *   +        }
- *   +
- *            if (responseStream != null) {
- *                try {
- *                    // FYI - this may indirectly invoke responseBodyConsumed.
- *   -- -1959,6 +1973,11 --
- *                        this.statusLine = null;
- *                    }
- *                }
- *   +            // HERITRIX mark transition from header to content.
- *   +            if (this.httpRecorder != null)
- *   +            {
- *   +                this.httpRecorder.markContentBegin();
- *   +            }
- *                readResponseBody(state, conn);
- *                processResponseBody(state, conn);
- *            } catch (IOException e) {
- * 
- * - *

We're not supposed to have access to the underlying connection object; - * am only violating contract because see cases where httpclient is skipping - * out w/o cleaning up after itself. - * - * @author stack - * @version $Revision$, $Date$ - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class HttpRecorderGetMethod extends GetMethod { - - protected static Logger logger = - Logger.getLogger(HttpRecorderGetMethod.class.getName()); - - /** - * Instance of http recorder method. - */ - protected HttpRecorderMethod httpRecorderMethod = null; - - - public HttpRecorderGetMethod(String uri, Recorder recorder) { - super(uri); - this.httpRecorderMethod = new HttpRecorderMethod(recorder); - } - - protected void readResponseBody(HttpState state, HttpConnection connection) - throws IOException, HttpException { - // We're about to read the body. Mark transition in http recorder. - this.httpRecorderMethod.markContentBegin(connection); - super.readResponseBody(state, connection); - } - - protected boolean shouldCloseConnection(HttpConnection conn) { - // Always close connection after each request. As best I can tell, this - // is superfluous -- we've set our client to be HTTP/1.0. Doing this - // out of paranoia. - return true; - } - - public int execute(HttpState state, HttpConnection conn) - throws HttpException, IOException { - // Save off the connection so we can close it on our way out in case - // httpclient fails to (We're not supposed to have access to the - // underlying connection object; am only violating contract because - // see cases where httpclient is skipping out w/o cleaning up - // after itself). - this.httpRecorderMethod.setConnection(conn); - return super.execute(state, conn); - } - - protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) - throws IOException, HttpException { - super.addProxyConnectionHeader(state, conn); - this.httpRecorderMethod.handleAddProxyConnectionHeader(this); - } - - // XXX see https://webarchive.jira.com/browse/HER-2059 - // We never call this method with the implied question mark prepended, so - // adding it does the trick, since commons-httpclient will strip it later. - public void setQueryString(String queryString) { - if (queryString != null) { - super.setQueryString('?' + queryString); - } else { - super.setQueryString(queryString); - } - } - -} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java deleted file mode 100644 index b08bc0bd..00000000 --- a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.httpclient; - -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpMethod; -import org.archive.util.Recorder; - - -/** - * This class encapsulates the specializations supplied by the - * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}. - * - * It keeps instance of HttpRecorder and HttpConnection. - * - * @author stack - * @version $Revision$, $Date$ - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class HttpRecorderMethod { - protected static Logger logger = - Logger.getLogger(HttpRecorderMethod.class.getName()); - - /** - * Instance of http recorder we're using recording this http get. - */ - private Recorder httpRecorder = null; - - /** - * Save around so can force close. - * - * See [ 922080 ] IllegalArgumentException (size is wrong). - * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099 - */ - private HttpConnection connection = null; - - - public HttpRecorderMethod(Recorder recorder) { - this.httpRecorder = recorder; - } - - public void markContentBegin(HttpConnection c) { - if (c != this.connection) { - // We're checking that we're not being asked to work on - // a connection that is other than the one we started - // this method#execute with. - throw new IllegalArgumentException("Connections differ: " + - this.connection + " " + c + " " + - Thread.currentThread().getName()); - } - this.httpRecorder.markContentBegin(); - } - - /** - * @return Returns the connection. - */ - public HttpConnection getConnection() { - return this.connection; - } - - /** - * @param connection The connection to set. - */ - public void setConnection(HttpConnection connection) { - this.connection = connection; - } - /** - * @return Returns the httpRecorder. - */ - public Recorder getHttpRecorder() { - return httpRecorder; - } - - /** - * If a 'Proxy-Connection' header has been added to the request, - * it'll be of a 'keep-alive' type. Until we support 'keep-alives', - * override the Proxy-Connection setting and instead pass a 'close' - * (Otherwise every request has to timeout before we notice - * end-of-document). - * @param method Method to find proxy-connection header in. - */ - public void handleAddProxyConnectionHeader(HttpMethod method) { - Header h = method.getRequestHeader("Proxy-Connection"); - if (h != null) { - h.setValue("close"); - method.setRequestHeader(h); - } - } -} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java deleted file mode 100644 index d55d816a..00000000 --- a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.httpclient; - -import java.io.IOException; - -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpState; -import org.apache.commons.httpclient.methods.PostMethod; -import org.archive.util.Recorder; - - -/** - * Override of PostMethod that marks the passed HttpRecorder w/ the transition - * from HTTP head to body and that forces a close on the responseConnection. - * - * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the - * parent subclass. - * - * @author stack - * @version $Date$ $Revision$ - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class HttpRecorderPostMethod extends PostMethod { - /** - * Instance of http recorder method. - */ - protected HttpRecorderMethod httpRecorderMethod = null; - - - public HttpRecorderPostMethod(String uri, Recorder recorder) { - super(uri); - this.httpRecorderMethod = new HttpRecorderMethod(recorder); - } - - protected void readResponseBody(HttpState state, HttpConnection connection) - throws IOException, HttpException { - // We're about to read the body. Mark transition in http recorder. - this.httpRecorderMethod.markContentBegin(connection); - super.readResponseBody(state, connection); - } - - protected boolean shouldCloseConnection(HttpConnection conn) { - // Always close connection after each request. As best I can tell, this - // is superfluous -- we've set our client to be HTTP/1.0. Doing this - // out of paranoia. - return true; - } - - public int execute(HttpState state, HttpConnection conn) - throws HttpException, IOException { - // Save off the connection so we can close it on our way out in case - // httpclient fails to (We're not supposed to have access to the - // underlying connection object; am only violating contract because - // see cases where httpclient is skipping out w/o cleaning up - // after itself). - this.httpRecorderMethod.setConnection(conn); - return super.execute(state, conn); - } - - protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) - throws IOException, HttpException { - super.addProxyConnectionHeader(state, conn); - this.httpRecorderMethod.handleAddProxyConnectionHeader(this); - } -} diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java deleted file mode 100644 index d6cf27ab..00000000 --- a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.httpclient; - -import java.io.IOException; -import java.io.InputStream; - -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.SimpleHttpConnectionManager; - -/** - * An HttpClient-compatible HttpConnection "manager" that actually - * just gives out a new connection each time -- skipping the overhead - * of connection management, since we already throttle our crawler - * with external mechanisms. - * - * @author gojomo - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class SingleHttpConnectionManager extends SimpleHttpConnectionManager { - - public SingleHttpConnectionManager() { - super(); - } - - public HttpConnection getConnectionWithTimeout( - HostConfiguration hostConfiguration, long timeout) { - - HttpConnection conn = new HttpConnection(hostConfiguration); - conn.setHttpConnectionManager(this); - conn.getParams().setDefaults(this.getParams()); - return conn; - } - - public void releaseConnection(HttpConnection conn) { - // ensure connection is closed - conn.close(); - finishLast(conn); - } - - protected static void finishLast(HttpConnection conn) { - // copied from superclass because it wasn't made available to subclasses - InputStream lastResponse = conn.getLastResponseInputStream(); - if (lastResponse != null) { - conn.setLastResponseInputStream(null); - try { - lastResponse.close(); - } catch (IOException ioe) { - //FIXME: badness - close to force reconnect. - conn.close(); - } - } - } -} diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java deleted file mode 100644 index 16821b36..00000000 --- a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java +++ /dev/null @@ -1,293 +0,0 @@ -/** - * ==================================================================== - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ==================================================================== - * - */ -package org.archive.httpclient; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpConnectionManager; -import org.apache.commons.httpclient.params.HttpConnectionManagerParams; - -/** - * A simple, but thread-safe HttpClient {@link HttpConnectionManager}. - * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}. - * - * Java >= 1.4 is recommended. - * - * @author Christian Kohlschuetter - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public final class ThreadLocalHttpConnectionManager implements - HttpConnectionManager { - - private static final CloserThread closer = new CloserThread(); - private static final Logger logger = Logger - .getLogger(ThreadLocalHttpConnectionManager.class.getName()); - - private final ThreadLocal tl = new ThreadLocal() { - protected synchronized ConnectionInfo initialValue() { - return new ConnectionInfo(); - } - }; - - private ConnectionInfo getConnectionInfo() { - return (ConnectionInfo) tl.get(); - } - - private static final class ConnectionInfo { - /** The http connection */ - private HttpConnection conn = null; - - /** - * The time the connection was made idle. - */ - private long idleStartTime = Long.MAX_VALUE; - } - - public ThreadLocalHttpConnectionManager() { - } - - /** - * Since the same connection is about to be reused, make sure the - * previous request was completely processed, and if not - * consume it now. - * @param conn The connection - * @return true, if the connection is reusable - */ - private static boolean finishLastResponse(final HttpConnection conn) { - InputStream lastResponse = conn.getLastResponseInputStream(); - if(lastResponse != null) { - conn.setLastResponseInputStream(null); - try { - lastResponse.close(); - return true; - } catch (IOException ioe) { - // force reconnect. - return false; - } - } else { - return false; - } - } - - /** - * Collection of parameters associated with this connection manager. - */ - private HttpConnectionManagerParams params = new HttpConnectionManagerParams(); - - /** - * @see HttpConnectionManager#getConnection(HostConfiguration) - */ - public HttpConnection getConnection( - final HostConfiguration hostConfiguration) { - return getConnection(hostConfiguration, 0); - } - - /** - * Gets the staleCheckingEnabled value to be set on HttpConnections that are created. - * - * @return true if stale checking will be enabled on HttpConections - * - * @see HttpConnection#isStaleCheckingEnabled() - * - * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()}, - * {@link HttpConnectionManager#getParams()}. - */ - public boolean isConnectionStaleCheckingEnabled() { - return this.params.isStaleCheckingEnabled(); - } - - /** - * Sets the staleCheckingEnabled value to be set on HttpConnections that are created. - * - * @param connectionStaleCheckingEnabled true if stale checking will be enabled - * on HttpConections - * - * @see HttpConnection#setStaleCheckingEnabled(boolean) - * - * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)}, - * {@link HttpConnectionManager#getParams()}. - */ - public void setConnectionStaleCheckingEnabled( - final boolean connectionStaleCheckingEnabled) { - this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled); - } - - /** - * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long) - * - * @since 3.0 - */ - public HttpConnection getConnectionWithTimeout( - final HostConfiguration hostConfiguration, final long timeout) { - - final ConnectionInfo ci = getConnectionInfo(); - HttpConnection httpConnection = ci.conn; - - // make sure the host and proxy are correct for this connection - // close it and set the values if they are not - if(httpConnection == null || !finishLastResponse(httpConnection) - || !hostConfiguration.hostEquals(httpConnection) - || !hostConfiguration.proxyEquals(httpConnection)) { - - if(httpConnection != null && httpConnection.isOpen()) { - closer.closeConnection(httpConnection); - } - - httpConnection = new HttpConnection(hostConfiguration); - httpConnection.setHttpConnectionManager(this); - httpConnection.getParams().setDefaults(this.params); - ci.conn = httpConnection; - - httpConnection.setHost(hostConfiguration.getHost()); - httpConnection.setPort(hostConfiguration.getPort()); - httpConnection.setProtocol(hostConfiguration.getProtocol()); - httpConnection.setLocalAddress(hostConfiguration.getLocalAddress()); - - httpConnection.setProxyHost(hostConfiguration.getProxyHost()); - httpConnection.setProxyPort(hostConfiguration.getProxyPort()); - } - - // remove the connection from the timeout handler - ci.idleStartTime = Long.MAX_VALUE; - - return httpConnection; - } - - /** - * @see HttpConnectionManager#getConnection(HostConfiguration, long) - * - * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long) - */ - public HttpConnection getConnection( - final HostConfiguration hostConfiguration, final long timeout) { - return getConnectionWithTimeout(hostConfiguration, timeout); - } - - /** - * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection) - */ - public void releaseConnection(final HttpConnection conn) { - final ConnectionInfo ci = getConnectionInfo(); - HttpConnection httpConnection = ci.conn; - - if(conn != httpConnection) { - throw new IllegalStateException( - "Unexpected release of an unknown connection."); - } - - finishLastResponse(httpConnection); - - // track the time the connection was made idle - ci.idleStartTime = System.currentTimeMillis(); - } - - /** - * Returns {@link HttpConnectionManagerParams parameters} associated - * with this connection manager. - * - * @since 2.1 - * - * @see HttpConnectionManagerParams - */ - public HttpConnectionManagerParams getParams() { - return this.params; - } - - /** - * Assigns {@link HttpConnectionManagerParams parameters} for this - * connection manager. - * - * @since 2.1 - * - * @see HttpConnectionManagerParams - */ - public void setParams(final HttpConnectionManagerParams p) { - if(p == null) { - throw new IllegalArgumentException("Parameters may not be null"); - } - this.params = p; - } - - /** - * @since 3.0 - */ - public void closeIdleConnections(final long idleTimeout) { - long maxIdleTime = System.currentTimeMillis() - idleTimeout; - - final ConnectionInfo ci = getConnectionInfo(); - - if(ci.idleStartTime <= maxIdleTime) { - ci.conn.close(); - } - } - - private static final class CloserThread extends Thread { - private List connections - = new ArrayList(); - - private static final int SLEEP_INTERVAL = 5000; - - public CloserThread() { - super("HttpConnection closer"); - // Make this a daemon thread so it can't be responsible for the JVM - // not shutting down. - setDaemon(true); - start(); - } - - public void closeConnection(final HttpConnection conn) { - synchronized (connections) { - connections.add(conn); - } - } - - public void run() { - try { - while (!Thread.interrupted()) { - Thread.sleep(SLEEP_INTERVAL); - - List s; - synchronized (connections) { - s = connections; - connections = new ArrayList(); - } - logger.log(Level.INFO, "Closing " + s.size() - + " HttpConnections"); - for(final Iterator it = s.iterator(); - it.hasNext();) { - HttpConnection conn = it.next(); - conn.close(); - conn.setHttpConnectionManager(null); - it.remove(); - } - } - } catch (InterruptedException e) { - return; - } - } - } -} diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java index 3cce595b..ac4b82f6 100644 --- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -26,10 +26,7 @@ import java.io.OutputStream; import java.io.PrintStream; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.format.http.HttpHeader; import org.archive.io.arc.ARCConstants; import org.archive.util.LaxHttpParser; @@ -59,7 +56,7 @@ public class HeaderedArchiveRecord extends ArchiveRecord { * * Only available after the reading of headers. */ - private Header [] contentHeaders = null; + private HttpHeader[] contentHeaders = null; public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException { @@ -149,13 +146,14 @@ private InputStream readContentHeaders() throws IOException { throw new IOException("Failed to read raw lie where one " + " was expected: " + new String(statusBytes)); } - String statusLine = EncodingUtil.getString(statusBytes, 0, + String statusLine = new String(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if (statusLine == null) { throw new NullPointerException("Expected status line is null"); } + statusLine = statusLine.trim(); // TODO: Tighten up this test. - boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine); + boolean isHttpResponse = statusLine.startsWith("HTTP"); boolean isHttpRequest = false; if (!isHttpResponse) { isHttpRequest = statusLine.toUpperCase().startsWith("GET") || @@ -165,9 +163,13 @@ private InputStream readContentHeaders() throws IOException { throw new UnexpectedStartLineIOException("Failed parse of " + "status line: " + statusLine); } - this.statusCode = isHttpResponse? - (new StatusLine(statusLine)).getStatusCode(): -1; - + + if (isHttpResponse) { + this.statusCode = parseStatusCode(statusLine); + } else { + this.statusCode = -1; + } + // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since @@ -210,7 +212,19 @@ private InputStream readContentHeaders() throws IOException { bais.reset(); return bais; } - + + public static int parseStatusCode(String statusLine) { + int i = statusLine.indexOf(' '); + if (i < 0) return -1; + int j = statusLine.indexOf(' ', i + 1); + if (j < 0) j = statusLine.length(); + try { + return Integer.parseInt(statusLine.substring(i + 1, j)); + } catch (NumberFormatException e) { + return -1; + } + } + public static class UnexpectedStartLineIOException extends RecoverableIOException { private static final long serialVersionUID = 1L; @@ -252,7 +266,7 @@ public int getContentHeadersLength() { return this.contentHeadersLength; } - public Header[] getContentHeaders() { + public HttpHeader[] getContentHeaders() { return contentHeaders; } diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java index 19010131..f0515694 100644 --- a/src/main/java/org/archive/io/arc/ARC2WCDX.java +++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java @@ -22,14 +22,12 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; -import java.util.Date; -import java.util.Iterator; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.*; import java.util.zip.GZIPOutputStream; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HeaderGroup; -import org.apache.commons.httpclient.util.DateParseException; -import org.apache.commons.httpclient.util.DateUtil; +import org.archive.format.http.HttpHeader; import org.archive.io.ArchiveRecord; import org.archive.util.ArchiveUtils; import org.archive.util.SURT; @@ -95,12 +93,15 @@ public static Object[] createWcdx(ARCReader reader) { ARCRecord record = (ARCRecord) iter.next(); record.close(); ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader(); - Header[] httpHeaders = record.getHttpHeaders(); + HttpHeader[] httpHeaders = record.getHttpHeaders(); if(httpHeaders==null) { - httpHeaders = new Header[0]; + httpHeaders = new HttpHeader[0]; } - HeaderGroup hg = new HeaderGroup(); - hg.setHeaders(httpHeaders); + Map headerMap = new HashMap<>(); + for (HttpHeader header : httpHeaders) { + headerMap.putIfAbsent(header.getName().toLowerCase(Locale.ROOT), header); + } + StringBuilder builder = new StringBuilder(); // SURT-form URI @@ -108,7 +109,7 @@ public static Object[] createWcdx(ARCReader reader) { // record timestamp ('b') appendField(builder,h.getDate()); // http header date - appendTimeField(builder,hg.getFirstHeader("Date")); + appendTimeField(builder, headerMap.get("date")); // response code ('s') appendField(builder,h.getStatusCode()); // media type ('m') @@ -131,17 +132,17 @@ public static Object[] createWcdx(ARCReader reader) { // uncompressed (declared in ARC headerline) record length appendField(builder,h.getLength()); // http header content-length - appendField(builder,hg.getFirstHeader("Content-Length")); + appendField(builder, headerMap.get("content-length")); // http header mod-date - appendTimeField(builder,hg.getFirstHeader("Last-Modified")); + appendTimeField(builder, headerMap.get("last-modified")); // http header expires - appendTimeField(builder,hg.getFirstHeader("Expires")); + appendTimeField(builder, headerMap.get("expires")); // http header etag - appendField(builder,hg.getFirstHeader("ETag")); + appendField(builder, headerMap.get("etag")); // http header redirect ('Location' header?) - appendField(builder,hg.getFirstHeader("Location")); + appendField(builder, headerMap.get("location")); // ip ('e') appendField(builder,h.getIp()); // original URI @@ -186,8 +187,8 @@ protected static void appendField(StringBuilder builder, Object obj) { // prepend with delimiter builder.append(' '); } - if(obj instanceof Header) { - obj = ((Header)obj).getValue().trim(); + if(obj instanceof HttpHeader) { + obj = ((HttpHeader)obj).getValue().trim(); } builder.append((obj==null||obj.toString().length()==0)?"-":obj); @@ -202,16 +203,16 @@ protected static void appendTimeField(StringBuilder builder, Object obj) { builder.append("-"); return; } - if(obj instanceof Header) { - String s = ((Header)obj).getValue().trim(); + if(obj instanceof HttpHeader) { + String s = ((HttpHeader)obj).getValue().trim(); try { - Date date = DateUtil.parseDate(s); + Date date = parseDate(s); String d = ArchiveUtils.get14DigitDate(date); if(d.startsWith("209")) { d = "199"+d.substring(3); } obj = d; - } catch (DateParseException e) { + } catch (ParseException e) { builder.append('e'); return; } @@ -219,6 +220,23 @@ protected static void appendTimeField(StringBuilder builder, Object obj) { } builder.append(obj); } + + private static Date parseDate(String s) throws ParseException { + SimpleDateFormat format = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); + format.setTimeZone(TimeZone.getTimeZone("GMT")); + format.set2DigitYearStart(new Date(946684800)); // year 2000 + try { + return format.parse(s); + } catch (ParseException e) { + try { + format.applyPattern("EEEE, dd-MMM-yy HH:mm:ss zzz"); + return format.parse(s); + } catch (ParseException e1) { + format.applyPattern("EEE MMM d HH:mm:ss yyyy"); + return format.parse(s); + } + } + } } //'wide' CDX diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index bacaca38..d3c036ba 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -32,12 +32,11 @@ import java.util.logging.Logger; import java.util.regex.Matcher; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.util.EncodingUtil; import org.apache.commons.lang.StringUtils; +import org.archive.format.http.HttpHeader; import org.archive.io.ArchiveRecord; import org.archive.io.ArchiveRecordHeader; +import org.archive.io.HeaderedArchiveRecord; import org.archive.io.RecoverableIOException; import org.archive.util.InetAddressUtil; import org.archive.util.LaxHttpParser; @@ -50,11 +49,11 @@ */ public class ARCRecord extends ArchiveRecord implements ARCConstants { /** - * Http status line object. + * Http status code. * - * May be null if record is not http. + * May be -1 if record is not http. */ - private StatusLine httpStatus = null; + private int statusCode = -1; /** * Http header bytes. @@ -69,7 +68,7 @@ public class ARCRecord extends ArchiveRecord implements ARCConstants { * * Only populated after reading of headers. */ - private Header [] httpHeaders = null; + private HttpHeader[] httpHeaders = null; /** * Array of field names. @@ -589,8 +588,8 @@ private InputStream readHttpHeader() throws IOException { "Failed to read http status where one was expected: " + ((statusBytes == null) ? "" : new String(statusBytes))); } - - statusLine = EncodingUtil.getString(statusBytes, 0, + + statusLine = new String(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); // If a null or DELETED break immediately @@ -600,7 +599,7 @@ private InputStream readHttpHeader() throws IOException { // If it's actually the status line, break, otherwise continue skipping any // previous header values - if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) { + if (!statusLine.contains(":") && statusLine.trim().startsWith("HTTP")) { break; } @@ -613,7 +612,7 @@ private InputStream readHttpHeader() throws IOException { } if ((statusLine == null) || - !StatusLine.startsWithHTTP(statusLine)) { + !statusLine.trim().startsWith("HTTP")) { if (statusLine.startsWith("DELETED")) { // Some old ARCs have deleted records like following: // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202 @@ -629,13 +628,12 @@ private InputStream readHttpHeader() throws IOException { } } - try { - this.httpStatus = new StatusLine(statusLine); - } catch(IOException e) { - logger.warning(e.getMessage() + " at offset: " + h.getOffset()); - this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION); + this.statusCode = HeaderedArchiveRecord.parseStatusCode(statusLine.trim()); + if (statusCode == -1) { + logger.warning("Bad status line at offset: " + h.getOffset()); + this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION); } - + // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since @@ -706,7 +704,7 @@ public DeletedARCRecordIOException(final String reason) { * @return Status code. */ public int getStatusCode() { - return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode(); + return statusCode; } /** @@ -735,7 +733,7 @@ public ARCRecordMetaData getMetaData() { /** * @return http headers (Only available after header has been read). */ - public Header [] getHttpHeaders() { + public HttpHeader[] getHttpHeaders() { return this.httpHeaders; } diff --git a/src/main/java/org/archive/io/warc/WARCRecord.java b/src/main/java/org/archive/io/warc/WARCRecord.java index 635d1c3b..cf106270 100644 --- a/src/main/java/org/archive/io/warc/WARCRecord.java +++ b/src/main/java/org/archive/io/warc/WARCRecord.java @@ -29,8 +29,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; +import org.archive.format.http.HttpHeader; import org.archive.io.ArchiveRecord; import org.archive.io.ArchiveRecordHeader; import org.archive.util.LaxHttpParser; @@ -123,7 +122,7 @@ protected ArchiveRecordHeader parseHeaders(final InputStream in, // keep count of bytes read, digest and fail properly if EOR too soon... // We don't want digesting while reading Headers. // - Header [] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING); + HttpHeader[] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING); for (int i = 0; i < h.length; i++) { m.put(h[i].getName(), h[i].getValue()); } diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index afb1c850..6e95270c 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -4,9 +4,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.util.logging.Logger; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.archive.format.http.HttpHeaders; import org.archive.format.json.JSONUtils; import org.archive.format.text.charset.CharsetDetector; @@ -25,7 +24,7 @@ public class HTMLResourceFactory implements ResourceFactory { - public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class); + private static final Logger LOG = Logger.getLogger(HTMLResourceFactory.class.getName()); protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; @@ -58,7 +57,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData, try { charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); } catch (Exception e) { - LOG.error("Failed to guess charset: " + e.getMessage()); + LOG.severe("Failed to guess charset: " + e.getMessage()); } } diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java index d7318dfd..57071460 100644 --- a/src/main/java/org/archive/url/LaxURI.java +++ b/src/main/java/org/archive/url/LaxURI.java @@ -18,13 +18,11 @@ */ package org.archive.url; +import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.BitSet; -import org.apache.commons.httpclient.URI; -import org.apache.commons.httpclient.URIException; -import org.apache.commons.httpclient.util.EncodingUtil; - /** * URI subclass which allows partial/inconsistent encoding, matching * the URIs which will be relayed in requests from popular web @@ -121,13 +119,12 @@ protected static String decode(String component, String charset) "Component array of chars may not be null"); } byte[] rawdata = null; - // try { - rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil - .getAsciiBytes(component)); - // } catch (DecoderException e) { - // throw new URIException(e.getMessage()); - // } - return EncodingUtil.getString(rawdata, charset); + rawdata = LaxURLCodec.decodeUrlLoose(component.getBytes(StandardCharsets.US_ASCII)); + try { + return new String(rawdata, charset); + } catch (UnsupportedEncodingException e) { + return new String(rawdata); + } } // overidden to lax() the acceptable-char BitSet passed in @@ -183,7 +180,7 @@ protected BitSet lax(BitSet generous) { * two instances to one where possible, slimming * instances. * - * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean) + * @see URI#parseAuthority(java.lang.String, boolean) */ protected void parseAuthority(String original, boolean escaped) throws URIException { @@ -204,7 +201,7 @@ protected void parseAuthority(String original, boolean escaped) * long-lived instance from a static field, saving 12-14 bytes * per instance. * - * @see org.apache.commons.httpclient.URI#setURI() + * @see URI#setURI() */ protected void setURI() { if (_scheme != null) { diff --git a/src/main/java/org/archive/url/SURT.java b/src/main/java/org/archive/url/SURT.java index 2c8e1b02..3e0bcd55 100644 --- a/src/main/java/org/archive/url/SURT.java +++ b/src/main/java/org/archive/url/SURT.java @@ -6,7 +6,6 @@ import java.util.Iterator; import java.util.logging.Logger; -import org.apache.commons.httpclient.URIException; import org.archive.util.iterator.AbstractPeekableIterator; public class SURT { diff --git a/src/main/java/org/archive/url/SURTTokenizer.java b/src/main/java/org/archive/url/SURTTokenizer.java index da8f58f2..52b80a03 100644 --- a/src/main/java/org/archive/url/SURTTokenizer.java +++ b/src/main/java/org/archive/url/SURTTokenizer.java @@ -19,7 +19,6 @@ */ package org.archive.url; -import org.apache.commons.httpclient.URIException; import org.archive.util.SURT; /** diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java new file mode 100644 index 00000000..e420ca51 --- /dev/null +++ b/src/main/java/org/archive/url/URI.java @@ -0,0 +1,3978 @@ +/* + * $HeadURL: https://svn.apache.org/repos/asf/jakarta/httpcomponents/oac.hc3x/tags/HTTPCLIENT_3_1/src/java/org/apache/commons/httpclient/URI.java $ + * $Revision: 564973 $ + * $Date: 2007-08-11 22:51:47 +0200 (Sat, 11 Aug 2007) $ + * + * ==================================================================== + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ + +package org.archive.url; + +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.net.URLCodec; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Hashtable; +import java.util.Locale; + +/** + * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396. + * This class has the purpose of supportting of parsing a URI reference to + * extend any specific protocols, the character encoding of the protocol to + * be transported and the charset of the document. + *

+ * A URI is always in an "escaped" form, since escaping or unescaping a + * completed URI might change its semantics. + *

+ * Implementers should be careful not to escape or unescape the same string + * more than once, since unescaping an already unescaped string might lead to + * misinterpreting a percent data character as another escaped character, + * or vice versa in the case of escaping an already escaped string. + *

+ * In order to avoid these problems, data types used as follows: + *

+ *   URI character sequence: char
+ *   octet sequence: byte
+ *   original character sequence: String
+ * 

+ * + * So, a URI is a sequence of characters as an array of a char type, which + * is not always represented as a sequence of octets as an array of byte. + *

+ * + * URI Syntactic Components + *

+ * - In general, written as follows:
+ *   Absolute URI = <scheme>:<scheme-specific-part>
+ *   Generic URI = <scheme>://<authority><path>?<query>
+ *
+ * - Syntax
+ *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
+ *   hier_part     = ( net_path | abs_path ) [ "?" query ]
+ *   net_path      = "//" authority [ abs_path ]
+ *   abs_path      = "/"  path_segments
+ * 

+ * + * The following examples illustrate URI that are in common use. + *

+ * ftp://ftp.is.co.za/rfc/rfc1808.txt
+ *    -- ftp scheme for File Transfer Protocol services
+ * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
+ *    -- gopher scheme for Gopher and Gopher+ Protocol services
+ * http://www.math.uio.no/faq/compression-faq/part1.html
+ *    -- http scheme for Hypertext Transfer Protocol services
+ * mailto:mduerst@ifi.unizh.ch
+ *    -- mailto scheme for electronic mail addresses
+ * news:comp.infosystems.www.servers.unix
+ *    -- news scheme for USENET news groups and articles
+ * telnet://melvyl.ucop.edu/
+ *    -- telnet scheme for interactive services via the TELNET Protocol
+ * 
+ * Please, notice that there are many modifications from URL(RFC 1738) and + * relative URL(RFC 1808). + *

+ * The expressions for a URI + *

+ * For escaped URI forms
+ *  - URI(char[]) // constructor
+ *  - char[] getRawXxx() // method
+ *  - String getEscapedXxx() // method
+ *  - String toString() // method
+ * 

+ * For unescaped URI forms + * - URI(String) // constructor + * - String getXXX() // method + *

+ * + * @author Sung-Gu + * @author Mike Bowler + * @version $Revision: 564973 $ $Date: 2002/03/14 15:14:01 + */ +class URI implements Cloneable, Comparable, Serializable { + + + // ----------------------------------------------------------- Constructors + + /** Create an instance as an internal use */ + protected URI() { + } + + /** + * Construct a URI from a string with the given charset. The input string can + * be either in escaped or unescaped form. + * + * @param s URI character sequence + * @param escaped true if URI character sequence is in escaped form. + * false otherwise. + * @param charset the charset string to do escape encoding, if required + * + * @throws URIException If the URI cannot be created. + * @throws NullPointerException if input string is null + * + * @see #getProtocolCharset + * + * @since 3.0 + */ + public URI(String s, boolean escaped, String charset) + throws URIException, NullPointerException { + protocolCharset = charset; + parseUriReference(s, escaped); + } + + /** + * Construct a URI from a string with the given charset. The input string can + * be either in escaped or unescaped form. + * + * @param s URI character sequence + * @param escaped true if URI character sequence is in escaped form. + * false otherwise. + * + * @throws URIException If the URI cannot be created. + * @throws NullPointerException if input string is null + * + * @see #getProtocolCharset + * + * @since 3.0 + */ + public URI(String s, boolean escaped) + throws URIException, NullPointerException { + parseUriReference(s, escaped); + } + + /** + * Construct a URI as an escaped form of a character array with the given + * charset. + * + * @param escaped the URI character sequence + * @param charset the charset string to do escape encoding + * @throws URIException If the URI cannot be created. + * @throws NullPointerException if escaped is null + * @see #getProtocolCharset + * + * @deprecated Use #URI(String, boolean, String) + */ + public URI(char[] escaped, String charset) + throws URIException, NullPointerException { + protocolCharset = charset; + parseUriReference(new String(escaped), true); + } + + + /** + * Construct a URI as an escaped form of a character array. + * An URI can be placed within double-quotes or angle brackets like + * "http://test.com/" and <http://test.com/> + * + * @param escaped the URI character sequence + * @throws URIException If the URI cannot be created. + * @throws NullPointerException if escaped is null + * @see #getDefaultProtocolCharset + * + * @deprecated Use #URI(String, boolean) + */ + public URI(char[] escaped) + throws URIException, NullPointerException { + parseUriReference(new String(escaped), true); + } + + + /** + * Construct a URI from the given string with the given charset. + * + * @param original the string to be represented to URI character sequence + * It is one of absoluteURI and relativeURI. + * @param charset the charset string to do escape encoding + * @throws URIException If the URI cannot be created. + * @see #getProtocolCharset + * + * @deprecated Use #URI(String, boolean, String) + */ + public URI(String original, String charset) throws URIException { + protocolCharset = charset; + parseUriReference(original, false); + } + + + /** + * Construct a URI from the given string. + *

+     *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     * 

+ * An URI can be placed within double-quotes or angle brackets like + * "http://test.com/" and <http://test.com/> + * + * @param original the string to be represented to URI character sequence + * It is one of absoluteURI and relativeURI. + * @throws URIException If the URI cannot be created. + * @see #getDefaultProtocolCharset + * + * @deprecated Use #URI(String, boolean) + */ + public URI(String original) throws URIException { + parseUriReference(original, false); + } + + + /** + * Construct a general URI from the given components. + *

+     *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
+     *   opaque_part   = uric_no_slash *uric
+     * 

+ * It's for absolute URI = <scheme>:<scheme-specific-part># + * <fragment>. + * + * @param scheme the scheme string + * @param schemeSpecificPart scheme_specific_part + * @param fragment the fragment string + * @throws URIException If the URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String schemeSpecificPart, String fragment) + throws URIException { + + // validate and contruct the URI character sequence + if (scheme == null) { + throw new URIException(URIException.PARSING, "scheme required"); + } + char[] s = scheme.toLowerCase().toCharArray(); + if (validate(s, URI.scheme)) { + _scheme = s; // is_absoluteURI + } else { + throw new URIException(URIException.PARSING, "incorrect scheme"); + } + _opaque = encode(schemeSpecificPart, allowed_opaque_part, + getProtocolCharset()); + // Set flag + _is_opaque_part = true; + _fragment = fragment == null ? null : fragment.toCharArray(); + setURI(); + } + + + /** + * Construct a general URI from the given components. + *

+     *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
+     *   relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
+     *   hier_part     = ( net_path | abs_path ) [ "?" query ]
+     * 

+ * It's for absolute URI = <scheme>:<path>?<query>#< + * fragment> and relative URI = <path>?<query>#<fragment + * >. + * + * @param scheme the scheme string + * @param authority the authority string + * @param path the path string + * @param query the query string + * @param fragment the fragment string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String authority, String path, String query, + String fragment) throws URIException { + + // validate and contruct the URI character sequence + StringBuffer buff = new StringBuffer(); + if (scheme != null) { + buff.append(scheme); + buff.append(':'); + } + if (authority != null) { + buff.append("//"); + buff.append(authority); + } + if (path != null) { // accept empty path + if ((scheme != null || authority != null) + && !path.startsWith("/")) { + throw new URIException(URIException.PARSING, + "abs_path requested"); + } + buff.append(path); + } + if (query != null) { + buff.append('?'); + buff.append(query); + } + if (fragment != null) { + buff.append('#'); + buff.append(fragment); + } + parseUriReference(buff.toString(), false); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param userinfo the userinfo string + * @param host the host string + * @param port the port number + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String userinfo, String host, int port) + throws URIException { + + this(scheme, userinfo, host, port, null, null, null); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param userinfo the userinfo string + * @param host the host string + * @param port the port number + * @param path the path string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String userinfo, String host, int port, + String path) throws URIException { + + this(scheme, userinfo, host, port, path, null, null); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param userinfo the userinfo string + * @param host the host string + * @param port the port number + * @param path the path string + * @param query the query string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String userinfo, String host, int port, + String path, String query) throws URIException { + + this(scheme, userinfo, host, port, path, query, null); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param userinfo the userinfo string + * @param host the host string + * @param port the port number + * @param path the path string + * @param query the query string + * @param fragment the fragment string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String userinfo, String host, int port, + String path, String query, String fragment) throws URIException { + + this(scheme, (host == null) ? null + : ((userinfo != null) ? userinfo + '@' : "") + host + + ((port != -1) ? ":" + port : ""), path, query, fragment); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param host the host string + * @param path the path string + * @param fragment the fragment string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String host, String path, String fragment) + throws URIException { + + this(scheme, host, path, null, fragment); + } + + + /** + * Construct a general URI with the given relative URI string. + * + * @param base the base URI + * @param relative the relative URI string + * @throws URIException If the new URI cannot be created. + * + * @deprecated Use #URI(URI, String, boolean) + */ + public URI(URI base, String relative) throws URIException { + this(base, new URI(relative)); + } + + + /** + * Construct a general URI with the given relative URI string. + * + * @param base the base URI + * @param relative the relative URI string + * @param escaped true if URI character sequence is in escaped form. + * false otherwise. + * + * @throws URIException If the new URI cannot be created. + * + * @since 3.0 + */ + public URI(URI base, String relative, boolean escaped) throws URIException { + this(base, new URI(relative, escaped)); + } + + + /** + * Construct a general URI with the given relative URI. + *

+     *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     *   relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
+     * 

+ * Resolving Relative References to Absolute Form. + * + * Examples of Resolving Relative URI References + * + * Within an object with a well-defined base URI of + *

+     *   http://a/b/c/d;p?q
+     * 

+ * the relative URI would be resolved as follows: + * + * Normal Examples + * + *

+     *   g:h           =  g:h
+     *   g             =  http://a/b/c/g
+     *   ./g           =  http://a/b/c/g
+     *   g/            =  http://a/b/c/g/
+     *   /g            =  http://a/g
+     *   //g           =  http://g
+     *   ?y            =  http://a/b/c/?y
+     *   g?y           =  http://a/b/c/g?y
+     *   #s            =  (current document)#s
+     *   g#s           =  http://a/b/c/g#s
+     *   g?y#s         =  http://a/b/c/g?y#s
+     *   ;x            =  http://a/b/c/;x
+     *   g;x           =  http://a/b/c/g;x
+     *   g;x?y#s       =  http://a/b/c/g;x?y#s
+     *   .             =  http://a/b/c/
+     *   ./            =  http://a/b/c/
+     *   ..            =  http://a/b/
+     *   ../           =  http://a/b/
+     *   ../g          =  http://a/b/g
+     *   ../..         =  http://a/
+     *   ../../        =  http://a/ 
+     *   ../../g       =  http://a/g
+     * 

+ * + * Some URI schemes do not allow a hierarchical syntax matching the + * syntax, and thus cannot use relative references. + * + * @param base the base URI + * @param relative the relative URI + * @throws URIException If the new URI cannot be created. + */ + public URI(URI base, URI relative) throws URIException { + + if (base._scheme == null) { + throw new URIException(URIException.PARSING, "base URI required"); + } + if (base._scheme != null) { + this._scheme = base._scheme; + this._authority = base._authority; + this._is_net_path = base._is_net_path; + } + if (base._is_opaque_part || relative._is_opaque_part) { + this._scheme = base._scheme; + this._is_opaque_part = base._is_opaque_part + || relative._is_opaque_part; + this._opaque = relative._opaque; + this._fragment = relative._fragment; + this.setURI(); + return; + } + boolean schemesEqual = Arrays.equals(base._scheme,relative._scheme); + if (relative._scheme != null + && (!schemesEqual || relative._authority != null)) { + this._scheme = relative._scheme; + this._is_net_path = relative._is_net_path; + this._authority = relative._authority; + if (relative._is_server) { + this._is_server = relative._is_server; + this._userinfo = relative._userinfo; + this._host = relative._host; + this._port = relative._port; + } else if (relative._is_reg_name) { + this._is_reg_name = relative._is_reg_name; + } + this._is_abs_path = relative._is_abs_path; + this._is_rel_path = relative._is_rel_path; + this._path = relative._path; + } else if (base._authority != null && relative._scheme == null) { + this._is_net_path = base._is_net_path; + this._authority = base._authority; + if (base._is_server) { + this._is_server = base._is_server; + this._userinfo = base._userinfo; + this._host = base._host; + this._port = base._port; + } else if (base._is_reg_name) { + this._is_reg_name = base._is_reg_name; + } + } + if (relative._authority != null) { + this._is_net_path = relative._is_net_path; + this._authority = relative._authority; + if (relative._is_server) { + this._is_server = relative._is_server; + this._userinfo = relative._userinfo; + this._host = relative._host; + this._port = relative._port; + } else if (relative._is_reg_name) { + this._is_reg_name = relative._is_reg_name; + } + this._is_abs_path = relative._is_abs_path; + this._is_rel_path = relative._is_rel_path; + this._path = relative._path; + } + // resolve the path and query if necessary + if (relative._authority == null + && (relative._scheme == null || schemesEqual)) { + if ((relative._path == null || relative._path.length == 0) + && relative._query == null) { + // handle a reference to the current document, see RFC 2396 + // section 5.2 step 2 + this._path = base._path; + this._query = base._query; + } else { + this._path = resolvePath(base._path, relative._path); + } + } + // base._query removed + if (relative._query != null) { + this._query = relative._query; + } + // base._fragment removed + if (relative._fragment != null) { + this._fragment = relative._fragment; + } + this.setURI(); + // reparse the newly built URI, this will ensure that all flags are set correctly. + // TODO there must be a better way to do this + parseUriReference(new String(_uri), true); + } + + // --------------------------------------------------- Instance Variables + + /** Version ID for serialization */ + static final long serialVersionUID = 604752400577948726L; + + + /** + * Cache the hash code for this URI. + */ + protected int hash = 0; + + + /** + * This Uniform Resource Identifier (URI). + * The URI is always in an "escaped" form, since escaping or unescaping + * a completed URI might change its semantics. + */ + protected char[] _uri = null; + + + /** + * The charset of the protocol used by this URI instance. + */ + protected String protocolCharset = null; + + + /** + * The default charset of the protocol. RFC 2277, 2396 + */ + protected static String defaultProtocolCharset = "UTF-8"; + + + /** + * The default charset of the document. RFC 2277, 2396 + * The platform's charset is used for the document by default. + */ + protected static String defaultDocumentCharset = null; + protected static String defaultDocumentCharsetByLocale = null; + protected static String defaultDocumentCharsetByPlatform = null; + // Static initializer for defaultDocumentCharset + static { + Locale locale = Locale.getDefault(); + // in order to support backward compatiblity + if (locale != null) { + defaultDocumentCharsetByLocale = + LocaleToCharsetMap.getCharset(locale); + // set the default document charset + defaultDocumentCharset = defaultDocumentCharsetByLocale; + } + // in order to support platform encoding + try { + defaultDocumentCharsetByPlatform = System.getProperty("file.encoding"); + } catch (SecurityException ignore) { + } + if (defaultDocumentCharset == null) { + // set the default document charset + defaultDocumentCharset = defaultDocumentCharsetByPlatform; + } + } + + + /** + * The scheme. + */ + protected char[] _scheme = null; + + + /** + * The opaque. + */ + protected char[] _opaque = null; + + + /** + * The authority. + */ + protected char[] _authority = null; + + + /** + * The userinfo. + */ + protected char[] _userinfo = null; + + + /** + * The host. + */ + protected char[] _host = null; + + + /** + * The port. + */ + protected int _port = -1; + + + /** + * The path. + */ + protected char[] _path = null; + + + /** + * The query. + */ + protected char[] _query = null; + + + /** + * The fragment. + */ + protected char[] _fragment = null; + + + /** + * The root path. + */ + protected static final char[] rootPath = { '/' }; + + // ---------------------- Generous characters for each component validation + + /** + * The percent "%" character always has the reserved purpose of being the + * escape indicator, it must be escaped as "%25" in order to be used as + * data within a URI. + */ + protected static final BitSet percent = new BitSet(256); + // Static initializer for percent + static { + percent.set('%'); + } + + + /** + * BitSet for digit. + *

+     * digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
+     *            "8" | "9"
+     * 

+ */ + protected static final BitSet digit = new BitSet(256); + // Static initializer for digit + static { + for (int i = '0'; i <= '9'; i++) { + digit.set(i); + } + } + + + /** + * BitSet for alpha. + *

+     * alpha         = lowalpha | upalpha
+     * 

+ */ + protected static final BitSet alpha = new BitSet(256); + // Static initializer for alpha + static { + for (int i = 'a'; i <= 'z'; i++) { + alpha.set(i); + } + for (int i = 'A'; i <= 'Z'; i++) { + alpha.set(i); + } + } + + + /** + * BitSet for alphanum (join of alpha & digit). + *

+     *  alphanum      = alpha | digit
+     * 

+ */ + protected static final BitSet alphanum = new BitSet(256); + // Static initializer for alphanum + static { + alphanum.or(alpha); + alphanum.or(digit); + } + + + /** + * BitSet for hex. + *

+     * hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
+     *                         "a" | "b" | "c" | "d" | "e" | "f"
+     * 

+ */ + protected static final BitSet hex = new BitSet(256); + // Static initializer for hex + static { + hex.or(digit); + for (int i = 'a'; i <= 'f'; i++) { + hex.set(i); + } + for (int i = 'A'; i <= 'F'; i++) { + hex.set(i); + } + } + + + /** + * BitSet for escaped. + *

+     * escaped       = "%" hex hex
+     * 

+ */ + protected static final BitSet escaped = new BitSet(256); + // Static initializer for escaped + static { + escaped.or(percent); + escaped.or(hex); + } + + + /** + * BitSet for mark. + *

+     * mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
+     *                 "(" | ")"
+     * 

+ */ + protected static final BitSet mark = new BitSet(256); + // Static initializer for mark + static { + mark.set('-'); + mark.set('_'); + mark.set('.'); + mark.set('!'); + mark.set('~'); + mark.set('*'); + mark.set('\''); + mark.set('('); + mark.set(')'); + } + + + /** + * Data characters that are allowed in a URI but do not have a reserved + * purpose are called unreserved. + *

+     * unreserved    = alphanum | mark
+     * 

+ */ + protected static final BitSet unreserved = new BitSet(256); + // Static initializer for unreserved + static { + unreserved.or(alphanum); + unreserved.or(mark); + } + + + /** + * BitSet for reserved. + *

+     * reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
+     *                 "$" | ","
+     * 

+ */ + protected static final BitSet reserved = new BitSet(256); + // Static initializer for reserved + static { + reserved.set(';'); + reserved.set('/'); + reserved.set('?'); + reserved.set(':'); + reserved.set('@'); + reserved.set('&'); + reserved.set('='); + reserved.set('+'); + reserved.set('$'); + reserved.set(','); + } + + + /** + * BitSet for uric. + *

+     * uric          = reserved | unreserved | escaped
+     * 

+ */ + protected static final BitSet uric = new BitSet(256); + // Static initializer for uric + static { + uric.or(reserved); + uric.or(unreserved); + uric.or(escaped); + } + + + /** + * BitSet for fragment (alias for uric). + *

+     * fragment      = *uric
+     * 

+ */ + protected static final BitSet fragment = uric; + + + /** + * BitSet for query (alias for uric). + *

+     * query         = *uric
+     * 

+ */ + protected static final BitSet query = uric; + + + /** + * BitSet for pchar. + *

+     * pchar         = unreserved | escaped |
+     *                 ":" | "@" | "&" | "=" | "+" | "$" | ","
+     * 

+ */ + protected static final BitSet pchar = new BitSet(256); + // Static initializer for pchar + static { + pchar.or(unreserved); + pchar.or(escaped); + pchar.set(':'); + pchar.set('@'); + pchar.set('&'); + pchar.set('='); + pchar.set('+'); + pchar.set('$'); + pchar.set(','); + } + + + /** + * BitSet for param (alias for pchar). + *

+     * param         = *pchar
+     * 

+ */ + protected static final BitSet param = pchar; + + + /** + * BitSet for segment. + *

+     * segment       = *pchar *( ";" param )
+     * 

+ */ + protected static final BitSet segment = new BitSet(256); + // Static initializer for segment + static { + segment.or(pchar); + segment.set(';'); + segment.or(param); + } + + + /** + * BitSet for path segments. + *

+     * path_segments = segment *( "/" segment )
+     * 

+ */ + protected static final BitSet path_segments = new BitSet(256); + // Static initializer for path_segments + static { + path_segments.set('/'); + path_segments.or(segment); + } + + + /** + * URI absolute path. + *

+     * abs_path      = "/"  path_segments
+     * 

+ */ + protected static final BitSet abs_path = new BitSet(256); + // Static initializer for abs_path + static { + abs_path.set('/'); + abs_path.or(path_segments); + } + + + /** + * URI bitset for encoding typical non-slash characters. + *

+     * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
+     *                 "&" | "=" | "+" | "$" | ","
+     * 

+ */ + protected static final BitSet uric_no_slash = new BitSet(256); + // Static initializer for uric_no_slash + static { + uric_no_slash.or(unreserved); + uric_no_slash.or(escaped); + uric_no_slash.set(';'); + uric_no_slash.set('?'); + uric_no_slash.set(';'); + uric_no_slash.set('@'); + uric_no_slash.set('&'); + uric_no_slash.set('='); + uric_no_slash.set('+'); + uric_no_slash.set('$'); + uric_no_slash.set(','); + } + + + /** + * URI bitset that combines uric_no_slash and uric. + *

+     * opaque_part   = uric_no_slash *uric
+     * 

+ */ + protected static final BitSet opaque_part = new BitSet(256); + // Static initializer for opaque_part + static { + // it's generous. because first character must not include a slash + opaque_part.or(uric_no_slash); + opaque_part.or(uric); + } + + + /** + * URI bitset that combines absolute path and opaque part. + *

+     * path          = [ abs_path | opaque_part ]
+     * 

+ */ + protected static final BitSet path = new BitSet(256); + // Static initializer for path + static { + path.or(abs_path); + path.or(opaque_part); + } + + + /** + * Port, a logical alias for digit. + */ + protected static final BitSet port = digit; + + + /** + * Bitset that combines digit and dot fo IPv$address. + *

+     * IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
+     * 

+ */ + protected static final BitSet IPv4address = new BitSet(256); + // Static initializer for IPv4address + static { + IPv4address.or(digit); + IPv4address.set('.'); + } + + + /** + * RFC 2373. + *

+     * IPv6address = hexpart [ ":" IPv4address ]
+     * 

+ */ + protected static final BitSet IPv6address = new BitSet(256); + // Static initializer for IPv6address reference + static { + IPv6address.or(hex); // hexpart + IPv6address.set(':'); + IPv6address.or(IPv4address); + } + + + /** + * RFC 2732, 2373. + *

+     * IPv6reference   = "[" IPv6address "]"
+     * 

+ */ + protected static final BitSet IPv6reference = new BitSet(256); + // Static initializer for IPv6reference + static { + IPv6reference.set('['); + IPv6reference.or(IPv6address); + IPv6reference.set(']'); + } + + + /** + * BitSet for toplabel. + *

+     * toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
+     * 

+ */ + protected static final BitSet toplabel = new BitSet(256); + // Static initializer for toplabel + static { + toplabel.or(alphanum); + toplabel.set('-'); + } + + + /** + * BitSet for domainlabel. + *

+     * domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
+     * 

+ */ + protected static final BitSet domainlabel = toplabel; + + + /** + * BitSet for hostname. + *

+     * hostname      = *( domainlabel "." ) toplabel [ "." ]
+     * 

+ */ + protected static final BitSet hostname = new BitSet(256); + // Static initializer for hostname + static { + hostname.or(toplabel); + // hostname.or(domainlabel); + hostname.set('.'); + } + + + /** + * BitSet for host. + *

+     * host          = hostname | IPv4address | IPv6reference
+     * 

+ */ + protected static final BitSet host = new BitSet(256); + // Static initializer for host + static { + host.or(hostname); + // host.or(IPv4address); + host.or(IPv6reference); // IPv4address + } + + + /** + * BitSet for hostport. + *

+     * hostport      = host [ ":" port ]
+     * 

+ */ + protected static final BitSet hostport = new BitSet(256); + // Static initializer for hostport + static { + hostport.or(host); + hostport.set(':'); + hostport.or(port); + } + + + /** + * Bitset for userinfo. + *

+     * userinfo      = *( unreserved | escaped |
+     *                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
+     * 

+ */ + protected static final BitSet userinfo = new BitSet(256); + // Static initializer for userinfo + static { + userinfo.or(unreserved); + userinfo.or(escaped); + userinfo.set(';'); + userinfo.set(':'); + userinfo.set('&'); + userinfo.set('='); + userinfo.set('+'); + userinfo.set('$'); + userinfo.set(','); + } + + + /** + * BitSet for within the userinfo component like user and password. + */ + public static final BitSet within_userinfo = new BitSet(256); + // Static initializer for within_userinfo + static { + within_userinfo.or(userinfo); + within_userinfo.clear(';'); // reserved within authority + within_userinfo.clear(':'); + within_userinfo.clear('@'); + within_userinfo.clear('?'); + within_userinfo.clear('/'); + } + + + /** + * Bitset for server. + *

+     * server        = [ [ userinfo "@" ] hostport ]
+     * 

+ */ + protected static final BitSet server = new BitSet(256); + // Static initializer for server + static { + server.or(userinfo); + server.set('@'); + server.or(hostport); + } + + + /** + * BitSet for reg_name. + *

+     * reg_name      = 1*( unreserved | escaped | "$" | "," |
+     *                     ";" | ":" | "@" | "&" | "=" | "+" )
+     * 

+ */ + protected static final BitSet reg_name = new BitSet(256); + // Static initializer for reg_name + static { + reg_name.or(unreserved); + reg_name.or(escaped); + reg_name.set('$'); + reg_name.set(','); + reg_name.set(';'); + reg_name.set(':'); + reg_name.set('@'); + reg_name.set('&'); + reg_name.set('='); + reg_name.set('+'); + } + + + /** + * BitSet for authority. + *

+     * authority     = server | reg_name
+     * 

+ */ + protected static final BitSet authority = new BitSet(256); + // Static initializer for authority + static { + authority.or(server); + authority.or(reg_name); + } + + + /** + * BitSet for scheme. + *

+     * scheme        = alpha *( alpha | digit | "+" | "-" | "." )
+     * 

+ */ + protected static final BitSet scheme = new BitSet(256); + // Static initializer for scheme + static { + scheme.or(alpha); + scheme.or(digit); + scheme.set('+'); + scheme.set('-'); + scheme.set('.'); + } + + + /** + * BitSet for rel_segment. + *

+     * rel_segment   = 1*( unreserved | escaped |
+     *                     ";" | "@" | "&" | "=" | "+" | "$" | "," )
+     * 

+ */ + protected static final BitSet rel_segment = new BitSet(256); + // Static initializer for rel_segment + static { + rel_segment.or(unreserved); + rel_segment.or(escaped); + rel_segment.set(';'); + rel_segment.set('@'); + rel_segment.set('&'); + rel_segment.set('='); + rel_segment.set('+'); + rel_segment.set('$'); + rel_segment.set(','); + } + + + /** + * BitSet for rel_path. + *

+     * rel_path      = rel_segment [ abs_path ]
+     * 

+ */ + protected static final BitSet rel_path = new BitSet(256); + // Static initializer for rel_path + static { + rel_path.or(rel_segment); + rel_path.or(abs_path); + } + + + /** + * BitSet for net_path. + *

+     * net_path      = "//" authority [ abs_path ]
+     * 

+ */ + protected static final BitSet net_path = new BitSet(256); + // Static initializer for net_path + static { + net_path.set('/'); + net_path.or(authority); + net_path.or(abs_path); + } + + + /** + * BitSet for hier_part. + *

+     * hier_part     = ( net_path | abs_path ) [ "?" query ]
+     * 

+ */ + protected static final BitSet hier_part = new BitSet(256); + // Static initializer for hier_part + static { + hier_part.or(net_path); + hier_part.or(abs_path); + // hier_part.set('?'); aleady included + hier_part.or(query); + } + + + /** + * BitSet for relativeURI. + *

+     * relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
+     * 

+ */ + protected static final BitSet relativeURI = new BitSet(256); + // Static initializer for relativeURI + static { + relativeURI.or(net_path); + relativeURI.or(abs_path); + relativeURI.or(rel_path); + // relativeURI.set('?'); aleady included + relativeURI.or(query); + } + + + /** + * BitSet for absoluteURI. + *

+     * absoluteURI   = scheme ":" ( hier_part | opaque_part )
+     * 

+ */ + protected static final BitSet absoluteURI = new BitSet(256); + // Static initializer for absoluteURI + static { + absoluteURI.or(scheme); + absoluteURI.set(':'); + absoluteURI.or(hier_part); + absoluteURI.or(opaque_part); + } + + + /** + * BitSet for URI-reference. + *

+     * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     * 

+ */ + protected static final BitSet URI_reference = new BitSet(256); + // Static initializer for URI_reference + static { + URI_reference.or(absoluteURI); + URI_reference.or(relativeURI); + URI_reference.set('#'); + URI_reference.or(fragment); + } + + // ---------------------------- Characters disallowed within the URI syntax + // Excluded US-ASCII Characters are like control, space, delims and unwise + + /** + * BitSet for control. + */ + public static final BitSet control = new BitSet(256); + // Static initializer for control + static { + for (int i = 0; i <= 0x1F; i++) { + control.set(i); + } + control.set(0x7F); + } + + /** + * BitSet for space. + */ + public static final BitSet space = new BitSet(256); + // Static initializer for space + static { + space.set(0x20); + } + + + /** + * BitSet for delims. + */ + public static final BitSet delims = new BitSet(256); + // Static initializer for delims + static { + delims.set('<'); + delims.set('>'); + delims.set('#'); + delims.set('%'); + delims.set('"'); + } + + + /** + * BitSet for unwise. + */ + public static final BitSet unwise = new BitSet(256); + // Static initializer for unwise + static { + unwise.set('{'); + unwise.set('}'); + unwise.set('|'); + unwise.set('\\'); + unwise.set('^'); + unwise.set('['); + unwise.set(']'); + unwise.set('`'); + } + + + /** + * Disallowed rel_path before escaping. + */ + public static final BitSet disallowed_rel_path = new BitSet(256); + // Static initializer for disallowed_rel_path + static { + disallowed_rel_path.or(uric); + disallowed_rel_path.andNot(rel_path); + } + + + /** + * Disallowed opaque_part before escaping. + */ + public static final BitSet disallowed_opaque_part = new BitSet(256); + // Static initializer for disallowed_opaque_part + static { + disallowed_opaque_part.or(uric); + disallowed_opaque_part.andNot(opaque_part); + } + + // ----------------------- Characters allowed within and for each component + + /** + * Those characters that are allowed for the authority component. + */ + public static final BitSet allowed_authority = new BitSet(256); + // Static initializer for allowed_authority + static { + allowed_authority.or(authority); + allowed_authority.clear('%'); + } + + + /** + * Those characters that are allowed for the opaque_part. + */ + public static final BitSet allowed_opaque_part = new BitSet(256); + // Static initializer for allowed_opaque_part + static { + allowed_opaque_part.or(opaque_part); + allowed_opaque_part.clear('%'); + } + + + /** + * Those characters that are allowed for the reg_name. + */ + public static final BitSet allowed_reg_name = new BitSet(256); + // Static initializer for allowed_reg_name + static { + allowed_reg_name.or(reg_name); + // allowed_reg_name.andNot(percent); + allowed_reg_name.clear('%'); + } + + + /** + * Those characters that are allowed for the userinfo component. + */ + public static final BitSet allowed_userinfo = new BitSet(256); + // Static initializer for allowed_userinfo + static { + allowed_userinfo.or(userinfo); + // allowed_userinfo.andNot(percent); + allowed_userinfo.clear('%'); + } + + + /** + * Those characters that are allowed for within the userinfo component. + */ + public static final BitSet allowed_within_userinfo = new BitSet(256); + // Static initializer for allowed_within_userinfo + static { + allowed_within_userinfo.or(within_userinfo); + allowed_within_userinfo.clear('%'); + } + + + /** + * Those characters that are allowed for the IPv6reference component. + * The characters '[', ']' in IPv6reference should be excluded. + */ + public static final BitSet allowed_IPv6reference = new BitSet(256); + // Static initializer for allowed_IPv6reference + static { + allowed_IPv6reference.or(IPv6reference); + // allowed_IPv6reference.andNot(unwise); + allowed_IPv6reference.clear('['); + allowed_IPv6reference.clear(']'); + } + + + /** + * Those characters that are allowed for the host component. + * The characters '[', ']' in IPv6reference should be excluded. + */ + public static final BitSet allowed_host = new BitSet(256); + // Static initializer for allowed_host + static { + allowed_host.or(hostname); + allowed_host.or(allowed_IPv6reference); + } + + + /** + * Those characters that are allowed for the authority component. + */ + public static final BitSet allowed_within_authority = new BitSet(256); + // Static initializer for allowed_within_authority + static { + allowed_within_authority.or(server); + allowed_within_authority.or(reg_name); + allowed_within_authority.clear(';'); + allowed_within_authority.clear(':'); + allowed_within_authority.clear('@'); + allowed_within_authority.clear('?'); + allowed_within_authority.clear('/'); + } + + + /** + * Those characters that are allowed for the abs_path. + */ + public static final BitSet allowed_abs_path = new BitSet(256); + // Static initializer for allowed_abs_path + static { + allowed_abs_path.or(abs_path); + // allowed_abs_path.set('/'); // aleady included + allowed_abs_path.andNot(percent); + allowed_abs_path.clear('+'); + } + + + /** + * Those characters that are allowed for the rel_path. + */ + public static final BitSet allowed_rel_path = new BitSet(256); + // Static initializer for allowed_rel_path + static { + allowed_rel_path.or(rel_path); + allowed_rel_path.clear('%'); + allowed_rel_path.clear('+'); + } + + + /** + * Those characters that are allowed within the path. + */ + public static final BitSet allowed_within_path = new BitSet(256); + // Static initializer for allowed_within_path + static { + allowed_within_path.or(abs_path); + allowed_within_path.clear('/'); + allowed_within_path.clear(';'); + allowed_within_path.clear('='); + allowed_within_path.clear('?'); + } + + + /** + * Those characters that are allowed for the query component. + */ + public static final BitSet allowed_query = new BitSet(256); + // Static initializer for allowed_query + static { + allowed_query.or(uric); + allowed_query.clear('%'); + } + + + /** + * Those characters that are allowed within the query component. + */ + public static final BitSet allowed_within_query = new BitSet(256); + // Static initializer for allowed_within_query + static { + allowed_within_query.or(allowed_query); + allowed_within_query.andNot(reserved); // excluded 'reserved' + } + + + /** + * Those characters that are allowed for the fragment component. + */ + public static final BitSet allowed_fragment = new BitSet(256); + // Static initializer for allowed_fragment + static { + allowed_fragment.or(uric); + allowed_fragment.clear('%'); + } + + // ------------------------------------------- Flags for this URI-reference + + // TODO: Figure out what all these variables are for and provide javadoc + + // URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + // absoluteURI = scheme ":" ( hier_part | opaque_part ) + protected boolean _is_hier_part; + protected boolean _is_opaque_part; + // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] + // hier_part = ( net_path | abs_path ) [ "?" query ] + protected boolean _is_net_path; + protected boolean _is_abs_path; + protected boolean _is_rel_path; + // net_path = "//" authority [ abs_path ] + // authority = server | reg_name + protected boolean _is_reg_name; + protected boolean _is_server; // = _has_server + // server = [ [ userinfo "@" ] hostport ] + // host = hostname | IPv4address | IPv6reference + protected boolean _is_hostname; + protected boolean _is_IPv4address; + protected boolean _is_IPv6reference; + + // ------------------------------------------ Character and escape encoding + + /** + * Encodes URI string. + * + * This is a two mapping, one from original characters to octets, and + * subsequently a second from octets to URI characters: + *

+     *   original character sequence->octet sequence->URI character sequence
+     * 

+ * + * An escaped octet is encoded as a character triplet, consisting of the + * percent character "%" followed by the two hexadecimal digits + * representing the octet code. For example, "%20" is the escaped + * encoding for the US-ASCII space character. + *

+ * Conversion from the local filesystem character set to UTF-8 will + * normally involve a two step process. First convert the local character + * set to the UCS; then convert the UCS to UTF-8. + * The first step in the process can be performed by maintaining a mapping + * table that includes the local character set code and the corresponding + * UCS code. + * The next step is to convert the UCS character code to the UTF-8 encoding. + *

+ * Mapping between vendor codepages can be done in a very similar manner + * as described above. + *

+ * The only time escape encodings can allowedly be made is when a URI is + * being created from its component parts. The escape and validate methods + * are internally performed within this method. + * + * @param original the original character sequence + * @param allowed those characters that are allowed within a component + * @param charset the protocol charset + * @return URI character sequence + * @throws URIException null component or unsupported character encoding + */ + + protected static char[] encode(String original, BitSet allowed, + String charset) throws URIException { + if (original == null) { + throw new IllegalArgumentException("Original string may not be null"); + } + if (allowed == null) { + throw new IllegalArgumentException("Allowed bitset may not be null"); + } + byte[] rawdata = URLCodec.encodeUrl(allowed, getBytes(original, charset)); + return new String(rawdata, StandardCharsets.US_ASCII).toCharArray(); + } + + private static byte[] getBytes(String original, String charset) { + try { + return original.getBytes(charset); + } catch (UnsupportedEncodingException e) { + return original.getBytes(); + } + } + + /** + * Decodes URI encoded string. + * + * This is a two mapping, one from URI characters to octets, and + * subsequently a second from octets to original characters: + *

+     *   URI character sequence->octet sequence->original character sequence
+     * 

+ * + * A URI must be separated into its components before the escaped + * characters within those components can be allowedly decoded. + *

+ * Notice that there is a chance that URI characters that are non UTF-8 + * may be parsed as valid UTF-8. A recent non-scientific analysis found + * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a + * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% + * false reading. + *

+ * The percent "%" character always has the reserved purpose of being + * the escape indicator, it must be escaped as "%25" in order to be used + * as data within a URI. + *

+ * The unescape method is internally performed within this method. + * + * @param component the URI character sequence + * @param charset the protocol charset + * @return original character sequence + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + */ + protected static String decode(char[] component, String charset) + throws URIException { + if (component == null) { + throw new IllegalArgumentException("Component array of chars may not be null"); + } + return decode(new String(component), charset); + } + + /** + * Decodes URI encoded string. + * + * This is a two mapping, one from URI characters to octets, and + * subsequently a second from octets to original characters: + *

+     *   URI character sequence->octet sequence->original character sequence
+     * 

+ * + * A URI must be separated into its components before the escaped + * characters within those components can be allowedly decoded. + *

+ * Notice that there is a chance that URI characters that are non UTF-8 + * may be parsed as valid UTF-8. A recent non-scientific analysis found + * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a + * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% + * false reading. + *

+ * The percent "%" character always has the reserved purpose of being + * the escape indicator, it must be escaped as "%25" in order to be used + * as data within a URI. + *

+ * The unescape method is internally performed within this method. + * + * @param component the URI character sequence + * @param charset the protocol charset + * @return original character sequence + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * + * @since 3.0 + */ + protected static String decode(String component, String charset) + throws URIException { + if (component == null) { + throw new IllegalArgumentException("Component array of chars may not be null"); + } + byte[] rawdata = null; + try { + rawdata = URLCodec.decodeUrl(component.getBytes(StandardCharsets.US_ASCII)); + } catch (DecoderException e) { + throw new URIException(e.getMessage()); + } + try { + return new String(rawdata, charset); + } catch (UnsupportedEncodingException e) { + return new String(rawdata); + } + } + /** + * Pre-validate the unescaped URI string within a specific component. + * + * @param component the component string within the component + * @param disallowed those characters disallowed within the component + * @return if true, it doesn't have the disallowed characters + * if false, the component is undefined or an incorrect one + */ + protected boolean prevalidate(String component, BitSet disallowed) { + // prevalidate the given component by disallowed characters + if (component == null) { + return false; // undefined + } + char[] target = component.toCharArray(); + for (int i = 0; i < target.length; i++) { + if (disallowed.get(target[i])) { + return false; + } + } + return true; + } + + + /** + * Validate the URI characters within a specific component. + * The component must be performed after escape encoding. Or it doesn't + * include escaped characters. + * + * @param component the characters sequence within the component + * @param generous those characters that are allowed within a component + * @return if true, it's the correct URI character sequence + */ + protected boolean validate(char[] component, BitSet generous) { + // validate each component by generous characters + return validate(component, 0, -1, generous); + } + + + /** + * Validate the URI characters within a specific component. + * The component must be performed after escape encoding. Or it doesn't + * include escaped characters. + *

+ * It's not that much strict, generous. The strict validation might be + * performed before being called this method. + * + * @param component the characters sequence within the component + * @param soffset the starting offset of the given component + * @param eoffset the ending offset of the given component + * if -1, it means the length of the component + * @param generous those characters that are allowed within a component + * @return if true, it's the correct URI character sequence + */ + protected boolean validate(char[] component, int soffset, int eoffset, + BitSet generous) { + // validate each component by generous characters + if (eoffset == -1) { + eoffset = component.length - 1; + } + for (int i = soffset; i <= eoffset; i++) { + if (!generous.get(component[i])) { + return false; + } + } + return true; + } + + + /** + * In order to avoid any possilbity of conflict with non-ASCII characters, + * Parse a URI reference as a String with the character + * encoding of the local system or the document. + *

+ * The following line is the regular expression for breaking-down a URI + * reference into its components. + *

+     *   ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+     *    12            3  4          5       6  7        8 9
+     * 

+ * For example, matching the above expression to + * http://jakarta.apache.org/ietf/uri/#Related + * results in the following subexpression matches: + *

+     *               $1 = http:
+     *  scheme    =  $2 = http
+     *               $3 = //jakarta.apache.org
+     *  authority =  $4 = jakarta.apache.org
+     *  path      =  $5 = /ietf/uri/
+     *               $6 = 
+     *  query     =  $7 = 
+     *               $8 = #Related
+     *  fragment  =  $9 = Related
+     * 

+ * + * @param original the original character sequence + * @param escaped true if original is escaped + * @throws URIException If an error occurs. + */ + protected void parseUriReference(String original, boolean escaped) + throws URIException { + + // validate and contruct the URI character sequence + if (original == null) { + throw new URIException("URI-Reference required"); + } + + /* @ + * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? + */ + String tmp = original.trim(); + + /* + * The length of the string sequence of characters. + * It may not be equal to the length of the byte array. + */ + int length = tmp.length(); + + /* + * Remove the delimiters like angle brackets around an URI. + */ + if (length > 0) { + char[] firstDelimiter = { tmp.charAt(0) }; + if (validate(firstDelimiter, delims)) { + if (length >= 2) { + char[] lastDelimiter = { tmp.charAt(length - 1) }; + if (validate(lastDelimiter, delims)) { + tmp = tmp.substring(1, length - 1); + length = length - 2; + } + } + } + } + + /* + * The starting index + */ + int from = 0; + + /* + * The test flag whether the URI is started from the path component. + */ + boolean isStartedFromPath = false; + int atColon = tmp.indexOf(':'); + int atSlash = tmp.indexOf('/'); + if ((atColon <= 0 && !tmp.startsWith("//")) + || (atSlash >= 0 && atSlash < atColon)) { + isStartedFromPath = true; + } + + /* + *

+         *     @@@@@@@@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from); + if (at == -1) { + at = 0; + } + + /* + * Parse the scheme. + *

+         *  scheme    =  $2 = http
+         *              @
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + if (at > 0 && at < length && tmp.charAt(at) == ':') { + char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); + if (validate(target, scheme)) { + _scheme = target; + } else { + throw new URIException("incorrect scheme"); + } + from = ++at; + } + + /* + * Parse the authority component. + *

+         *  authority =  $4 = jakarta.apache.org
+         *                  @@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + // Reset flags + _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false; + if (0 <= at && at < length && tmp.charAt(at) == '/') { + // Set flag + _is_hier_part = true; + if (at + 2 < length && tmp.charAt(at + 1) == '/' + && !isStartedFromPath) { + // the temporary index to start the search from + int next = indexFirstOf(tmp, "/?#", at + 2); + if (next == -1) { + next = (tmp.substring(at + 2).length() == 0) ? at + 2 + : tmp.length(); + } + parseAuthority(tmp.substring(at + 2, next), escaped); + from = at = next; + // Set flag + _is_net_path = true; + } + if (from == at) { + // Set flag + _is_abs_path = true; + } + } + + /* + * Parse the path component. + *

+         *  path      =  $5 = /ietf/uri/
+         *                                @@@@@@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + if (from < length) { + // rel_path = rel_segment [ abs_path ] + int next = indexFirstOf(tmp, "?#", from); + if (next == -1) { + next = tmp.length(); + } + if (!_is_abs_path) { + if (!escaped + && prevalidate(tmp.substring(from, next), disallowed_rel_path) + || escaped + && validate(tmp.substring(from, next).toCharArray(), rel_path)) { + // Set flag + _is_rel_path = true; + } else if (!escaped + && prevalidate(tmp.substring(from, next), disallowed_opaque_part) + || escaped + && validate(tmp.substring(from, next).toCharArray(), opaque_part)) { + // Set flag + _is_opaque_part = true; + } else { + // the path component may be empty + _path = null; + } + } + String s = tmp.substring(from, next); + if (escaped) { + setRawPath(s.toCharArray()); + } else { + setPath(s); + } + at = next; + } + + // set the charset to do escape encoding + String charset = getProtocolCharset(); + + /* + * Parse the query component. + *

+         *  query     =  $7 = 
+         *                                        @@@@@@@@@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') { + int next = tmp.indexOf('#', at + 1); + if (next == -1) { + next = tmp.length(); + } + if (escaped) { + _query = tmp.substring(at + 1, next).toCharArray(); + if (!validate(_query, uric)) { + throw new URIException("Invalid query"); + } + } else { + _query = encode(tmp.substring(at + 1, next), allowed_query, charset); + } + at = next; + } + + /* + * Parse the fragment component. + *

+         *  fragment  =  $9 = Related
+         *                                                   @@@@@@@@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') { + if (at + 1 == length) { // empty fragment + _fragment = "".toCharArray(); + } else { + _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() + : encode(tmp.substring(at + 1), allowed_fragment, charset); + } + } + + // set this URI. + setURI(); + } + + + /** + * Get the earlier index that to be searched for the first occurrance in + * one of any of the given string. + * + * @param s the string to be indexed + * @param delims the delimiters used to index + * @return the earlier index if there are delimiters + */ + protected int indexFirstOf(String s, String delims) { + return indexFirstOf(s, delims, -1); + } + + + /** + * Get the earlier index that to be searched for the first occurrance in + * one of any of the given string. + * + * @param s the string to be indexed + * @param delims the delimiters used to index + * @param offset the from index + * @return the earlier index if there are delimiters + */ + protected int indexFirstOf(String s, String delims, int offset) { + if (s == null || s.length() == 0) { + return -1; + } + if (delims == null || delims.length() == 0) { + return -1; + } + // check boundaries + if (offset < 0) { + offset = 0; + } else if (offset > s.length()) { + return -1; + } + // s is never null + int min = s.length(); + char[] delim = delims.toCharArray(); + for (int i = 0; i < delim.length; i++) { + int at = s.indexOf(delim[i], offset); + if (at >= 0 && at < min) { + min = at; + } + } + return (min == s.length()) ? -1 : min; + } + + + /** + * Get the earlier index that to be searched for the first occurrance in + * one of any of the given array. + * + * @param s the character array to be indexed + * @param delim the delimiter used to index + * @return the ealier index if there are a delimiter + */ + protected int indexFirstOf(char[] s, char delim) { + return indexFirstOf(s, delim, 0); + } + + + /** + * Get the earlier index that to be searched for the first occurrance in + * one of any of the given array. + * + * @param s the character array to be indexed + * @param delim the delimiter used to index + * @param offset The offset. + * @return the ealier index if there is a delimiter + */ + protected int indexFirstOf(char[] s, char delim, int offset) { + if (s == null || s.length == 0) { + return -1; + } + // check boundaries + if (offset < 0) { + offset = 0; + } else if (offset > s.length) { + return -1; + } + for (int i = offset; i < s.length; i++) { + if (s[i] == delim) { + return i; + } + } + return -1; + } + + + /** + * Parse the authority component. + * + * @param original the original character sequence of authority component + * @param escaped true if original is escaped + * @throws URIException If an error occurs. + */ + protected void parseAuthority(String original, boolean escaped) + throws URIException { + + // Reset flags + _is_reg_name = _is_server = + _is_hostname = _is_IPv4address = _is_IPv6reference = false; + + // set the charset to do escape encoding + String charset = getProtocolCharset(); + + boolean hasPort = true; + int from = 0; + int next = original.indexOf('@'); + if (next != -1) { // neither -1 and 0 + // each protocol extented from URI supports the specific userinfo + _userinfo = (escaped) ? original.substring(0, next).toCharArray() + : encode(original.substring(0, next), allowed_userinfo, + charset); + from = next + 1; + } + next = original.indexOf('[', from); + if (next >= from) { + next = original.indexOf(']', from); + if (next == -1) { + throw new URIException(URIException.PARSING, "IPv6reference"); + } else { + next++; + } + // In IPv6reference, '[', ']' should be excluded + _host = (escaped) ? original.substring(from, next).toCharArray() + : encode(original.substring(from, next), allowed_IPv6reference, + charset); + // Set flag + _is_IPv6reference = true; + } else { // only for !_is_IPv6reference + next = original.indexOf(':', from); + if (next == -1) { + next = original.length(); + hasPort = false; + } + // REMINDME: it doesn't need the pre-validation + _host = original.substring(from, next).toCharArray(); + if (validate(_host, IPv4address)) { + // Set flag + _is_IPv4address = true; + } else if (validate(_host, hostname)) { + // Set flag + _is_hostname = true; + } else { + // Set flag + _is_reg_name = true; + } + } + if (_is_reg_name) { + // Reset flags for a server-based naming authority + _is_server = _is_hostname = _is_IPv4address = + _is_IPv6reference = false; + // set a registry-based naming authority + if (escaped) { + _authority = original.toCharArray(); + if (!validate(_authority, reg_name)) { + throw new URIException("Invalid authority"); + } + } else { + _authority = encode(original, allowed_reg_name, charset); + } + } else { + if (original.length() - 1 > next && hasPort + && original.charAt(next) == ':') { // not empty + from = next + 1; + try { + _port = Integer.parseInt(original.substring(from)); + } catch (NumberFormatException error) { + throw new URIException(URIException.PARSING, + "invalid port number"); + } + } + // set a server-based naming authority + StringBuffer buf = new StringBuffer(); + if (_userinfo != null) { // has_userinfo + buf.append(_userinfo); + buf.append('@'); + } + if (_host != null) { + buf.append(_host); + if (_port != -1) { + buf.append(':'); + buf.append(_port); + } + } + _authority = buf.toString().toCharArray(); + // Set flag + _is_server = true; + } + } + + + /** + * Once it's parsed successfully, set this URI. + * + * @see #getRawURI + */ + protected void setURI() { + // set _uri + StringBuffer buf = new StringBuffer(); + // ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? + if (_scheme != null) { + buf.append(_scheme); + buf.append(':'); + } + if (_is_net_path) { + buf.append("//"); + if (_authority != null) { // has_authority + buf.append(_authority); + } + } + if (_opaque != null && _is_opaque_part) { + buf.append(_opaque); + } else if (_path != null) { + // _is_hier_part or _is_relativeURI + if (_path.length != 0) { + buf.append(_path); + } + } + if (_query != null) { // has_query + buf.append('?'); + buf.append(_query); + } + // ignore the fragment identifier + _uri = buf.toString().toCharArray(); + hash = 0; + } + + // ----------------------------------------------------------- Test methods + + + /** + * Tell whether or not this URI is absolute. + * + * @return true iif this URI is absoluteURI + */ + public boolean isAbsoluteURI() { + return (_scheme != null); + } + + + /** + * Tell whether or not this URI is relative. + * + * @return true iif this URI is relativeURI + */ + public boolean isRelativeURI() { + return (_scheme == null); + } + + + /** + * Tell whether or not the absoluteURI of this URI is hier_part. + * + * @return true iif the absoluteURI is hier_part + */ + public boolean isHierPart() { + return _is_hier_part; + } + + + /** + * Tell whether or not the absoluteURI of this URI is opaque_part. + * + * @return true iif the absoluteURI is opaque_part + */ + public boolean isOpaquePart() { + return _is_opaque_part; + } + + + /** + * Tell whether or not the relativeURI or heir_part of this URI is net_path. + * It's the same function as the has_authority() method. + * + * @return true iif the relativeURI or heir_part is net_path + * @see #hasAuthority + */ + public boolean isNetPath() { + return _is_net_path || (_authority != null); + } + + + /** + * Tell whether or not the relativeURI or hier_part of this URI is abs_path. + * + * @return true iif the relativeURI or hier_part is abs_path + */ + public boolean isAbsPath() { + return _is_abs_path; + } + + + /** + * Tell whether or not the relativeURI of this URI is rel_path. + * + * @return true iif the relativeURI is rel_path + */ + public boolean isRelPath() { + return _is_rel_path; + } + + + /** + * Tell whether or not this URI has authority. + * It's the same function as the is_net_path() method. + * + * @return true iif this URI has authority + * @see #isNetPath + */ + public boolean hasAuthority() { + return (_authority != null) || _is_net_path; + } + + /** + * Tell whether or not the authority component of this URI is reg_name. + * + * @return true iif the authority component is reg_name + */ + public boolean isRegName() { + return _is_reg_name; + } + + + /** + * Tell whether or not the authority component of this URI is server. + * + * @return true iif the authority component is server + */ + public boolean isServer() { + return _is_server; + } + + + /** + * Tell whether or not this URI has userinfo. + * + * @return true iif this URI has userinfo + */ + public boolean hasUserinfo() { + return (_userinfo != null); + } + + + /** + * Tell whether or not the host part of this URI is hostname. + * + * @return true iif the host part is hostname + */ + public boolean isHostname() { + return _is_hostname; + } + + + /** + * Tell whether or not the host part of this URI is IPv4address. + * + * @return true iif the host part is IPv4address + */ + public boolean isIPv4address() { + return _is_IPv4address; + } + + + /** + * Tell whether or not the host part of this URI is IPv6reference. + * + * @return true iif the host part is IPv6reference + */ + public boolean isIPv6reference() { + return _is_IPv6reference; + } + + + /** + * Tell whether or not this URI has query. + * + * @return true iif this URI has query + */ + public boolean hasQuery() { + return (_query != null); + } + + + /** + * Tell whether or not this URI has fragment. + * + * @return true iif this URI has fragment + */ + public boolean hasFragment() { + return (_fragment != null); + } + + + // ---------------------------------------------------------------- Charset + + + /** + * Set the default charset of the protocol. + *

+ * The character set used to store files SHALL remain a local decision and + * MAY depend on the capability of local operating systems. Prior to the + * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format + * and UTF-8 encoded. This approach, while allowing international exchange + * of URIs, will still allow backward compatibility with older systems + * because the code set positions for ASCII characters are identical to the + * one byte sequence in UTF-8. + *

+ * An individual URI scheme may require a single charset, define a default + * charset, or provide a way to indicate the charset used. + * + *

+ * Always all the time, the setter method is always succeeded and throws + * DefaultCharsetChanged exception. + * + * So API programmer must follow the following way: + *

+     *  import org.apache.util.URI$DefaultCharsetChanged;
+     *      .
+     *      .
+     *      .
+     *  try {
+     *      URI.setDefaultProtocolCharset("UTF-8");
+     *  } catch (DefaultCharsetChanged cc) {
+     *      // CASE 1: the exception could be ignored, when it is set by user
+     *      if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
+     *      // CASE 2: let user know the default protocol charset changed
+     *      } else {
+     *      // CASE 2: let user know the default document charset changed
+     *      }
+     *  }
+     *  
+ * + * The API programmer is responsible to set the correct charset. + * And each application should remember its own charset to support. + * + * @param charset the default charset for each protocol + * @throws DefaultCharsetChanged default charset changed + */ + public static void setDefaultProtocolCharset(String charset) + throws DefaultCharsetChanged { + + defaultProtocolCharset = charset; + throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET, + "the default protocol charset changed"); + } + + + /** + * Get the default charset of the protocol. + *

+ * An individual URI scheme may require a single charset, define a default + * charset, or provide a way to indicate the charset used. + *

+ * To work globally either requires support of a number of character sets + * and to be able to convert between them, or the use of a single preferred + * character set. + * For support of global compatibility it is STRONGLY RECOMMENDED that + * clients and servers use UTF-8 encoding when exchanging URIs. + * + * @return the default charset string + */ + public static String getDefaultProtocolCharset() { + return defaultProtocolCharset; + } + + + /** + * Get the protocol charset used by this current URI instance. + * It was set by the constructor for this instance. If it was not set by + * contructor, it will return the default protocol charset. + * + * @return the protocol charset string + * @see #getDefaultProtocolCharset + */ + public String getProtocolCharset() { + return (protocolCharset != null) + ? protocolCharset + : defaultProtocolCharset; + } + + + /** + * Set the default charset of the document. + *

+ * Notice that it will be possible to contain mixed characters (e.g. + * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional + * display of these character sets, the protocol charset could be simply + * used again. Because it's not yet implemented that the insertion of BIDI + * control characters at different points during composition is extracted. + *

+ * + * Always all the time, the setter method is always succeeded and throws + * DefaultCharsetChanged exception. + * + * So API programmer must follow the following way: + *

+     *  import org.apache.util.URI$DefaultCharsetChanged;
+     *      .
+     *      .
+     *      .
+     *  try {
+     *      URI.setDefaultDocumentCharset("EUC-KR");
+     *  } catch (DefaultCharsetChanged cc) {
+     *      // CASE 1: the exception could be ignored, when it is set by user
+     *      if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
+     *      // CASE 2: let user know the default document charset changed
+     *      } else {
+     *      // CASE 2: let user know the default protocol charset changed
+     *      }
+     *  }
+     *  
+ * + * The API programmer is responsible to set the correct charset. + * And each application should remember its own charset to support. + * + * @param charset the default charset for the document + * @throws DefaultCharsetChanged default charset changed + */ + public static void setDefaultDocumentCharset(String charset) + throws DefaultCharsetChanged { + + defaultDocumentCharset = charset; + throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET, + "the default document charset changed"); + } + + + /** + * Get the recommended default charset of the document. + * + * @return the default charset string + */ + public static String getDefaultDocumentCharset() { + return defaultDocumentCharset; + } + + + /** + * Get the default charset of the document by locale. + * + * @return the default charset string by locale + */ + public static String getDefaultDocumentCharsetByLocale() { + return defaultDocumentCharsetByLocale; + } + + + /** + * Get the default charset of the document by platform. + * + * @return the default charset string by platform + */ + public static String getDefaultDocumentCharsetByPlatform() { + return defaultDocumentCharsetByPlatform; + } + + // ------------------------------------------------------------- The scheme + + /** + * Get the scheme. + * + * @return the scheme + */ + public char[] getRawScheme() { + return _scheme; + } + + + /** + * Get the scheme. + * + * @return the scheme + * null if undefined scheme + */ + public String getScheme() { + return (_scheme == null) ? null : new String(_scheme); + } + + // ---------------------------------------------------------- The authority + + /** + * Set the authority. It can be one type of server, hostport, hostname, + * IPv4address, IPv6reference and reg_name. + *

+     *   authority     = server | reg_name
+     * 

+ * + * @param escapedAuthority the raw escaped authority + * @throws URIException If {@link + * #parseAuthority(String,boolean)} fails + * @throws NullPointerException null authority + */ + public void setRawAuthority(char[] escapedAuthority) + throws URIException, NullPointerException { + + parseAuthority(new String(escapedAuthority), true); + setURI(); + } + + + /** + * Set the authority. It can be one type of server, hostport, hostname, + * IPv4address, IPv6reference and reg_name. + * Note that there is no setAuthority method by the escape encoding reason. + * + * @param escapedAuthority the escaped authority string + * @throws URIException If {@link + * #parseAuthority(String,boolean)} fails + */ + public void setEscapedAuthority(String escapedAuthority) + throws URIException { + + parseAuthority(escapedAuthority, true); + setURI(); + } + + + /** + * Get the raw-escaped authority. + * + * @return the raw-escaped authority + */ + public char[] getRawAuthority() { + return _authority; + } + + + /** + * Get the escaped authority. + * + * @return the escaped authority + */ + public String getEscapedAuthority() { + return (_authority == null) ? null : new String(_authority); + } + + + /** + * Get the authority. + * + * @return the authority + * @throws URIException If {@link #decode} fails + */ + public String getAuthority() throws URIException { + return (_authority == null) ? null : decode(_authority, + getProtocolCharset()); + } + + // ----------------------------------------------------------- The userinfo + + /** + * Get the raw-escaped userinfo. + * + * @return the raw-escaped userinfo + * @see #getAuthority + */ + public char[] getRawUserinfo() { + return _userinfo; + } + + + /** + * Get the escaped userinfo. + * + * @return the escaped userinfo + * @see #getAuthority + */ + public String getEscapedUserinfo() { + return (_userinfo == null) ? null : new String(_userinfo); + } + + + /** + * Get the userinfo. + * + * @return the userinfo + * @throws URIException If {@link #decode} fails + * @see #getAuthority + */ + public String getUserinfo() throws URIException { + return (_userinfo == null) ? null : decode(_userinfo, + getProtocolCharset()); + } + + // --------------------------------------------------------------- The host + + /** + * Get the host. + *

+     *   host          = hostname | IPv4address | IPv6reference
+     * 

+ * + * @return the host + * @see #getAuthority + */ + public char[] getRawHost() { + return _host; + } + + + /** + * Get the host. + *

+     *   host          = hostname | IPv4address | IPv6reference
+     * 

+ * + * @return the host + * @throws URIException If {@link #decode} fails + * @see #getAuthority + */ + public String getHost() throws URIException { + if (_host != null) { + return decode(_host, getProtocolCharset()); + } else { + return null; + } + } + + // --------------------------------------------------------------- The port + + /** + * Get the port. In order to get the specfic default port, the specific + * protocol-supported class extended from the URI class should be used. + * It has the server-based naming authority. + * + * @return the port + * if -1, it has the default port for the scheme or the server-based + * naming authority is not supported in the specific URI. + */ + public int getPort() { + return _port; + } + + // --------------------------------------------------------------- The path + + /** + * Set the raw-escaped path. + * + * @param escapedPath the path character sequence + * @throws URIException encoding error or not proper for initial instance + * @see #encode + */ + public void setRawPath(char[] escapedPath) throws URIException { + if (escapedPath == null || escapedPath.length == 0) { + _path = _opaque = escapedPath; + setURI(); + return; + } + // remove the fragment identifier + escapedPath = removeFragmentIdentifier(escapedPath); + if (_is_net_path || _is_abs_path) { + if (escapedPath[0] != '/') { + throw new URIException(URIException.PARSING, + "not absolute path"); + } + if (!validate(escapedPath, abs_path)) { + throw new URIException(URIException.ESCAPING, + "escaped absolute path not valid"); + } + _path = escapedPath; + } else if (_is_rel_path) { + int at = indexFirstOf(escapedPath, '/'); + if (at == 0) { + throw new URIException(URIException.PARSING, "incorrect path"); + } + if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment) + && !validate(escapedPath, at, -1, abs_path) + || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) { + + throw new URIException(URIException.ESCAPING, + "escaped relative path not valid"); + } + _path = escapedPath; + } else if (_is_opaque_part) { + if (!uric_no_slash.get(escapedPath[0]) + && !validate(escapedPath, 1, -1, uric)) { + throw new URIException(URIException.ESCAPING, + "escaped opaque part not valid"); + } + _opaque = escapedPath; + } else { + throw new URIException(URIException.PARSING, "incorrect path"); + } + setURI(); + } + + + /** + * Set the escaped path. + * + * @param escapedPath the escaped path string + * @throws URIException encoding error or not proper for initial instance + * @see #encode + */ + public void setEscapedPath(String escapedPath) throws URIException { + if (escapedPath == null) { + _path = _opaque = null; + setURI(); + return; + } + setRawPath(escapedPath.toCharArray()); + } + + + /** + * Set the path. + * + * @param path the path string + * @throws URIException set incorrectly or fragment only + * @see #encode + */ + public void setPath(String path) throws URIException { + + if (path == null || path.length() == 0) { + _path = _opaque = (path == null) ? null : path.toCharArray(); + setURI(); + return; + } + // set the charset to do escape encoding + String charset = getProtocolCharset(); + + if (_is_net_path || _is_abs_path) { + _path = encode(path, allowed_abs_path, charset); + } else if (_is_rel_path) { + StringBuffer buff = new StringBuffer(path.length()); + int at = path.indexOf('/'); + if (at == 0) { // never 0 + throw new URIException(URIException.PARSING, + "incorrect relative path"); + } + if (at > 0) { + buff.append(encode(path.substring(0, at), allowed_rel_path, + charset)); + buff.append(encode(path.substring(at), allowed_abs_path, + charset)); + } else { + buff.append(encode(path, allowed_rel_path, charset)); + } + _path = buff.toString().toCharArray(); + } else if (_is_opaque_part) { + StringBuffer buf = new StringBuffer(); + buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset)); + buf.insert(1, encode(path.substring(1), uric, charset)); + _opaque = buf.toString().toCharArray(); + } else { + throw new URIException(URIException.PARSING, "incorrect path"); + } + setURI(); + } + + + /** + * Resolve the base and relative path. + * + * @param basePath a character array of the basePath + * @param relPath a character array of the relPath + * @return the resolved path + * @throws URIException no more higher path level to be resolved + */ + protected char[] resolvePath(char[] basePath, char[] relPath) + throws URIException { + + // REMINDME: paths are never null + String base = (basePath == null) ? "" : new String(basePath); + + // _path could be empty + if (relPath == null || relPath.length == 0) { + return normalize(basePath); + } else if (relPath[0] == '/') { + return normalize(relPath); + } else { + int at = base.lastIndexOf('/'); + if (at != -1) { + basePath = base.substring(0, at + 1).toCharArray(); + } + StringBuffer buff = new StringBuffer(base.length() + + relPath.length); + buff.append((at != -1) ? base.substring(0, at + 1) : "/"); + buff.append(relPath); + return normalize(buff.toString().toCharArray()); + } + } + + + /** + * Get the raw-escaped current hierarchy level in the given path. + * If the last namespace is a collection, the slash mark ('/') should be + * ended with at the last character of the path string. + * + * @param path the path + * @return the current hierarchy level + * @throws URIException no hierarchy level + */ + protected char[] getRawCurrentHierPath(char[] path) throws URIException { + + if (_is_opaque_part) { + throw new URIException(URIException.PARSING, "no hierarchy level"); + } + if (path == null) { + throw new URIException(URIException.PARSING, "empty path"); + } + String buff = new String(path); + int first = buff.indexOf('/'); + int last = buff.lastIndexOf('/'); + if (last == 0) { + return rootPath; + } else if (first != last && last != -1) { + return buff.substring(0, last).toCharArray(); + } + // FIXME: it could be a document on the server side + return path; + } + + + /** + * Get the raw-escaped current hierarchy level. + * + * @return the raw-escaped current hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + */ + public char[] getRawCurrentHierPath() throws URIException { + return (_path == null) ? null : getRawCurrentHierPath(_path); + } + + + /** + * Get the escaped current hierarchy level. + * + * @return the escaped current hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + */ + public String getEscapedCurrentHierPath() throws URIException { + char[] path = getRawCurrentHierPath(); + return (path == null) ? null : new String(path); + } + + + /** + * Get the current hierarchy level. + * + * @return the current hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + * @see #decode + */ + public String getCurrentHierPath() throws URIException { + char[] path = getRawCurrentHierPath(); + return (path == null) ? null : decode(path, getProtocolCharset()); + } + + + /** + * Get the level above the this hierarchy level. + * + * @return the raw above hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + */ + public char[] getRawAboveHierPath() throws URIException { + char[] path = getRawCurrentHierPath(); + return (path == null) ? null : getRawCurrentHierPath(path); + } + + + /** + * Get the level above the this hierarchy level. + * + * @return the raw above hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + */ + public String getEscapedAboveHierPath() throws URIException { + char[] path = getRawAboveHierPath(); + return (path == null) ? null : new String(path); + } + + + /** + * Get the level above the this hierarchy level. + * + * @return the above hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + * @see #decode + */ + public String getAboveHierPath() throws URIException { + char[] path = getRawAboveHierPath(); + return (path == null) ? null : decode(path, getProtocolCharset()); + } + + + /** + * Get the raw-escaped path. + *

+     *   path          = [ abs_path | opaque_part ]
+     * 

+ * + * @return the raw-escaped path + */ + public char[] getRawPath() { + return _is_opaque_part ? _opaque : _path; + } + + + /** + * Get the escaped path. + *

+     *   path          = [ abs_path | opaque_part ]
+     *   abs_path      = "/"  path_segments 
+     *   opaque_part   = uric_no_slash *uric
+     * 

+ * + * @return the escaped path string + */ + public String getEscapedPath() { + char[] path = getRawPath(); + return (path == null) ? null : new String(path); + } + + + /** + * Get the path. + *

+     *   path          = [ abs_path | opaque_part ]
+     * 

+ * @return the path string + * @throws URIException If {@link #decode} fails. + * @see #decode + */ + public String getPath() throws URIException { + char[] path = getRawPath(); + return (path == null) ? null : decode(path, getProtocolCharset()); + } + + + /** + * Get the raw-escaped basename of the path. + * + * @return the raw-escaped basename + */ + public char[] getRawName() { + if (_path == null) { + return null; + } + + int at = 0; + for (int i = _path.length - 1; i >= 0; i--) { + if (_path[i] == '/') { + at = i + 1; + break; + } + } + int len = _path.length - at; + char[] basename = new char[len]; + System.arraycopy(_path, at, basename, 0, len); + return basename; + } + + + /** + * Get the escaped basename of the path. + * + * @return the escaped basename string + */ + public String getEscapedName() { + char[] basename = getRawName(); + return (basename == null) ? null : new String(basename); + } + + + /** + * Get the basename of the path. + * + * @return the basename string + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getName() throws URIException { + char[] basename = getRawName(); + return (basename == null) ? null : decode(getRawName(), + getProtocolCharset()); + } + + // ----------------------------------------------------- The path and query + + /** + * Get the raw-escaped path and query. + * + * @return the raw-escaped path and query + */ + public char[] getRawPathQuery() { + + if (_path == null && _query == null) { + return null; + } + StringBuffer buff = new StringBuffer(); + if (_path != null) { + buff.append(_path); + } + if (_query != null) { + buff.append('?'); + buff.append(_query); + } + return buff.toString().toCharArray(); + } + + + /** + * Get the escaped query. + * + * @return the escaped path and query string + */ + public String getEscapedPathQuery() { + char[] rawPathQuery = getRawPathQuery(); + return (rawPathQuery == null) ? null : new String(rawPathQuery); + } + + + /** + * Get the path and query. + * + * @return the path and query string. + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getPathQuery() throws URIException { + char[] rawPathQuery = getRawPathQuery(); + return (rawPathQuery == null) ? null : decode(rawPathQuery, + getProtocolCharset()); + } + + // -------------------------------------------------------------- The query + + /** + * Set the raw-escaped query. + * + * @param escapedQuery the raw-escaped query + * @throws URIException escaped query not valid + */ + public void setRawQuery(char[] escapedQuery) throws URIException { + if (escapedQuery == null || escapedQuery.length == 0) { + _query = escapedQuery; + setURI(); + return; + } + // remove the fragment identifier + escapedQuery = removeFragmentIdentifier(escapedQuery); + if (!validate(escapedQuery, query)) { + throw new URIException(URIException.ESCAPING, + "escaped query not valid"); + } + _query = escapedQuery; + setURI(); + } + + + /** + * Set the escaped query string. + * + * @param escapedQuery the escaped query string + * @throws URIException escaped query not valid + */ + public void setEscapedQuery(String escapedQuery) throws URIException { + if (escapedQuery == null) { + _query = null; + setURI(); + return; + } + setRawQuery(escapedQuery.toCharArray()); + } + + + /** + * Set the query. + *

+ * When a query string is not misunderstood the reserved special characters + * ("&", "=", "+", ",", and "$") within a query component, it is + * recommended to use in encoding the whole query with this method. + *

+ * The additional APIs for the special purpose using by the reserved + * special characters used in each protocol are implemented in each protocol + * classes inherited from URI. So refer to the same-named APIs + * implemented in each specific protocol instance. + * + * @param query the query string. + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #encode + */ + public void setQuery(String query) throws URIException { + if (query == null || query.length() == 0) { + _query = (query == null) ? null : query.toCharArray(); + setURI(); + return; + } + setRawQuery(encode(query, allowed_query, getProtocolCharset())); + } + + + /** + * Get the raw-escaped query. + * + * @return the raw-escaped query + */ + public char[] getRawQuery() { + return _query; + } + + + /** + * Get the escaped query. + * + * @return the escaped query string + */ + public String getEscapedQuery() { + return (_query == null) ? null : new String(_query); + } + + + /** + * Get the query. + * + * @return the query string. + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getQuery() throws URIException { + return (_query == null) ? null : decode(_query, getProtocolCharset()); + } + + // ----------------------------------------------------------- The fragment + + /** + * Set the raw-escaped fragment. + * + * @param escapedFragment the raw-escaped fragment + * @throws URIException escaped fragment not valid + */ + public void setRawFragment(char[] escapedFragment) throws URIException { + if (escapedFragment == null || escapedFragment.length == 0) { + _fragment = escapedFragment; + hash = 0; + return; + } + if (!validate(escapedFragment, fragment)) { + throw new URIException(URIException.ESCAPING, + "escaped fragment not valid"); + } + _fragment = escapedFragment; + hash = 0; + } + + + /** + * Set the escaped fragment string. + * + * @param escapedFragment the escaped fragment string + * @throws URIException escaped fragment not valid + */ + public void setEscapedFragment(String escapedFragment) throws URIException { + if (escapedFragment == null) { + _fragment = null; + hash = 0; + return; + } + setRawFragment(escapedFragment.toCharArray()); + } + + + /** + * Set the fragment. + * + * @param fragment the fragment string. + * @throws URIException If an error occurs. + */ + public void setFragment(String fragment) throws URIException { + if (fragment == null || fragment.length() == 0) { + _fragment = (fragment == null) ? null : fragment.toCharArray(); + hash = 0; + return; + } + _fragment = encode(fragment, allowed_fragment, getProtocolCharset()); + hash = 0; + } + + + /** + * Get the raw-escaped fragment. + *

+ * The optional fragment identifier is not part of a URI, but is often used + * in conjunction with a URI. + *

+ * The format and interpretation of fragment identifiers is dependent on + * the media type [RFC2046] of the retrieval result. + *

+ * A fragment identifier is only meaningful when a URI reference is + * intended for retrieval and the result of that retrieval is a document + * for which the identified fragment is consistently defined. + * + * @return the raw-escaped fragment + */ + public char[] getRawFragment() { + return _fragment; + } + + + /** + * Get the escaped fragment. + * + * @return the escaped fragment string + */ + public String getEscapedFragment() { + return (_fragment == null) ? null : new String(_fragment); + } + + + /** + * Get the fragment. + * + * @return the fragment string + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getFragment() throws URIException { + return (_fragment == null) ? null : decode(_fragment, + getProtocolCharset()); + } + + // ------------------------------------------------------------- Utilities + + /** + * Remove the fragment identifier of the given component. + * + * @param component the component that a fragment may be included + * @return the component that the fragment identifier is removed + */ + protected char[] removeFragmentIdentifier(char[] component) { + if (component == null) { + return null; + } + int lastIndex = new String(component).indexOf('#'); + if (lastIndex != -1) { + component = new String(component).substring(0, + lastIndex).toCharArray(); + } + return component; + } + + + /** + * Normalize the given hier path part. + * + *

Algorithm taken from URI reference parser at + * http://www.apache.org/~fielding/uri/rev-2002/issues.html. + * + * @param path the path to normalize + * @return the normalized path + * @throws URIException no more higher path level to be normalized + */ + protected char[] normalize(char[] path) throws URIException { + + if (path == null) { + return null; + } + + String normalized = new String(path); + + // If the buffer begins with "./" or "../", the "." or ".." is removed. + if (normalized.startsWith("./")) { + normalized = normalized.substring(1); + } else if (normalized.startsWith("../")) { + normalized = normalized.substring(2); + } else if (normalized.startsWith("..")) { + normalized = normalized.substring(2); + } + + // All occurrences of "/./" in the buffer are replaced with "/" + int index = -1; + while ((index = normalized.indexOf("/./")) != -1) { + normalized = normalized.substring(0, index) + normalized.substring(index + 2); + } + + // If the buffer ends with "/.", the "." is removed. + if (normalized.endsWith("/.")) { + normalized = normalized.substring(0, normalized.length() - 1); + } + + int startIndex = 0; + + // All occurrences of "//../" in the buffer, where ".." + // and are complete path segments, are iteratively replaced + // with "/" in order from left to right until no matching pattern remains. + // If the buffer ends with "//..", that is also replaced + // with "/". Note that may be empty. + while ((index = normalized.indexOf("/../", startIndex)) != -1) { + int slashIndex = normalized.lastIndexOf('/', index - 1); + if (slashIndex >= 0) { + normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3); + } else { + startIndex = index + 3; + } + } + if (normalized.endsWith("/..")) { + int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4); + if (slashIndex >= 0) { + normalized = normalized.substring(0, slashIndex + 1); + } + } + + // All prefixes of "/../" in the buffer, where ".." + // and are complete path segments, are iteratively replaced + // with "/" in order from left to right until no matching pattern remains. + // If the buffer ends with "/..", that is also replaced + // with "/". Note that may be empty. + while ((index = normalized.indexOf("/../")) != -1) { + int slashIndex = normalized.lastIndexOf('/', index - 1); + if (slashIndex >= 0) { + break; + } else { + normalized = normalized.substring(index + 3); + } + } + if (normalized.endsWith("/..")) { + int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4); + if (slashIndex < 0) { + normalized = "/"; + } + } + + return normalized.toCharArray(); + } + + + /** + * Normalizes the path part of this URI. Normalization is only meant to be performed on + * URIs with an absolute path. Calling this method on a relative path URI will have no + * effect. + * + * @throws URIException no more higher path level to be normalized + * + * @see #isAbsPath() + */ + public void normalize() throws URIException { + if (isAbsPath()) { + _path = normalize(_path); + setURI(); + } + } + + + /** + * Test if the first array is equal to the second array. + * + * @param first the first character array + * @param second the second character array + * @return true if they're equal + */ + protected boolean equals(char[] first, char[] second) { + + if (first == null && second == null) { + return true; + } + if (first == null || second == null) { + return false; + } + if (first.length != second.length) { + return false; + } + for (int i = 0; i < first.length; i++) { + if (first[i] != second[i]) { + return false; + } + } + return true; + } + + + /** + * Test an object if this URI is equal to another. + * + * @param obj an object to compare + * @return true if two URI objects are equal + */ + public boolean equals(Object obj) { + + // normalize and test each components + if (obj == this) { + return true; + } + if (!(obj instanceof URI)) { + return false; + } + URI another = (URI) obj; + // scheme + if (!equals(_scheme, another._scheme)) { + return false; + } + // is_opaque_part or is_hier_part? and opaque + if (!equals(_opaque, another._opaque)) { + return false; + } + // is_hier_part + // has_authority + if (!equals(_authority, another._authority)) { + return false; + } + // path + if (!equals(_path, another._path)) { + return false; + } + // has_query + if (!equals(_query, another._query)) { + return false; + } + // has_fragment? should be careful of the only fragment case. + if (!equals(_fragment, another._fragment)) { + return false; + } + return true; + } + + // ---------------------------------------------------------- Serialization + + /** + * Write the content of this URI. + * + * @param oos the object-output stream + * @throws IOException If an IO problem occurs. + */ + private void writeObject(ObjectOutputStream oos) + throws IOException { + + oos.defaultWriteObject(); + } + + + /** + * Read a URI. + * + * @param ois the object-input stream + * @throws ClassNotFoundException If one of the classes specified in the + * input stream cannot be found. + * @throws IOException If an IO problem occurs. + */ + private void readObject(ObjectInputStream ois) + throws ClassNotFoundException, IOException { + + ois.defaultReadObject(); + } + + // -------------------------------------------------------------- Hash code + + /** + * Return a hash code for this URI. + * + * @return a has code value for this URI + */ + public int hashCode() { + if (hash == 0) { + char[] c = _uri; + if (c != null) { + for (int i = 0, len = c.length; i < len; i++) { + hash = 31 * hash + c[i]; + } + } + c = _fragment; + if (c != null) { + for (int i = 0, len = c.length; i < len; i++) { + hash = 31 * hash + c[i]; + } + } + } + return hash; + } + + // ------------------------------------------------------------- Comparison + + /** + * Compare this URI to another object. + * + * @param obj the object to be compared. + * @return 0, if it's same, + * -1, if failed, first being compared with in the authority component + * @throws ClassCastException not URI argument + */ + public int compareTo(Object obj) throws ClassCastException { + + URI another = (URI) obj; + if (!equals(_authority, another.getRawAuthority())) { + return -1; + } + return toString().compareTo(another.toString()); + } + + // ------------------------------------------------------------------ Clone + + /** + * Create and return a copy of this object, the URI-reference containing + * the userinfo component. Notice that the whole URI-reference including + * the userinfo component counld not be gotten as a String. + *

+ * To copy the identical URI object including the userinfo + * component, it should be used. + * + * @return a clone of this instance + */ + public synchronized Object clone() throws CloneNotSupportedException { + + URI instance = (URI) super.clone(); + + instance._uri = _uri; + instance._scheme = _scheme; + instance._opaque = _opaque; + instance._authority = _authority; + instance._userinfo = _userinfo; + instance._host = _host; + instance._port = _port; + instance._path = _path; + instance._query = _query; + instance._fragment = _fragment; + // the charset to do escape encoding for this instance + instance.protocolCharset = protocolCharset; + // flags + instance._is_hier_part = _is_hier_part; + instance._is_opaque_part = _is_opaque_part; + instance._is_net_path = _is_net_path; + instance._is_abs_path = _is_abs_path; + instance._is_rel_path = _is_rel_path; + instance._is_reg_name = _is_reg_name; + instance._is_server = _is_server; + instance._is_hostname = _is_hostname; + instance._is_IPv4address = _is_IPv4address; + instance._is_IPv6reference = _is_IPv6reference; + + return instance; + } + + // ------------------------------------------------------------ Get the URI + + /** + * It can be gotten the URI character sequence. It's raw-escaped. + * For the purpose of the protocol to be transported, it will be useful. + *

+ * It is clearly unwise to use a URL that contains a password which is + * intended to be secret. In particular, the use of a password within + * the 'userinfo' component of a URL is strongly disrecommended except + * in those rare cases where the 'password' parameter is intended to be + * public. + *

+ * When you want to get each part of the userinfo, you need to use the + * specific methods in the specific URL. It depends on the specific URL. + * + * @return the URI character sequence + */ + public char[] getRawURI() { + return _uri; + } + + + /** + * It can be gotten the URI character sequence. It's escaped. + * For the purpose of the protocol to be transported, it will be useful. + * + * @return the escaped URI string + */ + public String getEscapedURI() { + return (_uri == null) ? null : new String(_uri); + } + + + /** + * It can be gotten the URI character sequence. + * + * @return the original URI string + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getURI() throws URIException { + return (_uri == null) ? null : decode(_uri, getProtocolCharset()); + } + + + /** + * Get the URI reference character sequence. + * + * @return the URI reference character sequence + */ + public char[] getRawURIReference() { + if (_fragment == null) { + return _uri; + } + if (_uri == null) { + return _fragment; + } + // if _uri != null && _fragment != null + String uriReference = new String(_uri) + "#" + new String(_fragment); + return uriReference.toCharArray(); + } + + + /** + * Get the escaped URI reference string. + * + * @return the escaped URI reference string + */ + public String getEscapedURIReference() { + char[] uriReference = getRawURIReference(); + return (uriReference == null) ? null : new String(uriReference); + } + + + /** + * Get the original URI reference string. + * + * @return the original URI reference string + * @throws URIException If {@link #decode} fails. + */ + public String getURIReference() throws URIException { + char[] uriReference = getRawURIReference(); + return (uriReference == null) ? null : decode(uriReference, + getProtocolCharset()); + } + + + /** + * Get the escaped URI string. + *

+ * On the document, the URI-reference form is only used without the userinfo + * component like http://jakarta.apache.org/ by the security reason. + * But the URI-reference form with the userinfo component could be parsed. + *

+ * In other words, this URI and any its subclasses must not expose the + * URI-reference expression with the userinfo component like + * http://user:password@hostport/restricted_zone.
+ * It means that the API client programmer should extract each user and + * password to access manually. Probably it will be supported in the each + * subclass, however, not a whole URI-reference expression. + * + * @return the escaped URI string + * @see #clone() + */ + public String toString() { + return getEscapedURI(); + } + + + // ------------------------------------------------------------ Inner class + + /** + * The charset-changed normal operation to represent to be required to + * alert to user the fact the default charset is changed. + */ + public static class DefaultCharsetChanged extends RuntimeException { + + // ------------------------------------------------------- constructors + + /** + * The constructor with a reason string and its code arguments. + * + * @param reasonCode the reason code + * @param reason the reason + */ + public DefaultCharsetChanged(int reasonCode, String reason) { + super(reason); + this.reason = reason; + this.reasonCode = reasonCode; + } + + // ---------------------------------------------------------- constants + + /** No specified reason code. */ + public static final int UNKNOWN = 0; + + /** Protocol charset changed. */ + public static final int PROTOCOL_CHARSET = 1; + + /** Document charset changed. */ + public static final int DOCUMENT_CHARSET = 2; + + // ------------------------------------------------- instance variables + + /** The reason code. */ + private int reasonCode; + + /** The reason message. */ + private String reason; + + // ------------------------------------------------------------ methods + + /** + * Get the reason code. + * + * @return the reason code + */ + public int getReasonCode() { + return reasonCode; + } + + /** + * Get the reason message. + * + * @return the reason message + */ + public String getReason() { + return reason; + } + + } + + + /** + * A mapping to determine the (somewhat arbitrarily) preferred charset for a + * given locale. Supports all locales recognized in JDK 1.1. + *

+ * The distribution of this class is Servlets.com. It was originally + * written by Jason Hunter [jhunter at acm.org] and used by with permission. + */ + public static class LocaleToCharsetMap { + + /** A mapping of language code to charset */ + private static final Hashtable LOCALE_TO_CHARSET_MAP; + static { + LOCALE_TO_CHARSET_MAP = new Hashtable(); + LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6"); + LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7"); + LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8"); + LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS"); + LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR"); + LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9"); + LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("zh", "GB2312"); + LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5"); + } + + /** + * Get the preferred charset for the given locale. + * + * @param locale the locale + * @return the preferred charset or null if the locale is not + * recognized. + */ + public static String getCharset(Locale locale) { + // try for an full name match (may include country) + String charset = + (String) LOCALE_TO_CHARSET_MAP.get(locale.toString()); + if (charset != null) { + return charset; + } + + // if a full name didn't match, try just the language + charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage()); + return charset; // may be null + } + + } + +} + diff --git a/src/main/java/org/archive/url/URIException.java b/src/main/java/org/archive/url/URIException.java new file mode 100644 index 00000000..b32c68cf --- /dev/null +++ b/src/main/java/org/archive/url/URIException.java @@ -0,0 +1,180 @@ +/* + * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/URIException.java,v 1.12 2004/09/30 18:53:20 olegk Exp $ + * $Revision: 480424 $ + * $Date: 2006-11-29 06:56:49 +0100 (Wed, 29 Nov 2006) $ + * + * ==================================================================== + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ + +package org.archive.url; + +import java.io.IOException; + +/** + * The URI parsing and escape encoding exception. + * + * @author Sung-Gu + * @author Oleg Kalnichevski + * @version $Revision: 480424 $ $Date: 2002/03/14 15:14:01 + */ +public class URIException extends IOException { + + // ----------------------------------------------------------- constructors + + /** + * Default constructor. + */ + public URIException() { + } + + + /** + * The constructor with a reason code argument. + * + * @param reasonCode the reason code + */ + public URIException(int reasonCode) { + this.reasonCode = reasonCode; + } + + + /** + * The constructor with a reason string and its code arguments. + * + * @param reasonCode the reason code + * @param reason the reason + */ + public URIException(int reasonCode, String reason) { + super(reason); // for backward compatibility of Throwable + this.reason = reason; + this.reasonCode = reasonCode; + } + + + /** + * The constructor with a reason string argument. + * + * @param reason the reason + */ + public URIException(String reason) { + super(reason); // for backward compatibility of Throwable + this.reason = reason; + this.reasonCode = UNKNOWN; + } + + // -------------------------------------------------------------- constants + + /** + * No specified reason code. + */ + public static final int UNKNOWN = 0; + + + /** + * The URI parsing error. + */ + public static final int PARSING = 1; + + + /** + * The unsupported character encoding. + */ + public static final int UNSUPPORTED_ENCODING = 2; + + + /** + * The URI escape encoding and decoding error. + */ + public static final int ESCAPING = 3; + + + /** + * The DNS punycode encoding or decoding error. + */ + public static final int PUNYCODE = 4; + + // ------------------------------------------------------------- properties + + /** + * The reason code. + */ + protected int reasonCode; + + + /** + * The reason message. + */ + protected String reason; + + // ---------------------------------------------------------------- methods + + /** + * Get the reason code. + * + * @return the reason code + */ + public int getReasonCode() { + return reasonCode; + } + + /** + * Set the reason code. + * + * @param reasonCode the reason code + * + * @deprecated Callers should set the reason code as a parameter to the + * constructor. + */ + public void setReasonCode(int reasonCode) { + this.reasonCode = reasonCode; + } + + + /** + * Get the reason message. + * + * @return the reason message + * + * @deprecated You should instead call {@link #getMessage()}. + */ + public String getReason() { + return reason; + } + + + /** + * Set the reason message. + * + * @param reason the reason message + * + * @deprecated Callers should instead set this via a parameter to the constructor. + */ + public void setReason(String reason) { + this.reason = reason; + } + + +} + diff --git a/src/main/java/org/archive/url/UsableURI.java b/src/main/java/org/archive/url/UsableURI.java index ed40f41a..b7d0cf71 100644 --- a/src/main/java/org/archive/url/UsableURI.java +++ b/src/main/java/org/archive/url/UsableURI.java @@ -26,14 +26,13 @@ import java.net.URI; import java.net.URISyntaxException; -import org.apache.commons.httpclient.URIException; import org.archive.util.SURT; import org.archive.util.TextUtils; /** * Usable URI. * - * This class wraps {@link org.apache.commons.httpclient.URI} adding caching + * This class wraps {@link org.archive.url.URI} adding caching * and methods. It cannot be instantiated directly. Go via UURIFactory. * *

We used to use {@link java.net.URI} for parsing URIs but ran across @@ -50,7 +49,7 @@ * @author gojomo * @author stack * - * @see org.apache.commons.httpclient.URI + * @see org.archive.url.URI */ public class UsableURI extends LaxURI implements CharSequence, Serializable { @@ -121,7 +120,6 @@ protected UsableURI() { * @param uri String representation of an absolute URI. * @param escaped If escaped. * @param charset Charset to use. - * @throws org.apache.commons.httpclient.URIException */ protected UsableURI(String uri, boolean escaped, String charset) throws URIException { @@ -132,7 +130,6 @@ protected UsableURI(String uri, boolean escaped, String charset) /** * @param relative String representation of URI. * @param base Parent UURI to use derelativizing. - * @throws org.apache.commons.httpclient.URIException */ protected UsableURI(UsableURI base, UsableURI relative) throws URIException { super(base, relative); @@ -275,7 +272,7 @@ public String toString() { /** * In the case of a puny encoded IDN, this method returns the decoded Unicode version. *

- * Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}. + * Most of this implementation is copied from {@link org.archive.url.URI#setURI()}. * * @return decoded IDN version of URI */ diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java index 3dfc33a7..08f18999 100644 --- a/src/main/java/org/archive/url/UsableURIFactory.java +++ b/src/main/java/org/archive/url/UsableURIFactory.java @@ -28,8 +28,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.httpclient.URI; -import org.apache.commons.httpclient.URIException; import org.archive.util.TextUtils; /** diff --git a/src/main/java/org/archive/util/ChunkedInputStream.java b/src/main/java/org/archive/util/ChunkedInputStream.java new file mode 100644 index 00000000..69b23047 --- /dev/null +++ b/src/main/java/org/archive/util/ChunkedInputStream.java @@ -0,0 +1,324 @@ +/* + * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/ChunkedInputStream.java,v 1.24 2004/10/10 15:18:55 olegk Exp $ + * $Revision: 480424 $ + * $Date: 2006-11-29 06:56:49 +0100 (Wed, 29 Nov 2006) $ + * + * ==================================================================== + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ + +package org.archive.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + + +/** + *

Transparently coalesces chunks of a HTTP stream that uses + * Transfer-Encoding chunked.

+ * + *

Note that this class NEVER closes the underlying stream, even when close + * gets called. Instead, it will read until the "end" of its chunking on close, + * which allows for the seamless invocation of subsequent HTTP 1.1 calls, while + * not requiring the client to remember to read the entire contents of the + * response.

+ * + * @author Ortwin Glueck + * @author Sean C. Sullivan + * @author Martin Elwin + * @author Eric Johnson + * @author Mike Bowler + * @author Michael Becke + * @author Oleg Kalnichevski + * + * @since 2.0 + * + */ +class ChunkedInputStream extends InputStream { + /** The inputstream that we're wrapping */ + private InputStream in; + + /** The chunk size */ + private int chunkSize; + + /** The current position within the current chunk */ + private int pos; + + /** True if we'are at the beginning of stream */ + private boolean bof = true; + + /** True if we've reached the end of stream */ + private boolean eof = false; + + /** True if this stream is closed */ + private boolean closed = false; + + /** + * ChunkedInputStream constructor + * + * @param in the raw input stream + * + */ + public ChunkedInputStream(final InputStream in) { + + if (in == null) { + throw new IllegalArgumentException("InputStream parameter may not be null"); + } + this.in = in; + this.pos = 0; + } + + /** + *

Returns all the data in a chunked stream in coalesced form. A chunk + * is followed by a CRLF. The method returns -1 as soon as a chunksize of 0 + * is detected.

+ * + *

Trailer headers are read automcatically at the end of the stream and + * can be obtained with the getResponseFooters() method.

+ * + * @return -1 of the end of the stream has been reached or the next data + * byte + * @throws IOException If an IO problem occurs + */ + public int read() throws IOException { + + if (closed) { + throw new IOException("Attempted read from closed stream."); + } + if (eof) { + return -1; + } + if (pos >= chunkSize) { + nextChunk(); + if (eof) { + return -1; + } + } + pos++; + return in.read(); + } + + /** + * Read some bytes from the stream. + * @param b The byte array that will hold the contents from the stream. + * @param off The offset into the byte array at which bytes will start to be + * placed. + * @param len the maximum number of bytes that can be returned. + * @return The number of bytes returned or -1 if the end of stream has been + * reached. + * @see InputStream#read(byte[], int, int) + * @throws IOException if an IO problem occurs. + */ + public int read (byte[] b, int off, int len) throws IOException { + + if (closed) { + throw new IOException("Attempted read from closed stream."); + } + + if (eof) { + return -1; + } + if (pos >= chunkSize) { + nextChunk(); + if (eof) { + return -1; + } + } + len = Math.min(len, chunkSize - pos); + int count = in.read(b, off, len); + pos += count; + return count; + } + + /** + * Read some bytes from the stream. + * @param b The byte array that will hold the contents from the stream. + * @return The number of bytes returned or -1 if the end of stream has been + * reached. + * @see InputStream#read(byte[]) + * @throws IOException if an IO problem occurs. + */ + public int read (byte[] b) throws IOException { + return read(b, 0, b.length); + } + + /** + * Read the CRLF terminator. + * @throws IOException If an IO error occurs. + */ + private void readCRLF() throws IOException { + int cr = in.read(); + int lf = in.read(); + if ((cr != '\r') || (lf != '\n')) { + throw new IOException( + "CRLF expected at end of chunk: " + cr + "/" + lf); + } + } + + + /** + * Read the next chunk. + * @throws IOException If an IO error occurs. + */ + private void nextChunk() throws IOException { + if (!bof) { + readCRLF(); + } + chunkSize = getChunkSizeFromInputStream(in); + bof = false; + pos = 0; + if (chunkSize == 0) { + eof = true; + parseTrailerHeaders(); + } + } + + /** + * Expects the stream to start with a chunksize in hex with optional + * comments after a semicolon. The line must end with a CRLF: "a3; some + * comment\r\n" Positions the stream at the start of the next line. + * + * @param in The new input stream. + * + * @return the chunk size as integer + * + * @throws IOException when the chunk size could not be parsed + */ + private static int getChunkSizeFromInputStream(final InputStream in) + throws IOException { + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + // States: 0=normal, 1=\r was scanned, 2=inside quoted string, -1=end + int state = 0; + while (state != -1) { + int b = in.read(); + if (b == -1) { + throw new IOException("chunked stream ended unexpectedly"); + } + switch (state) { + case 0: + switch (b) { + case '\r': + state = 1; + break; + case '\"': + state = 2; + /* fall through */ + default: + baos.write(b); + } + break; + + case 1: + if (b == '\n') { + state = -1; + } else { + // this was not CRLF + throw new IOException("Protocol violation: Unexpected" + + " single newline character in chunk size"); + } + break; + + case 2: + switch (b) { + case '\\': + b = in.read(); + baos.write(b); + break; + case '\"': + state = 0; + /* fall through */ + default: + baos.write(b); + } + break; + default: throw new RuntimeException("assertion failed"); + } + } + + //parse data + String dataString = baos.toString(StandardCharsets.US_ASCII.name()); + int separator = dataString.indexOf(';'); + dataString = (separator > 0) + ? dataString.substring(0, separator).trim() + : dataString.trim(); + + int result; + try { + result = Integer.parseInt(dataString.trim(), 16); + } catch (NumberFormatException e) { + throw new IOException ("Bad chunk size: " + dataString); + } + return result; + } + + /** + * Reads and stores the Trailer headers. + * @throws IOException If an IO problem occurs + */ + private void parseTrailerHeaders() throws IOException { + String charset = "US-ASCII"; + LaxHttpParser.parseHeaders(in, charset); + } + + /** + * Upon close, this reads the remainder of the chunked message, + * leaving the underlying socket at a position to start reading the + * next response without scanning. + * @throws IOException If an IO problem occurs. + */ + public void close() throws IOException { + if (!closed) { + try { + if (!eof) { + exhaustInputStream(this); + } + } finally { + eof = true; + closed = true; + } + } + } + + /** + * Exhaust an input stream, reading until EOF has been encountered. + * + *

Note that this function is intended as a non-public utility. + * This is a little weird, but it seemed silly to make a utility + * class for this one function, so instead it is just static and + * shared that way.

+ * + * @param inStream The {@link InputStream} to exhaust. + * @throws IOException If an IO problem occurs + */ + static void exhaustInputStream(InputStream inStream) throws IOException { + // read and discard the remainder of the message + byte buffer[] = new byte[1024]; + while (inStream.read(buffer) >= 0) { + ; + } + } +} diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java index 9e38669b..0545fd95 100644 --- a/src/main/java/org/archive/util/LaxHttpParser.java +++ b/src/main/java/org/archive/util/LaxHttpParser.java @@ -35,13 +35,11 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; +import java.util.logging.Logger; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.util.EncodingUtil; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.archive.format.http.HttpHeader; /** * A Modified version of HttpParser which doesn't throw exceptions on bad header lines @@ -57,7 +55,7 @@ public class LaxHttpParser { /** Log object for this class. */ - private static final Log LOG = LogFactory.getLog(LaxHttpParser.class); + private static final Logger LOG = Logger.getLogger(LaxHttpParser.class.getName()); /** * Constructor for LaxHttpParser. @@ -77,7 +75,7 @@ protected LaxHttpParser() { } * @return a byte array from the stream */ public static byte[] readRawLine(InputStream inputStream) throws IOException { - LOG.trace("enter LaxHttpParser.readRawLine()"); + LOG.finest("enter LaxHttpParser.readRawLine()"); ByteArrayOutputStream buf = new ByteArrayOutputStream(); int ch; @@ -108,7 +106,7 @@ public static byte[] readRawLine(InputStream inputStream) throws IOException { * @since 3.0 */ public static String readLine(InputStream inputStream, String charset) throws IOException { - LOG.trace("enter LaxHttpParser.readLine(InputStream, String)"); + LOG.finest("enter LaxHttpParser.readLine(InputStream, String)"); byte[] rawdata = readRawLine(inputStream); if (rawdata == null) { return null; @@ -126,7 +124,11 @@ public static String readLine(InputStream inputStream, String charset) throws IO } } } - return EncodingUtil.getString(rawdata, 0, len - offset, charset); + try { + return new String(rawdata, 0, len - offset, charset); + } catch (UnsupportedEncodingException e) { + return new String(rawdata, 0, len - offset); + } } /** @@ -144,7 +146,7 @@ public static String readLine(InputStream inputStream, String charset) throws IO */ public static String readLine(InputStream inputStream) throws IOException { - LOG.trace("enter LaxHttpParser.readLine(InputStream)"); + LOG.finest("enter LaxHttpParser.readLine(InputStream)"); return readLine(inputStream, "US-ASCII"); } @@ -158,14 +160,13 @@ public static String readLine(InputStream inputStream) throws IOException { * @return an array of headers in the order in which they were parsed * * @throws IOException if an IO error occurs while reading from the stream - * @throws HttpException if there is an error parsing a header value - * + * * @since 3.0 */ - public static Header[] parseHeaders(InputStream is, String charset) throws IOException, HttpException { - LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)"); + public static HttpHeader[] parseHeaders(InputStream is, String charset) throws IOException { + LOG.finest("enter HeaderParser.parseHeaders(InputStream, String)"); - ArrayList
headers = new ArrayList
(); + ArrayList headers = new ArrayList<>(); String name = null; StringBuffer value = null; for (; ;) { @@ -188,7 +189,7 @@ public static Header[] parseHeaders(InputStream is, String charset) throws IOExc } else { // make sure we save the previous name,value pair if present if (name != null) { - headers.add(new Header(name, value.toString())); + headers.add(new HttpHeader(name, value.toString())); } // Otherwise we should have normal HTTP header line @@ -216,10 +217,10 @@ public static Header[] parseHeaders(InputStream is, String charset) throws IOExc // make sure we save the last name,value pair if present if (name != null) { - headers.add(new Header(name, value.toString())); + headers.add(new HttpHeader(name, value.toString())); } - return (Header[]) headers.toArray(new Header[headers.size()]); + return headers.toArray(new HttpHeader[0]); } /** @@ -231,12 +232,11 @@ public static Header[] parseHeaders(InputStream is, String charset) throws IOExc * @return an array of headers in the order in which they were parsed * * @throws IOException if an IO error occurs while reading from the stream - * @throws HttpException if there is an error parsing a header value - * + * * @deprecated use #parseHeaders(InputStream, String) */ - public static Header[] parseHeaders(InputStream is) throws IOException, HttpException { - LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)"); + public static HttpHeader[] parseHeaders(InputStream is) throws IOException { + LOG.finest("enter HeaderParser.parseHeaders(InputStream, String)"); return parseHeaders(is, "US-ASCII"); } } diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java index dff02bff..61cbf871 100644 --- a/src/main/java/org/archive/util/Recorder.java +++ b/src/main/java/org/archive/util/Recorder.java @@ -32,7 +32,6 @@ import java.util.zip.DeflaterInputStream; import java.util.zip.GZIPInputStream; -import org.apache.commons.httpclient.ChunkedInputStream; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; diff --git a/src/main/java/org/archive/util/SURT.java b/src/main/java/org/archive/util/SURT.java index 69daf247..059b2ec6 100644 --- a/src/main/java/org/archive/util/SURT.java +++ b/src/main/java/org/archive/util/SURT.java @@ -29,7 +29,7 @@ import java.io.PrintStream; import java.util.regex.Matcher; -import org.apache.commons.httpclient.URIException; +import org.archive.url.URIException; import org.archive.url.UsableURIFactory; /** diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java index 68ee6551..69189862 100644 --- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java +++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java @@ -3,7 +3,6 @@ import java.io.IOException; import org.archive.util.binsearch.SeekableLineReaderFactory; -import org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory; import org.archive.util.binsearch.impl.http.ApacheHttp43SLRFactory; import org.archive.util.binsearch.impl.http.HTTPURLConnSLRFactory; @@ -20,15 +19,13 @@ protected HTTPSeekableLineReaderFactory() public enum HttpLibs { - @Deprecated - APACHE_31, APACHE_43, URLCONN, } public static HTTPSeekableLineReaderFactory getHttpFactory() { - return getHttpFactory(HttpLibs.APACHE_31); + return getHttpFactory(HttpLibs.APACHE_43); } public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type) @@ -38,7 +35,7 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type) public static HTTPSeekableLineReaderFactory getHttpFactory(String defaultURL) { - return getHttpFactory(HttpLibs.APACHE_31, defaultURL); + return getHttpFactory(HttpLibs.APACHE_43, defaultURL); } public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String defaultURL) @@ -46,10 +43,6 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String HTTPSeekableLineReaderFactory factory = null; switch (type) { - case APACHE_31: - factory = new ApacheHttp31SLRFactory(); - break; - case URLCONN: factory = new HTTPURLConnSLRFactory(); break; @@ -60,7 +53,7 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String } if (factory == null) { - factory = new ApacheHttp31SLRFactory(); + factory = new ApacheHttp43SLRFactory(); } factory.defaultURL = defaultURL; diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java deleted file mode 100644 index 124d3d03..00000000 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java +++ /dev/null @@ -1,235 +0,0 @@ -package org.archive.util.binsearch.impl.http; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URISyntaxException; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpMethod; -import org.apache.commons.httpclient.cookie.CookiePolicy; -import org.apache.commons.httpclient.methods.GetMethod; -import org.apache.commons.httpclient.methods.HeadMethod; -import org.apache.commons.io.input.CountingInputStream; -import org.archive.util.binsearch.impl.HTTPSeekableLineReader; - -/** - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class ApacheHttp31SLR extends HTTPSeekableLineReader { - - private HttpClient http; - private String url; - private long length = -1; - - protected CountingInputStream cin; - - private GetMethod activeMethod; - - public ApacheHttp31SLR(HttpClient http, String url) { - this.http = http; - this.url = url; - } - - private void acquireLength() throws URISyntaxException, HttpException, IOException { - HttpMethod head = new HeadMethod(url); - int code = http.executeMethod(head); - if(code != 200) { - throw new IOException("Unable to retrieve from " + url); - } - Header lengthHeader = head.getResponseHeader(CONTENT_LENGTH); - if(lengthHeader == null) { - throw new IOException("No Content-Length header for " + url); - } - String val = lengthHeader.getValue(); - try { - length = Long.parseLong(val); - } catch(NumberFormatException e) { - throw new IOException("Bad Content-Length value " +url+ ": " + val); - } - } - - protected String getHeader(String header) throws URISyntaxException, HttpException, IOException { - HttpMethod head = new HeadMethod(url); - int code = http.executeMethod(head); - if(code != 200) { - throw new IOException("Unable to retrieve from " + url); - } - Header theHeader = head.getResponseHeader(header); - if(theHeader == null) { - throw new IOException("No " + header + " header for " + url); - } - String val = theHeader.getValue(); - return val; - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReader#getUrl() - */ - @Override - public String getUrl() - { - return url; - } - -// public void seek(long offset, boolean gzip) throws IOException { -// is = doSeekLoad(offset, -1); -// -// if (gzip) { -// is = new GZIPMembersInputStream(is, blockSize); -// } -// } - -// public void seekWithMaxRead(long offset, boolean gzip, int maxLength) throws IOException { -// is = doSeekLoad(offset, maxLength); -// -// if (bufferFully && (maxLength > 0) && (maxLength < 1e10)) { -// try { -// byte[] buffer = new byte[maxLength]; -// ByteStreams.readFully(is, buffer); -// is.close(); -// -// // Create new stream -// is = new ByteArrayInputStream(buffer); -// } finally { -// activeMethod.releaseConnection(); -// activeMethod = null; -// } -// } -// -// if (gzip) { -// is = new GZIPMembersInputStream(is, blockSize); -// } -// } - - protected InputStream doSeekLoad(long offset, int maxLength) throws IOException { - if (activeMethod != null) { - doClose(); - } - - br = null; - - try { - - activeMethod = new GetMethod(url); - - String rangeHeader = makeRangeHeader(offset, maxLength); - - if (rangeHeader != null) { - activeMethod.setRequestHeader("Range", rangeHeader); - } - - if (this.isNoKeepAlive()) { - activeMethod.setRequestHeader("Connection", "close"); - } - - if (this.getCookie() != null) { - activeMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); - activeMethod.setRequestHeader("Cookie", this.getCookie()); - } - - int code = http.executeMethod(activeMethod); - - connectedUrl = activeMethod.getURI().toString(); - - if ((code != 206) && (code != 200)) { - throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader); - } - - InputStream is = activeMethod.getResponseBodyAsStream(); - cin = new CountingInputStream(is); - return cin; - - } catch (IOException io) { - if (saveErrHeader != null) { - errHeader = getHeaderValue(saveErrHeader); - } - - connectedUrl = activeMethod.getURI().toString(); - doClose(); - throw io; - } - } - - public GetMethod getHttpMethod() - { - return activeMethod; - } - - public void doClose() throws IOException { - - if (activeMethod == null) { - return; - } - - try { - long contentLength = activeMethod.getResponseContentLength(); - - long bytesRead = (cin != null ? cin.getByteCount() : 0); - - // If fully read, close gracefully, otherwise abort - if ((contentLength > 0) && (contentLength == bytesRead)) { -// try { -// cin.close(); -// } catch (IOException e) { -// activeMethod.abort(); -// } - } else { - activeMethod.abort(); - } - - activeMethod.releaseConnection(); - activeMethod = null; - - } finally { - if (activeMethod != null) { - activeMethod.abort(); - activeMethod.releaseConnection(); - activeMethod = null; - } - } - - cin = null; - is = null; - br = null; - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReader#getSize() - */ - @Override - public long getSize() throws IOException { - if (length < 0) { - try { - if (activeMethod != null) { - length = activeMethod.getResponseContentLength(); - } else { - acquireLength(); - } - } catch (URISyntaxException e) { - throw new IOException(e); - } - } - return length; - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReader#getHeaderValue(java.lang.String) - */ - @Override - public String getHeaderValue(String headerName) { - if (activeMethod == null) { - return null; - } - - Header header = activeMethod.getResponseHeader(headerName); - - if (header == null) { - return null; - } - - return header.getValue(); - } -} diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java deleted file mode 100644 index 2af03dab..00000000 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java +++ /dev/null @@ -1,192 +0,0 @@ -package org.archive.util.binsearch.impl.http; - -import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Locale; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpConnectionManager; -import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; -import org.apache.commons.httpclient.params.HttpClientParams; -import org.archive.util.binsearch.impl.HTTPSeekableLineReader; -import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; - -/** - * - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory { - private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName()); - - private HttpConnectionManager connectionManager = null; - private HostConfiguration hostConfiguration = null; - private HttpClient http = null; - - public ApacheHttp31SLRFactory(String uriString) { - this(); - } - - public ApacheHttp31SLRFactory() { - connectionManager = new MultiThreadedHttpConnectionManager(); - //connectionManager = new ThreadLocalHttpConnectionManager(); - hostConfiguration = new HostConfiguration(); - HttpClientParams params = new HttpClientParams(); - http = new HttpClient(params,connectionManager); - http.setHostConfiguration(hostConfiguration); - } - - public void close() throws IOException - { - //connectionManager.deleteClosedConnections(); - connectionManager.closeIdleConnections(0); - } - - @Override - public ApacheHttp31SLR get(String url) throws IOException { - -// if (LOGGER.isLoggable(Level.FINEST)) { -// LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration)); -// } - - return new ApacheHttp31SLR(http, url); - } - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setProxyHostPort(java.lang.String) - */ - @Override - public void setProxyHostPort(String hostPort) { - int colonIdx = hostPort.indexOf(':'); - if(colonIdx > 0) { - String host = hostPort.substring(0,colonIdx); - int port = Integer.valueOf(hostPort.substring(colonIdx+1)); - -// http.getHostConfiguration().setProxy(host, port); - hostConfiguration.setProxy(host, port); - } - } - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setMaxTotalConnections(int) - */ - @Override - public void setMaxTotalConnections(int maxTotalConnections) { - connectionManager.getParams(). - setMaxTotalConnections(maxTotalConnections); - } - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getMaxTotalConnections() - */ - @Override - public int getMaxTotalConnections() { - return connectionManager.getParams().getMaxTotalConnections(); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setMaxHostConnections(int) - */ - @Override - public void setMaxHostConnections(int maxHostConnections) { - connectionManager.getParams().setDefaultMaxConnectionsPerHost(maxHostConnections); - connectionManager.getParams().setMaxConnectionsPerHost(hostConfiguration, maxHostConnections); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getMaxHostConnections() - */ - @Override - public int getMaxHostConnections() { - return connectionManager.getParams(). - getMaxConnectionsPerHost(hostConfiguration); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getConnectionTimeoutMS() - */ - @Override - public int getConnectionTimeoutMS() { - return connectionManager.getParams().getConnectionTimeout(); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setConnectionTimeoutMS(int) - */ - @Override - public void setConnectionTimeoutMS(int connectionTimeoutMS) { - connectionManager.getParams().setConnectionTimeout(connectionTimeoutMS); - http.getParams().setConnectionManagerTimeout(connectionTimeoutMS); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getSocketTimeoutMS() - */ - @Override - public int getSocketTimeoutMS() { - return connectionManager.getParams().getSoTimeout(); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setSocketTimeoutMS(int) - */ - @Override - public void setSocketTimeoutMS(int socketTimeoutMS) { - connectionManager.getParams().setSoTimeout(socketTimeoutMS); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setStaleChecking(boolean) - */ - @Override - public void setStaleChecking(boolean enabled) - { - connectionManager.getParams().setStaleCheckingEnabled(enabled); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#isStaleChecking() - */ - @Override - public boolean isStaleChecking() - { - return connectionManager.getParams().isStaleCheckingEnabled(); - } - - // Experimental - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getModTime() - */ - @Override - public long getModTime() - { - HTTPSeekableLineReader reader = null; - SimpleDateFormat lastModFormat = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.ENGLISH); - - try { - reader = get(); - String result = reader.getHeaderValue(HTTPSeekableLineReader.LAST_MODIFIED); - Date date = lastModFormat.parse(result); - return date.getTime(); - - } catch (Exception e) { - e.printStackTrace(); - } finally { - if (reader != null) { - try { - reader.close(); - } catch (IOException e) { - - } - } - } - - return 0; - } - - @Override - public void setNumRetries(int numRetries) { - http.getParams().setParameter(HttpClientParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(numRetries, true)); - } -} diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java index 7988cb2b..005e2c49 100644 --- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java +++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java @@ -26,7 +26,7 @@ import java.util.Map; import java.util.Set; -import org.apache.commons.httpclient.Header; +import org.archive.format.http.HttpHeader; import org.archive.io.arc.ARCRecord; import org.archive.io.warc.WARCRecord; import org.junit.jupiter.api.Test; @@ -188,12 +188,12 @@ public void testEasierParseHttpHeadersInARC() throws IOException { assertEquals(har.getHeader().getUrl(), url, "failed to retrieve Url from metadata"); } - private void assertHeaderCorrectlyParsed(Header[] headers) { + private void assertHeaderCorrectlyParsed(HttpHeader[] headers) { final List orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n")); assertEquals(orgHeaders.size(), headers.length + 1, "not all HTTP header entries have been retrieved"); - for (Header header : headers) { + for (HttpHeader header : headers) { assertTrue(orgHeaders.contains(header.getName() + ": " + header.getValue())); } diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java index dc000265..19b1984f 100644 --- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java @@ -2,8 +2,6 @@ import java.net.URISyntaxException; -import org.apache.commons.httpclient.URIException; - import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java index ff99fe38..bc8fc3a5 100644 --- a/src/test/java/org/archive/url/URLParserTest.java +++ b/src/test/java/org/archive/url/URLParserTest.java @@ -4,8 +4,6 @@ import java.net.URISyntaxException; import java.net.URLDecoder; -import org.apache.commons.httpclient.URIException; - import com.google.common.net.InetAddresses; import org.junit.jupiter.api.Test; diff --git a/src/test/java/org/archive/url/URLRegexTransformerTest.java b/src/test/java/org/archive/url/URLRegexTransformerTest.java index 01e97aac..73c43f96 100644 --- a/src/test/java/org/archive/url/URLRegexTransformerTest.java +++ b/src/test/java/org/archive/url/URLRegexTransformerTest.java @@ -1,8 +1,6 @@ package org.archive.url; -import org.apache.commons.httpclient.URIException; - import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/src/test/java/org/archive/url/UsableURIFactoryTest.java b/src/test/java/org/archive/url/UsableURIFactoryTest.java index 368cc93d..85d423c0 100644 --- a/src/test/java/org/archive/url/UsableURIFactoryTest.java +++ b/src/test/java/org/archive/url/UsableURIFactoryTest.java @@ -21,7 +21,6 @@ import java.util.TreeMap; -import org.apache.commons.httpclient.URIException; import org.apache.commons.lang.SerializationUtils; import org.junit.jupiter.api.Test; diff --git a/src/test/java/org/archive/url/UsableURITest.java b/src/test/java/org/archive/url/UsableURITest.java index 9a4c1860..161e215a 100644 --- a/src/test/java/org/archive/url/UsableURITest.java +++ b/src/test/java/org/archive/url/UsableURITest.java @@ -20,8 +20,6 @@ import java.net.URISyntaxException; -import org.apache.commons.httpclient.URIException; - import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*;