From 21c81e518d8896cf50a9d0616cc77957ed3cbbc6 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Mon, 19 May 2025 10:03:13 +0900 Subject: [PATCH] Remove dependency on Apache Commons HttpClient 3.1 HttpClient 3 was discontinued in 2007 and frequently triggers alerts in dependency vulnerability scanners. We're also not using much of it anymore, with one big exception. The URI class is the foundation of UsableURI and central to Heritrix which has made removing the library difficult. URIException in particular appears a lot in client code. HttpClient 4+ has switched to java.net.URI and the main reason Heritrix was built on HttpClient URI instead was because java.net.URI is not flexible and differs from how browsers behave. (Although, how browsers behave has shifted over time.) Eventually we'll probably need to rework Heritrix's URI handling to follow the WhatWG URL spec. However, to let us remove the dependency while keeping UsableURI working, this copies HttpClient 3's URI, URIException and ChunkedInputStream with some small tweaks remove their dependency on other classes in HttpClient. The HttpClient Header class is replaced with our existing HttpHeader. URI and ChunkedInputStream are marked package private for now. This is a breaking API change and will trigger a bump of the major version number. --- pom.xml | 6 +- .../format/gzip/zipnum/ZipNumBlockLoader.java | 2 +- .../httpclient/HttpRecorderGetMethod.java | 134 - .../httpclient/HttpRecorderMethod.java | 109 - .../httpclient/HttpRecorderPostMethod.java | 84 - .../SingleHttpConnectionManager.java | 72 - .../ThreadLocalHttpConnectionManager.java | 293 -- .../org/archive/io/HeaderedArchiveRecord.java | 38 +- .../java/org/archive/io/arc/ARC2WCDX.java | 62 +- .../java/org/archive/io/arc/ARCRecord.java | 36 +- .../java/org/archive/io/warc/WARCRecord.java | 5 +- .../resource/html/HTMLResourceFactory.java | 7 +- src/main/java/org/archive/url/LaxURI.java | 23 +- src/main/java/org/archive/url/SURT.java | 1 - .../java/org/archive/url/SURTTokenizer.java | 1 - src/main/java/org/archive/url/URI.java | 3978 +++++++++++++++++ .../java/org/archive/url/URIException.java | 180 + src/main/java/org/archive/url/UsableURI.java | 9 +- .../org/archive/url/UsableURIFactory.java | 2 - .../org/archive/util/ChunkedInputStream.java | 324 ++ .../java/org/archive/util/LaxHttpParser.java | 44 +- src/main/java/org/archive/util/Recorder.java | 1 - src/main/java/org/archive/util/SURT.java | 2 +- .../impl/HTTPSeekableLineReaderFactory.java | 13 +- .../binsearch/impl/http/ApacheHttp31SLR.java | 235 - .../impl/http/ApacheHttp31SLRFactory.java | 192 - .../archive/io/HeaderedArchiveRecordTest.java | 6 +- .../url/BasicURLCanonicalizerTest.java | 2 - .../java/org/archive/url/URLParserTest.java | 2 - .../archive/url/URLRegexTransformerTest.java | 2 - .../org/archive/url/UsableURIFactoryTest.java | 1 - .../java/org/archive/url/UsableURITest.java | 2 - 32 files changed, 4616 insertions(+), 1252 deletions(-) delete mode 100644 src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java delete mode 100644 src/main/java/org/archive/httpclient/HttpRecorderMethod.java delete mode 100644 src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java delete mode 100644 src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java delete mode 100644 src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java create mode 100644 src/main/java/org/archive/url/URI.java create mode 100644 src/main/java/org/archive/url/URIException.java create mode 100644 src/main/java/org/archive/util/ChunkedInputStream.java delete mode 100644 src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java delete mode 100644 src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java diff --git a/pom.xml b/pom.xml index c70a2cd7..81bd9b32 100644 --- a/pom.xml +++ b/pom.xml @@ -82,9 +82,9 @@ - commons-httpclient - commons-httpclient - 3.1 + commons-codec + commons-codec + 1.18.0 diff --git a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java index 2247eda4..c28ee536 100644 --- a/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java +++ b/src/main/java/org/archive/format/gzip/zipnum/ZipNumBlockLoader.java @@ -31,7 +31,7 @@ public class ZipNumBlockLoader { protected int signDurationSecs = DEFAULT_SIG_DURATION_SECS; protected boolean useNio = false; - protected String httpLib = HttpLibs.APACHE_31.name(); + protected String httpLib = HttpLibs.APACHE_43.name(); protected boolean bufferFully = true; protected boolean noKeepAlive = true; diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java deleted file mode 100644 index 1a94af1f..00000000 --- a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.httpclient; - -import java.io.IOException; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpState; -import org.apache.commons.httpclient.methods.GetMethod; -import org.archive.util.Recorder; - - -/** - * Override of GetMethod that marks the passed HttpRecorder w/ the transition - * from HTTP head to body and that forces a close on the http connection. - * - * The actions done in this subclass used to be done by copying - * org.apache.commons.HttpMethodBase, overlaying our version in place of the - * one that came w/ httpclient. Here is the patch of the difference between - * shipped httpclient code and our mods: - *
- *    -- -1338,6 +1346,12 --
- *
- *        public void releaseConnection() {
- *
- *   +        // HERITRIX always ants the streams closed.
- *   +        if (responseConnection != null)
- *   +        {
- *   +            responseConnection.close();
- *   +        }
- *   +
- *            if (responseStream != null) {
- *                try {
- *                    // FYI - this may indirectly invoke responseBodyConsumed.
- *   -- -1959,6 +1973,11 --
- *                        this.statusLine = null;
- *                    }
- *                }
- *   +            // HERITRIX mark transition from header to content.
- *   +            if (this.httpRecorder != null)
- *   +            {
- *   +                this.httpRecorder.markContentBegin();
- *   +            }
- *                readResponseBody(state, conn);
- *                processResponseBody(state, conn);
- *            } catch (IOException e) {
- * 
- * - *

We're not supposed to have access to the underlying connection object; - * am only violating contract because see cases where httpclient is skipping - * out w/o cleaning up after itself. - * - * @author stack - * @version $Revision$, $Date$ - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class HttpRecorderGetMethod extends GetMethod { - - protected static Logger logger = - Logger.getLogger(HttpRecorderGetMethod.class.getName()); - - /** - * Instance of http recorder method. - */ - protected HttpRecorderMethod httpRecorderMethod = null; - - - public HttpRecorderGetMethod(String uri, Recorder recorder) { - super(uri); - this.httpRecorderMethod = new HttpRecorderMethod(recorder); - } - - protected void readResponseBody(HttpState state, HttpConnection connection) - throws IOException, HttpException { - // We're about to read the body. Mark transition in http recorder. - this.httpRecorderMethod.markContentBegin(connection); - super.readResponseBody(state, connection); - } - - protected boolean shouldCloseConnection(HttpConnection conn) { - // Always close connection after each request. As best I can tell, this - // is superfluous -- we've set our client to be HTTP/1.0. Doing this - // out of paranoia. - return true; - } - - public int execute(HttpState state, HttpConnection conn) - throws HttpException, IOException { - // Save off the connection so we can close it on our way out in case - // httpclient fails to (We're not supposed to have access to the - // underlying connection object; am only violating contract because - // see cases where httpclient is skipping out w/o cleaning up - // after itself). - this.httpRecorderMethod.setConnection(conn); - return super.execute(state, conn); - } - - protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) - throws IOException, HttpException { - super.addProxyConnectionHeader(state, conn); - this.httpRecorderMethod.handleAddProxyConnectionHeader(this); - } - - // XXX see https://webarchive.jira.com/browse/HER-2059 - // We never call this method with the implied question mark prepended, so - // adding it does the trick, since commons-httpclient will strip it later. - public void setQueryString(String queryString) { - if (queryString != null) { - super.setQueryString('?' + queryString); - } else { - super.setQueryString(queryString); - } - } - -} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java deleted file mode 100644 index b08bc0bd..00000000 --- a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.httpclient; - -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpMethod; -import org.archive.util.Recorder; - - -/** - * This class encapsulates the specializations supplied by the - * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}. - * - * It keeps instance of HttpRecorder and HttpConnection. - * - * @author stack - * @version $Revision$, $Date$ - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class HttpRecorderMethod { - protected static Logger logger = - Logger.getLogger(HttpRecorderMethod.class.getName()); - - /** - * Instance of http recorder we're using recording this http get. - */ - private Recorder httpRecorder = null; - - /** - * Save around so can force close. - * - * See [ 922080 ] IllegalArgumentException (size is wrong). - * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099 - */ - private HttpConnection connection = null; - - - public HttpRecorderMethod(Recorder recorder) { - this.httpRecorder = recorder; - } - - public void markContentBegin(HttpConnection c) { - if (c != this.connection) { - // We're checking that we're not being asked to work on - // a connection that is other than the one we started - // this method#execute with. - throw new IllegalArgumentException("Connections differ: " + - this.connection + " " + c + " " + - Thread.currentThread().getName()); - } - this.httpRecorder.markContentBegin(); - } - - /** - * @return Returns the connection. - */ - public HttpConnection getConnection() { - return this.connection; - } - - /** - * @param connection The connection to set. - */ - public void setConnection(HttpConnection connection) { - this.connection = connection; - } - /** - * @return Returns the httpRecorder. - */ - public Recorder getHttpRecorder() { - return httpRecorder; - } - - /** - * If a 'Proxy-Connection' header has been added to the request, - * it'll be of a 'keep-alive' type. Until we support 'keep-alives', - * override the Proxy-Connection setting and instead pass a 'close' - * (Otherwise every request has to timeout before we notice - * end-of-document). - * @param method Method to find proxy-connection header in. - */ - public void handleAddProxyConnectionHeader(HttpMethod method) { - Header h = method.getRequestHeader("Proxy-Connection"); - if (h != null) { - h.setValue("close"); - method.setRequestHeader(h); - } - } -} diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java deleted file mode 100644 index d55d816a..00000000 --- a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.httpclient; - -import java.io.IOException; - -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpState; -import org.apache.commons.httpclient.methods.PostMethod; -import org.archive.util.Recorder; - - -/** - * Override of PostMethod that marks the passed HttpRecorder w/ the transition - * from HTTP head to body and that forces a close on the responseConnection. - * - * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the - * parent subclass. - * - * @author stack - * @version $Date$ $Revision$ - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class HttpRecorderPostMethod extends PostMethod { - /** - * Instance of http recorder method. - */ - protected HttpRecorderMethod httpRecorderMethod = null; - - - public HttpRecorderPostMethod(String uri, Recorder recorder) { - super(uri); - this.httpRecorderMethod = new HttpRecorderMethod(recorder); - } - - protected void readResponseBody(HttpState state, HttpConnection connection) - throws IOException, HttpException { - // We're about to read the body. Mark transition in http recorder. - this.httpRecorderMethod.markContentBegin(connection); - super.readResponseBody(state, connection); - } - - protected boolean shouldCloseConnection(HttpConnection conn) { - // Always close connection after each request. As best I can tell, this - // is superfluous -- we've set our client to be HTTP/1.0. Doing this - // out of paranoia. - return true; - } - - public int execute(HttpState state, HttpConnection conn) - throws HttpException, IOException { - // Save off the connection so we can close it on our way out in case - // httpclient fails to (We're not supposed to have access to the - // underlying connection object; am only violating contract because - // see cases where httpclient is skipping out w/o cleaning up - // after itself). - this.httpRecorderMethod.setConnection(conn); - return super.execute(state, conn); - } - - protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) - throws IOException, HttpException { - super.addProxyConnectionHeader(state, conn); - this.httpRecorderMethod.handleAddProxyConnectionHeader(this); - } -} diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java deleted file mode 100644 index d6cf27ab..00000000 --- a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.httpclient; - -import java.io.IOException; -import java.io.InputStream; - -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.SimpleHttpConnectionManager; - -/** - * An HttpClient-compatible HttpConnection "manager" that actually - * just gives out a new connection each time -- skipping the overhead - * of connection management, since we already throttle our crawler - * with external mechanisms. - * - * @author gojomo - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class SingleHttpConnectionManager extends SimpleHttpConnectionManager { - - public SingleHttpConnectionManager() { - super(); - } - - public HttpConnection getConnectionWithTimeout( - HostConfiguration hostConfiguration, long timeout) { - - HttpConnection conn = new HttpConnection(hostConfiguration); - conn.setHttpConnectionManager(this); - conn.getParams().setDefaults(this.getParams()); - return conn; - } - - public void releaseConnection(HttpConnection conn) { - // ensure connection is closed - conn.close(); - finishLast(conn); - } - - protected static void finishLast(HttpConnection conn) { - // copied from superclass because it wasn't made available to subclasses - InputStream lastResponse = conn.getLastResponseInputStream(); - if (lastResponse != null) { - conn.setLastResponseInputStream(null); - try { - lastResponse.close(); - } catch (IOException ioe) { - //FIXME: badness - close to force reconnect. - conn.close(); - } - } - } -} diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java deleted file mode 100644 index 16821b36..00000000 --- a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java +++ /dev/null @@ -1,293 +0,0 @@ -/** - * ==================================================================== - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ==================================================================== - * - */ -package org.archive.httpclient; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpConnectionManager; -import org.apache.commons.httpclient.params.HttpConnectionManagerParams; - -/** - * A simple, but thread-safe HttpClient {@link HttpConnectionManager}. - * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}. - * - * Java >= 1.4 is recommended. - * - * @author Christian Kohlschuetter - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public final class ThreadLocalHttpConnectionManager implements - HttpConnectionManager { - - private static final CloserThread closer = new CloserThread(); - private static final Logger logger = Logger - .getLogger(ThreadLocalHttpConnectionManager.class.getName()); - - private final ThreadLocal tl = new ThreadLocal() { - protected synchronized ConnectionInfo initialValue() { - return new ConnectionInfo(); - } - }; - - private ConnectionInfo getConnectionInfo() { - return (ConnectionInfo) tl.get(); - } - - private static final class ConnectionInfo { - /** The http connection */ - private HttpConnection conn = null; - - /** - * The time the connection was made idle. - */ - private long idleStartTime = Long.MAX_VALUE; - } - - public ThreadLocalHttpConnectionManager() { - } - - /** - * Since the same connection is about to be reused, make sure the - * previous request was completely processed, and if not - * consume it now. - * @param conn The connection - * @return true, if the connection is reusable - */ - private static boolean finishLastResponse(final HttpConnection conn) { - InputStream lastResponse = conn.getLastResponseInputStream(); - if(lastResponse != null) { - conn.setLastResponseInputStream(null); - try { - lastResponse.close(); - return true; - } catch (IOException ioe) { - // force reconnect. - return false; - } - } else { - return false; - } - } - - /** - * Collection of parameters associated with this connection manager. - */ - private HttpConnectionManagerParams params = new HttpConnectionManagerParams(); - - /** - * @see HttpConnectionManager#getConnection(HostConfiguration) - */ - public HttpConnection getConnection( - final HostConfiguration hostConfiguration) { - return getConnection(hostConfiguration, 0); - } - - /** - * Gets the staleCheckingEnabled value to be set on HttpConnections that are created. - * - * @return true if stale checking will be enabled on HttpConections - * - * @see HttpConnection#isStaleCheckingEnabled() - * - * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()}, - * {@link HttpConnectionManager#getParams()}. - */ - public boolean isConnectionStaleCheckingEnabled() { - return this.params.isStaleCheckingEnabled(); - } - - /** - * Sets the staleCheckingEnabled value to be set on HttpConnections that are created. - * - * @param connectionStaleCheckingEnabled true if stale checking will be enabled - * on HttpConections - * - * @see HttpConnection#setStaleCheckingEnabled(boolean) - * - * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)}, - * {@link HttpConnectionManager#getParams()}. - */ - public void setConnectionStaleCheckingEnabled( - final boolean connectionStaleCheckingEnabled) { - this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled); - } - - /** - * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long) - * - * @since 3.0 - */ - public HttpConnection getConnectionWithTimeout( - final HostConfiguration hostConfiguration, final long timeout) { - - final ConnectionInfo ci = getConnectionInfo(); - HttpConnection httpConnection = ci.conn; - - // make sure the host and proxy are correct for this connection - // close it and set the values if they are not - if(httpConnection == null || !finishLastResponse(httpConnection) - || !hostConfiguration.hostEquals(httpConnection) - || !hostConfiguration.proxyEquals(httpConnection)) { - - if(httpConnection != null && httpConnection.isOpen()) { - closer.closeConnection(httpConnection); - } - - httpConnection = new HttpConnection(hostConfiguration); - httpConnection.setHttpConnectionManager(this); - httpConnection.getParams().setDefaults(this.params); - ci.conn = httpConnection; - - httpConnection.setHost(hostConfiguration.getHost()); - httpConnection.setPort(hostConfiguration.getPort()); - httpConnection.setProtocol(hostConfiguration.getProtocol()); - httpConnection.setLocalAddress(hostConfiguration.getLocalAddress()); - - httpConnection.setProxyHost(hostConfiguration.getProxyHost()); - httpConnection.setProxyPort(hostConfiguration.getProxyPort()); - } - - // remove the connection from the timeout handler - ci.idleStartTime = Long.MAX_VALUE; - - return httpConnection; - } - - /** - * @see HttpConnectionManager#getConnection(HostConfiguration, long) - * - * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long) - */ - public HttpConnection getConnection( - final HostConfiguration hostConfiguration, final long timeout) { - return getConnectionWithTimeout(hostConfiguration, timeout); - } - - /** - * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection) - */ - public void releaseConnection(final HttpConnection conn) { - final ConnectionInfo ci = getConnectionInfo(); - HttpConnection httpConnection = ci.conn; - - if(conn != httpConnection) { - throw new IllegalStateException( - "Unexpected release of an unknown connection."); - } - - finishLastResponse(httpConnection); - - // track the time the connection was made idle - ci.idleStartTime = System.currentTimeMillis(); - } - - /** - * Returns {@link HttpConnectionManagerParams parameters} associated - * with this connection manager. - * - * @since 2.1 - * - * @see HttpConnectionManagerParams - */ - public HttpConnectionManagerParams getParams() { - return this.params; - } - - /** - * Assigns {@link HttpConnectionManagerParams parameters} for this - * connection manager. - * - * @since 2.1 - * - * @see HttpConnectionManagerParams - */ - public void setParams(final HttpConnectionManagerParams p) { - if(p == null) { - throw new IllegalArgumentException("Parameters may not be null"); - } - this.params = p; - } - - /** - * @since 3.0 - */ - public void closeIdleConnections(final long idleTimeout) { - long maxIdleTime = System.currentTimeMillis() - idleTimeout; - - final ConnectionInfo ci = getConnectionInfo(); - - if(ci.idleStartTime <= maxIdleTime) { - ci.conn.close(); - } - } - - private static final class CloserThread extends Thread { - private List connections - = new ArrayList(); - - private static final int SLEEP_INTERVAL = 5000; - - public CloserThread() { - super("HttpConnection closer"); - // Make this a daemon thread so it can't be responsible for the JVM - // not shutting down. - setDaemon(true); - start(); - } - - public void closeConnection(final HttpConnection conn) { - synchronized (connections) { - connections.add(conn); - } - } - - public void run() { - try { - while (!Thread.interrupted()) { - Thread.sleep(SLEEP_INTERVAL); - - List s; - synchronized (connections) { - s = connections; - connections = new ArrayList(); - } - logger.log(Level.INFO, "Closing " + s.size() - + " HttpConnections"); - for(final Iterator it = s.iterator(); - it.hasNext();) { - HttpConnection conn = it.next(); - conn.close(); - conn.setHttpConnectionManager(null); - it.remove(); - } - } - } catch (InterruptedException e) { - return; - } - } - } -} diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java index 3cce595b..ac4b82f6 100644 --- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java +++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java @@ -26,10 +26,7 @@ import java.io.OutputStream; import java.io.PrintStream; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.format.http.HttpHeader; import org.archive.io.arc.ARCConstants; import org.archive.util.LaxHttpParser; @@ -59,7 +56,7 @@ public class HeaderedArchiveRecord extends ArchiveRecord { * * Only available after the reading of headers. */ - private Header [] contentHeaders = null; + private HttpHeader[] contentHeaders = null; public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException { @@ -149,13 +146,14 @@ private InputStream readContentHeaders() throws IOException { throw new IOException("Failed to read raw lie where one " + " was expected: " + new String(statusBytes)); } - String statusLine = EncodingUtil.getString(statusBytes, 0, + String statusLine = new String(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if (statusLine == null) { throw new NullPointerException("Expected status line is null"); } + statusLine = statusLine.trim(); // TODO: Tighten up this test. - boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine); + boolean isHttpResponse = statusLine.startsWith("HTTP"); boolean isHttpRequest = false; if (!isHttpResponse) { isHttpRequest = statusLine.toUpperCase().startsWith("GET") || @@ -165,9 +163,13 @@ private InputStream readContentHeaders() throws IOException { throw new UnexpectedStartLineIOException("Failed parse of " + "status line: " + statusLine); } - this.statusCode = isHttpResponse? - (new StatusLine(statusLine)).getStatusCode(): -1; - + + if (isHttpResponse) { + this.statusCode = parseStatusCode(statusLine); + } else { + this.statusCode = -1; + } + // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since @@ -210,7 +212,19 @@ private InputStream readContentHeaders() throws IOException { bais.reset(); return bais; } - + + public static int parseStatusCode(String statusLine) { + int i = statusLine.indexOf(' '); + if (i < 0) return -1; + int j = statusLine.indexOf(' ', i + 1); + if (j < 0) j = statusLine.length(); + try { + return Integer.parseInt(statusLine.substring(i + 1, j)); + } catch (NumberFormatException e) { + return -1; + } + } + public static class UnexpectedStartLineIOException extends RecoverableIOException { private static final long serialVersionUID = 1L; @@ -252,7 +266,7 @@ public int getContentHeadersLength() { return this.contentHeadersLength; } - public Header[] getContentHeaders() { + public HttpHeader[] getContentHeaders() { return contentHeaders; } diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java index 19010131..f0515694 100644 --- a/src/main/java/org/archive/io/arc/ARC2WCDX.java +++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java @@ -22,14 +22,12 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; -import java.util.Date; -import java.util.Iterator; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.*; import java.util.zip.GZIPOutputStream; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HeaderGroup; -import org.apache.commons.httpclient.util.DateParseException; -import org.apache.commons.httpclient.util.DateUtil; +import org.archive.format.http.HttpHeader; import org.archive.io.ArchiveRecord; import org.archive.util.ArchiveUtils; import org.archive.util.SURT; @@ -95,12 +93,15 @@ public static Object[] createWcdx(ARCReader reader) { ARCRecord record = (ARCRecord) iter.next(); record.close(); ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader(); - Header[] httpHeaders = record.getHttpHeaders(); + HttpHeader[] httpHeaders = record.getHttpHeaders(); if(httpHeaders==null) { - httpHeaders = new Header[0]; + httpHeaders = new HttpHeader[0]; } - HeaderGroup hg = new HeaderGroup(); - hg.setHeaders(httpHeaders); + Map headerMap = new HashMap<>(); + for (HttpHeader header : httpHeaders) { + headerMap.putIfAbsent(header.getName().toLowerCase(Locale.ROOT), header); + } + StringBuilder builder = new StringBuilder(); // SURT-form URI @@ -108,7 +109,7 @@ public static Object[] createWcdx(ARCReader reader) { // record timestamp ('b') appendField(builder,h.getDate()); // http header date - appendTimeField(builder,hg.getFirstHeader("Date")); + appendTimeField(builder, headerMap.get("date")); // response code ('s') appendField(builder,h.getStatusCode()); // media type ('m') @@ -131,17 +132,17 @@ public static Object[] createWcdx(ARCReader reader) { // uncompressed (declared in ARC headerline) record length appendField(builder,h.getLength()); // http header content-length - appendField(builder,hg.getFirstHeader("Content-Length")); + appendField(builder, headerMap.get("content-length")); // http header mod-date - appendTimeField(builder,hg.getFirstHeader("Last-Modified")); + appendTimeField(builder, headerMap.get("last-modified")); // http header expires - appendTimeField(builder,hg.getFirstHeader("Expires")); + appendTimeField(builder, headerMap.get("expires")); // http header etag - appendField(builder,hg.getFirstHeader("ETag")); + appendField(builder, headerMap.get("etag")); // http header redirect ('Location' header?) - appendField(builder,hg.getFirstHeader("Location")); + appendField(builder, headerMap.get("location")); // ip ('e') appendField(builder,h.getIp()); // original URI @@ -186,8 +187,8 @@ protected static void appendField(StringBuilder builder, Object obj) { // prepend with delimiter builder.append(' '); } - if(obj instanceof Header) { - obj = ((Header)obj).getValue().trim(); + if(obj instanceof HttpHeader) { + obj = ((HttpHeader)obj).getValue().trim(); } builder.append((obj==null||obj.toString().length()==0)?"-":obj); @@ -202,16 +203,16 @@ protected static void appendTimeField(StringBuilder builder, Object obj) { builder.append("-"); return; } - if(obj instanceof Header) { - String s = ((Header)obj).getValue().trim(); + if(obj instanceof HttpHeader) { + String s = ((HttpHeader)obj).getValue().trim(); try { - Date date = DateUtil.parseDate(s); + Date date = parseDate(s); String d = ArchiveUtils.get14DigitDate(date); if(d.startsWith("209")) { d = "199"+d.substring(3); } obj = d; - } catch (DateParseException e) { + } catch (ParseException e) { builder.append('e'); return; } @@ -219,6 +220,23 @@ protected static void appendTimeField(StringBuilder builder, Object obj) { } builder.append(obj); } + + private static Date parseDate(String s) throws ParseException { + SimpleDateFormat format = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); + format.setTimeZone(TimeZone.getTimeZone("GMT")); + format.set2DigitYearStart(new Date(946684800)); // year 2000 + try { + return format.parse(s); + } catch (ParseException e) { + try { + format.applyPattern("EEEE, dd-MMM-yy HH:mm:ss zzz"); + return format.parse(s); + } catch (ParseException e1) { + format.applyPattern("EEE MMM d HH:mm:ss yyyy"); + return format.parse(s); + } + } + } } //'wide' CDX diff --git a/src/main/java/org/archive/io/arc/ARCRecord.java b/src/main/java/org/archive/io/arc/ARCRecord.java index bacaca38..d3c036ba 100644 --- a/src/main/java/org/archive/io/arc/ARCRecord.java +++ b/src/main/java/org/archive/io/arc/ARCRecord.java @@ -32,12 +32,11 @@ import java.util.logging.Logger; import java.util.regex.Matcher; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.util.EncodingUtil; import org.apache.commons.lang.StringUtils; +import org.archive.format.http.HttpHeader; import org.archive.io.ArchiveRecord; import org.archive.io.ArchiveRecordHeader; +import org.archive.io.HeaderedArchiveRecord; import org.archive.io.RecoverableIOException; import org.archive.util.InetAddressUtil; import org.archive.util.LaxHttpParser; @@ -50,11 +49,11 @@ */ public class ARCRecord extends ArchiveRecord implements ARCConstants { /** - * Http status line object. + * Http status code. * - * May be null if record is not http. + * May be -1 if record is not http. */ - private StatusLine httpStatus = null; + private int statusCode = -1; /** * Http header bytes. @@ -69,7 +68,7 @@ public class ARCRecord extends ArchiveRecord implements ARCConstants { * * Only populated after reading of headers. */ - private Header [] httpHeaders = null; + private HttpHeader[] httpHeaders = null; /** * Array of field names. @@ -589,8 +588,8 @@ private InputStream readHttpHeader() throws IOException { "Failed to read http status where one was expected: " + ((statusBytes == null) ? "" : new String(statusBytes))); } - - statusLine = EncodingUtil.getString(statusBytes, 0, + + statusLine = new String(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); // If a null or DELETED break immediately @@ -600,7 +599,7 @@ private InputStream readHttpHeader() throws IOException { // If it's actually the status line, break, otherwise continue skipping any // previous header values - if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) { + if (!statusLine.contains(":") && statusLine.trim().startsWith("HTTP")) { break; } @@ -613,7 +612,7 @@ private InputStream readHttpHeader() throws IOException { } if ((statusLine == null) || - !StatusLine.startsWithHTTP(statusLine)) { + !statusLine.trim().startsWith("HTTP")) { if (statusLine.startsWith("DELETED")) { // Some old ARCs have deleted records like following: // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202 @@ -629,13 +628,12 @@ private InputStream readHttpHeader() throws IOException { } } - try { - this.httpStatus = new StatusLine(statusLine); - } catch(IOException e) { - logger.warning(e.getMessage() + " at offset: " + h.getOffset()); - this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION); + this.statusCode = HeaderedArchiveRecord.parseStatusCode(statusLine.trim()); + if (statusCode == -1) { + logger.warning("Bad status line at offset: " + h.getOffset()); + this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION); } - + // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since @@ -706,7 +704,7 @@ public DeletedARCRecordIOException(final String reason) { * @return Status code. */ public int getStatusCode() { - return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode(); + return statusCode; } /** @@ -735,7 +733,7 @@ public ARCRecordMetaData getMetaData() { /** * @return http headers (Only available after header has been read). */ - public Header [] getHttpHeaders() { + public HttpHeader[] getHttpHeaders() { return this.httpHeaders; } diff --git a/src/main/java/org/archive/io/warc/WARCRecord.java b/src/main/java/org/archive/io/warc/WARCRecord.java index 635d1c3b..cf106270 100644 --- a/src/main/java/org/archive/io/warc/WARCRecord.java +++ b/src/main/java/org/archive/io/warc/WARCRecord.java @@ -29,8 +29,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; +import org.archive.format.http.HttpHeader; import org.archive.io.ArchiveRecord; import org.archive.io.ArchiveRecordHeader; import org.archive.util.LaxHttpParser; @@ -123,7 +122,7 @@ protected ArchiveRecordHeader parseHeaders(final InputStream in, // keep count of bytes read, digest and fail properly if EOR too soon... // We don't want digesting while reading Headers. // - Header [] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING); + HttpHeader[] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING); for (int i = 0; i < h.length; i++) { m.put(h[i].getName(), h[i].getValue()); } diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index afb1c850..6e95270c 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -4,9 +4,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.util.logging.Logger; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.archive.format.http.HttpHeaders; import org.archive.format.json.JSONUtils; import org.archive.format.text.charset.CharsetDetector; @@ -25,7 +24,7 @@ public class HTMLResourceFactory implements ResourceFactory { - public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class); + private static final Logger LOG = Logger.getLogger(HTMLResourceFactory.class.getName()); protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; @@ -58,7 +57,7 @@ public Resource getResource(InputStream is, MetaData parentMetaData, try { charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); } catch (Exception e) { - LOG.error("Failed to guess charset: " + e.getMessage()); + LOG.severe("Failed to guess charset: " + e.getMessage()); } } diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java index d7318dfd..57071460 100644 --- a/src/main/java/org/archive/url/LaxURI.java +++ b/src/main/java/org/archive/url/LaxURI.java @@ -18,13 +18,11 @@ */ package org.archive.url; +import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.BitSet; -import org.apache.commons.httpclient.URI; -import org.apache.commons.httpclient.URIException; -import org.apache.commons.httpclient.util.EncodingUtil; - /** * URI subclass which allows partial/inconsistent encoding, matching * the URIs which will be relayed in requests from popular web @@ -121,13 +119,12 @@ protected static String decode(String component, String charset) "Component array of chars may not be null"); } byte[] rawdata = null; - // try { - rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil - .getAsciiBytes(component)); - // } catch (DecoderException e) { - // throw new URIException(e.getMessage()); - // } - return EncodingUtil.getString(rawdata, charset); + rawdata = LaxURLCodec.decodeUrlLoose(component.getBytes(StandardCharsets.US_ASCII)); + try { + return new String(rawdata, charset); + } catch (UnsupportedEncodingException e) { + return new String(rawdata); + } } // overidden to lax() the acceptable-char BitSet passed in @@ -183,7 +180,7 @@ protected BitSet lax(BitSet generous) { * two instances to one where possible, slimming * instances. * - * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean) + * @see URI#parseAuthority(java.lang.String, boolean) */ protected void parseAuthority(String original, boolean escaped) throws URIException { @@ -204,7 +201,7 @@ protected void parseAuthority(String original, boolean escaped) * long-lived instance from a static field, saving 12-14 bytes * per instance. * - * @see org.apache.commons.httpclient.URI#setURI() + * @see URI#setURI() */ protected void setURI() { if (_scheme != null) { diff --git a/src/main/java/org/archive/url/SURT.java b/src/main/java/org/archive/url/SURT.java index 2c8e1b02..3e0bcd55 100644 --- a/src/main/java/org/archive/url/SURT.java +++ b/src/main/java/org/archive/url/SURT.java @@ -6,7 +6,6 @@ import java.util.Iterator; import java.util.logging.Logger; -import org.apache.commons.httpclient.URIException; import org.archive.util.iterator.AbstractPeekableIterator; public class SURT { diff --git a/src/main/java/org/archive/url/SURTTokenizer.java b/src/main/java/org/archive/url/SURTTokenizer.java index da8f58f2..52b80a03 100644 --- a/src/main/java/org/archive/url/SURTTokenizer.java +++ b/src/main/java/org/archive/url/SURTTokenizer.java @@ -19,7 +19,6 @@ */ package org.archive.url; -import org.apache.commons.httpclient.URIException; import org.archive.util.SURT; /** diff --git a/src/main/java/org/archive/url/URI.java b/src/main/java/org/archive/url/URI.java new file mode 100644 index 00000000..e420ca51 --- /dev/null +++ b/src/main/java/org/archive/url/URI.java @@ -0,0 +1,3978 @@ +/* + * $HeadURL: https://svn.apache.org/repos/asf/jakarta/httpcomponents/oac.hc3x/tags/HTTPCLIENT_3_1/src/java/org/apache/commons/httpclient/URI.java $ + * $Revision: 564973 $ + * $Date: 2007-08-11 22:51:47 +0200 (Sat, 11 Aug 2007) $ + * + * ==================================================================== + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ + +package org.archive.url; + +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.net.URLCodec; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Hashtable; +import java.util.Locale; + +/** + * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396. + * This class has the purpose of supportting of parsing a URI reference to + * extend any specific protocols, the character encoding of the protocol to + * be transported and the charset of the document. + *

+ * A URI is always in an "escaped" form, since escaping or unescaping a + * completed URI might change its semantics. + *

+ * Implementers should be careful not to escape or unescape the same string + * more than once, since unescaping an already unescaped string might lead to + * misinterpreting a percent data character as another escaped character, + * or vice versa in the case of escaping an already escaped string. + *

+ * In order to avoid these problems, data types used as follows: + *

+ *   URI character sequence: char
+ *   octet sequence: byte
+ *   original character sequence: String
+ * 

+ * + * So, a URI is a sequence of characters as an array of a char type, which + * is not always represented as a sequence of octets as an array of byte. + *

+ * + * URI Syntactic Components + *

+ * - In general, written as follows:
+ *   Absolute URI = <scheme>:<scheme-specific-part>
+ *   Generic URI = <scheme>://<authority><path>?<query>
+ *
+ * - Syntax
+ *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
+ *   hier_part     = ( net_path | abs_path ) [ "?" query ]
+ *   net_path      = "//" authority [ abs_path ]
+ *   abs_path      = "/"  path_segments
+ * 

+ * + * The following examples illustrate URI that are in common use. + *

+ * ftp://ftp.is.co.za/rfc/rfc1808.txt
+ *    -- ftp scheme for File Transfer Protocol services
+ * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
+ *    -- gopher scheme for Gopher and Gopher+ Protocol services
+ * http://www.math.uio.no/faq/compression-faq/part1.html
+ *    -- http scheme for Hypertext Transfer Protocol services
+ * mailto:mduerst@ifi.unizh.ch
+ *    -- mailto scheme for electronic mail addresses
+ * news:comp.infosystems.www.servers.unix
+ *    -- news scheme for USENET news groups and articles
+ * telnet://melvyl.ucop.edu/
+ *    -- telnet scheme for interactive services via the TELNET Protocol
+ * 
+ * Please, notice that there are many modifications from URL(RFC 1738) and + * relative URL(RFC 1808). + *

+ * The expressions for a URI + *

+ * For escaped URI forms
+ *  - URI(char[]) // constructor
+ *  - char[] getRawXxx() // method
+ *  - String getEscapedXxx() // method
+ *  - String toString() // method
+ * 

+ * For unescaped URI forms + * - URI(String) // constructor + * - String getXXX() // method + *

+ * + * @author Sung-Gu + * @author Mike Bowler + * @version $Revision: 564973 $ $Date: 2002/03/14 15:14:01 + */ +class URI implements Cloneable, Comparable, Serializable { + + + // ----------------------------------------------------------- Constructors + + /** Create an instance as an internal use */ + protected URI() { + } + + /** + * Construct a URI from a string with the given charset. The input string can + * be either in escaped or unescaped form. + * + * @param s URI character sequence + * @param escaped true if URI character sequence is in escaped form. + * false otherwise. + * @param charset the charset string to do escape encoding, if required + * + * @throws URIException If the URI cannot be created. + * @throws NullPointerException if input string is null + * + * @see #getProtocolCharset + * + * @since 3.0 + */ + public URI(String s, boolean escaped, String charset) + throws URIException, NullPointerException { + protocolCharset = charset; + parseUriReference(s, escaped); + } + + /** + * Construct a URI from a string with the given charset. The input string can + * be either in escaped or unescaped form. + * + * @param s URI character sequence + * @param escaped true if URI character sequence is in escaped form. + * false otherwise. + * + * @throws URIException If the URI cannot be created. + * @throws NullPointerException if input string is null + * + * @see #getProtocolCharset + * + * @since 3.0 + */ + public URI(String s, boolean escaped) + throws URIException, NullPointerException { + parseUriReference(s, escaped); + } + + /** + * Construct a URI as an escaped form of a character array with the given + * charset. + * + * @param escaped the URI character sequence + * @param charset the charset string to do escape encoding + * @throws URIException If the URI cannot be created. + * @throws NullPointerException if escaped is null + * @see #getProtocolCharset + * + * @deprecated Use #URI(String, boolean, String) + */ + public URI(char[] escaped, String charset) + throws URIException, NullPointerException { + protocolCharset = charset; + parseUriReference(new String(escaped), true); + } + + + /** + * Construct a URI as an escaped form of a character array. + * An URI can be placed within double-quotes or angle brackets like + * "http://test.com/" and <http://test.com/> + * + * @param escaped the URI character sequence + * @throws URIException If the URI cannot be created. + * @throws NullPointerException if escaped is null + * @see #getDefaultProtocolCharset + * + * @deprecated Use #URI(String, boolean) + */ + public URI(char[] escaped) + throws URIException, NullPointerException { + parseUriReference(new String(escaped), true); + } + + + /** + * Construct a URI from the given string with the given charset. + * + * @param original the string to be represented to URI character sequence + * It is one of absoluteURI and relativeURI. + * @param charset the charset string to do escape encoding + * @throws URIException If the URI cannot be created. + * @see #getProtocolCharset + * + * @deprecated Use #URI(String, boolean, String) + */ + public URI(String original, String charset) throws URIException { + protocolCharset = charset; + parseUriReference(original, false); + } + + + /** + * Construct a URI from the given string. + *

+     *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     * 

+ * An URI can be placed within double-quotes or angle brackets like + * "http://test.com/" and <http://test.com/> + * + * @param original the string to be represented to URI character sequence + * It is one of absoluteURI and relativeURI. + * @throws URIException If the URI cannot be created. + * @see #getDefaultProtocolCharset + * + * @deprecated Use #URI(String, boolean) + */ + public URI(String original) throws URIException { + parseUriReference(original, false); + } + + + /** + * Construct a general URI from the given components. + *

+     *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
+     *   opaque_part   = uric_no_slash *uric
+     * 

+ * It's for absolute URI = <scheme>:<scheme-specific-part># + * <fragment>. + * + * @param scheme the scheme string + * @param schemeSpecificPart scheme_specific_part + * @param fragment the fragment string + * @throws URIException If the URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String schemeSpecificPart, String fragment) + throws URIException { + + // validate and contruct the URI character sequence + if (scheme == null) { + throw new URIException(URIException.PARSING, "scheme required"); + } + char[] s = scheme.toLowerCase().toCharArray(); + if (validate(s, URI.scheme)) { + _scheme = s; // is_absoluteURI + } else { + throw new URIException(URIException.PARSING, "incorrect scheme"); + } + _opaque = encode(schemeSpecificPart, allowed_opaque_part, + getProtocolCharset()); + // Set flag + _is_opaque_part = true; + _fragment = fragment == null ? null : fragment.toCharArray(); + setURI(); + } + + + /** + * Construct a general URI from the given components. + *

+     *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
+     *   relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
+     *   hier_part     = ( net_path | abs_path ) [ "?" query ]
+     * 

+ * It's for absolute URI = <scheme>:<path>?<query>#< + * fragment> and relative URI = <path>?<query>#<fragment + * >. + * + * @param scheme the scheme string + * @param authority the authority string + * @param path the path string + * @param query the query string + * @param fragment the fragment string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String authority, String path, String query, + String fragment) throws URIException { + + // validate and contruct the URI character sequence + StringBuffer buff = new StringBuffer(); + if (scheme != null) { + buff.append(scheme); + buff.append(':'); + } + if (authority != null) { + buff.append("//"); + buff.append(authority); + } + if (path != null) { // accept empty path + if ((scheme != null || authority != null) + && !path.startsWith("/")) { + throw new URIException(URIException.PARSING, + "abs_path requested"); + } + buff.append(path); + } + if (query != null) { + buff.append('?'); + buff.append(query); + } + if (fragment != null) { + buff.append('#'); + buff.append(fragment); + } + parseUriReference(buff.toString(), false); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param userinfo the userinfo string + * @param host the host string + * @param port the port number + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String userinfo, String host, int port) + throws URIException { + + this(scheme, userinfo, host, port, null, null, null); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param userinfo the userinfo string + * @param host the host string + * @param port the port number + * @param path the path string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String userinfo, String host, int port, + String path) throws URIException { + + this(scheme, userinfo, host, port, path, null, null); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param userinfo the userinfo string + * @param host the host string + * @param port the port number + * @param path the path string + * @param query the query string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String userinfo, String host, int port, + String path, String query) throws URIException { + + this(scheme, userinfo, host, port, path, query, null); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param userinfo the userinfo string + * @param host the host string + * @param port the port number + * @param path the path string + * @param query the query string + * @param fragment the fragment string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String userinfo, String host, int port, + String path, String query, String fragment) throws URIException { + + this(scheme, (host == null) ? null + : ((userinfo != null) ? userinfo + '@' : "") + host + + ((port != -1) ? ":" + port : ""), path, query, fragment); + } + + + /** + * Construct a general URI from the given components. + * + * @param scheme the scheme string + * @param host the host string + * @param path the path string + * @param fragment the fragment string + * @throws URIException If the new URI cannot be created. + * @see #getDefaultProtocolCharset + */ + public URI(String scheme, String host, String path, String fragment) + throws URIException { + + this(scheme, host, path, null, fragment); + } + + + /** + * Construct a general URI with the given relative URI string. + * + * @param base the base URI + * @param relative the relative URI string + * @throws URIException If the new URI cannot be created. + * + * @deprecated Use #URI(URI, String, boolean) + */ + public URI(URI base, String relative) throws URIException { + this(base, new URI(relative)); + } + + + /** + * Construct a general URI with the given relative URI string. + * + * @param base the base URI + * @param relative the relative URI string + * @param escaped true if URI character sequence is in escaped form. + * false otherwise. + * + * @throws URIException If the new URI cannot be created. + * + * @since 3.0 + */ + public URI(URI base, String relative, boolean escaped) throws URIException { + this(base, new URI(relative, escaped)); + } + + + /** + * Construct a general URI with the given relative URI. + *

+     *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     *   relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
+     * 

+ * Resolving Relative References to Absolute Form. + * + * Examples of Resolving Relative URI References + * + * Within an object with a well-defined base URI of + *

+     *   http://a/b/c/d;p?q
+     * 

+ * the relative URI would be resolved as follows: + * + * Normal Examples + * + *

+     *   g:h           =  g:h
+     *   g             =  http://a/b/c/g
+     *   ./g           =  http://a/b/c/g
+     *   g/            =  http://a/b/c/g/
+     *   /g            =  http://a/g
+     *   //g           =  http://g
+     *   ?y            =  http://a/b/c/?y
+     *   g?y           =  http://a/b/c/g?y
+     *   #s            =  (current document)#s
+     *   g#s           =  http://a/b/c/g#s
+     *   g?y#s         =  http://a/b/c/g?y#s
+     *   ;x            =  http://a/b/c/;x
+     *   g;x           =  http://a/b/c/g;x
+     *   g;x?y#s       =  http://a/b/c/g;x?y#s
+     *   .             =  http://a/b/c/
+     *   ./            =  http://a/b/c/
+     *   ..            =  http://a/b/
+     *   ../           =  http://a/b/
+     *   ../g          =  http://a/b/g
+     *   ../..         =  http://a/
+     *   ../../        =  http://a/ 
+     *   ../../g       =  http://a/g
+     * 

+ * + * Some URI schemes do not allow a hierarchical syntax matching the + * syntax, and thus cannot use relative references. + * + * @param base the base URI + * @param relative the relative URI + * @throws URIException If the new URI cannot be created. + */ + public URI(URI base, URI relative) throws URIException { + + if (base._scheme == null) { + throw new URIException(URIException.PARSING, "base URI required"); + } + if (base._scheme != null) { + this._scheme = base._scheme; + this._authority = base._authority; + this._is_net_path = base._is_net_path; + } + if (base._is_opaque_part || relative._is_opaque_part) { + this._scheme = base._scheme; + this._is_opaque_part = base._is_opaque_part + || relative._is_opaque_part; + this._opaque = relative._opaque; + this._fragment = relative._fragment; + this.setURI(); + return; + } + boolean schemesEqual = Arrays.equals(base._scheme,relative._scheme); + if (relative._scheme != null + && (!schemesEqual || relative._authority != null)) { + this._scheme = relative._scheme; + this._is_net_path = relative._is_net_path; + this._authority = relative._authority; + if (relative._is_server) { + this._is_server = relative._is_server; + this._userinfo = relative._userinfo; + this._host = relative._host; + this._port = relative._port; + } else if (relative._is_reg_name) { + this._is_reg_name = relative._is_reg_name; + } + this._is_abs_path = relative._is_abs_path; + this._is_rel_path = relative._is_rel_path; + this._path = relative._path; + } else if (base._authority != null && relative._scheme == null) { + this._is_net_path = base._is_net_path; + this._authority = base._authority; + if (base._is_server) { + this._is_server = base._is_server; + this._userinfo = base._userinfo; + this._host = base._host; + this._port = base._port; + } else if (base._is_reg_name) { + this._is_reg_name = base._is_reg_name; + } + } + if (relative._authority != null) { + this._is_net_path = relative._is_net_path; + this._authority = relative._authority; + if (relative._is_server) { + this._is_server = relative._is_server; + this._userinfo = relative._userinfo; + this._host = relative._host; + this._port = relative._port; + } else if (relative._is_reg_name) { + this._is_reg_name = relative._is_reg_name; + } + this._is_abs_path = relative._is_abs_path; + this._is_rel_path = relative._is_rel_path; + this._path = relative._path; + } + // resolve the path and query if necessary + if (relative._authority == null + && (relative._scheme == null || schemesEqual)) { + if ((relative._path == null || relative._path.length == 0) + && relative._query == null) { + // handle a reference to the current document, see RFC 2396 + // section 5.2 step 2 + this._path = base._path; + this._query = base._query; + } else { + this._path = resolvePath(base._path, relative._path); + } + } + // base._query removed + if (relative._query != null) { + this._query = relative._query; + } + // base._fragment removed + if (relative._fragment != null) { + this._fragment = relative._fragment; + } + this.setURI(); + // reparse the newly built URI, this will ensure that all flags are set correctly. + // TODO there must be a better way to do this + parseUriReference(new String(_uri), true); + } + + // --------------------------------------------------- Instance Variables + + /** Version ID for serialization */ + static final long serialVersionUID = 604752400577948726L; + + + /** + * Cache the hash code for this URI. + */ + protected int hash = 0; + + + /** + * This Uniform Resource Identifier (URI). + * The URI is always in an "escaped" form, since escaping or unescaping + * a completed URI might change its semantics. + */ + protected char[] _uri = null; + + + /** + * The charset of the protocol used by this URI instance. + */ + protected String protocolCharset = null; + + + /** + * The default charset of the protocol. RFC 2277, 2396 + */ + protected static String defaultProtocolCharset = "UTF-8"; + + + /** + * The default charset of the document. RFC 2277, 2396 + * The platform's charset is used for the document by default. + */ + protected static String defaultDocumentCharset = null; + protected static String defaultDocumentCharsetByLocale = null; + protected static String defaultDocumentCharsetByPlatform = null; + // Static initializer for defaultDocumentCharset + static { + Locale locale = Locale.getDefault(); + // in order to support backward compatiblity + if (locale != null) { + defaultDocumentCharsetByLocale = + LocaleToCharsetMap.getCharset(locale); + // set the default document charset + defaultDocumentCharset = defaultDocumentCharsetByLocale; + } + // in order to support platform encoding + try { + defaultDocumentCharsetByPlatform = System.getProperty("file.encoding"); + } catch (SecurityException ignore) { + } + if (defaultDocumentCharset == null) { + // set the default document charset + defaultDocumentCharset = defaultDocumentCharsetByPlatform; + } + } + + + /** + * The scheme. + */ + protected char[] _scheme = null; + + + /** + * The opaque. + */ + protected char[] _opaque = null; + + + /** + * The authority. + */ + protected char[] _authority = null; + + + /** + * The userinfo. + */ + protected char[] _userinfo = null; + + + /** + * The host. + */ + protected char[] _host = null; + + + /** + * The port. + */ + protected int _port = -1; + + + /** + * The path. + */ + protected char[] _path = null; + + + /** + * The query. + */ + protected char[] _query = null; + + + /** + * The fragment. + */ + protected char[] _fragment = null; + + + /** + * The root path. + */ + protected static final char[] rootPath = { '/' }; + + // ---------------------- Generous characters for each component validation + + /** + * The percent "%" character always has the reserved purpose of being the + * escape indicator, it must be escaped as "%25" in order to be used as + * data within a URI. + */ + protected static final BitSet percent = new BitSet(256); + // Static initializer for percent + static { + percent.set('%'); + } + + + /** + * BitSet for digit. + *

+     * digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
+     *            "8" | "9"
+     * 

+ */ + protected static final BitSet digit = new BitSet(256); + // Static initializer for digit + static { + for (int i = '0'; i <= '9'; i++) { + digit.set(i); + } + } + + + /** + * BitSet for alpha. + *

+     * alpha         = lowalpha | upalpha
+     * 

+ */ + protected static final BitSet alpha = new BitSet(256); + // Static initializer for alpha + static { + for (int i = 'a'; i <= 'z'; i++) { + alpha.set(i); + } + for (int i = 'A'; i <= 'Z'; i++) { + alpha.set(i); + } + } + + + /** + * BitSet for alphanum (join of alpha & digit). + *

+     *  alphanum      = alpha | digit
+     * 

+ */ + protected static final BitSet alphanum = new BitSet(256); + // Static initializer for alphanum + static { + alphanum.or(alpha); + alphanum.or(digit); + } + + + /** + * BitSet for hex. + *

+     * hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
+     *                         "a" | "b" | "c" | "d" | "e" | "f"
+     * 

+ */ + protected static final BitSet hex = new BitSet(256); + // Static initializer for hex + static { + hex.or(digit); + for (int i = 'a'; i <= 'f'; i++) { + hex.set(i); + } + for (int i = 'A'; i <= 'F'; i++) { + hex.set(i); + } + } + + + /** + * BitSet for escaped. + *

+     * escaped       = "%" hex hex
+     * 

+ */ + protected static final BitSet escaped = new BitSet(256); + // Static initializer for escaped + static { + escaped.or(percent); + escaped.or(hex); + } + + + /** + * BitSet for mark. + *

+     * mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
+     *                 "(" | ")"
+     * 

+ */ + protected static final BitSet mark = new BitSet(256); + // Static initializer for mark + static { + mark.set('-'); + mark.set('_'); + mark.set('.'); + mark.set('!'); + mark.set('~'); + mark.set('*'); + mark.set('\''); + mark.set('('); + mark.set(')'); + } + + + /** + * Data characters that are allowed in a URI but do not have a reserved + * purpose are called unreserved. + *

+     * unreserved    = alphanum | mark
+     * 

+ */ + protected static final BitSet unreserved = new BitSet(256); + // Static initializer for unreserved + static { + unreserved.or(alphanum); + unreserved.or(mark); + } + + + /** + * BitSet for reserved. + *

+     * reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
+     *                 "$" | ","
+     * 

+ */ + protected static final BitSet reserved = new BitSet(256); + // Static initializer for reserved + static { + reserved.set(';'); + reserved.set('/'); + reserved.set('?'); + reserved.set(':'); + reserved.set('@'); + reserved.set('&'); + reserved.set('='); + reserved.set('+'); + reserved.set('$'); + reserved.set(','); + } + + + /** + * BitSet for uric. + *

+     * uric          = reserved | unreserved | escaped
+     * 

+ */ + protected static final BitSet uric = new BitSet(256); + // Static initializer for uric + static { + uric.or(reserved); + uric.or(unreserved); + uric.or(escaped); + } + + + /** + * BitSet for fragment (alias for uric). + *

+     * fragment      = *uric
+     * 

+ */ + protected static final BitSet fragment = uric; + + + /** + * BitSet for query (alias for uric). + *

+     * query         = *uric
+     * 

+ */ + protected static final BitSet query = uric; + + + /** + * BitSet for pchar. + *

+     * pchar         = unreserved | escaped |
+     *                 ":" | "@" | "&" | "=" | "+" | "$" | ","
+     * 

+ */ + protected static final BitSet pchar = new BitSet(256); + // Static initializer for pchar + static { + pchar.or(unreserved); + pchar.or(escaped); + pchar.set(':'); + pchar.set('@'); + pchar.set('&'); + pchar.set('='); + pchar.set('+'); + pchar.set('$'); + pchar.set(','); + } + + + /** + * BitSet for param (alias for pchar). + *

+     * param         = *pchar
+     * 

+ */ + protected static final BitSet param = pchar; + + + /** + * BitSet for segment. + *

+     * segment       = *pchar *( ";" param )
+     * 

+ */ + protected static final BitSet segment = new BitSet(256); + // Static initializer for segment + static { + segment.or(pchar); + segment.set(';'); + segment.or(param); + } + + + /** + * BitSet for path segments. + *

+     * path_segments = segment *( "/" segment )
+     * 

+ */ + protected static final BitSet path_segments = new BitSet(256); + // Static initializer for path_segments + static { + path_segments.set('/'); + path_segments.or(segment); + } + + + /** + * URI absolute path. + *

+     * abs_path      = "/"  path_segments
+     * 

+ */ + protected static final BitSet abs_path = new BitSet(256); + // Static initializer for abs_path + static { + abs_path.set('/'); + abs_path.or(path_segments); + } + + + /** + * URI bitset for encoding typical non-slash characters. + *

+     * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
+     *                 "&" | "=" | "+" | "$" | ","
+     * 

+ */ + protected static final BitSet uric_no_slash = new BitSet(256); + // Static initializer for uric_no_slash + static { + uric_no_slash.or(unreserved); + uric_no_slash.or(escaped); + uric_no_slash.set(';'); + uric_no_slash.set('?'); + uric_no_slash.set(';'); + uric_no_slash.set('@'); + uric_no_slash.set('&'); + uric_no_slash.set('='); + uric_no_slash.set('+'); + uric_no_slash.set('$'); + uric_no_slash.set(','); + } + + + /** + * URI bitset that combines uric_no_slash and uric. + *

+     * opaque_part   = uric_no_slash *uric
+     * 

+ */ + protected static final BitSet opaque_part = new BitSet(256); + // Static initializer for opaque_part + static { + // it's generous. because first character must not include a slash + opaque_part.or(uric_no_slash); + opaque_part.or(uric); + } + + + /** + * URI bitset that combines absolute path and opaque part. + *

+     * path          = [ abs_path | opaque_part ]
+     * 

+ */ + protected static final BitSet path = new BitSet(256); + // Static initializer for path + static { + path.or(abs_path); + path.or(opaque_part); + } + + + /** + * Port, a logical alias for digit. + */ + protected static final BitSet port = digit; + + + /** + * Bitset that combines digit and dot fo IPv$address. + *

+     * IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
+     * 

+ */ + protected static final BitSet IPv4address = new BitSet(256); + // Static initializer for IPv4address + static { + IPv4address.or(digit); + IPv4address.set('.'); + } + + + /** + * RFC 2373. + *

+     * IPv6address = hexpart [ ":" IPv4address ]
+     * 

+ */ + protected static final BitSet IPv6address = new BitSet(256); + // Static initializer for IPv6address reference + static { + IPv6address.or(hex); // hexpart + IPv6address.set(':'); + IPv6address.or(IPv4address); + } + + + /** + * RFC 2732, 2373. + *

+     * IPv6reference   = "[" IPv6address "]"
+     * 

+ */ + protected static final BitSet IPv6reference = new BitSet(256); + // Static initializer for IPv6reference + static { + IPv6reference.set('['); + IPv6reference.or(IPv6address); + IPv6reference.set(']'); + } + + + /** + * BitSet for toplabel. + *

+     * toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
+     * 

+ */ + protected static final BitSet toplabel = new BitSet(256); + // Static initializer for toplabel + static { + toplabel.or(alphanum); + toplabel.set('-'); + } + + + /** + * BitSet for domainlabel. + *

+     * domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
+     * 

+ */ + protected static final BitSet domainlabel = toplabel; + + + /** + * BitSet for hostname. + *

+     * hostname      = *( domainlabel "." ) toplabel [ "." ]
+     * 

+ */ + protected static final BitSet hostname = new BitSet(256); + // Static initializer for hostname + static { + hostname.or(toplabel); + // hostname.or(domainlabel); + hostname.set('.'); + } + + + /** + * BitSet for host. + *

+     * host          = hostname | IPv4address | IPv6reference
+     * 

+ */ + protected static final BitSet host = new BitSet(256); + // Static initializer for host + static { + host.or(hostname); + // host.or(IPv4address); + host.or(IPv6reference); // IPv4address + } + + + /** + * BitSet for hostport. + *

+     * hostport      = host [ ":" port ]
+     * 

+ */ + protected static final BitSet hostport = new BitSet(256); + // Static initializer for hostport + static { + hostport.or(host); + hostport.set(':'); + hostport.or(port); + } + + + /** + * Bitset for userinfo. + *

+     * userinfo      = *( unreserved | escaped |
+     *                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
+     * 

+ */ + protected static final BitSet userinfo = new BitSet(256); + // Static initializer for userinfo + static { + userinfo.or(unreserved); + userinfo.or(escaped); + userinfo.set(';'); + userinfo.set(':'); + userinfo.set('&'); + userinfo.set('='); + userinfo.set('+'); + userinfo.set('$'); + userinfo.set(','); + } + + + /** + * BitSet for within the userinfo component like user and password. + */ + public static final BitSet within_userinfo = new BitSet(256); + // Static initializer for within_userinfo + static { + within_userinfo.or(userinfo); + within_userinfo.clear(';'); // reserved within authority + within_userinfo.clear(':'); + within_userinfo.clear('@'); + within_userinfo.clear('?'); + within_userinfo.clear('/'); + } + + + /** + * Bitset for server. + *

+     * server        = [ [ userinfo "@" ] hostport ]
+     * 

+ */ + protected static final BitSet server = new BitSet(256); + // Static initializer for server + static { + server.or(userinfo); + server.set('@'); + server.or(hostport); + } + + + /** + * BitSet for reg_name. + *

+     * reg_name      = 1*( unreserved | escaped | "$" | "," |
+     *                     ";" | ":" | "@" | "&" | "=" | "+" )
+     * 

+ */ + protected static final BitSet reg_name = new BitSet(256); + // Static initializer for reg_name + static { + reg_name.or(unreserved); + reg_name.or(escaped); + reg_name.set('$'); + reg_name.set(','); + reg_name.set(';'); + reg_name.set(':'); + reg_name.set('@'); + reg_name.set('&'); + reg_name.set('='); + reg_name.set('+'); + } + + + /** + * BitSet for authority. + *

+     * authority     = server | reg_name
+     * 

+ */ + protected static final BitSet authority = new BitSet(256); + // Static initializer for authority + static { + authority.or(server); + authority.or(reg_name); + } + + + /** + * BitSet for scheme. + *

+     * scheme        = alpha *( alpha | digit | "+" | "-" | "." )
+     * 

+ */ + protected static final BitSet scheme = new BitSet(256); + // Static initializer for scheme + static { + scheme.or(alpha); + scheme.or(digit); + scheme.set('+'); + scheme.set('-'); + scheme.set('.'); + } + + + /** + * BitSet for rel_segment. + *

+     * rel_segment   = 1*( unreserved | escaped |
+     *                     ";" | "@" | "&" | "=" | "+" | "$" | "," )
+     * 

+ */ + protected static final BitSet rel_segment = new BitSet(256); + // Static initializer for rel_segment + static { + rel_segment.or(unreserved); + rel_segment.or(escaped); + rel_segment.set(';'); + rel_segment.set('@'); + rel_segment.set('&'); + rel_segment.set('='); + rel_segment.set('+'); + rel_segment.set('$'); + rel_segment.set(','); + } + + + /** + * BitSet for rel_path. + *

+     * rel_path      = rel_segment [ abs_path ]
+     * 

+ */ + protected static final BitSet rel_path = new BitSet(256); + // Static initializer for rel_path + static { + rel_path.or(rel_segment); + rel_path.or(abs_path); + } + + + /** + * BitSet for net_path. + *

+     * net_path      = "//" authority [ abs_path ]
+     * 

+ */ + protected static final BitSet net_path = new BitSet(256); + // Static initializer for net_path + static { + net_path.set('/'); + net_path.or(authority); + net_path.or(abs_path); + } + + + /** + * BitSet for hier_part. + *

+     * hier_part     = ( net_path | abs_path ) [ "?" query ]
+     * 

+ */ + protected static final BitSet hier_part = new BitSet(256); + // Static initializer for hier_part + static { + hier_part.or(net_path); + hier_part.or(abs_path); + // hier_part.set('?'); aleady included + hier_part.or(query); + } + + + /** + * BitSet for relativeURI. + *

+     * relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
+     * 

+ */ + protected static final BitSet relativeURI = new BitSet(256); + // Static initializer for relativeURI + static { + relativeURI.or(net_path); + relativeURI.or(abs_path); + relativeURI.or(rel_path); + // relativeURI.set('?'); aleady included + relativeURI.or(query); + } + + + /** + * BitSet for absoluteURI. + *

+     * absoluteURI   = scheme ":" ( hier_part | opaque_part )
+     * 

+ */ + protected static final BitSet absoluteURI = new BitSet(256); + // Static initializer for absoluteURI + static { + absoluteURI.or(scheme); + absoluteURI.set(':'); + absoluteURI.or(hier_part); + absoluteURI.or(opaque_part); + } + + + /** + * BitSet for URI-reference. + *

+     * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+     * 

+ */ + protected static final BitSet URI_reference = new BitSet(256); + // Static initializer for URI_reference + static { + URI_reference.or(absoluteURI); + URI_reference.or(relativeURI); + URI_reference.set('#'); + URI_reference.or(fragment); + } + + // ---------------------------- Characters disallowed within the URI syntax + // Excluded US-ASCII Characters are like control, space, delims and unwise + + /** + * BitSet for control. + */ + public static final BitSet control = new BitSet(256); + // Static initializer for control + static { + for (int i = 0; i <= 0x1F; i++) { + control.set(i); + } + control.set(0x7F); + } + + /** + * BitSet for space. + */ + public static final BitSet space = new BitSet(256); + // Static initializer for space + static { + space.set(0x20); + } + + + /** + * BitSet for delims. + */ + public static final BitSet delims = new BitSet(256); + // Static initializer for delims + static { + delims.set('<'); + delims.set('>'); + delims.set('#'); + delims.set('%'); + delims.set('"'); + } + + + /** + * BitSet for unwise. + */ + public static final BitSet unwise = new BitSet(256); + // Static initializer for unwise + static { + unwise.set('{'); + unwise.set('}'); + unwise.set('|'); + unwise.set('\\'); + unwise.set('^'); + unwise.set('['); + unwise.set(']'); + unwise.set('`'); + } + + + /** + * Disallowed rel_path before escaping. + */ + public static final BitSet disallowed_rel_path = new BitSet(256); + // Static initializer for disallowed_rel_path + static { + disallowed_rel_path.or(uric); + disallowed_rel_path.andNot(rel_path); + } + + + /** + * Disallowed opaque_part before escaping. + */ + public static final BitSet disallowed_opaque_part = new BitSet(256); + // Static initializer for disallowed_opaque_part + static { + disallowed_opaque_part.or(uric); + disallowed_opaque_part.andNot(opaque_part); + } + + // ----------------------- Characters allowed within and for each component + + /** + * Those characters that are allowed for the authority component. + */ + public static final BitSet allowed_authority = new BitSet(256); + // Static initializer for allowed_authority + static { + allowed_authority.or(authority); + allowed_authority.clear('%'); + } + + + /** + * Those characters that are allowed for the opaque_part. + */ + public static final BitSet allowed_opaque_part = new BitSet(256); + // Static initializer for allowed_opaque_part + static { + allowed_opaque_part.or(opaque_part); + allowed_opaque_part.clear('%'); + } + + + /** + * Those characters that are allowed for the reg_name. + */ + public static final BitSet allowed_reg_name = new BitSet(256); + // Static initializer for allowed_reg_name + static { + allowed_reg_name.or(reg_name); + // allowed_reg_name.andNot(percent); + allowed_reg_name.clear('%'); + } + + + /** + * Those characters that are allowed for the userinfo component. + */ + public static final BitSet allowed_userinfo = new BitSet(256); + // Static initializer for allowed_userinfo + static { + allowed_userinfo.or(userinfo); + // allowed_userinfo.andNot(percent); + allowed_userinfo.clear('%'); + } + + + /** + * Those characters that are allowed for within the userinfo component. + */ + public static final BitSet allowed_within_userinfo = new BitSet(256); + // Static initializer for allowed_within_userinfo + static { + allowed_within_userinfo.or(within_userinfo); + allowed_within_userinfo.clear('%'); + } + + + /** + * Those characters that are allowed for the IPv6reference component. + * The characters '[', ']' in IPv6reference should be excluded. + */ + public static final BitSet allowed_IPv6reference = new BitSet(256); + // Static initializer for allowed_IPv6reference + static { + allowed_IPv6reference.or(IPv6reference); + // allowed_IPv6reference.andNot(unwise); + allowed_IPv6reference.clear('['); + allowed_IPv6reference.clear(']'); + } + + + /** + * Those characters that are allowed for the host component. + * The characters '[', ']' in IPv6reference should be excluded. + */ + public static final BitSet allowed_host = new BitSet(256); + // Static initializer for allowed_host + static { + allowed_host.or(hostname); + allowed_host.or(allowed_IPv6reference); + } + + + /** + * Those characters that are allowed for the authority component. + */ + public static final BitSet allowed_within_authority = new BitSet(256); + // Static initializer for allowed_within_authority + static { + allowed_within_authority.or(server); + allowed_within_authority.or(reg_name); + allowed_within_authority.clear(';'); + allowed_within_authority.clear(':'); + allowed_within_authority.clear('@'); + allowed_within_authority.clear('?'); + allowed_within_authority.clear('/'); + } + + + /** + * Those characters that are allowed for the abs_path. + */ + public static final BitSet allowed_abs_path = new BitSet(256); + // Static initializer for allowed_abs_path + static { + allowed_abs_path.or(abs_path); + // allowed_abs_path.set('/'); // aleady included + allowed_abs_path.andNot(percent); + allowed_abs_path.clear('+'); + } + + + /** + * Those characters that are allowed for the rel_path. + */ + public static final BitSet allowed_rel_path = new BitSet(256); + // Static initializer for allowed_rel_path + static { + allowed_rel_path.or(rel_path); + allowed_rel_path.clear('%'); + allowed_rel_path.clear('+'); + } + + + /** + * Those characters that are allowed within the path. + */ + public static final BitSet allowed_within_path = new BitSet(256); + // Static initializer for allowed_within_path + static { + allowed_within_path.or(abs_path); + allowed_within_path.clear('/'); + allowed_within_path.clear(';'); + allowed_within_path.clear('='); + allowed_within_path.clear('?'); + } + + + /** + * Those characters that are allowed for the query component. + */ + public static final BitSet allowed_query = new BitSet(256); + // Static initializer for allowed_query + static { + allowed_query.or(uric); + allowed_query.clear('%'); + } + + + /** + * Those characters that are allowed within the query component. + */ + public static final BitSet allowed_within_query = new BitSet(256); + // Static initializer for allowed_within_query + static { + allowed_within_query.or(allowed_query); + allowed_within_query.andNot(reserved); // excluded 'reserved' + } + + + /** + * Those characters that are allowed for the fragment component. + */ + public static final BitSet allowed_fragment = new BitSet(256); + // Static initializer for allowed_fragment + static { + allowed_fragment.or(uric); + allowed_fragment.clear('%'); + } + + // ------------------------------------------- Flags for this URI-reference + + // TODO: Figure out what all these variables are for and provide javadoc + + // URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + // absoluteURI = scheme ":" ( hier_part | opaque_part ) + protected boolean _is_hier_part; + protected boolean _is_opaque_part; + // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] + // hier_part = ( net_path | abs_path ) [ "?" query ] + protected boolean _is_net_path; + protected boolean _is_abs_path; + protected boolean _is_rel_path; + // net_path = "//" authority [ abs_path ] + // authority = server | reg_name + protected boolean _is_reg_name; + protected boolean _is_server; // = _has_server + // server = [ [ userinfo "@" ] hostport ] + // host = hostname | IPv4address | IPv6reference + protected boolean _is_hostname; + protected boolean _is_IPv4address; + protected boolean _is_IPv6reference; + + // ------------------------------------------ Character and escape encoding + + /** + * Encodes URI string. + * + * This is a two mapping, one from original characters to octets, and + * subsequently a second from octets to URI characters: + *

+     *   original character sequence->octet sequence->URI character sequence
+     * 

+ * + * An escaped octet is encoded as a character triplet, consisting of the + * percent character "%" followed by the two hexadecimal digits + * representing the octet code. For example, "%20" is the escaped + * encoding for the US-ASCII space character. + *

+ * Conversion from the local filesystem character set to UTF-8 will + * normally involve a two step process. First convert the local character + * set to the UCS; then convert the UCS to UTF-8. + * The first step in the process can be performed by maintaining a mapping + * table that includes the local character set code and the corresponding + * UCS code. + * The next step is to convert the UCS character code to the UTF-8 encoding. + *

+ * Mapping between vendor codepages can be done in a very similar manner + * as described above. + *

+ * The only time escape encodings can allowedly be made is when a URI is + * being created from its component parts. The escape and validate methods + * are internally performed within this method. + * + * @param original the original character sequence + * @param allowed those characters that are allowed within a component + * @param charset the protocol charset + * @return URI character sequence + * @throws URIException null component or unsupported character encoding + */ + + protected static char[] encode(String original, BitSet allowed, + String charset) throws URIException { + if (original == null) { + throw new IllegalArgumentException("Original string may not be null"); + } + if (allowed == null) { + throw new IllegalArgumentException("Allowed bitset may not be null"); + } + byte[] rawdata = URLCodec.encodeUrl(allowed, getBytes(original, charset)); + return new String(rawdata, StandardCharsets.US_ASCII).toCharArray(); + } + + private static byte[] getBytes(String original, String charset) { + try { + return original.getBytes(charset); + } catch (UnsupportedEncodingException e) { + return original.getBytes(); + } + } + + /** + * Decodes URI encoded string. + * + * This is a two mapping, one from URI characters to octets, and + * subsequently a second from octets to original characters: + *

+     *   URI character sequence->octet sequence->original character sequence
+     * 

+ * + * A URI must be separated into its components before the escaped + * characters within those components can be allowedly decoded. + *

+ * Notice that there is a chance that URI characters that are non UTF-8 + * may be parsed as valid UTF-8. A recent non-scientific analysis found + * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a + * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% + * false reading. + *

+ * The percent "%" character always has the reserved purpose of being + * the escape indicator, it must be escaped as "%25" in order to be used + * as data within a URI. + *

+ * The unescape method is internally performed within this method. + * + * @param component the URI character sequence + * @param charset the protocol charset + * @return original character sequence + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + */ + protected static String decode(char[] component, String charset) + throws URIException { + if (component == null) { + throw new IllegalArgumentException("Component array of chars may not be null"); + } + return decode(new String(component), charset); + } + + /** + * Decodes URI encoded string. + * + * This is a two mapping, one from URI characters to octets, and + * subsequently a second from octets to original characters: + *

+     *   URI character sequence->octet sequence->original character sequence
+     * 

+ * + * A URI must be separated into its components before the escaped + * characters within those components can be allowedly decoded. + *

+ * Notice that there is a chance that URI characters that are non UTF-8 + * may be parsed as valid UTF-8. A recent non-scientific analysis found + * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a + * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% + * false reading. + *

+ * The percent "%" character always has the reserved purpose of being + * the escape indicator, it must be escaped as "%25" in order to be used + * as data within a URI. + *

+ * The unescape method is internally performed within this method. + * + * @param component the URI character sequence + * @param charset the protocol charset + * @return original character sequence + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * + * @since 3.0 + */ + protected static String decode(String component, String charset) + throws URIException { + if (component == null) { + throw new IllegalArgumentException("Component array of chars may not be null"); + } + byte[] rawdata = null; + try { + rawdata = URLCodec.decodeUrl(component.getBytes(StandardCharsets.US_ASCII)); + } catch (DecoderException e) { + throw new URIException(e.getMessage()); + } + try { + return new String(rawdata, charset); + } catch (UnsupportedEncodingException e) { + return new String(rawdata); + } + } + /** + * Pre-validate the unescaped URI string within a specific component. + * + * @param component the component string within the component + * @param disallowed those characters disallowed within the component + * @return if true, it doesn't have the disallowed characters + * if false, the component is undefined or an incorrect one + */ + protected boolean prevalidate(String component, BitSet disallowed) { + // prevalidate the given component by disallowed characters + if (component == null) { + return false; // undefined + } + char[] target = component.toCharArray(); + for (int i = 0; i < target.length; i++) { + if (disallowed.get(target[i])) { + return false; + } + } + return true; + } + + + /** + * Validate the URI characters within a specific component. + * The component must be performed after escape encoding. Or it doesn't + * include escaped characters. + * + * @param component the characters sequence within the component + * @param generous those characters that are allowed within a component + * @return if true, it's the correct URI character sequence + */ + protected boolean validate(char[] component, BitSet generous) { + // validate each component by generous characters + return validate(component, 0, -1, generous); + } + + + /** + * Validate the URI characters within a specific component. + * The component must be performed after escape encoding. Or it doesn't + * include escaped characters. + *

+ * It's not that much strict, generous. The strict validation might be + * performed before being called this method. + * + * @param component the characters sequence within the component + * @param soffset the starting offset of the given component + * @param eoffset the ending offset of the given component + * if -1, it means the length of the component + * @param generous those characters that are allowed within a component + * @return if true, it's the correct URI character sequence + */ + protected boolean validate(char[] component, int soffset, int eoffset, + BitSet generous) { + // validate each component by generous characters + if (eoffset == -1) { + eoffset = component.length - 1; + } + for (int i = soffset; i <= eoffset; i++) { + if (!generous.get(component[i])) { + return false; + } + } + return true; + } + + + /** + * In order to avoid any possilbity of conflict with non-ASCII characters, + * Parse a URI reference as a String with the character + * encoding of the local system or the document. + *

+ * The following line is the regular expression for breaking-down a URI + * reference into its components. + *

+     *   ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+     *    12            3  4          5       6  7        8 9
+     * 

+ * For example, matching the above expression to + * http://jakarta.apache.org/ietf/uri/#Related + * results in the following subexpression matches: + *

+     *               $1 = http:
+     *  scheme    =  $2 = http
+     *               $3 = //jakarta.apache.org
+     *  authority =  $4 = jakarta.apache.org
+     *  path      =  $5 = /ietf/uri/
+     *               $6 = 
+     *  query     =  $7 = 
+     *               $8 = #Related
+     *  fragment  =  $9 = Related
+     * 

+ * + * @param original the original character sequence + * @param escaped true if original is escaped + * @throws URIException If an error occurs. + */ + protected void parseUriReference(String original, boolean escaped) + throws URIException { + + // validate and contruct the URI character sequence + if (original == null) { + throw new URIException("URI-Reference required"); + } + + /* @ + * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? + */ + String tmp = original.trim(); + + /* + * The length of the string sequence of characters. + * It may not be equal to the length of the byte array. + */ + int length = tmp.length(); + + /* + * Remove the delimiters like angle brackets around an URI. + */ + if (length > 0) { + char[] firstDelimiter = { tmp.charAt(0) }; + if (validate(firstDelimiter, delims)) { + if (length >= 2) { + char[] lastDelimiter = { tmp.charAt(length - 1) }; + if (validate(lastDelimiter, delims)) { + tmp = tmp.substring(1, length - 1); + length = length - 2; + } + } + } + } + + /* + * The starting index + */ + int from = 0; + + /* + * The test flag whether the URI is started from the path component. + */ + boolean isStartedFromPath = false; + int atColon = tmp.indexOf(':'); + int atSlash = tmp.indexOf('/'); + if ((atColon <= 0 && !tmp.startsWith("//")) + || (atSlash >= 0 && atSlash < atColon)) { + isStartedFromPath = true; + } + + /* + *

+         *     @@@@@@@@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from); + if (at == -1) { + at = 0; + } + + /* + * Parse the scheme. + *

+         *  scheme    =  $2 = http
+         *              @
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + if (at > 0 && at < length && tmp.charAt(at) == ':') { + char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); + if (validate(target, scheme)) { + _scheme = target; + } else { + throw new URIException("incorrect scheme"); + } + from = ++at; + } + + /* + * Parse the authority component. + *

+         *  authority =  $4 = jakarta.apache.org
+         *                  @@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + // Reset flags + _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false; + if (0 <= at && at < length && tmp.charAt(at) == '/') { + // Set flag + _is_hier_part = true; + if (at + 2 < length && tmp.charAt(at + 1) == '/' + && !isStartedFromPath) { + // the temporary index to start the search from + int next = indexFirstOf(tmp, "/?#", at + 2); + if (next == -1) { + next = (tmp.substring(at + 2).length() == 0) ? at + 2 + : tmp.length(); + } + parseAuthority(tmp.substring(at + 2, next), escaped); + from = at = next; + // Set flag + _is_net_path = true; + } + if (from == at) { + // Set flag + _is_abs_path = true; + } + } + + /* + * Parse the path component. + *

+         *  path      =  $5 = /ietf/uri/
+         *                                @@@@@@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + if (from < length) { + // rel_path = rel_segment [ abs_path ] + int next = indexFirstOf(tmp, "?#", from); + if (next == -1) { + next = tmp.length(); + } + if (!_is_abs_path) { + if (!escaped + && prevalidate(tmp.substring(from, next), disallowed_rel_path) + || escaped + && validate(tmp.substring(from, next).toCharArray(), rel_path)) { + // Set flag + _is_rel_path = true; + } else if (!escaped + && prevalidate(tmp.substring(from, next), disallowed_opaque_part) + || escaped + && validate(tmp.substring(from, next).toCharArray(), opaque_part)) { + // Set flag + _is_opaque_part = true; + } else { + // the path component may be empty + _path = null; + } + } + String s = tmp.substring(from, next); + if (escaped) { + setRawPath(s.toCharArray()); + } else { + setPath(s); + } + at = next; + } + + // set the charset to do escape encoding + String charset = getProtocolCharset(); + + /* + * Parse the query component. + *

+         *  query     =  $7 = 
+         *                                        @@@@@@@@@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') { + int next = tmp.indexOf('#', at + 1); + if (next == -1) { + next = tmp.length(); + } + if (escaped) { + _query = tmp.substring(at + 1, next).toCharArray(); + if (!validate(_query, uric)) { + throw new URIException("Invalid query"); + } + } else { + _query = encode(tmp.substring(at + 1, next), allowed_query, charset); + } + at = next; + } + + /* + * Parse the fragment component. + *

+         *  fragment  =  $9 = Related
+         *                                                   @@@@@@@@
+         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+         * 

+ */ + if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') { + if (at + 1 == length) { // empty fragment + _fragment = "".toCharArray(); + } else { + _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() + : encode(tmp.substring(at + 1), allowed_fragment, charset); + } + } + + // set this URI. + setURI(); + } + + + /** + * Get the earlier index that to be searched for the first occurrance in + * one of any of the given string. + * + * @param s the string to be indexed + * @param delims the delimiters used to index + * @return the earlier index if there are delimiters + */ + protected int indexFirstOf(String s, String delims) { + return indexFirstOf(s, delims, -1); + } + + + /** + * Get the earlier index that to be searched for the first occurrance in + * one of any of the given string. + * + * @param s the string to be indexed + * @param delims the delimiters used to index + * @param offset the from index + * @return the earlier index if there are delimiters + */ + protected int indexFirstOf(String s, String delims, int offset) { + if (s == null || s.length() == 0) { + return -1; + } + if (delims == null || delims.length() == 0) { + return -1; + } + // check boundaries + if (offset < 0) { + offset = 0; + } else if (offset > s.length()) { + return -1; + } + // s is never null + int min = s.length(); + char[] delim = delims.toCharArray(); + for (int i = 0; i < delim.length; i++) { + int at = s.indexOf(delim[i], offset); + if (at >= 0 && at < min) { + min = at; + } + } + return (min == s.length()) ? -1 : min; + } + + + /** + * Get the earlier index that to be searched for the first occurrance in + * one of any of the given array. + * + * @param s the character array to be indexed + * @param delim the delimiter used to index + * @return the ealier index if there are a delimiter + */ + protected int indexFirstOf(char[] s, char delim) { + return indexFirstOf(s, delim, 0); + } + + + /** + * Get the earlier index that to be searched for the first occurrance in + * one of any of the given array. + * + * @param s the character array to be indexed + * @param delim the delimiter used to index + * @param offset The offset. + * @return the ealier index if there is a delimiter + */ + protected int indexFirstOf(char[] s, char delim, int offset) { + if (s == null || s.length == 0) { + return -1; + } + // check boundaries + if (offset < 0) { + offset = 0; + } else if (offset > s.length) { + return -1; + } + for (int i = offset; i < s.length; i++) { + if (s[i] == delim) { + return i; + } + } + return -1; + } + + + /** + * Parse the authority component. + * + * @param original the original character sequence of authority component + * @param escaped true if original is escaped + * @throws URIException If an error occurs. + */ + protected void parseAuthority(String original, boolean escaped) + throws URIException { + + // Reset flags + _is_reg_name = _is_server = + _is_hostname = _is_IPv4address = _is_IPv6reference = false; + + // set the charset to do escape encoding + String charset = getProtocolCharset(); + + boolean hasPort = true; + int from = 0; + int next = original.indexOf('@'); + if (next != -1) { // neither -1 and 0 + // each protocol extented from URI supports the specific userinfo + _userinfo = (escaped) ? original.substring(0, next).toCharArray() + : encode(original.substring(0, next), allowed_userinfo, + charset); + from = next + 1; + } + next = original.indexOf('[', from); + if (next >= from) { + next = original.indexOf(']', from); + if (next == -1) { + throw new URIException(URIException.PARSING, "IPv6reference"); + } else { + next++; + } + // In IPv6reference, '[', ']' should be excluded + _host = (escaped) ? original.substring(from, next).toCharArray() + : encode(original.substring(from, next), allowed_IPv6reference, + charset); + // Set flag + _is_IPv6reference = true; + } else { // only for !_is_IPv6reference + next = original.indexOf(':', from); + if (next == -1) { + next = original.length(); + hasPort = false; + } + // REMINDME: it doesn't need the pre-validation + _host = original.substring(from, next).toCharArray(); + if (validate(_host, IPv4address)) { + // Set flag + _is_IPv4address = true; + } else if (validate(_host, hostname)) { + // Set flag + _is_hostname = true; + } else { + // Set flag + _is_reg_name = true; + } + } + if (_is_reg_name) { + // Reset flags for a server-based naming authority + _is_server = _is_hostname = _is_IPv4address = + _is_IPv6reference = false; + // set a registry-based naming authority + if (escaped) { + _authority = original.toCharArray(); + if (!validate(_authority, reg_name)) { + throw new URIException("Invalid authority"); + } + } else { + _authority = encode(original, allowed_reg_name, charset); + } + } else { + if (original.length() - 1 > next && hasPort + && original.charAt(next) == ':') { // not empty + from = next + 1; + try { + _port = Integer.parseInt(original.substring(from)); + } catch (NumberFormatException error) { + throw new URIException(URIException.PARSING, + "invalid port number"); + } + } + // set a server-based naming authority + StringBuffer buf = new StringBuffer(); + if (_userinfo != null) { // has_userinfo + buf.append(_userinfo); + buf.append('@'); + } + if (_host != null) { + buf.append(_host); + if (_port != -1) { + buf.append(':'); + buf.append(_port); + } + } + _authority = buf.toString().toCharArray(); + // Set flag + _is_server = true; + } + } + + + /** + * Once it's parsed successfully, set this URI. + * + * @see #getRawURI + */ + protected void setURI() { + // set _uri + StringBuffer buf = new StringBuffer(); + // ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? + if (_scheme != null) { + buf.append(_scheme); + buf.append(':'); + } + if (_is_net_path) { + buf.append("//"); + if (_authority != null) { // has_authority + buf.append(_authority); + } + } + if (_opaque != null && _is_opaque_part) { + buf.append(_opaque); + } else if (_path != null) { + // _is_hier_part or _is_relativeURI + if (_path.length != 0) { + buf.append(_path); + } + } + if (_query != null) { // has_query + buf.append('?'); + buf.append(_query); + } + // ignore the fragment identifier + _uri = buf.toString().toCharArray(); + hash = 0; + } + + // ----------------------------------------------------------- Test methods + + + /** + * Tell whether or not this URI is absolute. + * + * @return true iif this URI is absoluteURI + */ + public boolean isAbsoluteURI() { + return (_scheme != null); + } + + + /** + * Tell whether or not this URI is relative. + * + * @return true iif this URI is relativeURI + */ + public boolean isRelativeURI() { + return (_scheme == null); + } + + + /** + * Tell whether or not the absoluteURI of this URI is hier_part. + * + * @return true iif the absoluteURI is hier_part + */ + public boolean isHierPart() { + return _is_hier_part; + } + + + /** + * Tell whether or not the absoluteURI of this URI is opaque_part. + * + * @return true iif the absoluteURI is opaque_part + */ + public boolean isOpaquePart() { + return _is_opaque_part; + } + + + /** + * Tell whether or not the relativeURI or heir_part of this URI is net_path. + * It's the same function as the has_authority() method. + * + * @return true iif the relativeURI or heir_part is net_path + * @see #hasAuthority + */ + public boolean isNetPath() { + return _is_net_path || (_authority != null); + } + + + /** + * Tell whether or not the relativeURI or hier_part of this URI is abs_path. + * + * @return true iif the relativeURI or hier_part is abs_path + */ + public boolean isAbsPath() { + return _is_abs_path; + } + + + /** + * Tell whether or not the relativeURI of this URI is rel_path. + * + * @return true iif the relativeURI is rel_path + */ + public boolean isRelPath() { + return _is_rel_path; + } + + + /** + * Tell whether or not this URI has authority. + * It's the same function as the is_net_path() method. + * + * @return true iif this URI has authority + * @see #isNetPath + */ + public boolean hasAuthority() { + return (_authority != null) || _is_net_path; + } + + /** + * Tell whether or not the authority component of this URI is reg_name. + * + * @return true iif the authority component is reg_name + */ + public boolean isRegName() { + return _is_reg_name; + } + + + /** + * Tell whether or not the authority component of this URI is server. + * + * @return true iif the authority component is server + */ + public boolean isServer() { + return _is_server; + } + + + /** + * Tell whether or not this URI has userinfo. + * + * @return true iif this URI has userinfo + */ + public boolean hasUserinfo() { + return (_userinfo != null); + } + + + /** + * Tell whether or not the host part of this URI is hostname. + * + * @return true iif the host part is hostname + */ + public boolean isHostname() { + return _is_hostname; + } + + + /** + * Tell whether or not the host part of this URI is IPv4address. + * + * @return true iif the host part is IPv4address + */ + public boolean isIPv4address() { + return _is_IPv4address; + } + + + /** + * Tell whether or not the host part of this URI is IPv6reference. + * + * @return true iif the host part is IPv6reference + */ + public boolean isIPv6reference() { + return _is_IPv6reference; + } + + + /** + * Tell whether or not this URI has query. + * + * @return true iif this URI has query + */ + public boolean hasQuery() { + return (_query != null); + } + + + /** + * Tell whether or not this URI has fragment. + * + * @return true iif this URI has fragment + */ + public boolean hasFragment() { + return (_fragment != null); + } + + + // ---------------------------------------------------------------- Charset + + + /** + * Set the default charset of the protocol. + *

+ * The character set used to store files SHALL remain a local decision and + * MAY depend on the capability of local operating systems. Prior to the + * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format + * and UTF-8 encoded. This approach, while allowing international exchange + * of URIs, will still allow backward compatibility with older systems + * because the code set positions for ASCII characters are identical to the + * one byte sequence in UTF-8. + *

+ * An individual URI scheme may require a single charset, define a default + * charset, or provide a way to indicate the charset used. + * + *

+ * Always all the time, the setter method is always succeeded and throws + * DefaultCharsetChanged exception. + * + * So API programmer must follow the following way: + *

+     *  import org.apache.util.URI$DefaultCharsetChanged;
+     *      .
+     *      .
+     *      .
+     *  try {
+     *      URI.setDefaultProtocolCharset("UTF-8");
+     *  } catch (DefaultCharsetChanged cc) {
+     *      // CASE 1: the exception could be ignored, when it is set by user
+     *      if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
+     *      // CASE 2: let user know the default protocol charset changed
+     *      } else {
+     *      // CASE 2: let user know the default document charset changed
+     *      }
+     *  }
+     *  
+ * + * The API programmer is responsible to set the correct charset. + * And each application should remember its own charset to support. + * + * @param charset the default charset for each protocol + * @throws DefaultCharsetChanged default charset changed + */ + public static void setDefaultProtocolCharset(String charset) + throws DefaultCharsetChanged { + + defaultProtocolCharset = charset; + throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET, + "the default protocol charset changed"); + } + + + /** + * Get the default charset of the protocol. + *

+ * An individual URI scheme may require a single charset, define a default + * charset, or provide a way to indicate the charset used. + *

+ * To work globally either requires support of a number of character sets + * and to be able to convert between them, or the use of a single preferred + * character set. + * For support of global compatibility it is STRONGLY RECOMMENDED that + * clients and servers use UTF-8 encoding when exchanging URIs. + * + * @return the default charset string + */ + public static String getDefaultProtocolCharset() { + return defaultProtocolCharset; + } + + + /** + * Get the protocol charset used by this current URI instance. + * It was set by the constructor for this instance. If it was not set by + * contructor, it will return the default protocol charset. + * + * @return the protocol charset string + * @see #getDefaultProtocolCharset + */ + public String getProtocolCharset() { + return (protocolCharset != null) + ? protocolCharset + : defaultProtocolCharset; + } + + + /** + * Set the default charset of the document. + *

+ * Notice that it will be possible to contain mixed characters (e.g. + * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional + * display of these character sets, the protocol charset could be simply + * used again. Because it's not yet implemented that the insertion of BIDI + * control characters at different points during composition is extracted. + *

+ * + * Always all the time, the setter method is always succeeded and throws + * DefaultCharsetChanged exception. + * + * So API programmer must follow the following way: + *

+     *  import org.apache.util.URI$DefaultCharsetChanged;
+     *      .
+     *      .
+     *      .
+     *  try {
+     *      URI.setDefaultDocumentCharset("EUC-KR");
+     *  } catch (DefaultCharsetChanged cc) {
+     *      // CASE 1: the exception could be ignored, when it is set by user
+     *      if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
+     *      // CASE 2: let user know the default document charset changed
+     *      } else {
+     *      // CASE 2: let user know the default protocol charset changed
+     *      }
+     *  }
+     *  
+ * + * The API programmer is responsible to set the correct charset. + * And each application should remember its own charset to support. + * + * @param charset the default charset for the document + * @throws DefaultCharsetChanged default charset changed + */ + public static void setDefaultDocumentCharset(String charset) + throws DefaultCharsetChanged { + + defaultDocumentCharset = charset; + throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET, + "the default document charset changed"); + } + + + /** + * Get the recommended default charset of the document. + * + * @return the default charset string + */ + public static String getDefaultDocumentCharset() { + return defaultDocumentCharset; + } + + + /** + * Get the default charset of the document by locale. + * + * @return the default charset string by locale + */ + public static String getDefaultDocumentCharsetByLocale() { + return defaultDocumentCharsetByLocale; + } + + + /** + * Get the default charset of the document by platform. + * + * @return the default charset string by platform + */ + public static String getDefaultDocumentCharsetByPlatform() { + return defaultDocumentCharsetByPlatform; + } + + // ------------------------------------------------------------- The scheme + + /** + * Get the scheme. + * + * @return the scheme + */ + public char[] getRawScheme() { + return _scheme; + } + + + /** + * Get the scheme. + * + * @return the scheme + * null if undefined scheme + */ + public String getScheme() { + return (_scheme == null) ? null : new String(_scheme); + } + + // ---------------------------------------------------------- The authority + + /** + * Set the authority. It can be one type of server, hostport, hostname, + * IPv4address, IPv6reference and reg_name. + *

+     *   authority     = server | reg_name
+     * 

+ * + * @param escapedAuthority the raw escaped authority + * @throws URIException If {@link + * #parseAuthority(String,boolean)} fails + * @throws NullPointerException null authority + */ + public void setRawAuthority(char[] escapedAuthority) + throws URIException, NullPointerException { + + parseAuthority(new String(escapedAuthority), true); + setURI(); + } + + + /** + * Set the authority. It can be one type of server, hostport, hostname, + * IPv4address, IPv6reference and reg_name. + * Note that there is no setAuthority method by the escape encoding reason. + * + * @param escapedAuthority the escaped authority string + * @throws URIException If {@link + * #parseAuthority(String,boolean)} fails + */ + public void setEscapedAuthority(String escapedAuthority) + throws URIException { + + parseAuthority(escapedAuthority, true); + setURI(); + } + + + /** + * Get the raw-escaped authority. + * + * @return the raw-escaped authority + */ + public char[] getRawAuthority() { + return _authority; + } + + + /** + * Get the escaped authority. + * + * @return the escaped authority + */ + public String getEscapedAuthority() { + return (_authority == null) ? null : new String(_authority); + } + + + /** + * Get the authority. + * + * @return the authority + * @throws URIException If {@link #decode} fails + */ + public String getAuthority() throws URIException { + return (_authority == null) ? null : decode(_authority, + getProtocolCharset()); + } + + // ----------------------------------------------------------- The userinfo + + /** + * Get the raw-escaped userinfo. + * + * @return the raw-escaped userinfo + * @see #getAuthority + */ + public char[] getRawUserinfo() { + return _userinfo; + } + + + /** + * Get the escaped userinfo. + * + * @return the escaped userinfo + * @see #getAuthority + */ + public String getEscapedUserinfo() { + return (_userinfo == null) ? null : new String(_userinfo); + } + + + /** + * Get the userinfo. + * + * @return the userinfo + * @throws URIException If {@link #decode} fails + * @see #getAuthority + */ + public String getUserinfo() throws URIException { + return (_userinfo == null) ? null : decode(_userinfo, + getProtocolCharset()); + } + + // --------------------------------------------------------------- The host + + /** + * Get the host. + *

+     *   host          = hostname | IPv4address | IPv6reference
+     * 

+ * + * @return the host + * @see #getAuthority + */ + public char[] getRawHost() { + return _host; + } + + + /** + * Get the host. + *

+     *   host          = hostname | IPv4address | IPv6reference
+     * 

+ * + * @return the host + * @throws URIException If {@link #decode} fails + * @see #getAuthority + */ + public String getHost() throws URIException { + if (_host != null) { + return decode(_host, getProtocolCharset()); + } else { + return null; + } + } + + // --------------------------------------------------------------- The port + + /** + * Get the port. In order to get the specfic default port, the specific + * protocol-supported class extended from the URI class should be used. + * It has the server-based naming authority. + * + * @return the port + * if -1, it has the default port for the scheme or the server-based + * naming authority is not supported in the specific URI. + */ + public int getPort() { + return _port; + } + + // --------------------------------------------------------------- The path + + /** + * Set the raw-escaped path. + * + * @param escapedPath the path character sequence + * @throws URIException encoding error or not proper for initial instance + * @see #encode + */ + public void setRawPath(char[] escapedPath) throws URIException { + if (escapedPath == null || escapedPath.length == 0) { + _path = _opaque = escapedPath; + setURI(); + return; + } + // remove the fragment identifier + escapedPath = removeFragmentIdentifier(escapedPath); + if (_is_net_path || _is_abs_path) { + if (escapedPath[0] != '/') { + throw new URIException(URIException.PARSING, + "not absolute path"); + } + if (!validate(escapedPath, abs_path)) { + throw new URIException(URIException.ESCAPING, + "escaped absolute path not valid"); + } + _path = escapedPath; + } else if (_is_rel_path) { + int at = indexFirstOf(escapedPath, '/'); + if (at == 0) { + throw new URIException(URIException.PARSING, "incorrect path"); + } + if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment) + && !validate(escapedPath, at, -1, abs_path) + || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) { + + throw new URIException(URIException.ESCAPING, + "escaped relative path not valid"); + } + _path = escapedPath; + } else if (_is_opaque_part) { + if (!uric_no_slash.get(escapedPath[0]) + && !validate(escapedPath, 1, -1, uric)) { + throw new URIException(URIException.ESCAPING, + "escaped opaque part not valid"); + } + _opaque = escapedPath; + } else { + throw new URIException(URIException.PARSING, "incorrect path"); + } + setURI(); + } + + + /** + * Set the escaped path. + * + * @param escapedPath the escaped path string + * @throws URIException encoding error or not proper for initial instance + * @see #encode + */ + public void setEscapedPath(String escapedPath) throws URIException { + if (escapedPath == null) { + _path = _opaque = null; + setURI(); + return; + } + setRawPath(escapedPath.toCharArray()); + } + + + /** + * Set the path. + * + * @param path the path string + * @throws URIException set incorrectly or fragment only + * @see #encode + */ + public void setPath(String path) throws URIException { + + if (path == null || path.length() == 0) { + _path = _opaque = (path == null) ? null : path.toCharArray(); + setURI(); + return; + } + // set the charset to do escape encoding + String charset = getProtocolCharset(); + + if (_is_net_path || _is_abs_path) { + _path = encode(path, allowed_abs_path, charset); + } else if (_is_rel_path) { + StringBuffer buff = new StringBuffer(path.length()); + int at = path.indexOf('/'); + if (at == 0) { // never 0 + throw new URIException(URIException.PARSING, + "incorrect relative path"); + } + if (at > 0) { + buff.append(encode(path.substring(0, at), allowed_rel_path, + charset)); + buff.append(encode(path.substring(at), allowed_abs_path, + charset)); + } else { + buff.append(encode(path, allowed_rel_path, charset)); + } + _path = buff.toString().toCharArray(); + } else if (_is_opaque_part) { + StringBuffer buf = new StringBuffer(); + buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset)); + buf.insert(1, encode(path.substring(1), uric, charset)); + _opaque = buf.toString().toCharArray(); + } else { + throw new URIException(URIException.PARSING, "incorrect path"); + } + setURI(); + } + + + /** + * Resolve the base and relative path. + * + * @param basePath a character array of the basePath + * @param relPath a character array of the relPath + * @return the resolved path + * @throws URIException no more higher path level to be resolved + */ + protected char[] resolvePath(char[] basePath, char[] relPath) + throws URIException { + + // REMINDME: paths are never null + String base = (basePath == null) ? "" : new String(basePath); + + // _path could be empty + if (relPath == null || relPath.length == 0) { + return normalize(basePath); + } else if (relPath[0] == '/') { + return normalize(relPath); + } else { + int at = base.lastIndexOf('/'); + if (at != -1) { + basePath = base.substring(0, at + 1).toCharArray(); + } + StringBuffer buff = new StringBuffer(base.length() + + relPath.length); + buff.append((at != -1) ? base.substring(0, at + 1) : "/"); + buff.append(relPath); + return normalize(buff.toString().toCharArray()); + } + } + + + /** + * Get the raw-escaped current hierarchy level in the given path. + * If the last namespace is a collection, the slash mark ('/') should be + * ended with at the last character of the path string. + * + * @param path the path + * @return the current hierarchy level + * @throws URIException no hierarchy level + */ + protected char[] getRawCurrentHierPath(char[] path) throws URIException { + + if (_is_opaque_part) { + throw new URIException(URIException.PARSING, "no hierarchy level"); + } + if (path == null) { + throw new URIException(URIException.PARSING, "empty path"); + } + String buff = new String(path); + int first = buff.indexOf('/'); + int last = buff.lastIndexOf('/'); + if (last == 0) { + return rootPath; + } else if (first != last && last != -1) { + return buff.substring(0, last).toCharArray(); + } + // FIXME: it could be a document on the server side + return path; + } + + + /** + * Get the raw-escaped current hierarchy level. + * + * @return the raw-escaped current hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + */ + public char[] getRawCurrentHierPath() throws URIException { + return (_path == null) ? null : getRawCurrentHierPath(_path); + } + + + /** + * Get the escaped current hierarchy level. + * + * @return the escaped current hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + */ + public String getEscapedCurrentHierPath() throws URIException { + char[] path = getRawCurrentHierPath(); + return (path == null) ? null : new String(path); + } + + + /** + * Get the current hierarchy level. + * + * @return the current hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + * @see #decode + */ + public String getCurrentHierPath() throws URIException { + char[] path = getRawCurrentHierPath(); + return (path == null) ? null : decode(path, getProtocolCharset()); + } + + + /** + * Get the level above the this hierarchy level. + * + * @return the raw above hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + */ + public char[] getRawAboveHierPath() throws URIException { + char[] path = getRawCurrentHierPath(); + return (path == null) ? null : getRawCurrentHierPath(path); + } + + + /** + * Get the level above the this hierarchy level. + * + * @return the raw above hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + */ + public String getEscapedAboveHierPath() throws URIException { + char[] path = getRawAboveHierPath(); + return (path == null) ? null : new String(path); + } + + + /** + * Get the level above the this hierarchy level. + * + * @return the above hierarchy level + * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. + * @see #decode + */ + public String getAboveHierPath() throws URIException { + char[] path = getRawAboveHierPath(); + return (path == null) ? null : decode(path, getProtocolCharset()); + } + + + /** + * Get the raw-escaped path. + *

+     *   path          = [ abs_path | opaque_part ]
+     * 

+ * + * @return the raw-escaped path + */ + public char[] getRawPath() { + return _is_opaque_part ? _opaque : _path; + } + + + /** + * Get the escaped path. + *

+     *   path          = [ abs_path | opaque_part ]
+     *   abs_path      = "/"  path_segments 
+     *   opaque_part   = uric_no_slash *uric
+     * 

+ * + * @return the escaped path string + */ + public String getEscapedPath() { + char[] path = getRawPath(); + return (path == null) ? null : new String(path); + } + + + /** + * Get the path. + *

+     *   path          = [ abs_path | opaque_part ]
+     * 

+ * @return the path string + * @throws URIException If {@link #decode} fails. + * @see #decode + */ + public String getPath() throws URIException { + char[] path = getRawPath(); + return (path == null) ? null : decode(path, getProtocolCharset()); + } + + + /** + * Get the raw-escaped basename of the path. + * + * @return the raw-escaped basename + */ + public char[] getRawName() { + if (_path == null) { + return null; + } + + int at = 0; + for (int i = _path.length - 1; i >= 0; i--) { + if (_path[i] == '/') { + at = i + 1; + break; + } + } + int len = _path.length - at; + char[] basename = new char[len]; + System.arraycopy(_path, at, basename, 0, len); + return basename; + } + + + /** + * Get the escaped basename of the path. + * + * @return the escaped basename string + */ + public String getEscapedName() { + char[] basename = getRawName(); + return (basename == null) ? null : new String(basename); + } + + + /** + * Get the basename of the path. + * + * @return the basename string + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getName() throws URIException { + char[] basename = getRawName(); + return (basename == null) ? null : decode(getRawName(), + getProtocolCharset()); + } + + // ----------------------------------------------------- The path and query + + /** + * Get the raw-escaped path and query. + * + * @return the raw-escaped path and query + */ + public char[] getRawPathQuery() { + + if (_path == null && _query == null) { + return null; + } + StringBuffer buff = new StringBuffer(); + if (_path != null) { + buff.append(_path); + } + if (_query != null) { + buff.append('?'); + buff.append(_query); + } + return buff.toString().toCharArray(); + } + + + /** + * Get the escaped query. + * + * @return the escaped path and query string + */ + public String getEscapedPathQuery() { + char[] rawPathQuery = getRawPathQuery(); + return (rawPathQuery == null) ? null : new String(rawPathQuery); + } + + + /** + * Get the path and query. + * + * @return the path and query string. + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getPathQuery() throws URIException { + char[] rawPathQuery = getRawPathQuery(); + return (rawPathQuery == null) ? null : decode(rawPathQuery, + getProtocolCharset()); + } + + // -------------------------------------------------------------- The query + + /** + * Set the raw-escaped query. + * + * @param escapedQuery the raw-escaped query + * @throws URIException escaped query not valid + */ + public void setRawQuery(char[] escapedQuery) throws URIException { + if (escapedQuery == null || escapedQuery.length == 0) { + _query = escapedQuery; + setURI(); + return; + } + // remove the fragment identifier + escapedQuery = removeFragmentIdentifier(escapedQuery); + if (!validate(escapedQuery, query)) { + throw new URIException(URIException.ESCAPING, + "escaped query not valid"); + } + _query = escapedQuery; + setURI(); + } + + + /** + * Set the escaped query string. + * + * @param escapedQuery the escaped query string + * @throws URIException escaped query not valid + */ + public void setEscapedQuery(String escapedQuery) throws URIException { + if (escapedQuery == null) { + _query = null; + setURI(); + return; + } + setRawQuery(escapedQuery.toCharArray()); + } + + + /** + * Set the query. + *

+ * When a query string is not misunderstood the reserved special characters + * ("&", "=", "+", ",", and "$") within a query component, it is + * recommended to use in encoding the whole query with this method. + *

+ * The additional APIs for the special purpose using by the reserved + * special characters used in each protocol are implemented in each protocol + * classes inherited from URI. So refer to the same-named APIs + * implemented in each specific protocol instance. + * + * @param query the query string. + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #encode + */ + public void setQuery(String query) throws URIException { + if (query == null || query.length() == 0) { + _query = (query == null) ? null : query.toCharArray(); + setURI(); + return; + } + setRawQuery(encode(query, allowed_query, getProtocolCharset())); + } + + + /** + * Get the raw-escaped query. + * + * @return the raw-escaped query + */ + public char[] getRawQuery() { + return _query; + } + + + /** + * Get the escaped query. + * + * @return the escaped query string + */ + public String getEscapedQuery() { + return (_query == null) ? null : new String(_query); + } + + + /** + * Get the query. + * + * @return the query string. + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getQuery() throws URIException { + return (_query == null) ? null : decode(_query, getProtocolCharset()); + } + + // ----------------------------------------------------------- The fragment + + /** + * Set the raw-escaped fragment. + * + * @param escapedFragment the raw-escaped fragment + * @throws URIException escaped fragment not valid + */ + public void setRawFragment(char[] escapedFragment) throws URIException { + if (escapedFragment == null || escapedFragment.length == 0) { + _fragment = escapedFragment; + hash = 0; + return; + } + if (!validate(escapedFragment, fragment)) { + throw new URIException(URIException.ESCAPING, + "escaped fragment not valid"); + } + _fragment = escapedFragment; + hash = 0; + } + + + /** + * Set the escaped fragment string. + * + * @param escapedFragment the escaped fragment string + * @throws URIException escaped fragment not valid + */ + public void setEscapedFragment(String escapedFragment) throws URIException { + if (escapedFragment == null) { + _fragment = null; + hash = 0; + return; + } + setRawFragment(escapedFragment.toCharArray()); + } + + + /** + * Set the fragment. + * + * @param fragment the fragment string. + * @throws URIException If an error occurs. + */ + public void setFragment(String fragment) throws URIException { + if (fragment == null || fragment.length() == 0) { + _fragment = (fragment == null) ? null : fragment.toCharArray(); + hash = 0; + return; + } + _fragment = encode(fragment, allowed_fragment, getProtocolCharset()); + hash = 0; + } + + + /** + * Get the raw-escaped fragment. + *

+ * The optional fragment identifier is not part of a URI, but is often used + * in conjunction with a URI. + *

+ * The format and interpretation of fragment identifiers is dependent on + * the media type [RFC2046] of the retrieval result. + *

+ * A fragment identifier is only meaningful when a URI reference is + * intended for retrieval and the result of that retrieval is a document + * for which the identified fragment is consistently defined. + * + * @return the raw-escaped fragment + */ + public char[] getRawFragment() { + return _fragment; + } + + + /** + * Get the escaped fragment. + * + * @return the escaped fragment string + */ + public String getEscapedFragment() { + return (_fragment == null) ? null : new String(_fragment); + } + + + /** + * Get the fragment. + * + * @return the fragment string + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getFragment() throws URIException { + return (_fragment == null) ? null : decode(_fragment, + getProtocolCharset()); + } + + // ------------------------------------------------------------- Utilities + + /** + * Remove the fragment identifier of the given component. + * + * @param component the component that a fragment may be included + * @return the component that the fragment identifier is removed + */ + protected char[] removeFragmentIdentifier(char[] component) { + if (component == null) { + return null; + } + int lastIndex = new String(component).indexOf('#'); + if (lastIndex != -1) { + component = new String(component).substring(0, + lastIndex).toCharArray(); + } + return component; + } + + + /** + * Normalize the given hier path part. + * + *

Algorithm taken from URI reference parser at + * http://www.apache.org/~fielding/uri/rev-2002/issues.html. + * + * @param path the path to normalize + * @return the normalized path + * @throws URIException no more higher path level to be normalized + */ + protected char[] normalize(char[] path) throws URIException { + + if (path == null) { + return null; + } + + String normalized = new String(path); + + // If the buffer begins with "./" or "../", the "." or ".." is removed. + if (normalized.startsWith("./")) { + normalized = normalized.substring(1); + } else if (normalized.startsWith("../")) { + normalized = normalized.substring(2); + } else if (normalized.startsWith("..")) { + normalized = normalized.substring(2); + } + + // All occurrences of "/./" in the buffer are replaced with "/" + int index = -1; + while ((index = normalized.indexOf("/./")) != -1) { + normalized = normalized.substring(0, index) + normalized.substring(index + 2); + } + + // If the buffer ends with "/.", the "." is removed. + if (normalized.endsWith("/.")) { + normalized = normalized.substring(0, normalized.length() - 1); + } + + int startIndex = 0; + + // All occurrences of "//../" in the buffer, where ".." + // and are complete path segments, are iteratively replaced + // with "/" in order from left to right until no matching pattern remains. + // If the buffer ends with "//..", that is also replaced + // with "/". Note that may be empty. + while ((index = normalized.indexOf("/../", startIndex)) != -1) { + int slashIndex = normalized.lastIndexOf('/', index - 1); + if (slashIndex >= 0) { + normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3); + } else { + startIndex = index + 3; + } + } + if (normalized.endsWith("/..")) { + int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4); + if (slashIndex >= 0) { + normalized = normalized.substring(0, slashIndex + 1); + } + } + + // All prefixes of "/../" in the buffer, where ".." + // and are complete path segments, are iteratively replaced + // with "/" in order from left to right until no matching pattern remains. + // If the buffer ends with "/..", that is also replaced + // with "/". Note that may be empty. + while ((index = normalized.indexOf("/../")) != -1) { + int slashIndex = normalized.lastIndexOf('/', index - 1); + if (slashIndex >= 0) { + break; + } else { + normalized = normalized.substring(index + 3); + } + } + if (normalized.endsWith("/..")) { + int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4); + if (slashIndex < 0) { + normalized = "/"; + } + } + + return normalized.toCharArray(); + } + + + /** + * Normalizes the path part of this URI. Normalization is only meant to be performed on + * URIs with an absolute path. Calling this method on a relative path URI will have no + * effect. + * + * @throws URIException no more higher path level to be normalized + * + * @see #isAbsPath() + */ + public void normalize() throws URIException { + if (isAbsPath()) { + _path = normalize(_path); + setURI(); + } + } + + + /** + * Test if the first array is equal to the second array. + * + * @param first the first character array + * @param second the second character array + * @return true if they're equal + */ + protected boolean equals(char[] first, char[] second) { + + if (first == null && second == null) { + return true; + } + if (first == null || second == null) { + return false; + } + if (first.length != second.length) { + return false; + } + for (int i = 0; i < first.length; i++) { + if (first[i] != second[i]) { + return false; + } + } + return true; + } + + + /** + * Test an object if this URI is equal to another. + * + * @param obj an object to compare + * @return true if two URI objects are equal + */ + public boolean equals(Object obj) { + + // normalize and test each components + if (obj == this) { + return true; + } + if (!(obj instanceof URI)) { + return false; + } + URI another = (URI) obj; + // scheme + if (!equals(_scheme, another._scheme)) { + return false; + } + // is_opaque_part or is_hier_part? and opaque + if (!equals(_opaque, another._opaque)) { + return false; + } + // is_hier_part + // has_authority + if (!equals(_authority, another._authority)) { + return false; + } + // path + if (!equals(_path, another._path)) { + return false; + } + // has_query + if (!equals(_query, another._query)) { + return false; + } + // has_fragment? should be careful of the only fragment case. + if (!equals(_fragment, another._fragment)) { + return false; + } + return true; + } + + // ---------------------------------------------------------- Serialization + + /** + * Write the content of this URI. + * + * @param oos the object-output stream + * @throws IOException If an IO problem occurs. + */ + private void writeObject(ObjectOutputStream oos) + throws IOException { + + oos.defaultWriteObject(); + } + + + /** + * Read a URI. + * + * @param ois the object-input stream + * @throws ClassNotFoundException If one of the classes specified in the + * input stream cannot be found. + * @throws IOException If an IO problem occurs. + */ + private void readObject(ObjectInputStream ois) + throws ClassNotFoundException, IOException { + + ois.defaultReadObject(); + } + + // -------------------------------------------------------------- Hash code + + /** + * Return a hash code for this URI. + * + * @return a has code value for this URI + */ + public int hashCode() { + if (hash == 0) { + char[] c = _uri; + if (c != null) { + for (int i = 0, len = c.length; i < len; i++) { + hash = 31 * hash + c[i]; + } + } + c = _fragment; + if (c != null) { + for (int i = 0, len = c.length; i < len; i++) { + hash = 31 * hash + c[i]; + } + } + } + return hash; + } + + // ------------------------------------------------------------- Comparison + + /** + * Compare this URI to another object. + * + * @param obj the object to be compared. + * @return 0, if it's same, + * -1, if failed, first being compared with in the authority component + * @throws ClassCastException not URI argument + */ + public int compareTo(Object obj) throws ClassCastException { + + URI another = (URI) obj; + if (!equals(_authority, another.getRawAuthority())) { + return -1; + } + return toString().compareTo(another.toString()); + } + + // ------------------------------------------------------------------ Clone + + /** + * Create and return a copy of this object, the URI-reference containing + * the userinfo component. Notice that the whole URI-reference including + * the userinfo component counld not be gotten as a String. + *

+ * To copy the identical URI object including the userinfo + * component, it should be used. + * + * @return a clone of this instance + */ + public synchronized Object clone() throws CloneNotSupportedException { + + URI instance = (URI) super.clone(); + + instance._uri = _uri; + instance._scheme = _scheme; + instance._opaque = _opaque; + instance._authority = _authority; + instance._userinfo = _userinfo; + instance._host = _host; + instance._port = _port; + instance._path = _path; + instance._query = _query; + instance._fragment = _fragment; + // the charset to do escape encoding for this instance + instance.protocolCharset = protocolCharset; + // flags + instance._is_hier_part = _is_hier_part; + instance._is_opaque_part = _is_opaque_part; + instance._is_net_path = _is_net_path; + instance._is_abs_path = _is_abs_path; + instance._is_rel_path = _is_rel_path; + instance._is_reg_name = _is_reg_name; + instance._is_server = _is_server; + instance._is_hostname = _is_hostname; + instance._is_IPv4address = _is_IPv4address; + instance._is_IPv6reference = _is_IPv6reference; + + return instance; + } + + // ------------------------------------------------------------ Get the URI + + /** + * It can be gotten the URI character sequence. It's raw-escaped. + * For the purpose of the protocol to be transported, it will be useful. + *

+ * It is clearly unwise to use a URL that contains a password which is + * intended to be secret. In particular, the use of a password within + * the 'userinfo' component of a URL is strongly disrecommended except + * in those rare cases where the 'password' parameter is intended to be + * public. + *

+ * When you want to get each part of the userinfo, you need to use the + * specific methods in the specific URL. It depends on the specific URL. + * + * @return the URI character sequence + */ + public char[] getRawURI() { + return _uri; + } + + + /** + * It can be gotten the URI character sequence. It's escaped. + * For the purpose of the protocol to be transported, it will be useful. + * + * @return the escaped URI string + */ + public String getEscapedURI() { + return (_uri == null) ? null : new String(_uri); + } + + + /** + * It can be gotten the URI character sequence. + * + * @return the original URI string + * @throws URIException incomplete trailing escape pattern or unsupported + * character encoding + * @see #decode + */ + public String getURI() throws URIException { + return (_uri == null) ? null : decode(_uri, getProtocolCharset()); + } + + + /** + * Get the URI reference character sequence. + * + * @return the URI reference character sequence + */ + public char[] getRawURIReference() { + if (_fragment == null) { + return _uri; + } + if (_uri == null) { + return _fragment; + } + // if _uri != null && _fragment != null + String uriReference = new String(_uri) + "#" + new String(_fragment); + return uriReference.toCharArray(); + } + + + /** + * Get the escaped URI reference string. + * + * @return the escaped URI reference string + */ + public String getEscapedURIReference() { + char[] uriReference = getRawURIReference(); + return (uriReference == null) ? null : new String(uriReference); + } + + + /** + * Get the original URI reference string. + * + * @return the original URI reference string + * @throws URIException If {@link #decode} fails. + */ + public String getURIReference() throws URIException { + char[] uriReference = getRawURIReference(); + return (uriReference == null) ? null : decode(uriReference, + getProtocolCharset()); + } + + + /** + * Get the escaped URI string. + *

+ * On the document, the URI-reference form is only used without the userinfo + * component like http://jakarta.apache.org/ by the security reason. + * But the URI-reference form with the userinfo component could be parsed. + *

+ * In other words, this URI and any its subclasses must not expose the + * URI-reference expression with the userinfo component like + * http://user:password@hostport/restricted_zone.
+ * It means that the API client programmer should extract each user and + * password to access manually. Probably it will be supported in the each + * subclass, however, not a whole URI-reference expression. + * + * @return the escaped URI string + * @see #clone() + */ + public String toString() { + return getEscapedURI(); + } + + + // ------------------------------------------------------------ Inner class + + /** + * The charset-changed normal operation to represent to be required to + * alert to user the fact the default charset is changed. + */ + public static class DefaultCharsetChanged extends RuntimeException { + + // ------------------------------------------------------- constructors + + /** + * The constructor with a reason string and its code arguments. + * + * @param reasonCode the reason code + * @param reason the reason + */ + public DefaultCharsetChanged(int reasonCode, String reason) { + super(reason); + this.reason = reason; + this.reasonCode = reasonCode; + } + + // ---------------------------------------------------------- constants + + /** No specified reason code. */ + public static final int UNKNOWN = 0; + + /** Protocol charset changed. */ + public static final int PROTOCOL_CHARSET = 1; + + /** Document charset changed. */ + public static final int DOCUMENT_CHARSET = 2; + + // ------------------------------------------------- instance variables + + /** The reason code. */ + private int reasonCode; + + /** The reason message. */ + private String reason; + + // ------------------------------------------------------------ methods + + /** + * Get the reason code. + * + * @return the reason code + */ + public int getReasonCode() { + return reasonCode; + } + + /** + * Get the reason message. + * + * @return the reason message + */ + public String getReason() { + return reason; + } + + } + + + /** + * A mapping to determine the (somewhat arbitrarily) preferred charset for a + * given locale. Supports all locales recognized in JDK 1.1. + *

+ * The distribution of this class is Servlets.com. It was originally + * written by Jason Hunter [jhunter at acm.org] and used by with permission. + */ + public static class LocaleToCharsetMap { + + /** A mapping of language code to charset */ + private static final Hashtable LOCALE_TO_CHARSET_MAP; + static { + LOCALE_TO_CHARSET_MAP = new Hashtable(); + LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6"); + LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7"); + LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8"); + LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS"); + LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR"); + LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2"); + LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1"); + LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9"); + LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5"); + LOCALE_TO_CHARSET_MAP.put("zh", "GB2312"); + LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5"); + } + + /** + * Get the preferred charset for the given locale. + * + * @param locale the locale + * @return the preferred charset or null if the locale is not + * recognized. + */ + public static String getCharset(Locale locale) { + // try for an full name match (may include country) + String charset = + (String) LOCALE_TO_CHARSET_MAP.get(locale.toString()); + if (charset != null) { + return charset; + } + + // if a full name didn't match, try just the language + charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage()); + return charset; // may be null + } + + } + +} + diff --git a/src/main/java/org/archive/url/URIException.java b/src/main/java/org/archive/url/URIException.java new file mode 100644 index 00000000..b32c68cf --- /dev/null +++ b/src/main/java/org/archive/url/URIException.java @@ -0,0 +1,180 @@ +/* + * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/URIException.java,v 1.12 2004/09/30 18:53:20 olegk Exp $ + * $Revision: 480424 $ + * $Date: 2006-11-29 06:56:49 +0100 (Wed, 29 Nov 2006) $ + * + * ==================================================================== + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ + +package org.archive.url; + +import java.io.IOException; + +/** + * The URI parsing and escape encoding exception. + * + * @author Sung-Gu + * @author Oleg Kalnichevski + * @version $Revision: 480424 $ $Date: 2002/03/14 15:14:01 + */ +public class URIException extends IOException { + + // ----------------------------------------------------------- constructors + + /** + * Default constructor. + */ + public URIException() { + } + + + /** + * The constructor with a reason code argument. + * + * @param reasonCode the reason code + */ + public URIException(int reasonCode) { + this.reasonCode = reasonCode; + } + + + /** + * The constructor with a reason string and its code arguments. + * + * @param reasonCode the reason code + * @param reason the reason + */ + public URIException(int reasonCode, String reason) { + super(reason); // for backward compatibility of Throwable + this.reason = reason; + this.reasonCode = reasonCode; + } + + + /** + * The constructor with a reason string argument. + * + * @param reason the reason + */ + public URIException(String reason) { + super(reason); // for backward compatibility of Throwable + this.reason = reason; + this.reasonCode = UNKNOWN; + } + + // -------------------------------------------------------------- constants + + /** + * No specified reason code. + */ + public static final int UNKNOWN = 0; + + + /** + * The URI parsing error. + */ + public static final int PARSING = 1; + + + /** + * The unsupported character encoding. + */ + public static final int UNSUPPORTED_ENCODING = 2; + + + /** + * The URI escape encoding and decoding error. + */ + public static final int ESCAPING = 3; + + + /** + * The DNS punycode encoding or decoding error. + */ + public static final int PUNYCODE = 4; + + // ------------------------------------------------------------- properties + + /** + * The reason code. + */ + protected int reasonCode; + + + /** + * The reason message. + */ + protected String reason; + + // ---------------------------------------------------------------- methods + + /** + * Get the reason code. + * + * @return the reason code + */ + public int getReasonCode() { + return reasonCode; + } + + /** + * Set the reason code. + * + * @param reasonCode the reason code + * + * @deprecated Callers should set the reason code as a parameter to the + * constructor. + */ + public void setReasonCode(int reasonCode) { + this.reasonCode = reasonCode; + } + + + /** + * Get the reason message. + * + * @return the reason message + * + * @deprecated You should instead call {@link #getMessage()}. + */ + public String getReason() { + return reason; + } + + + /** + * Set the reason message. + * + * @param reason the reason message + * + * @deprecated Callers should instead set this via a parameter to the constructor. + */ + public void setReason(String reason) { + this.reason = reason; + } + + +} + diff --git a/src/main/java/org/archive/url/UsableURI.java b/src/main/java/org/archive/url/UsableURI.java index ed40f41a..b7d0cf71 100644 --- a/src/main/java/org/archive/url/UsableURI.java +++ b/src/main/java/org/archive/url/UsableURI.java @@ -26,14 +26,13 @@ import java.net.URI; import java.net.URISyntaxException; -import org.apache.commons.httpclient.URIException; import org.archive.util.SURT; import org.archive.util.TextUtils; /** * Usable URI. * - * This class wraps {@link org.apache.commons.httpclient.URI} adding caching + * This class wraps {@link org.archive.url.URI} adding caching * and methods. It cannot be instantiated directly. Go via UURIFactory. * *

We used to use {@link java.net.URI} for parsing URIs but ran across @@ -50,7 +49,7 @@ * @author gojomo * @author stack * - * @see org.apache.commons.httpclient.URI + * @see org.archive.url.URI */ public class UsableURI extends LaxURI implements CharSequence, Serializable { @@ -121,7 +120,6 @@ protected UsableURI() { * @param uri String representation of an absolute URI. * @param escaped If escaped. * @param charset Charset to use. - * @throws org.apache.commons.httpclient.URIException */ protected UsableURI(String uri, boolean escaped, String charset) throws URIException { @@ -132,7 +130,6 @@ protected UsableURI(String uri, boolean escaped, String charset) /** * @param relative String representation of URI. * @param base Parent UURI to use derelativizing. - * @throws org.apache.commons.httpclient.URIException */ protected UsableURI(UsableURI base, UsableURI relative) throws URIException { super(base, relative); @@ -275,7 +272,7 @@ public String toString() { /** * In the case of a puny encoded IDN, this method returns the decoded Unicode version. *

- * Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}. + * Most of this implementation is copied from {@link org.archive.url.URI#setURI()}. * * @return decoded IDN version of URI */ diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java index 3dfc33a7..08f18999 100644 --- a/src/main/java/org/archive/url/UsableURIFactory.java +++ b/src/main/java/org/archive/url/UsableURIFactory.java @@ -28,8 +28,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.httpclient.URI; -import org.apache.commons.httpclient.URIException; import org.archive.util.TextUtils; /** diff --git a/src/main/java/org/archive/util/ChunkedInputStream.java b/src/main/java/org/archive/util/ChunkedInputStream.java new file mode 100644 index 00000000..69b23047 --- /dev/null +++ b/src/main/java/org/archive/util/ChunkedInputStream.java @@ -0,0 +1,324 @@ +/* + * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/ChunkedInputStream.java,v 1.24 2004/10/10 15:18:55 olegk Exp $ + * $Revision: 480424 $ + * $Date: 2006-11-29 06:56:49 +0100 (Wed, 29 Nov 2006) $ + * + * ==================================================================== + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ + +package org.archive.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + + +/** + *

Transparently coalesces chunks of a HTTP stream that uses + * Transfer-Encoding chunked.

+ * + *

Note that this class NEVER closes the underlying stream, even when close + * gets called. Instead, it will read until the "end" of its chunking on close, + * which allows for the seamless invocation of subsequent HTTP 1.1 calls, while + * not requiring the client to remember to read the entire contents of the + * response.

+ * + * @author Ortwin Glueck + * @author Sean C. Sullivan + * @author Martin Elwin + * @author Eric Johnson + * @author Mike Bowler + * @author Michael Becke + * @author Oleg Kalnichevski + * + * @since 2.0 + * + */ +class ChunkedInputStream extends InputStream { + /** The inputstream that we're wrapping */ + private InputStream in; + + /** The chunk size */ + private int chunkSize; + + /** The current position within the current chunk */ + private int pos; + + /** True if we'are at the beginning of stream */ + private boolean bof = true; + + /** True if we've reached the end of stream */ + private boolean eof = false; + + /** True if this stream is closed */ + private boolean closed = false; + + /** + * ChunkedInputStream constructor + * + * @param in the raw input stream + * + */ + public ChunkedInputStream(final InputStream in) { + + if (in == null) { + throw new IllegalArgumentException("InputStream parameter may not be null"); + } + this.in = in; + this.pos = 0; + } + + /** + *

Returns all the data in a chunked stream in coalesced form. A chunk + * is followed by a CRLF. The method returns -1 as soon as a chunksize of 0 + * is detected.

+ * + *

Trailer headers are read automcatically at the end of the stream and + * can be obtained with the getResponseFooters() method.

+ * + * @return -1 of the end of the stream has been reached or the next data + * byte + * @throws IOException If an IO problem occurs + */ + public int read() throws IOException { + + if (closed) { + throw new IOException("Attempted read from closed stream."); + } + if (eof) { + return -1; + } + if (pos >= chunkSize) { + nextChunk(); + if (eof) { + return -1; + } + } + pos++; + return in.read(); + } + + /** + * Read some bytes from the stream. + * @param b The byte array that will hold the contents from the stream. + * @param off The offset into the byte array at which bytes will start to be + * placed. + * @param len the maximum number of bytes that can be returned. + * @return The number of bytes returned or -1 if the end of stream has been + * reached. + * @see InputStream#read(byte[], int, int) + * @throws IOException if an IO problem occurs. + */ + public int read (byte[] b, int off, int len) throws IOException { + + if (closed) { + throw new IOException("Attempted read from closed stream."); + } + + if (eof) { + return -1; + } + if (pos >= chunkSize) { + nextChunk(); + if (eof) { + return -1; + } + } + len = Math.min(len, chunkSize - pos); + int count = in.read(b, off, len); + pos += count; + return count; + } + + /** + * Read some bytes from the stream. + * @param b The byte array that will hold the contents from the stream. + * @return The number of bytes returned or -1 if the end of stream has been + * reached. + * @see InputStream#read(byte[]) + * @throws IOException if an IO problem occurs. + */ + public int read (byte[] b) throws IOException { + return read(b, 0, b.length); + } + + /** + * Read the CRLF terminator. + * @throws IOException If an IO error occurs. + */ + private void readCRLF() throws IOException { + int cr = in.read(); + int lf = in.read(); + if ((cr != '\r') || (lf != '\n')) { + throw new IOException( + "CRLF expected at end of chunk: " + cr + "/" + lf); + } + } + + + /** + * Read the next chunk. + * @throws IOException If an IO error occurs. + */ + private void nextChunk() throws IOException { + if (!bof) { + readCRLF(); + } + chunkSize = getChunkSizeFromInputStream(in); + bof = false; + pos = 0; + if (chunkSize == 0) { + eof = true; + parseTrailerHeaders(); + } + } + + /** + * Expects the stream to start with a chunksize in hex with optional + * comments after a semicolon. The line must end with a CRLF: "a3; some + * comment\r\n" Positions the stream at the start of the next line. + * + * @param in The new input stream. + * + * @return the chunk size as integer + * + * @throws IOException when the chunk size could not be parsed + */ + private static int getChunkSizeFromInputStream(final InputStream in) + throws IOException { + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + // States: 0=normal, 1=\r was scanned, 2=inside quoted string, -1=end + int state = 0; + while (state != -1) { + int b = in.read(); + if (b == -1) { + throw new IOException("chunked stream ended unexpectedly"); + } + switch (state) { + case 0: + switch (b) { + case '\r': + state = 1; + break; + case '\"': + state = 2; + /* fall through */ + default: + baos.write(b); + } + break; + + case 1: + if (b == '\n') { + state = -1; + } else { + // this was not CRLF + throw new IOException("Protocol violation: Unexpected" + + " single newline character in chunk size"); + } + break; + + case 2: + switch (b) { + case '\\': + b = in.read(); + baos.write(b); + break; + case '\"': + state = 0; + /* fall through */ + default: + baos.write(b); + } + break; + default: throw new RuntimeException("assertion failed"); + } + } + + //parse data + String dataString = baos.toString(StandardCharsets.US_ASCII.name()); + int separator = dataString.indexOf(';'); + dataString = (separator > 0) + ? dataString.substring(0, separator).trim() + : dataString.trim(); + + int result; + try { + result = Integer.parseInt(dataString.trim(), 16); + } catch (NumberFormatException e) { + throw new IOException ("Bad chunk size: " + dataString); + } + return result; + } + + /** + * Reads and stores the Trailer headers. + * @throws IOException If an IO problem occurs + */ + private void parseTrailerHeaders() throws IOException { + String charset = "US-ASCII"; + LaxHttpParser.parseHeaders(in, charset); + } + + /** + * Upon close, this reads the remainder of the chunked message, + * leaving the underlying socket at a position to start reading the + * next response without scanning. + * @throws IOException If an IO problem occurs. + */ + public void close() throws IOException { + if (!closed) { + try { + if (!eof) { + exhaustInputStream(this); + } + } finally { + eof = true; + closed = true; + } + } + } + + /** + * Exhaust an input stream, reading until EOF has been encountered. + * + *

Note that this function is intended as a non-public utility. + * This is a little weird, but it seemed silly to make a utility + * class for this one function, so instead it is just static and + * shared that way.

+ * + * @param inStream The {@link InputStream} to exhaust. + * @throws IOException If an IO problem occurs + */ + static void exhaustInputStream(InputStream inStream) throws IOException { + // read and discard the remainder of the message + byte buffer[] = new byte[1024]; + while (inStream.read(buffer) >= 0) { + ; + } + } +} diff --git a/src/main/java/org/archive/util/LaxHttpParser.java b/src/main/java/org/archive/util/LaxHttpParser.java index 9e38669b..0545fd95 100644 --- a/src/main/java/org/archive/util/LaxHttpParser.java +++ b/src/main/java/org/archive/util/LaxHttpParser.java @@ -35,13 +35,11 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; +import java.util.logging.Logger; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.util.EncodingUtil; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.archive.format.http.HttpHeader; /** * A Modified version of HttpParser which doesn't throw exceptions on bad header lines @@ -57,7 +55,7 @@ public class LaxHttpParser { /** Log object for this class. */ - private static final Log LOG = LogFactory.getLog(LaxHttpParser.class); + private static final Logger LOG = Logger.getLogger(LaxHttpParser.class.getName()); /** * Constructor for LaxHttpParser. @@ -77,7 +75,7 @@ protected LaxHttpParser() { } * @return a byte array from the stream */ public static byte[] readRawLine(InputStream inputStream) throws IOException { - LOG.trace("enter LaxHttpParser.readRawLine()"); + LOG.finest("enter LaxHttpParser.readRawLine()"); ByteArrayOutputStream buf = new ByteArrayOutputStream(); int ch; @@ -108,7 +106,7 @@ public static byte[] readRawLine(InputStream inputStream) throws IOException { * @since 3.0 */ public static String readLine(InputStream inputStream, String charset) throws IOException { - LOG.trace("enter LaxHttpParser.readLine(InputStream, String)"); + LOG.finest("enter LaxHttpParser.readLine(InputStream, String)"); byte[] rawdata = readRawLine(inputStream); if (rawdata == null) { return null; @@ -126,7 +124,11 @@ public static String readLine(InputStream inputStream, String charset) throws IO } } } - return EncodingUtil.getString(rawdata, 0, len - offset, charset); + try { + return new String(rawdata, 0, len - offset, charset); + } catch (UnsupportedEncodingException e) { + return new String(rawdata, 0, len - offset); + } } /** @@ -144,7 +146,7 @@ public static String readLine(InputStream inputStream, String charset) throws IO */ public static String readLine(InputStream inputStream) throws IOException { - LOG.trace("enter LaxHttpParser.readLine(InputStream)"); + LOG.finest("enter LaxHttpParser.readLine(InputStream)"); return readLine(inputStream, "US-ASCII"); } @@ -158,14 +160,13 @@ public static String readLine(InputStream inputStream) throws IOException { * @return an array of headers in the order in which they were parsed * * @throws IOException if an IO error occurs while reading from the stream - * @throws HttpException if there is an error parsing a header value - * + * * @since 3.0 */ - public static Header[] parseHeaders(InputStream is, String charset) throws IOException, HttpException { - LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)"); + public static HttpHeader[] parseHeaders(InputStream is, String charset) throws IOException { + LOG.finest("enter HeaderParser.parseHeaders(InputStream, String)"); - ArrayList
headers = new ArrayList
(); + ArrayList headers = new ArrayList<>(); String name = null; StringBuffer value = null; for (; ;) { @@ -188,7 +189,7 @@ public static Header[] parseHeaders(InputStream is, String charset) throws IOExc } else { // make sure we save the previous name,value pair if present if (name != null) { - headers.add(new Header(name, value.toString())); + headers.add(new HttpHeader(name, value.toString())); } // Otherwise we should have normal HTTP header line @@ -216,10 +217,10 @@ public static Header[] parseHeaders(InputStream is, String charset) throws IOExc // make sure we save the last name,value pair if present if (name != null) { - headers.add(new Header(name, value.toString())); + headers.add(new HttpHeader(name, value.toString())); } - return (Header[]) headers.toArray(new Header[headers.size()]); + return headers.toArray(new HttpHeader[0]); } /** @@ -231,12 +232,11 @@ public static Header[] parseHeaders(InputStream is, String charset) throws IOExc * @return an array of headers in the order in which they were parsed * * @throws IOException if an IO error occurs while reading from the stream - * @throws HttpException if there is an error parsing a header value - * + * * @deprecated use #parseHeaders(InputStream, String) */ - public static Header[] parseHeaders(InputStream is) throws IOException, HttpException { - LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)"); + public static HttpHeader[] parseHeaders(InputStream is) throws IOException { + LOG.finest("enter HeaderParser.parseHeaders(InputStream, String)"); return parseHeaders(is, "US-ASCII"); } } diff --git a/src/main/java/org/archive/util/Recorder.java b/src/main/java/org/archive/util/Recorder.java index dff02bff..61cbf871 100644 --- a/src/main/java/org/archive/util/Recorder.java +++ b/src/main/java/org/archive/util/Recorder.java @@ -32,7 +32,6 @@ import java.util.zip.DeflaterInputStream; import java.util.zip.GZIPInputStream; -import org.apache.commons.httpclient.ChunkedInputStream; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; diff --git a/src/main/java/org/archive/util/SURT.java b/src/main/java/org/archive/util/SURT.java index 69daf247..059b2ec6 100644 --- a/src/main/java/org/archive/util/SURT.java +++ b/src/main/java/org/archive/util/SURT.java @@ -29,7 +29,7 @@ import java.io.PrintStream; import java.util.regex.Matcher; -import org.apache.commons.httpclient.URIException; +import org.archive.url.URIException; import org.archive.url.UsableURIFactory; /** diff --git a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java index 68ee6551..69189862 100644 --- a/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java +++ b/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java @@ -3,7 +3,6 @@ import java.io.IOException; import org.archive.util.binsearch.SeekableLineReaderFactory; -import org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory; import org.archive.util.binsearch.impl.http.ApacheHttp43SLRFactory; import org.archive.util.binsearch.impl.http.HTTPURLConnSLRFactory; @@ -20,15 +19,13 @@ protected HTTPSeekableLineReaderFactory() public enum HttpLibs { - @Deprecated - APACHE_31, APACHE_43, URLCONN, } public static HTTPSeekableLineReaderFactory getHttpFactory() { - return getHttpFactory(HttpLibs.APACHE_31); + return getHttpFactory(HttpLibs.APACHE_43); } public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type) @@ -38,7 +35,7 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type) public static HTTPSeekableLineReaderFactory getHttpFactory(String defaultURL) { - return getHttpFactory(HttpLibs.APACHE_31, defaultURL); + return getHttpFactory(HttpLibs.APACHE_43, defaultURL); } public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String defaultURL) @@ -46,10 +43,6 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String HTTPSeekableLineReaderFactory factory = null; switch (type) { - case APACHE_31: - factory = new ApacheHttp31SLRFactory(); - break; - case URLCONN: factory = new HTTPURLConnSLRFactory(); break; @@ -60,7 +53,7 @@ public static HTTPSeekableLineReaderFactory getHttpFactory(HttpLibs type, String } if (factory == null) { - factory = new ApacheHttp31SLRFactory(); + factory = new ApacheHttp43SLRFactory(); } factory.defaultURL = defaultURL; diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java deleted file mode 100644 index 124d3d03..00000000 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLR.java +++ /dev/null @@ -1,235 +0,0 @@ -package org.archive.util.binsearch.impl.http; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URISyntaxException; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpMethod; -import org.apache.commons.httpclient.cookie.CookiePolicy; -import org.apache.commons.httpclient.methods.GetMethod; -import org.apache.commons.httpclient.methods.HeadMethod; -import org.apache.commons.io.input.CountingInputStream; -import org.archive.util.binsearch.impl.HTTPSeekableLineReader; - -/** - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class ApacheHttp31SLR extends HTTPSeekableLineReader { - - private HttpClient http; - private String url; - private long length = -1; - - protected CountingInputStream cin; - - private GetMethod activeMethod; - - public ApacheHttp31SLR(HttpClient http, String url) { - this.http = http; - this.url = url; - } - - private void acquireLength() throws URISyntaxException, HttpException, IOException { - HttpMethod head = new HeadMethod(url); - int code = http.executeMethod(head); - if(code != 200) { - throw new IOException("Unable to retrieve from " + url); - } - Header lengthHeader = head.getResponseHeader(CONTENT_LENGTH); - if(lengthHeader == null) { - throw new IOException("No Content-Length header for " + url); - } - String val = lengthHeader.getValue(); - try { - length = Long.parseLong(val); - } catch(NumberFormatException e) { - throw new IOException("Bad Content-Length value " +url+ ": " + val); - } - } - - protected String getHeader(String header) throws URISyntaxException, HttpException, IOException { - HttpMethod head = new HeadMethod(url); - int code = http.executeMethod(head); - if(code != 200) { - throw new IOException("Unable to retrieve from " + url); - } - Header theHeader = head.getResponseHeader(header); - if(theHeader == null) { - throw new IOException("No " + header + " header for " + url); - } - String val = theHeader.getValue(); - return val; - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReader#getUrl() - */ - @Override - public String getUrl() - { - return url; - } - -// public void seek(long offset, boolean gzip) throws IOException { -// is = doSeekLoad(offset, -1); -// -// if (gzip) { -// is = new GZIPMembersInputStream(is, blockSize); -// } -// } - -// public void seekWithMaxRead(long offset, boolean gzip, int maxLength) throws IOException { -// is = doSeekLoad(offset, maxLength); -// -// if (bufferFully && (maxLength > 0) && (maxLength < 1e10)) { -// try { -// byte[] buffer = new byte[maxLength]; -// ByteStreams.readFully(is, buffer); -// is.close(); -// -// // Create new stream -// is = new ByteArrayInputStream(buffer); -// } finally { -// activeMethod.releaseConnection(); -// activeMethod = null; -// } -// } -// -// if (gzip) { -// is = new GZIPMembersInputStream(is, blockSize); -// } -// } - - protected InputStream doSeekLoad(long offset, int maxLength) throws IOException { - if (activeMethod != null) { - doClose(); - } - - br = null; - - try { - - activeMethod = new GetMethod(url); - - String rangeHeader = makeRangeHeader(offset, maxLength); - - if (rangeHeader != null) { - activeMethod.setRequestHeader("Range", rangeHeader); - } - - if (this.isNoKeepAlive()) { - activeMethod.setRequestHeader("Connection", "close"); - } - - if (this.getCookie() != null) { - activeMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); - activeMethod.setRequestHeader("Cookie", this.getCookie()); - } - - int code = http.executeMethod(activeMethod); - - connectedUrl = activeMethod.getURI().toString(); - - if ((code != 206) && (code != 200)) { - throw new BadHttpStatusException(code, connectedUrl + " " + rangeHeader); - } - - InputStream is = activeMethod.getResponseBodyAsStream(); - cin = new CountingInputStream(is); - return cin; - - } catch (IOException io) { - if (saveErrHeader != null) { - errHeader = getHeaderValue(saveErrHeader); - } - - connectedUrl = activeMethod.getURI().toString(); - doClose(); - throw io; - } - } - - public GetMethod getHttpMethod() - { - return activeMethod; - } - - public void doClose() throws IOException { - - if (activeMethod == null) { - return; - } - - try { - long contentLength = activeMethod.getResponseContentLength(); - - long bytesRead = (cin != null ? cin.getByteCount() : 0); - - // If fully read, close gracefully, otherwise abort - if ((contentLength > 0) && (contentLength == bytesRead)) { -// try { -// cin.close(); -// } catch (IOException e) { -// activeMethod.abort(); -// } - } else { - activeMethod.abort(); - } - - activeMethod.releaseConnection(); - activeMethod = null; - - } finally { - if (activeMethod != null) { - activeMethod.abort(); - activeMethod.releaseConnection(); - activeMethod = null; - } - } - - cin = null; - is = null; - br = null; - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReader#getSize() - */ - @Override - public long getSize() throws IOException { - if (length < 0) { - try { - if (activeMethod != null) { - length = activeMethod.getResponseContentLength(); - } else { - acquireLength(); - } - } catch (URISyntaxException e) { - throw new IOException(e); - } - } - return length; - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReader#getHeaderValue(java.lang.String) - */ - @Override - public String getHeaderValue(String headerName) { - if (activeMethod == null) { - return null; - } - - Header header = activeMethod.getResponseHeader(headerName); - - if (header == null) { - return null; - } - - return header.getValue(); - } -} diff --git a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java b/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java deleted file mode 100644 index 2af03dab..00000000 --- a/src/main/java/org/archive/util/binsearch/impl/http/ApacheHttp31SLRFactory.java +++ /dev/null @@ -1,192 +0,0 @@ -package org.archive.util.binsearch.impl.http; - -import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Locale; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpConnectionManager; -import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; -import org.apache.commons.httpclient.params.HttpClientParams; -import org.archive.util.binsearch.impl.HTTPSeekableLineReader; -import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory; - -/** - * - * @deprecated Commons HttpClient 3 is end of life, this will be removed in webarchive-commons 2.0 - */ -@Deprecated -public class ApacheHttp31SLRFactory extends HTTPSeekableLineReaderFactory { - private final static Logger LOGGER = Logger.getLogger(ApacheHttp31SLRFactory.class.getName()); - - private HttpConnectionManager connectionManager = null; - private HostConfiguration hostConfiguration = null; - private HttpClient http = null; - - public ApacheHttp31SLRFactory(String uriString) { - this(); - } - - public ApacheHttp31SLRFactory() { - connectionManager = new MultiThreadedHttpConnectionManager(); - //connectionManager = new ThreadLocalHttpConnectionManager(); - hostConfiguration = new HostConfiguration(); - HttpClientParams params = new HttpClientParams(); - http = new HttpClient(params,connectionManager); - http.setHostConfiguration(hostConfiguration); - } - - public void close() throws IOException - { - //connectionManager.deleteClosedConnections(); - connectionManager.closeIdleConnections(0); - } - - @Override - public ApacheHttp31SLR get(String url) throws IOException { - -// if (LOGGER.isLoggable(Level.FINEST)) { -// LOGGER.finest("Connections: " + connectionManager.getConnectionsInPool(hostConfiguration)); -// } - - return new ApacheHttp31SLR(http, url); - } - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setProxyHostPort(java.lang.String) - */ - @Override - public void setProxyHostPort(String hostPort) { - int colonIdx = hostPort.indexOf(':'); - if(colonIdx > 0) { - String host = hostPort.substring(0,colonIdx); - int port = Integer.valueOf(hostPort.substring(colonIdx+1)); - -// http.getHostConfiguration().setProxy(host, port); - hostConfiguration.setProxy(host, port); - } - } - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setMaxTotalConnections(int) - */ - @Override - public void setMaxTotalConnections(int maxTotalConnections) { - connectionManager.getParams(). - setMaxTotalConnections(maxTotalConnections); - } - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getMaxTotalConnections() - */ - @Override - public int getMaxTotalConnections() { - return connectionManager.getParams().getMaxTotalConnections(); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setMaxHostConnections(int) - */ - @Override - public void setMaxHostConnections(int maxHostConnections) { - connectionManager.getParams().setDefaultMaxConnectionsPerHost(maxHostConnections); - connectionManager.getParams().setMaxConnectionsPerHost(hostConfiguration, maxHostConnections); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getMaxHostConnections() - */ - @Override - public int getMaxHostConnections() { - return connectionManager.getParams(). - getMaxConnectionsPerHost(hostConfiguration); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getConnectionTimeoutMS() - */ - @Override - public int getConnectionTimeoutMS() { - return connectionManager.getParams().getConnectionTimeout(); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setConnectionTimeoutMS(int) - */ - @Override - public void setConnectionTimeoutMS(int connectionTimeoutMS) { - connectionManager.getParams().setConnectionTimeout(connectionTimeoutMS); - http.getParams().setConnectionManagerTimeout(connectionTimeoutMS); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getSocketTimeoutMS() - */ - @Override - public int getSocketTimeoutMS() { - return connectionManager.getParams().getSoTimeout(); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setSocketTimeoutMS(int) - */ - @Override - public void setSocketTimeoutMS(int socketTimeoutMS) { - connectionManager.getParams().setSoTimeout(socketTimeoutMS); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#setStaleChecking(boolean) - */ - @Override - public void setStaleChecking(boolean enabled) - { - connectionManager.getParams().setStaleCheckingEnabled(enabled); - } - - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#isStaleChecking() - */ - @Override - public boolean isStaleChecking() - { - return connectionManager.getParams().isStaleCheckingEnabled(); - } - - // Experimental - /* (non-Javadoc) - * @see org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory#getModTime() - */ - @Override - public long getModTime() - { - HTTPSeekableLineReader reader = null; - SimpleDateFormat lastModFormat = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.ENGLISH); - - try { - reader = get(); - String result = reader.getHeaderValue(HTTPSeekableLineReader.LAST_MODIFIED); - Date date = lastModFormat.parse(result); - return date.getTime(); - - } catch (Exception e) { - e.printStackTrace(); - } finally { - if (reader != null) { - try { - reader.close(); - } catch (IOException e) { - - } - } - } - - return 0; - } - - @Override - public void setNumRetries(int numRetries) { - http.getParams().setParameter(HttpClientParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(numRetries, true)); - } -} diff --git a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java index 7988cb2b..005e2c49 100644 --- a/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java +++ b/src/test/java/org/archive/io/HeaderedArchiveRecordTest.java @@ -26,7 +26,7 @@ import java.util.Map; import java.util.Set; -import org.apache.commons.httpclient.Header; +import org.archive.format.http.HttpHeader; import org.archive.io.arc.ARCRecord; import org.archive.io.warc.WARCRecord; import org.junit.jupiter.api.Test; @@ -188,12 +188,12 @@ public void testEasierParseHttpHeadersInARC() throws IOException { assertEquals(har.getHeader().getUrl(), url, "failed to retrieve Url from metadata"); } - private void assertHeaderCorrectlyParsed(Header[] headers) { + private void assertHeaderCorrectlyParsed(HttpHeader[] headers) { final List orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n")); assertEquals(orgHeaders.size(), headers.length + 1, "not all HTTP header entries have been retrieved"); - for (Header header : headers) { + for (HttpHeader header : headers) { assertTrue(orgHeaders.contains(header.getName() + ": " + header.getValue())); } diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java index dc000265..19b1984f 100644 --- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java @@ -2,8 +2,6 @@ import java.net.URISyntaxException; -import org.apache.commons.httpclient.URIException; - import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/src/test/java/org/archive/url/URLParserTest.java b/src/test/java/org/archive/url/URLParserTest.java index ff99fe38..bc8fc3a5 100644 --- a/src/test/java/org/archive/url/URLParserTest.java +++ b/src/test/java/org/archive/url/URLParserTest.java @@ -4,8 +4,6 @@ import java.net.URISyntaxException; import java.net.URLDecoder; -import org.apache.commons.httpclient.URIException; - import com.google.common.net.InetAddresses; import org.junit.jupiter.api.Test; diff --git a/src/test/java/org/archive/url/URLRegexTransformerTest.java b/src/test/java/org/archive/url/URLRegexTransformerTest.java index 01e97aac..73c43f96 100644 --- a/src/test/java/org/archive/url/URLRegexTransformerTest.java +++ b/src/test/java/org/archive/url/URLRegexTransformerTest.java @@ -1,8 +1,6 @@ package org.archive.url; -import org.apache.commons.httpclient.URIException; - import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/src/test/java/org/archive/url/UsableURIFactoryTest.java b/src/test/java/org/archive/url/UsableURIFactoryTest.java index 368cc93d..85d423c0 100644 --- a/src/test/java/org/archive/url/UsableURIFactoryTest.java +++ b/src/test/java/org/archive/url/UsableURIFactoryTest.java @@ -21,7 +21,6 @@ import java.util.TreeMap; -import org.apache.commons.httpclient.URIException; import org.apache.commons.lang.SerializationUtils; import org.junit.jupiter.api.Test; diff --git a/src/test/java/org/archive/url/UsableURITest.java b/src/test/java/org/archive/url/UsableURITest.java index 9a4c1860..161e215a 100644 --- a/src/test/java/org/archive/url/UsableURITest.java +++ b/src/test/java/org/archive/url/UsableURITest.java @@ -20,8 +20,6 @@ import java.net.URISyntaxException; -import org.apache.commons.httpclient.URIException; - import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*;