diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..209bb31e
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+version: 2
+updates:
+ - package-ecosystem: "maven"
+ directory: "/"
+ open-pull-requests-limit: 10
+ schedule:
+ interval: "monthly"
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: "monthly"
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
new file mode 100644
index 00000000..64b395c2
--- /dev/null
+++ b/.github/workflows/maven.yml
@@ -0,0 +1,37 @@
+name: Java CI with Maven
+
+permissions:
+ contents: read
+
+on:
+ push:
+ branches: [ "master" ]
+ pull_request:
+ branches: [ "master" ]
+
+jobs:
+ build:
+ strategy:
+ matrix:
+ jdk: [8, 11, 17, 21, 25]
+
+ runs-on: ubuntu-latest
+ timeout-minutes: 30
+
+ steps:
+ - uses: actions/checkout@v6
+ - name: Set up JDK ${{ matrix.jdk }}
+ uses: actions/setup-java@v5
+ with:
+ java-version: ${{ matrix.jdk }}
+ distribution: 'temurin'
+ cache: maven
+ - name: Cache local Maven repository
+ uses: actions/cache@v5
+ with:
+ path: ~/.m2/repository
+ key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-maven-
+ - name: Build with Maven
+ run: mvn -B verify --file pom.xml
diff --git a/.gitignore b/.gitignore
index fc8f67e9..feee77d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.idea
*.pydevproject
.project
.metadata
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 54daf83b..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-dist: trusty
-language: java
-# sudo required for OpenJDK7 support per:
-# https://github.com/travis-ci/travis-ci/issues/7884#issuecomment-309689557
-sudo: required
-
-jdk:
- - openjdk7
- - oraclejdk8
- - openjdk8
-
-before_install:
- - "git clone https://github.com/iipc/travis.git target/travis"
-
-before_script:
- - "export JAVA_OPTS=-Xmx1024m"
- - "export MAVEN_OPTS=-Xmx512m"
- - "ulimit -u 2048"
-
-script:
- - mvn install -B -V
-
-# whitelist in the master branch only
-branches:
- only:
- - master
-
-env:
- global:
- - secure: "qDKjVdoe4Qcz4WfXiQydU7tyl51T62FUJrjqu4FUPBcgeQhFQiggwhpaE6xCOzOpxbsuBi2R1c8gMQf5esE5iDL5jZMu+kz++dYbuzMTd13ttvZWMW5wRPH0H8iHk609FP/RDtVKKBr7WO0JvvIAZEhWNHZrLXBrrKgdTey171g="
- - secure: "FXGBKJNP9X7ePJfS4eYTZtoFo4RT1sxor34XxncSJr7uV6ggtZb4B4WNd16IlLcDk6E32sx8YoWdltaOGwQ5Vg/kux5Ko/wKZCoccS018Ln1bRT86dD1KoPY34rGoNJVQxe7J/1MPqpBKwmi2XCKfzpsEh3W7bbIqg8w9MEOOZA="
diff --git a/CHANGES.md b/CHANGES.md
index bf985ada..18fb8290 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,9 +1,260 @@
-1.1.10
-------
+Unreleased
+----------
+
+3.0.4 (2026-06-02)
+------------------
+
+### Fixes
+
+* WAT extractor not to fail on metadata records without WARC-Target-URI [#150](https://github.com/iipc/webarchive-commons/pull/150)
+
+### Dependency upgrades
+
+* **commons-codec**: 1.21.0 → 1.22.0
+* **commons-io**: 2.21.0 → 2.22.0
+* **guava**: 33.5.0-jre → 33.6.0-jre
+* **hadoop** (hadoop-common, hadoop-mapreduce-client-core): 3.4.2 → 3.4.3
+
+3.0.3 (2025-02-06)
+------------------
+
+### Dependency upgrades
+
+* **commons-codec**: 1.20.0 → 1.21.0
+* **commons-lang3**: 3.19.0 → 3.20.0
+* **json**: 20250517 → 20251224
+* **junit-jupiter**: 5.14.1 → 5.14.2
+
+3.0.2 (2025-11-14)
+------------------
+
+### Fixes
+
+- Avoid relying on the default locale or charset. [#128](https://github.com/iipc/webarchive-commons/pull/128)
+- BasicURLCanonicalizer: more efficient normalization of dots in host names. [#129](https://github.com/iipc/webarchive-commons/pull/129)
+
+### Dependency upgrades
+
+* **commons-cli**: 1.10.0 → 1.11.0
+* **commons-codec**: 1.19.0 → 1.20.0
+* **commons-io**: 2.20.0 → 2.21.0
+* **junit-jupiter**: 5.13.3 → 5.14.1
+* **maven-release-plugin**: 3.1.1 → 3.2.0
+
+3.0.1 (2025-10-27)
+------------------
+
+### Fixes
+
+* Fixed a file handle leak in `FileUtils.pagedLines()` and `FileUtils.appendTo()` that could occur during I/O errors.
+
+### Dependency Upgrades
+
+* **commons-codec**: 1.18.0 → 1.19.0
+* **commons-lang3**: 3.18.0 → 3.19.0
+* **commons-cli**: 1.9.0 → 1.10.0
+* **guava**: 33.4.8-jre → 33.5.0-jre
+* **hadoop**: 3.4.1 → 3.4.2
+* **pig**: 0.17.0 → 0.18.0
+
+3.0.0 (2025-07-21)
+------------------
+
+### Changes
+
+`FileUtils.pagedLines()` and `FileUtils.expandRange()` now return the Apache Commons Lang 3 version of `LongRange`.
+Users of these methods may need to make the following changes:
+
+| Old | New |
+|-------------------------------------------------|---------------------------------------------|
+| `import org.apache.commons.lang.math.LongRange` | `import org.apache.commons.lang3.LongRange` |
+| `new LongRange(min, max)` | `LongRange.of(min, max)` |
+| `longRange.getMaximumLong()` | `longRange.getMaximum()` |
+| `longRange.getMinimumLong()` | `longRange.getMinimum()` |
+
+### Dependency upgrades
+
+- **commons-io**: 2.19.0 → 2.20.0
+- **commons-lang**: 2.6 → 3.18.0
+
+2.0.2 (2025-07-15)
+------------------
+
+### Fixes
+
+* Fixes for `org.archive.net.PublicSuffixes` [#110](https://github.com/iipc/webarchive-commons/pull/110)
+ * Updated to the latest version of the public suffix list.
+ * Fixed parsing failures with newer list versions.
+ * Moved `effective_tld_names.dat` to `org/archive/effective_tld_names.dat` to prevent conflict with `crawler-commons`.
+
+2.0.1 (2025-05-21)
+------------------
+
+### Changes
+
+* Re-added `Reporter.shortReportLineTo(PrintWriter)` as it turned out to be important to Heritrix.
+
+
+2.0.0 (2025-05-21)
+------------------
+
+### New features
+
+- Added `RecordingInputStream.asOutputStream()` for direct writing of recorded data without an input stream. [#108](https://github.com/iipc/webarchive-commons/pull/108)
+
+### Removals
+
+#### Removed Apache HttpClient 3.1
+
+`HTTPSeekableLineReaderFactory` and `ZipNumBlockLoader` now default to HttpClient 4.3.
+
+| Removed | Replacement |
+|-----------------------------------------------------------|--------------------------------------|
+| `org.apache.commons.httpclient.URIException` | `org.archive.url.URIException` |
+| `org.apache.commons.httpclient.Header` | `org.archive.format.http.HttpHeader` |
+| `org.archive.httpclient.HttpRecorderGetMethod` | |
+| `org.archive.httpclient.HttpRecorderMethod` | |
+| `org.archive.httpclient.HttpRecorderPostMethod` | |
+| `org.archive.httpclient.SingleHttpConnectionManager` | |
+| `org.archive.httpclient.ThreadLocalHttpConnectionManager` | |
+
+#### Removed deprecated versions of renamed classes
+
+| Removed | Replacement |
+|-----------------------------------------------|--------------------------------------------------|
+| `org.archive.io.ArchiveFileConstants` | `org.archive.format.ArchiveFileConstants` |
+| `org.archive.io.GzipHeader` | `org.archive.util.zip.GzipHeader` |
+| `org.archive.io.GZIPMembersInputStream` | `org.archive.util.zip.GZIPMembersInputStream` |
+| `org.archive.io.NoGzipMagicException` | `org.archive.util.zip.NoGzipMagicException` |
+| `org.archive.io.arc.ARCConstants` | `org.archive.format.arc.ARCConstants` |
+| `org.archive.io.warc.WARCConstants` | `org.archive.format.warc.WARCConstants` |
+| `org.archive.url.DefaultIACanonicalizerRules` | `org.archive.url.AggressiveIACanonicalizerRules` |
+| `org.archive.url.DefaultIAURLCanonicalizer` | `org.archive.url.AggressiveIAURLCanonicalizer` |
+| `org.archive.url.GoogleURLCanonicalizer` | `org.archive.url.BasicURLCanonicalizer` |
+
+#### Removed deprecated methods
+
+| Removed | Replacement |
+|-----------------------------------------------|-------------------------------------------|
+| `ANVLRecord(int)` | `ANVLRecord()` |
+| `DevUtils.betterPrintStack(RuntimeException)` | `Throwable.printStackStrace()` |
+| `Recorder.getReplayCharSequence()` | `Recorder.getContentReplayCharSequence()` |
+| `Reporter.shortReportLineTo(PrintWriter)` | `Reporter.reportTo(PrintWriter)` |
+
+##### Removed usages of constant interfaces
+
+Static imports should be used instead.
+
+* `ArchiveFileConstants` is no longer implemented by:
+ * `ArchiveReader`
+ * `ArchiveReaderFactory`
+ * `WARCWriter`
+ * `WriterPool`
+ * `WriterPoolMember`
+* `ARCConstants` is no longer implemented by:
+ * `ARCReader`
+ * `ARCReaderFactory`
+ * `ARCRecord`
+ * `ARCRecordMetaData`
+ * `ARCUtils`
+ * `ARCWriter`
+* `WARCConstants` is no longer implemented by:
+ * `WARCReader`
+ * `WARCReaderFactory`
+ * `WARCRecord`
+ * `WARCWriter`
+
+### Dependency upgrades
+
+- **commons-io**: 2.18.0 → 2.19.0
+- **guava**: 33.3.1-jre → 33.4.8-jre
+- **json**: 20240303 → 20250517
+- **junit**: 4.13.2 → 5.12.2
+
+1.3.0 (2024-12-20)
+------------------
+
+#### URL Canonicalization Changed
+
+The output of WaybackURLKeyMaker and other canonicalizers based on BasicURLCanonicalizer has changed for URLs that
+contain non UTF-8 percent encoded sequences. For example when a URL contains "%C3%23" it will now be normalised to
+"%c3%23" whereas previous releases produced "%25c3%23". This change brings webarchive-commons more inline with pywb,
+surt (Python), warcio.js and RFC 3986. While CDX file compatibility with these newer tools should improve, note that CDX
+files generated by the new release which contain such URLs may not work correctly with existing versions of
+OpenWayback that use the older webarchive-commons. [#102](https://github.com/iipc/webarchive-commons/pull/102)
+
+#### Bug fixes
+
+* WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length" [#103](https://github.com/iipc/webarchive-commons/pull/103)
+* ObjectPlusFilesOutputStream.hardlinkOrCopy now uses `Files.createLink()` instead of executing `ln`. This
+ prevents the potential for security vulnerabilities from command line option injection and improves portability.
+
+#### Dependency upgrades
+
+* fastutil removed
+* dsiutils removed
+
+#### Deprecations
+
+The following classes and enum members have been marked deprecated as a step towards removal of the dependency on
+Apache Commons HttpClient 3.1.
+
+* org.archive.httpclient.HttpRecorderGetMethod
+* org.archive.httpclient.HttpRecorderMethod
+* org.archive.httpclient.HttpRecorderPostMethod
+* org.archive.httpclient.SingleHttpConnectionManager
+* org.archive.httpclient.ThreadLocalHttpConnectionManager
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLR
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory
+* org.archive.util.binsearch.impl.http.HTTPSeekableLineReaderFactory.HttpLibs.APACHE_31
+
+1.2.0 (2024-11-29)
+------------------
+
+#### New features
+
+* MetaData is now multivalued to support repeated WARC and HTTP headers. [#98](https://github.com/iipc/webarchive-commons/pull/98/files)
+
+#### Dependency upgrades
+
+* commons-io 2.18.0
+* commons-lang 2.6
+* guava 33.3.1-jre
+* hadoop 3.4.1
+* htmlparser 2.1
+* httpcore 4.4.16
+* json 20240303
+* junit 4.13.2
+
+1.1.11 (2024-11-27)
+-------------------
+
+#### Bug fixes
+
+* Fixed URLParser and WaybackURLKeyMaker failing on URLs with IPv6 address hostnames [#100](https://github.com/iipc/webarchive-commons/pull/100)
+
+1.1.10 (2024-10-15)
+-------------------
+
+#### Bug fixes
+
* [WAT extractor: do not fail on missing WARC-Filename in warcinfo record](https://github.com/iipc/webarchive-commons/pull/89)
* [ExtractingParseObserver: extract rel, hreflang and type attributes](https://github.com/iipc/webarchive-commons/pull/86)
* [ExtractingParseObserver: extract links from onClick attributes](https://github.com/iipc/webarchive-commons/pull/85)
-* [Update TravisCI config](https://github.com/iipc/webarchive-commons/pull/83)
+
+#### Dependency Upgrades
+
+* commons-collections 3.2.2
+* commons-io 2.7
+* dsiutils 2.2.8
+* guava 33.3.0-jre
+* hadoop 3.4.0 (now optional)
+* pig 0.17.0
+* org.json 20231013
+
+#### Dependency Removals
+
+* joda-time (was unused)
1.1.9
-----
diff --git a/README.md b/README.md
index 72858a52..55be6e68 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
IIPC Web Archive Commons
========================
-
-[](https://travis-ci.org/iipc/webarchive-commons/)
+[](https://maven-badges.herokuapp.com/maven-central/org.netpreserve.commons/webarchive-commons) [](https://www.javadoc.io/doc/org.netpreserve.commons/webarchive-commons)
This repository contains common utility code for [OpenWayback][1] and other projects.
diff --git a/pom-cdh4.xml b/pom-cdh4.xml
deleted file mode 100644
index de19d8d0..00000000
--- a/pom-cdh4.xml
+++ /dev/null
@@ -1,229 +0,0 @@
-
+ * See WARCWriterProcessor
*/
public static final String HEADER_KEY_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI";
public static final String HEADER_KEY_REFERS_TO_DATE = "WARC-Refers-To-Date";
diff --git a/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java b/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java
index e92ed7e1..d31e31c9 100644
--- a/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java
+++ b/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java
@@ -6,7 +6,7 @@
import java.util.logging.Level;
import java.util.logging.Logger;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
diff --git a/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java b/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java
index 37c8af99..a3cbb26c 100644
--- a/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java
+++ b/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Locale;
import java.util.logging.Logger;
import org.apache.hadoop.mapreduce.InputFormat;
@@ -54,7 +55,7 @@ public Tuple getNext() throws IOException {
try {
key = reader.getCurrentKey();
- LOG.info(String.format("Loaded key-offset %d\n", key.offset));
+ LOG.info(String.format(Locale.ROOT, "Loaded key-offset %d\n", key.offset));
value = reader.getCurrentValue();
} catch (InterruptedException e) {
// is this needed and the right way?
diff --git a/src/main/java/org/archive/hadoop/FilenameInputFormat.java b/src/main/java/org/archive/hadoop/FilenameInputFormat.java
index 5893afb1..3f41cdee 100644
--- a/src/main/java/org/archive/hadoop/FilenameInputFormat.java
+++ b/src/main/java/org/archive/hadoop/FilenameInputFormat.java
@@ -17,7 +17,6 @@
package org.archive.hadoop;
import java.io.*;
-import java.util.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
diff --git a/src/main/java/org/archive/hadoop/PerMapOutputFormat.java b/src/main/java/org/archive/hadoop/PerMapOutputFormat.java
index 28ebca73..684202bb 100644
--- a/src/main/java/org/archive/hadoop/PerMapOutputFormat.java
+++ b/src/main/java/org/archive/hadoop/PerMapOutputFormat.java
@@ -17,7 +17,6 @@
package org.archive.hadoop;
import java.io.*;
-import java.util.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
diff --git a/src/main/java/org/archive/hadoop/ResourceRecordReader.java b/src/main/java/org/archive/hadoop/ResourceRecordReader.java
index 06d3ce2e..88b93dd2 100644
--- a/src/main/java/org/archive/hadoop/ResourceRecordReader.java
+++ b/src/main/java/org/archive/hadoop/ResourceRecordReader.java
@@ -1,6 +1,7 @@
package org.archive.hadoop;
import java.io.IOException;
+import java.util.Locale;
import java.util.logging.Logger;
import org.apache.hadoop.fs.FSDataInputStream;
@@ -111,7 +112,7 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
if(r != null) {
StreamCopy.readToEOF(r.getInputStream());
- LOG.info(String.format("Extracted offset %d\n",
+ LOG.info(String.format(Locale.ROOT, "Extracted offset %d\n",
series.getCurrentMemberStartOffset()));
cachedK = new ResourceContext(name,
series.getCurrentMemberStartOffset());
@@ -121,7 +122,7 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
} catch (ResourceParseException e) {
e.printStackTrace();
throw new IOException(
- String.format("ResourceParseException at(%s)(%d)",
+ String.format(Locale.ROOT, "ResourceParseException at(%s)(%d)",
name,series.getCurrentMemberStartOffset()),
e);
}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
deleted file mode 100644
index ef241b48..00000000
--- a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.httpclient;
-
-import java.io.IOException;
-import java.util.logging.Logger;
-
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.HttpException;
-import org.apache.commons.httpclient.HttpState;
-import org.apache.commons.httpclient.methods.GetMethod;
-import org.archive.util.Recorder;
-
-
-/**
- * Override of GetMethod that marks the passed HttpRecorder w/ the transition
- * from HTTP head to body and that forces a close on the http connection.
- *
- * The actions done in this subclass used to be done by copying
- * org.apache.commons.HttpMethodBase, overlaying our version in place of the
- * one that came w/ httpclient. Here is the patch of the difference between
- * shipped httpclient code and our mods:
- * We're not supposed to have access to the underlying connection object;
- * am only violating contract because see cases where httpclient is skipping
- * out w/o cleaning up after itself.
- *
- * @author stack
- * @version $Revision$, $Date$
- */
-public class HttpRecorderGetMethod extends GetMethod {
-
- protected static Logger logger =
- Logger.getLogger(HttpRecorderGetMethod.class.getName());
-
- /**
- * Instance of http recorder method.
- */
- protected HttpRecorderMethod httpRecorderMethod = null;
-
-
- public HttpRecorderGetMethod(String uri, Recorder recorder) {
- super(uri);
- this.httpRecorderMethod = new HttpRecorderMethod(recorder);
- }
-
- protected void readResponseBody(HttpState state, HttpConnection connection)
- throws IOException, HttpException {
- // We're about to read the body. Mark transition in http recorder.
- this.httpRecorderMethod.markContentBegin(connection);
- super.readResponseBody(state, connection);
- }
-
- protected boolean shouldCloseConnection(HttpConnection conn) {
- // Always close connection after each request. As best I can tell, this
- // is superfluous -- we've set our client to be HTTP/1.0. Doing this
- // out of paranoia.
- return true;
- }
-
- public int execute(HttpState state, HttpConnection conn)
- throws HttpException, IOException {
- // Save off the connection so we can close it on our way out in case
- // httpclient fails to (We're not supposed to have access to the
- // underlying connection object; am only violating contract because
- // see cases where httpclient is skipping out w/o cleaning up
- // after itself).
- this.httpRecorderMethod.setConnection(conn);
- return super.execute(state, conn);
- }
-
- protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
- throws IOException, HttpException {
- super.addProxyConnectionHeader(state, conn);
- this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
- }
-
- // XXX see https://webarchive.jira.com/browse/HER-2059
- // We never call this method with the implied question mark prepended, so
- // adding it does the trick, since commons-httpclient will strip it later.
- public void setQueryString(String queryString) {
- if (queryString != null) {
- super.setQueryString('?' + queryString);
- } else {
- super.setQueryString(queryString);
- }
- }
-
-}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
deleted file mode 100644
index 932e7e98..00000000
--- a/src/main/java/org/archive/httpclient/HttpRecorderMethod.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.httpclient;
-
-import java.util.logging.Logger;
-
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.HttpMethod;
-import org.archive.util.Recorder;
-
-
-/**
- * This class encapsulates the specializations supplied by the
- * overrides {@link HttpRecorderGetMethod} and {@link HttpRecorderPostMethod}.
- *
- * It keeps instance of HttpRecorder and HttpConnection.
- *
- * @author stack
- * @version $Revision$, $Date$
- */
-public class HttpRecorderMethod {
- protected static Logger logger =
- Logger.getLogger(HttpRecorderMethod.class.getName());
-
- /**
- * Instance of http recorder we're using recording this http get.
- */
- private Recorder httpRecorder = null;
-
- /**
- * Save around so can force close.
- *
- * See [ 922080 ] IllegalArgumentException (size is wrong).
- * https://sourceforge.net/tracker/?func=detail&aid=922080&group_id=73833&atid=539099
- */
- private HttpConnection connection = null;
-
-
- public HttpRecorderMethod(Recorder recorder) {
- this.httpRecorder = recorder;
- }
-
- public void markContentBegin(HttpConnection c) {
- if (c != this.connection) {
- // We're checking that we're not being asked to work on
- // a connection that is other than the one we started
- // this method#execute with.
- throw new IllegalArgumentException("Connections differ: " +
- this.connection + " " + c + " " +
- Thread.currentThread().getName());
- }
- this.httpRecorder.markContentBegin();
- }
-
- /**
- * @return Returns the connection.
- */
- public HttpConnection getConnection() {
- return this.connection;
- }
-
- /**
- * @param connection The connection to set.
- */
- public void setConnection(HttpConnection connection) {
- this.connection = connection;
- }
- /**
- * @return Returns the httpRecorder.
- */
- public Recorder getHttpRecorder() {
- return httpRecorder;
- }
-
- /**
- * If a 'Proxy-Connection' header has been added to the request,
- * it'll be of a 'keep-alive' type. Until we support 'keep-alives',
- * override the Proxy-Connection setting and instead pass a 'close'
- * (Otherwise every request has to timeout before we notice
- * end-of-document).
- * @param method Method to find proxy-connection header in.
- */
- public void handleAddProxyConnectionHeader(HttpMethod method) {
- Header h = method.getRequestHeader("Proxy-Connection");
- if (h != null) {
- h.setValue("close");
- method.setRequestHeader(h);
- }
- }
-}
diff --git a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
deleted file mode 100644
index 20f1bfd1..00000000
--- a/src/main/java/org/archive/httpclient/HttpRecorderPostMethod.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.httpclient;
-
-import java.io.IOException;
-
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.HttpException;
-import org.apache.commons.httpclient.HttpState;
-import org.apache.commons.httpclient.methods.PostMethod;
-import org.archive.util.Recorder;
-
-
-/**
- * Override of PostMethod that marks the passed HttpRecorder w/ the transition
- * from HTTP head to body and that forces a close on the responseConnection.
- *
- * This is a copy of {@link HttpRecorderGetMethod}. Only difference is the
- * parent subclass.
- *
- * @author stack
- * @version $Date$ $Revision$
- */
-public class HttpRecorderPostMethod extends PostMethod {
- /**
- * Instance of http recorder method.
- */
- protected HttpRecorderMethod httpRecorderMethod = null;
-
-
- public HttpRecorderPostMethod(String uri, Recorder recorder) {
- super(uri);
- this.httpRecorderMethod = new HttpRecorderMethod(recorder);
- }
-
- protected void readResponseBody(HttpState state, HttpConnection connection)
- throws IOException, HttpException {
- // We're about to read the body. Mark transition in http recorder.
- this.httpRecorderMethod.markContentBegin(connection);
- super.readResponseBody(state, connection);
- }
-
- protected boolean shouldCloseConnection(HttpConnection conn) {
- // Always close connection after each request. As best I can tell, this
- // is superfluous -- we've set our client to be HTTP/1.0. Doing this
- // out of paranoia.
- return true;
- }
-
- public int execute(HttpState state, HttpConnection conn)
- throws HttpException, IOException {
- // Save off the connection so we can close it on our way out in case
- // httpclient fails to (We're not supposed to have access to the
- // underlying connection object; am only violating contract because
- // see cases where httpclient is skipping out w/o cleaning up
- // after itself).
- this.httpRecorderMethod.setConnection(conn);
- return super.execute(state, conn);
- }
-
- protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
- throws IOException, HttpException {
- super.addProxyConnectionHeader(state, conn);
- this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
- }
-}
diff --git a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java b/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
deleted file mode 100644
index 4ba6a837..00000000
--- a/src/main/java/org/archive/httpclient/SingleHttpConnectionManager.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.httpclient;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.commons.httpclient.HostConfiguration;
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.SimpleHttpConnectionManager;
-
-/**
- * An HttpClient-compatible HttpConnection "manager" that actually
- * just gives out a new connection each time -- skipping the overhead
- * of connection management, since we already throttle our crawler
- * with external mechanisms.
- *
- * @author gojomo
- */
-public class SingleHttpConnectionManager extends SimpleHttpConnectionManager {
-
- public SingleHttpConnectionManager() {
- super();
- }
-
- public HttpConnection getConnectionWithTimeout(
- HostConfiguration hostConfiguration, long timeout) {
-
- HttpConnection conn = new HttpConnection(hostConfiguration);
- conn.setHttpConnectionManager(this);
- conn.getParams().setDefaults(this.getParams());
- return conn;
- }
-
- public void releaseConnection(HttpConnection conn) {
- // ensure connection is closed
- conn.close();
- finishLast(conn);
- }
-
- protected static void finishLast(HttpConnection conn) {
- // copied from superclass because it wasn't made available to subclasses
- InputStream lastResponse = conn.getLastResponseInputStream();
- if (lastResponse != null) {
- conn.setLastResponseInputStream(null);
- try {
- lastResponse.close();
- } catch (IOException ioe) {
- //FIXME: badness - close to force reconnect.
- conn.close();
- }
- }
- }
-}
diff --git a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java b/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
deleted file mode 100644
index 91e850ea..00000000
--- a/src/main/java/org/archive/httpclient/ThreadLocalHttpConnectionManager.java
+++ /dev/null
@@ -1,291 +0,0 @@
-/**
- * ====================================================================
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ====================================================================
- *
- */
-package org.archive.httpclient;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import org.apache.commons.httpclient.HostConfiguration;
-import org.apache.commons.httpclient.HttpConnection;
-import org.apache.commons.httpclient.HttpConnectionManager;
-import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
-
-/**
- * A simple, but thread-safe HttpClient {@link HttpConnectionManager}.
- * Based on {@link org.apache.commons.httpclient.SimpleHttpConnectionManager}.
- *
- * Java >= 1.4 is recommended.
- *
- * @author Christian Kohlschuetter
- */
-public final class ThreadLocalHttpConnectionManager implements
- HttpConnectionManager {
-
- private static final CloserThread closer = new CloserThread();
- private static final Logger logger = Logger
- .getLogger(ThreadLocalHttpConnectionManager.class.getName());
-
- private final ThreadLocal Class that the passed HttpRecorder w/ boundary between
HTTP header and content. Also forces a close on the response on
call to releaseConnection. A protocol socket factory that allows setting of trust level on
construction. JavaTM Secure Socket Extension (JSSE): Reference Guide Call {@link #close()} on this class when done to clean up resources.
*
- * For small streams, use {@link InMemoryReplayCharSequence}.
- *
- * Call {@link close()} on this class when done to clean up resources.
- *
- * @contributor stack
- * @contributor nlevitt
+ * @author stack
+ * @author nlevitt
* @version $Revision$, $Date$
*/
public class GenericReplayCharSequence implements ReplayCharSequence {
@@ -67,9 +66,9 @@ public class GenericReplayCharSequence implements ReplayCharSequence {
* decodings. The name of the file that holds the decoding is the name
* of the backing file w/ this encoding for a suffix.
*
- * See Encoding.
+ * See Encoding.
*/
- public static final Charset WRITE_ENCODING = Charsets.UTF_16BE;
+ public static final Charset WRITE_ENCODING = StandardCharsets.UTF_16BE;
private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M
@@ -170,8 +169,8 @@ private void updateMemoryMappedBuffer() {
long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES);
logger.fine("updateMemoryMappedBuffer: mapOffset="
- + NumberFormat.getInstance().format(mapByteOffset)
- + " mapSize=" + NumberFormat.getInstance().format(mapSize));
+ + NumberFormat.getInstance(Locale.ROOT).format(mapByteOffset)
+ + " mapSize=" + NumberFormat.getInstance(Locale.ROOT).format(mapSize));
try {
// TODO: stress-test without these possibly-costly requests!
// System.gc();
@@ -257,9 +256,9 @@ protected void decode(InputStream inStream, int prefixMax,
this.length = Ints.saturatedCast(count);
if(count>Integer.MAX_VALUE) {
logger.warning("input stream is longer than Integer.MAX_VALUE="
- + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE)
+ " characters -- only first "
- + NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ + NumberFormat.getInstance(Locale.ROOT).format(Integer.MAX_VALUE)
+ " are accessible through this GenericReplayCharSequence");
}
diff --git a/src/main/java/org/archive/io/GzipHeader.java b/src/main/java/org/archive/io/GzipHeader.java
deleted file mode 100644
index 6b8263bc..00000000
--- a/src/main/java/org/archive/io/GzipHeader.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.io;
-
-/**
- * @deprecated use {@link org.archive.util.zip.GzipHeader}
- */
-@Deprecated
-public class GzipHeader extends org.archive.util.zip.GzipHeader {
-}
diff --git a/src/main/java/org/archive/io/HeaderedArchiveRecord.java b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
index 3cce595b..858edb4d 100644
--- a/src/main/java/org/archive/io/HeaderedArchiveRecord.java
+++ b/src/main/java/org/archive/io/HeaderedArchiveRecord.java
@@ -25,12 +25,11 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HttpParser;
-import org.apache.commons.httpclient.StatusLine;
-import org.apache.commons.httpclient.util.EncodingUtil;
-import org.archive.io.arc.ARCConstants;
+import org.archive.format.http.HttpHeader;
+import org.archive.format.arc.ARCConstants;
import org.archive.util.LaxHttpParser;
/**
@@ -59,7 +58,7 @@ public class HeaderedArchiveRecord extends ArchiveRecord {
*
* Only available after the reading of headers.
*/
- private Header [] contentHeaders = null;
+ private HttpHeader[] contentHeaders = null;
public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException {
@@ -147,27 +146,29 @@ private InputStream readContentHeaders() throws IOException {
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new IOException("Failed to read raw lie where one " +
- " was expected: " + new String(statusBytes));
+ " was expected: " + new String(statusBytes, ARCConstants.DEFAULT_ENCODING));
}
- String statusLine = EncodingUtil.getString(statusBytes, 0,
+ String statusLine = new String(statusBytes, 0,
statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
- if (statusLine == null) {
- throw new NullPointerException("Expected status line is null");
- }
+ statusLine = statusLine.trim();
// TODO: Tighten up this test.
- boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine);
+ boolean isHttpResponse = statusLine.startsWith("HTTP");
boolean isHttpRequest = false;
if (!isHttpResponse) {
- isHttpRequest = statusLine.toUpperCase().startsWith("GET") ||
- !statusLine.toUpperCase().startsWith("POST");
+ isHttpRequest = statusLine.toUpperCase(Locale.ROOT).startsWith("GET") ||
+ !statusLine.toUpperCase(Locale.ROOT).startsWith("POST");
}
if (!isHttpResponse && !isHttpRequest) {
throw new UnexpectedStartLineIOException("Failed parse of " +
"status line: " + statusLine);
}
- this.statusCode = isHttpResponse?
- (new StatusLine(statusLine)).getStatusCode(): -1;
-
+
+ if (isHttpResponse) {
+ this.statusCode = parseStatusCode(statusLine);
+ } else {
+ this.statusCode = -1;
+ }
+
// Save off all bytes read. Keep them as bytes rather than
// convert to strings so we don't have to worry about encodings
// though this should never be a problem doing http headers since
@@ -183,7 +184,7 @@ private InputStream readContentHeaders() throws IOException {
eolCharCount = getEolCharsCount(lineBytes);
if (eolCharCount <= 0) {
throw new IOException("Failed reading headers: " +
- ((lineBytes != null)? new String(lineBytes): null));
+ ((lineBytes != null)? new String(lineBytes, StandardCharsets.ISO_8859_1): null));
}
// Save the bytes read.
baos.write(lineBytes);
@@ -210,7 +211,19 @@ private InputStream readContentHeaders() throws IOException {
bais.reset();
return bais;
}
-
+
+ public static int parseStatusCode(String statusLine) {
+ int i = statusLine.indexOf(' ');
+ if (i < 0) return -1;
+ int j = statusLine.indexOf(' ', i + 1);
+ if (j < 0) j = statusLine.length();
+ try {
+ return Integer.parseInt(statusLine.substring(i + 1, j));
+ } catch (NumberFormatException e) {
+ return -1;
+ }
+ }
+
public static class UnexpectedStartLineIOException
extends RecoverableIOException {
private static final long serialVersionUID = 1L;
@@ -252,7 +265,7 @@ public int getContentHeadersLength() {
return this.contentHeadersLength;
}
- public Header[] getContentHeaders() {
+ public HttpHeader[] getContentHeaders() {
return contentHeaders;
}
diff --git a/src/main/java/org/archive/io/MiserOutputStream.java b/src/main/java/org/archive/io/MiserOutputStream.java
index f10ac9ca..f29256fd 100644
--- a/src/main/java/org/archive/io/MiserOutputStream.java
+++ b/src/main/java/org/archive/io/MiserOutputStream.java
@@ -27,7 +27,7 @@
* A filter stream that both counts bytes written, and optionally swallows
* flush() requests.
*
- * @contributor gojomo
+ * @author gojomo
*/
public class MiserOutputStream extends FilterOutputStream {
protected long count;
diff --git a/src/main/java/org/archive/io/NoGzipMagicException.java b/src/main/java/org/archive/io/NoGzipMagicException.java
deleted file mode 100644
index 27d1058a..00000000
--- a/src/main/java/org/archive/io/NoGzipMagicException.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.io;
-
-/**
- * @deprecated use {@link org.archive.util.zip.NoGzipMagicException}
- */
-@Deprecated
-public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException {
-}
diff --git a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
index 224f24e7..bd5c1eea 100644
--- a/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
+++ b/src/main/java/org/archive/io/ObjectPlusFilesOutputStream.java
@@ -18,10 +18,8 @@
*/
package org.archive.io;
-import java.io.File;
-import java.io.IOException;
-import java.io.ObjectOutputStream;
-import java.io.OutputStream;
+import java.io.*;
+import java.nio.file.Files;
import java.util.LinkedList;
import org.archive.util.FileUtils;
@@ -116,19 +114,10 @@ public void snapshotAppendOnlyFile(File file) throws IOException {
* @throws IOException
*/
private void hardlinkOrCopy(File file, File destination) throws IOException {
- // For Linux/UNIX, try a hard link first.
- Process link = Runtime.getRuntime().exec("ln "+file.getAbsolutePath()+" "+destination.getAbsolutePath());
- // TODO NTFS also supports hard links; add appropriate try
try {
- link.waitFor();
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- if(link.exitValue()!=0) {
- // hard link failed
+ Files.createLink(destination.toPath(), file.toPath());
+ } catch (UnsupportedEncodingException e) {
FileUtils.copyFile(file,destination);
}
}
-
}
diff --git a/src/main/java/org/archive/io/Preformatter.java b/src/main/java/org/archive/io/Preformatter.java
index dcd31bb6..c7099c51 100644
--- a/src/main/java/org/archive/io/Preformatter.java
+++ b/src/main/java/org/archive/io/Preformatter.java
@@ -24,7 +24,7 @@
* Interface indicating a logging Formatter can preformat a record (outside
* the standard-implementation synchronized block) and cache it, returning it
* for the next request for formatting from the same thread.
- * @contributor gojomo
+ * @author gojomo
*/
public interface Preformatter {
public void preformat(LogRecord record);
diff --git a/src/main/java/org/archive/io/RecordingInputStream.java b/src/main/java/org/archive/io/RecordingInputStream.java
index 36f539a9..3c9db61f 100644
--- a/src/main/java/org/archive/io/RecordingInputStream.java
+++ b/src/main/java/org/archive/io/RecordingInputStream.java
@@ -225,7 +225,7 @@ public void readToEndOfContent(long contentLength)
/**
* Read all of a stream (Or read until we timeout or have read to the max).
- * @param softMaxLength Maximum length to read; if zero or < 0, then no
+ * @param softMaxLength Maximum length to read; if zero or < 0, then no
* limit. If met, return normally.
* @throws IOException failed read.
* @throws RecorderLengthExceededException
@@ -383,12 +383,12 @@ public synchronized void mark(int readlimit) {
@Override
public boolean markSupported() {
- return this.in.markSupported();
+ return in != null && this.in.markSupported();
}
@Override
public synchronized void reset() throws IOException {
- this.in.reset();
+ if (in != null) this.in.reset();
this.recordingOutputStream.reset();
}
@@ -418,4 +418,13 @@ public void chopAtMessageBodyBegin() {
public void clearForReuse() throws IOException {
recordingOutputStream.clearForReuse();
}
+
+ /**
+ * Returns an OutputStream that can be used for recording input data. This is useful if the input comes in some
+ * form other than an InputStream. For example, if the input is provided by a callback periodically called with
+ * a chunk of data.
+ */
+ public RecordingOutputStream asOutputStream() {
+ return this.recordingOutputStream;
+ }
}
diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java
index 7d2ff212..6c77997b 100644
--- a/src/main/java/org/archive/io/RecordingOutputStream.java
+++ b/src/main/java/org/archive/io/RecordingOutputStream.java
@@ -19,8 +19,7 @@
package org.archive.io;
-import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
-
+import java.io.BufferedOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -207,7 +206,7 @@ public void open(OutputStream wrappedStream) throws IOException {
protected OutputStream ensureDiskStream() throws FileNotFoundException {
if (this.diskStream == null) {
FileOutputStream fis = new FileOutputStream(this.backingFilename);
- this.diskStream = new FastBufferedOutputStream(fis);
+ this.diskStream = new BufferedOutputStream(fis);
}
return this.diskStream;
}
diff --git a/src/main/java/org/archive/io/ReplayCharSequence.java b/src/main/java/org/archive/io/ReplayCharSequence.java
index aa9b9587..bd74f2f8 100644
--- a/src/main/java/org/archive/io/ReplayCharSequence.java
+++ b/src/main/java/org/archive/io/ReplayCharSequence.java
@@ -23,8 +23,7 @@
import java.io.IOException;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
-
-import com.google.common.base.Charsets;
+import java.nio.charset.StandardCharsets;
/**
@@ -40,7 +39,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable {
/** charset to use in replay when declared value
* is absent/illegal/unavailable */
- public Charset FALLBACK_CHARSET = Charsets.ISO_8859_1; // TODO: should this be UTF-8?
+ public Charset FALLBACK_CHARSET = StandardCharsets.ISO_8859_1; // TODO: should this be UTF-8?
/**
* Call this method when done so implementation has chance to clean up
@@ -59,7 +58,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable {
public long getDecodeExceptionCount();
/**
- * Return the first coding-exception encountered, if the count > 0.
+ * Return the first coding-exception encountered, if the count > 0.
* @return CharacterCodingException
*/
public CharacterCodingException getCodingException();
diff --git a/src/main/java/org/archive/io/ReplayInputStream.java b/src/main/java/org/archive/io/ReplayInputStream.java
index 35ea8175..60b0dc85 100644
--- a/src/main/java/org/archive/io/ReplayInputStream.java
+++ b/src/main/java/org/archive/io/ReplayInputStream.java
@@ -64,7 +64,7 @@ public class ReplayInputStream extends SeekInputStream
* @param size Size of data to replay.
* @param responseBodyStart Start of the response body.
* @param backingFilename Backing file that sits behind the buffer. If
- * This class will write until we hit >= maxSize. The check is done at
+ * This class will write until we hit >= maxSize. The check is done at
* record boundary. Records do not span ARC files. We will then close current
* file and open another and then continue writing.
*
@@ -95,9 +96,9 @@
* alexa
* ARC c-tools:
* While being written, WARCs have a '.open' suffix appended.
*
- * @contributor stack
+ * @author stack
* @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
*/
-public class WARCWriter extends WriterPoolMember
-implements WARCConstants {
+public class WARCWriter extends WriterPoolMember {
public static final String TOTALS = "totals";
public static final String SIZE_ON_DISK = "sizeOnDisk";
public static final String TOTAL_BYTES = "totalBytes";
@@ -81,7 +83,7 @@ public class WARCWriter extends WriterPoolMember
/**
* Temporarily accumulates stats managed externally by
- * {@link WARCWriterProcessor}. WARCWriterProcessor will call
+ * WARCWriterProcessor. WARCWriterProcessor will call
* {@link #resetTmpStats()}, write some records, then add
* {@link #getTmpStats()} into its long-term running totals.
*/
@@ -97,9 +99,6 @@ public class WARCWriter extends WriterPoolMember
* @param serialNo used to generate unique file name sequences
* @param out Where to write.
* @param f File the Initial implementations of
- * This class differs from google in treatment of non-ascii input. Google's
+ * This class differs from Google in treatment of non-ascii input. Google's
* rules don't really address this except with one example test case, which
* seems to suggest taking raw input bytes and pct-encoding them byte for byte.
* Since the input to this class consists of java strings, not raw bytes, that
- * wouldn't be possible, even if deemed preferable. Instead
+ * wouldn't be possible, even if deemed preferable. Instead,
* BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8.
*/
public class BasicURLCanonicalizer implements URLCanonicalizer {
@@ -34,7 +36,9 @@ public class BasicURLCanonicalizer implements URLCanonicalizer {
.compile("^(0[0-7]*)(\\.[0-7]+)?(\\.[0-7]+)?(\\.[0-7]+)?$");
Pattern DECIMAL_IP = Pattern
.compile("^([1-9][0-9]*)(\\.[0-9]+)?(\\.[0-9]+)?(\\.[0-9]+)?$");
+ Pattern MULTIDOT = Pattern.compile("\\.{2,}");
+ @Override
public void canonicalize(HandyURL url) {
url.setHash(null);
url.setAuthUser(minimalEscape(url.getAuthUser()));
@@ -55,8 +59,7 @@ public void canonicalize(HandyURL url) {
host = hostE;
}
- host = host.replaceAll("^\\.+", "").replaceAll("\\.\\.+", ".")
- .replaceAll("\\.$", "");
+ host = normalizeDots(host);
}
String ip = null;
@@ -64,7 +67,7 @@ public void canonicalize(HandyURL url) {
if (ip != null) {
host = ip;
} else if (host != null) {
- host = escapeOnce(host.toLowerCase());
+ host = escapeOnce(host.toLowerCase(Locale.ROOT));
}
url.setHost(host);
// now the path:
@@ -74,6 +77,36 @@ public void canonicalize(HandyURL url) {
url.setPath(escapeOnce(normalizePath(path)));
}
+ /**
+ * Normalize dots in the host name.
+ *
+ * @param host
+ * @return host name with all sequences of dots replaced with a single dot,
+ * and all leading and trailing dots removed
+ */
+ private String normalizeDots(String host) {
+ if (host.indexOf('.') == -1) {
+ return host;
+ }
+ int start = 0, end = host.length();
+ boolean changed = false;
+ while (start < end && host.charAt(start) == '.') {
+ start++;
+ changed = true;
+ }
+ while (end > start && host.charAt(end - 1) == '.') {
+ end--;
+ changed = true;
+ }
+ if (changed) {
+ host = host.substring(start, end);
+ }
+ if (host.contains("..")) {
+ host = MULTIDOT.matcher(host).replaceAll(".");
+ }
+ return host;
+ }
+
private static final Pattern SINGLE_FORWARDSLASH_PATTERN = Pattern
.compile("/");
@@ -159,7 +192,7 @@ public String attemptIPFormats(String host) { // throws URIException {
}
ip[i] = octet;
}
- return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]);
+ return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]);
} else {
Matcher m2 = DECIMAL_IP.matcher(host);
if (m2.matches()) {
@@ -190,7 +223,7 @@ public String attemptIPFormats(String host) { // throws URIException {
}
ip[i] = octet;
}
- return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2],
+ return String.format(Locale.ROOT, "%d.%d.%d.%d", ip[0], ip[1], ip[2],
ip[3]);
}
@@ -203,15 +236,16 @@ public String minimalEscape(String input) {
return escapeOnce(unescapeRepeatedly(input));
}
- protected static Charset _UTF8 = null;
+ protected static Charset _UTF8 = StandardCharsets.UTF_8;
protected static Charset UTF8() {
- if (_UTF8 == null) {
- _UTF8 = Charset.forName("UTF-8");
- }
return _UTF8;
}
+ /**
+ * @param input String to be percent-encoded. Assumed to be fully unescaped.
+ * @return percent-encoded string
+ */
public String escapeOnce(String input) {
if (input == null) {
return null;
@@ -243,8 +277,21 @@ public String escapeOnce(String input) {
*/
sb = new StringBuilder(input.substring(0, i));
}
+ if (b == '%' && i < utf8bytes.length - 2) {
+ // Any hex escapes left at this point represent non-UTF-8 encoded characters
+ // Unescape them, so they don't get double escaped
+ int hex1 = getHex(utf8bytes[i + 1]);
+ if (hex1 >= 0) {
+ int hex2 = getHex(utf8bytes[i + 2]);
+ if (hex2 >= 0) {
+ i = i+2;
+ b = hex1 * 16 + hex2;
+ }
+ }
+
+ }
sb.append("%");
- String hex = Integer.toHexString(b).toUpperCase();
+ String hex = Integer.toHexString(b).toUpperCase(Locale.ROOT);
if (hex.length() == 1) {
sb.append('0');
}
@@ -337,7 +384,7 @@ public String decode(String input) {
* Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If
* decoding of any portion fails, appends the un-decodable %xx%xx sequence
* extracted from inputStr instead of decoded characters. See "bad unicode"
- * tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense
+ * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense
* within context of {@link #decode(String)}.
*
* @param sb
diff --git a/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java b/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java
deleted file mode 100644
index 3d4d8581..00000000
--- a/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package org.archive.url;
-
-/**
- * @deprecated use AggressiveIACanonicalizerRules
- */
-public class DefaultIACanonicalizerRules extends AggressiveIACanonicalizerRules {
-}
diff --git a/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java b/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java
deleted file mode 100644
index 3d1f985d..00000000
--- a/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package org.archive.url;
-
-/**
- * @deprecated use AggressiveIAURLCanonicalizer
- */
-public class DefaultIAURLCanonicalizer extends AggressiveIAURLCanonicalizer {
-}
diff --git a/src/main/java/org/archive/url/GoogleURLCanonicalizer.java b/src/main/java/org/archive/url/GoogleURLCanonicalizer.java
deleted file mode 100644
index 388db8aa..00000000
--- a/src/main/java/org/archive/url/GoogleURLCanonicalizer.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package org.archive.url;
-
-/**
- * @deprecated use {@link BasicURLCanonicalizer}
- */
-public class GoogleURLCanonicalizer extends BasicURLCanonicalizer {
-}
diff --git a/src/main/java/org/archive/url/HandyURL.java b/src/main/java/org/archive/url/HandyURL.java
index 91539b3f..0c2c81f7 100644
--- a/src/main/java/org/archive/url/HandyURL.java
+++ b/src/main/java/org/archive/url/HandyURL.java
@@ -2,6 +2,7 @@
import java.net.MalformedURLException;
import java.net.URL;
+import java.util.Locale;
public class HandyURL {
public final static int DEFAULT_PORT = -1;
@@ -277,7 +278,7 @@ public void setOpaque(String opaque) {
}
public String toDebugString() {
- return String.format("Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)",
+ return String.format(Locale.ROOT, "Scheme(%s) UserName(%s) UserPass(%s) Host(%s) port(%d) Path(%s) Query(%s) Frag(%s)",
scheme, authUser, authPass, host, port, path, query, hash);
}
diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java
index 0cf7c8a4..e964cd00 100644
--- a/src/main/java/org/archive/url/IAURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java
@@ -2,6 +2,7 @@
import java.util.Arrays;
import java.util.Comparator;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -20,11 +21,11 @@ public void canonicalize(HandyURL url) {
}
if (rules.isSet(SCHEME_SETTINGS, SCHEME_LOWERCASE)) {
if (url.getScheme() != null) {
- url.setScheme(url.getScheme().toLowerCase());
+ url.setScheme(url.getScheme().toLowerCase(Locale.ROOT));
}
}
if(rules.isSet(HOST_SETTINGS, HOST_LOWERCASE)) {
- url.setHost(url.getHost().toLowerCase());
+ url.setHost(url.getHost().toLowerCase(Locale.ROOT));
}
if(rules.isSet(HOST_SETTINGS, HOST_MASSAGE)) {
url.setHost(massageHost(url.getHost()));
@@ -46,7 +47,7 @@ public void canonicalize(HandyURL url) {
url.setPath(null);
} else {
if(rules.isSet(PATH_SETTINGS, PATH_LOWERCASE)) {
- path = path.toLowerCase();
+ path = path.toLowerCase(Locale.ROOT);
}
if(rules.isSet(PATH_SETTINGS, PATH_STRIP_SESSION_ID)) {
path = URLRegexTransformer.stripPathSessionID(path);
@@ -71,7 +72,7 @@ public void canonicalize(HandyURL url) {
}
// lower-case:
if(rules.isSet(QUERY_SETTINGS, QUERY_LOWERCASE)) {
- query = query.toLowerCase();
+ query = query.toLowerCase(Locale.ROOT);
}
// re-order?
if(rules.isSet(QUERY_SETTINGS, QUERY_ALPHA_REORDER)) {
@@ -155,7 +156,7 @@ public static String massageHost(String host) {
return host;
}
public static int getDefaultPort(String scheme) {
- String lcScheme = scheme.toLowerCase();
+ String lcScheme = scheme.toLowerCase(Locale.ROOT);
if(lcScheme.equals("http")) {
return 80;
} else if(lcScheme.equals("https")) {
diff --git a/src/main/java/org/archive/url/LaxURI.java b/src/main/java/org/archive/url/LaxURI.java
index e1cea9b7..9b7485c7 100644
--- a/src/main/java/org/archive/url/LaxURI.java
+++ b/src/main/java/org/archive/url/LaxURI.java
@@ -18,12 +18,12 @@
*/
package org.archive.url;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.BitSet;
-
-import org.apache.commons.httpclient.URI;
-import org.apache.commons.httpclient.URIException;
-import org.apache.commons.httpclient.util.EncodingUtil;
+import java.util.Locale;
/**
* URI subclass which allows partial/inconsistent encoding, matching
@@ -121,13 +121,13 @@ protected static String decode(String component, String charset)
"Component array of chars may not be null");
}
byte[] rawdata = null;
- // try {
- rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil
- .getAsciiBytes(component));
- // } catch (DecoderException e) {
- // throw new URIException(e.getMessage());
- // }
- return EncodingUtil.getString(rawdata, charset);
+ rawdata = LaxURLCodec.decodeUrlLoose(component.getBytes(StandardCharsets.US_ASCII));
+ try {
+ Charset cs = Charset.forName(charset);
+ return new String(rawdata, cs);
+ } catch (IllegalCharsetNameException e) {
+ return new String(rawdata, StandardCharsets.US_ASCII);
+ }
}
// overidden to lax() the acceptable-char BitSet passed in
@@ -183,7 +183,7 @@ protected BitSet lax(BitSet generous) {
* two instances to one where possible, slimming
* instances.
*
- * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)
+ * @see URI#parseAuthority(java.lang.String, boolean)
*/
protected void parseAuthority(String original, boolean escaped)
throws URIException {
@@ -204,7 +204,7 @@ protected void parseAuthority(String original, boolean escaped)
* long-lived instance from a static field, saving 12-14 bytes
* per instance.
*
- * @see org.apache.commons.httpclient.URI#setURI()
+ * @see URI#setURI()
*/
protected void setURI() {
if (_scheme != null) {
@@ -243,8 +243,8 @@ protected void setURI() {
* $3 = //jakarta.apache.org
* authority = $4 = jakarta.apache.org
* path = $5 = /ietf/uri/
- * $6 =
@@ -324,7 +324,7 @@ protected void parseUriReference(String original, boolean escaped)
*
*/
if (at > 0 && at < length && tmp.charAt(at) == ':') {
- char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
+ char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray();
if (validate(target, scheme)) {
_scheme = target;
from = ++at;
diff --git a/src/main/java/org/archive/url/LaxURLCodec.java b/src/main/java/org/archive/url/LaxURLCodec.java
index e27d9de0..b68a0c19 100644
--- a/src/main/java/org/archive/url/LaxURLCodec.java
+++ b/src/main/java/org/archive/url/LaxURLCodec.java
@@ -20,17 +20,16 @@
import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
import java.util.BitSet;
import org.apache.commons.codec.net.URLCodec;
-import com.google.common.base.Charsets;
-
/**
* @author gojomo
*/
public class LaxURLCodec extends URLCodec {
- public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8");
+ public static LaxURLCodec DEFAULT = new LaxURLCodec(StandardCharsets.UTF_8.name());
// passthrough constructor
public LaxURLCodec(String encoding) {
@@ -155,6 +154,6 @@ public String encode(BitSet safe, String pString, String cs)
if (pString == null) {
return null;
}
- return new String(encodeUrl(safe,pString.getBytes(cs)), Charsets.US_ASCII);
+ return new String(encodeUrl(safe,pString.getBytes(cs)), StandardCharsets.US_ASCII);
}
}
diff --git a/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java b/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java
index cd579eb0..830b7b92 100644
--- a/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java
@@ -1,10 +1,10 @@
package org.archive.url;
public class NonMassagingIAURLCanonicalizer implements URLCanonicalizer {
- private static final GoogleURLCanonicalizer google =
- new GoogleURLCanonicalizer();
+ private static final BasicURLCanonicalizer basic =
+ new BasicURLCanonicalizer();
private static CanonicalizeRules nonMassagingRules =
- new DefaultIACanonicalizerRules();
+ new AggressiveIACanonicalizerRules();
static {
nonMassagingRules.setRule(CanonicalizeRules.HOST_SETTINGS,
CanonicalizeRules.HOST_LOWERCASE);
@@ -14,7 +14,7 @@ public class NonMassagingIAURLCanonicalizer implements URLCanonicalizer {
public void canonicalize(HandyURL url) {
// just google's stuff, followed by the IA default stuff:
- google.canonicalize(url);
+ basic.canonicalize(url);
ia.canonicalize(url);
}
}
diff --git a/src/main/java/org/archive/url/SURT.java b/src/main/java/org/archive/url/SURT.java
index 2c8e1b02..9598f458 100644
--- a/src/main/java/org/archive/url/SURT.java
+++ b/src/main/java/org/archive/url/SURT.java
@@ -2,11 +2,10 @@
import java.io.BufferedReader;
import java.io.InputStreamReader;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.logging.Logger;
-import org.apache.commons.httpclient.URIException;
import org.archive.util.iterator.AbstractPeekableIterator;
public class SURT {
@@ -34,7 +33,7 @@ public static String toSURT(String input) {
}
public static void main(String[] args) {
String line;
- InputStreamReader isr = new InputStreamReader(System.in,Charset.forName("UTF-8"));
+ InputStreamReader isr = new InputStreamReader(System.in, StandardCharsets.UTF_8);
BufferedReader br = new BufferedReader(isr);
Iterator
+ * A URI is always in an "escaped" form, since escaping or unescaping a
+ * completed URI might change its semantics.
+ *
+ * Implementers should be careful not to escape or unescape the same string
+ * more than once, since unescaping an already unescaped string might lead to
+ * misinterpreting a percent data character as another escaped character,
+ * or vice versa in the case of escaping an already escaped string.
+ *
+ * In order to avoid these problems, data types used as follows:
+ *
+ *
+ * So, a URI is a sequence of characters as an array of a char type, which
+ * is not always represented as a sequence of octets as an array of byte.
+ *
+ *
+ * URI Syntactic Components
+ *
+ *
+ * The following examples illustrate URI that are in common use.
+ *
+ * The expressions for a URI
+ *
+ *
+ * @author Sung-Gu
+ * @author Mike Bowler
+ * @version $Revision: 564973 $ $Date: 2002/03/14 15:14:01
+ */
+class URI implements Cloneable, Comparable, Serializable {
+
+
+ // ----------------------------------------------------------- Constructors
+
+ /** Create an instance as an internal use */
+ protected URI() {
+ }
+
+ /**
+ * Construct a URI from a string with the given charset. The input string can
+ * be either in escaped or unescaped form.
+ *
+ * @param s URI character sequence
+ * @param escaped true if URI character sequence is in escaped form.
+ * false otherwise.
+ * @param charset the charset string to do escape encoding, if required
+ *
+ * @throws URIException If the URI cannot be created.
+ * @throws NullPointerException if input string is
+ * An URI can be placed within double-quotes or angle brackets like
+ * "http://test.com/" and <http://test.com/>
+ *
+ * @param original the string to be represented to URI character sequence
+ * It is one of absoluteURI and relativeURI.
+ * @throws URIException If the URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ *
+ * @deprecated Use #URI(String, boolean)
+ */
+ public URI(String original) throws URIException {
+ parseUriReference(original, false);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * It's for absolute URI = <scheme>:<scheme-specific-part>#
+ * <fragment>.
+ *
+ * @param scheme the scheme string
+ * @param schemeSpecificPart scheme_specific_part
+ * @param fragment the fragment string
+ * @throws URIException If the URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String schemeSpecificPart, String fragment)
+ throws URIException {
+
+ // validate and contruct the URI character sequence
+ if (scheme == null) {
+ throw new URIException(URIException.PARSING, "scheme required");
+ }
+ char[] s = scheme.toLowerCase(Locale.ROOT).toCharArray();
+ if (validate(s, URI.scheme)) {
+ _scheme = s; // is_absoluteURI
+ } else {
+ throw new URIException(URIException.PARSING, "incorrect scheme");
+ }
+ _opaque = encode(schemeSpecificPart, allowed_opaque_part,
+ getProtocolCharset());
+ // Set flag
+ _is_opaque_part = true;
+ _fragment = fragment == null ? null : fragment.toCharArray();
+ setURI();
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * It's for absolute URI = <scheme>:<path>?<query>#<
+ * fragment> and relative URI = <path>?<query>#<fragment
+ * >.
+ *
+ * @param scheme the scheme string
+ * @param authority the authority string
+ * @param path the path string
+ * @param query the query string
+ * @param fragment the fragment string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String authority, String path, String query,
+ String fragment) throws URIException {
+
+ // validate and contruct the URI character sequence
+ StringBuffer buff = new StringBuffer();
+ if (scheme != null) {
+ buff.append(scheme);
+ buff.append(':');
+ }
+ if (authority != null) {
+ buff.append("//");
+ buff.append(authority);
+ }
+ if (path != null) { // accept empty path
+ if ((scheme != null || authority != null)
+ && !path.startsWith("/")) {
+ throw new URIException(URIException.PARSING,
+ "abs_path requested");
+ }
+ buff.append(path);
+ }
+ if (query != null) {
+ buff.append('?');
+ buff.append(query);
+ }
+ if (fragment != null) {
+ buff.append('#');
+ buff.append(fragment);
+ }
+ parseUriReference(buff.toString(), false);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param userinfo the userinfo string
+ * @param host the host string
+ * @param port the port number
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String userinfo, String host, int port)
+ throws URIException {
+
+ this(scheme, userinfo, host, port, null, null, null);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param userinfo the userinfo string
+ * @param host the host string
+ * @param port the port number
+ * @param path the path string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String userinfo, String host, int port,
+ String path) throws URIException {
+
+ this(scheme, userinfo, host, port, path, null, null);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param userinfo the userinfo string
+ * @param host the host string
+ * @param port the port number
+ * @param path the path string
+ * @param query the query string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String userinfo, String host, int port,
+ String path, String query) throws URIException {
+
+ this(scheme, userinfo, host, port, path, query, null);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param userinfo the userinfo string
+ * @param host the host string
+ * @param port the port number
+ * @param path the path string
+ * @param query the query string
+ * @param fragment the fragment string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String userinfo, String host, int port,
+ String path, String query, String fragment) throws URIException {
+
+ this(scheme, (host == null) ? null
+ : ((userinfo != null) ? userinfo + '@' : "") + host
+ + ((port != -1) ? ":" + port : ""), path, query, fragment);
+ }
+
+
+ /**
+ * Construct a general URI from the given components.
+ *
+ * @param scheme the scheme string
+ * @param host the host string
+ * @param path the path string
+ * @param fragment the fragment string
+ * @throws URIException If the new URI cannot be created.
+ * @see #getDefaultProtocolCharset
+ */
+ public URI(String scheme, String host, String path, String fragment)
+ throws URIException {
+
+ this(scheme, host, path, null, fragment);
+ }
+
+
+ /**
+ * Construct a general URI with the given relative URI string.
+ *
+ * @param base the base URI
+ * @param relative the relative URI string
+ * @throws URIException If the new URI cannot be created.
+ *
+ * @deprecated Use #URI(URI, String, boolean)
+ */
+ public URI(URI base, String relative) throws URIException {
+ this(base, new URI(relative));
+ }
+
+
+ /**
+ * Construct a general URI with the given relative URI string.
+ *
+ * @param base the base URI
+ * @param relative the relative URI string
+ * @param escaped true if URI character sequence is in escaped form.
+ * false otherwise.
+ *
+ * @throws URIException If the new URI cannot be created.
+ *
+ * @since 3.0
+ */
+ public URI(URI base, String relative, boolean escaped) throws URIException {
+ this(base, new URI(relative, escaped));
+ }
+
+
+ /**
+ * Construct a general URI with the given relative URI.
+ *
+ * Resolving Relative References to Absolute Form.
+ *
+ * Examples of Resolving Relative URI References
+ *
+ * Within an object with a well-defined base URI of
+ *
+ * the relative URI would be resolved as follows:
+ *
+ * Normal Examples
+ *
+ *
+ *
+ * Some URI schemes do not allow a hierarchical syntax matching the
+ *
+ */
+ protected static final BitSet digit = new BitSet(256);
+ // Static initializer for digit
+ static {
+ for (int i = '0'; i <= '9'; i++) {
+ digit.set(i);
+ }
+ }
+
+
+ /**
+ * BitSet for alpha.
+ *
+ */
+ protected static final BitSet alpha = new BitSet(256);
+ // Static initializer for alpha
+ static {
+ for (int i = 'a'; i <= 'z'; i++) {
+ alpha.set(i);
+ }
+ for (int i = 'A'; i <= 'Z'; i++) {
+ alpha.set(i);
+ }
+ }
+
+
+ /**
+ * BitSet for alphanum (join of alpha & digit).
+ *
+ */
+ protected static final BitSet alphanum = new BitSet(256);
+ // Static initializer for alphanum
+ static {
+ alphanum.or(alpha);
+ alphanum.or(digit);
+ }
+
+
+ /**
+ * BitSet for hex.
+ *
+ */
+ protected static final BitSet hex = new BitSet(256);
+ // Static initializer for hex
+ static {
+ hex.or(digit);
+ for (int i = 'a'; i <= 'f'; i++) {
+ hex.set(i);
+ }
+ for (int i = 'A'; i <= 'F'; i++) {
+ hex.set(i);
+ }
+ }
+
+
+ /**
+ * BitSet for escaped.
+ *
+ */
+ protected static final BitSet escaped = new BitSet(256);
+ // Static initializer for escaped
+ static {
+ escaped.or(percent);
+ escaped.or(hex);
+ }
+
+
+ /**
+ * BitSet for mark.
+ *
+ */
+ protected static final BitSet mark = new BitSet(256);
+ // Static initializer for mark
+ static {
+ mark.set('-');
+ mark.set('_');
+ mark.set('.');
+ mark.set('!');
+ mark.set('~');
+ mark.set('*');
+ mark.set('\'');
+ mark.set('(');
+ mark.set(')');
+ }
+
+
+ /**
+ * Data characters that are allowed in a URI but do not have a reserved
+ * purpose are called unreserved.
+ *
+ */
+ protected static final BitSet unreserved = new BitSet(256);
+ // Static initializer for unreserved
+ static {
+ unreserved.or(alphanum);
+ unreserved.or(mark);
+ }
+
+
+ /**
+ * BitSet for reserved.
+ *
+ */
+ protected static final BitSet reserved = new BitSet(256);
+ // Static initializer for reserved
+ static {
+ reserved.set(';');
+ reserved.set('/');
+ reserved.set('?');
+ reserved.set(':');
+ reserved.set('@');
+ reserved.set('&');
+ reserved.set('=');
+ reserved.set('+');
+ reserved.set('$');
+ reserved.set(',');
+ }
+
+
+ /**
+ * BitSet for uric.
+ *
+ */
+ protected static final BitSet uric = new BitSet(256);
+ // Static initializer for uric
+ static {
+ uric.or(reserved);
+ uric.or(unreserved);
+ uric.or(escaped);
+ }
+
+
+ /**
+ * BitSet for fragment (alias for uric).
+ *
+ */
+ protected static final BitSet fragment = uric;
+
+
+ /**
+ * BitSet for query (alias for uric).
+ *
+ */
+ protected static final BitSet query = uric;
+
+
+ /**
+ * BitSet for pchar.
+ *
+ */
+ protected static final BitSet pchar = new BitSet(256);
+ // Static initializer for pchar
+ static {
+ pchar.or(unreserved);
+ pchar.or(escaped);
+ pchar.set(':');
+ pchar.set('@');
+ pchar.set('&');
+ pchar.set('=');
+ pchar.set('+');
+ pchar.set('$');
+ pchar.set(',');
+ }
+
+
+ /**
+ * BitSet for param (alias for pchar).
+ *
+ */
+ protected static final BitSet param = pchar;
+
+
+ /**
+ * BitSet for segment.
+ *
+ */
+ protected static final BitSet segment = new BitSet(256);
+ // Static initializer for segment
+ static {
+ segment.or(pchar);
+ segment.set(';');
+ segment.or(param);
+ }
+
+
+ /**
+ * BitSet for path segments.
+ *
+ */
+ protected static final BitSet path_segments = new BitSet(256);
+ // Static initializer for path_segments
+ static {
+ path_segments.set('/');
+ path_segments.or(segment);
+ }
+
+
+ /**
+ * URI absolute path.
+ *
+ */
+ protected static final BitSet abs_path = new BitSet(256);
+ // Static initializer for abs_path
+ static {
+ abs_path.set('/');
+ abs_path.or(path_segments);
+ }
+
+
+ /**
+ * URI bitset for encoding typical non-slash characters.
+ *
+ */
+ protected static final BitSet uric_no_slash = new BitSet(256);
+ // Static initializer for uric_no_slash
+ static {
+ uric_no_slash.or(unreserved);
+ uric_no_slash.or(escaped);
+ uric_no_slash.set(';');
+ uric_no_slash.set('?');
+ uric_no_slash.set(';');
+ uric_no_slash.set('@');
+ uric_no_slash.set('&');
+ uric_no_slash.set('=');
+ uric_no_slash.set('+');
+ uric_no_slash.set('$');
+ uric_no_slash.set(',');
+ }
+
+
+ /**
+ * URI bitset that combines uric_no_slash and uric.
+ *
+ */
+ protected static final BitSet opaque_part = new BitSet(256);
+ // Static initializer for opaque_part
+ static {
+ // it's generous. because first character must not include a slash
+ opaque_part.or(uric_no_slash);
+ opaque_part.or(uric);
+ }
+
+
+ /**
+ * URI bitset that combines absolute path and opaque part.
+ *
+ */
+ protected static final BitSet path = new BitSet(256);
+ // Static initializer for path
+ static {
+ path.or(abs_path);
+ path.or(opaque_part);
+ }
+
+
+ /**
+ * Port, a logical alias for digit.
+ */
+ protected static final BitSet port = digit;
+
+
+ /**
+ * Bitset that combines digit and dot fo IPv$address.
+ *
+ */
+ protected static final BitSet IPv4address = new BitSet(256);
+ // Static initializer for IPv4address
+ static {
+ IPv4address.or(digit);
+ IPv4address.set('.');
+ }
+
+
+ /**
+ * RFC 2373.
+ *
+ */
+ protected static final BitSet IPv6address = new BitSet(256);
+ // Static initializer for IPv6address reference
+ static {
+ IPv6address.or(hex); // hexpart
+ IPv6address.set(':');
+ IPv6address.or(IPv4address);
+ }
+
+
+ /**
+ * RFC 2732, 2373.
+ *
+ */
+ protected static final BitSet IPv6reference = new BitSet(256);
+ // Static initializer for IPv6reference
+ static {
+ IPv6reference.set('[');
+ IPv6reference.or(IPv6address);
+ IPv6reference.set(']');
+ }
+
+
+ /**
+ * BitSet for toplabel.
+ *
+ */
+ protected static final BitSet toplabel = new BitSet(256);
+ // Static initializer for toplabel
+ static {
+ toplabel.or(alphanum);
+ toplabel.set('-');
+ }
+
+
+ /**
+ * BitSet for domainlabel.
+ *
+ */
+ protected static final BitSet domainlabel = toplabel;
+
+
+ /**
+ * BitSet for hostname.
+ *
+ */
+ protected static final BitSet hostname = new BitSet(256);
+ // Static initializer for hostname
+ static {
+ hostname.or(toplabel);
+ // hostname.or(domainlabel);
+ hostname.set('.');
+ }
+
+
+ /**
+ * BitSet for host.
+ *
+ */
+ protected static final BitSet host = new BitSet(256);
+ // Static initializer for host
+ static {
+ host.or(hostname);
+ // host.or(IPv4address);
+ host.or(IPv6reference); // IPv4address
+ }
+
+
+ /**
+ * BitSet for hostport.
+ *
+ */
+ protected static final BitSet hostport = new BitSet(256);
+ // Static initializer for hostport
+ static {
+ hostport.or(host);
+ hostport.set(':');
+ hostport.or(port);
+ }
+
+
+ /**
+ * Bitset for userinfo.
+ *
+ */
+ protected static final BitSet userinfo = new BitSet(256);
+ // Static initializer for userinfo
+ static {
+ userinfo.or(unreserved);
+ userinfo.or(escaped);
+ userinfo.set(';');
+ userinfo.set(':');
+ userinfo.set('&');
+ userinfo.set('=');
+ userinfo.set('+');
+ userinfo.set('$');
+ userinfo.set(',');
+ }
+
+
+ /**
+ * BitSet for within the userinfo component like user and password.
+ */
+ public static final BitSet within_userinfo = new BitSet(256);
+ // Static initializer for within_userinfo
+ static {
+ within_userinfo.or(userinfo);
+ within_userinfo.clear(';'); // reserved within authority
+ within_userinfo.clear(':');
+ within_userinfo.clear('@');
+ within_userinfo.clear('?');
+ within_userinfo.clear('/');
+ }
+
+
+ /**
+ * Bitset for server.
+ *
+ */
+ protected static final BitSet server = new BitSet(256);
+ // Static initializer for server
+ static {
+ server.or(userinfo);
+ server.set('@');
+ server.or(hostport);
+ }
+
+
+ /**
+ * BitSet for reg_name.
+ *
+ */
+ protected static final BitSet reg_name = new BitSet(256);
+ // Static initializer for reg_name
+ static {
+ reg_name.or(unreserved);
+ reg_name.or(escaped);
+ reg_name.set('$');
+ reg_name.set(',');
+ reg_name.set(';');
+ reg_name.set(':');
+ reg_name.set('@');
+ reg_name.set('&');
+ reg_name.set('=');
+ reg_name.set('+');
+ }
+
+
+ /**
+ * BitSet for authority.
+ *
+ */
+ protected static final BitSet authority = new BitSet(256);
+ // Static initializer for authority
+ static {
+ authority.or(server);
+ authority.or(reg_name);
+ }
+
+
+ /**
+ * BitSet for scheme.
+ *
+ */
+ protected static final BitSet scheme = new BitSet(256);
+ // Static initializer for scheme
+ static {
+ scheme.or(alpha);
+ scheme.or(digit);
+ scheme.set('+');
+ scheme.set('-');
+ scheme.set('.');
+ }
+
+
+ /**
+ * BitSet for rel_segment.
+ *
+ */
+ protected static final BitSet rel_segment = new BitSet(256);
+ // Static initializer for rel_segment
+ static {
+ rel_segment.or(unreserved);
+ rel_segment.or(escaped);
+ rel_segment.set(';');
+ rel_segment.set('@');
+ rel_segment.set('&');
+ rel_segment.set('=');
+ rel_segment.set('+');
+ rel_segment.set('$');
+ rel_segment.set(',');
+ }
+
+
+ /**
+ * BitSet for rel_path.
+ *
+ */
+ protected static final BitSet rel_path = new BitSet(256);
+ // Static initializer for rel_path
+ static {
+ rel_path.or(rel_segment);
+ rel_path.or(abs_path);
+ }
+
+
+ /**
+ * BitSet for net_path.
+ *
+ */
+ protected static final BitSet net_path = new BitSet(256);
+ // Static initializer for net_path
+ static {
+ net_path.set('/');
+ net_path.or(authority);
+ net_path.or(abs_path);
+ }
+
+
+ /**
+ * BitSet for hier_part.
+ *
+ */
+ protected static final BitSet hier_part = new BitSet(256);
+ // Static initializer for hier_part
+ static {
+ hier_part.or(net_path);
+ hier_part.or(abs_path);
+ // hier_part.set('?'); aleady included
+ hier_part.or(query);
+ }
+
+
+ /**
+ * BitSet for relativeURI.
+ *
+ */
+ protected static final BitSet relativeURI = new BitSet(256);
+ // Static initializer for relativeURI
+ static {
+ relativeURI.or(net_path);
+ relativeURI.or(abs_path);
+ relativeURI.or(rel_path);
+ // relativeURI.set('?'); aleady included
+ relativeURI.or(query);
+ }
+
+
+ /**
+ * BitSet for absoluteURI.
+ *
+ */
+ protected static final BitSet absoluteURI = new BitSet(256);
+ // Static initializer for absoluteURI
+ static {
+ absoluteURI.or(scheme);
+ absoluteURI.set(':');
+ absoluteURI.or(hier_part);
+ absoluteURI.or(opaque_part);
+ }
+
+
+ /**
+ * BitSet for URI-reference.
+ *
+ */
+ protected static final BitSet URI_reference = new BitSet(256);
+ // Static initializer for URI_reference
+ static {
+ URI_reference.or(absoluteURI);
+ URI_reference.or(relativeURI);
+ URI_reference.set('#');
+ URI_reference.or(fragment);
+ }
+
+ // ---------------------------- Characters disallowed within the URI syntax
+ // Excluded US-ASCII Characters are like control, space, delims and unwise
+
+ /**
+ * BitSet for control.
+ */
+ public static final BitSet control = new BitSet(256);
+ // Static initializer for control
+ static {
+ for (int i = 0; i <= 0x1F; i++) {
+ control.set(i);
+ }
+ control.set(0x7F);
+ }
+
+ /**
+ * BitSet for space.
+ */
+ public static final BitSet space = new BitSet(256);
+ // Static initializer for space
+ static {
+ space.set(0x20);
+ }
+
+
+ /**
+ * BitSet for delims.
+ */
+ public static final BitSet delims = new BitSet(256);
+ // Static initializer for delims
+ static {
+ delims.set('<');
+ delims.set('>');
+ delims.set('#');
+ delims.set('%');
+ delims.set('"');
+ }
+
+
+ /**
+ * BitSet for unwise.
+ */
+ public static final BitSet unwise = new BitSet(256);
+ // Static initializer for unwise
+ static {
+ unwise.set('{');
+ unwise.set('}');
+ unwise.set('|');
+ unwise.set('\\');
+ unwise.set('^');
+ unwise.set('[');
+ unwise.set(']');
+ unwise.set('`');
+ }
+
+
+ /**
+ * Disallowed rel_path before escaping.
+ */
+ public static final BitSet disallowed_rel_path = new BitSet(256);
+ // Static initializer for disallowed_rel_path
+ static {
+ disallowed_rel_path.or(uric);
+ disallowed_rel_path.andNot(rel_path);
+ }
+
+
+ /**
+ * Disallowed opaque_part before escaping.
+ */
+ public static final BitSet disallowed_opaque_part = new BitSet(256);
+ // Static initializer for disallowed_opaque_part
+ static {
+ disallowed_opaque_part.or(uric);
+ disallowed_opaque_part.andNot(opaque_part);
+ }
+
+ // ----------------------- Characters allowed within and for each component
+
+ /**
+ * Those characters that are allowed for the authority component.
+ */
+ public static final BitSet allowed_authority = new BitSet(256);
+ // Static initializer for allowed_authority
+ static {
+ allowed_authority.or(authority);
+ allowed_authority.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for the opaque_part.
+ */
+ public static final BitSet allowed_opaque_part = new BitSet(256);
+ // Static initializer for allowed_opaque_part
+ static {
+ allowed_opaque_part.or(opaque_part);
+ allowed_opaque_part.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for the reg_name.
+ */
+ public static final BitSet allowed_reg_name = new BitSet(256);
+ // Static initializer for allowed_reg_name
+ static {
+ allowed_reg_name.or(reg_name);
+ // allowed_reg_name.andNot(percent);
+ allowed_reg_name.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for the userinfo component.
+ */
+ public static final BitSet allowed_userinfo = new BitSet(256);
+ // Static initializer for allowed_userinfo
+ static {
+ allowed_userinfo.or(userinfo);
+ // allowed_userinfo.andNot(percent);
+ allowed_userinfo.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for within the userinfo component.
+ */
+ public static final BitSet allowed_within_userinfo = new BitSet(256);
+ // Static initializer for allowed_within_userinfo
+ static {
+ allowed_within_userinfo.or(within_userinfo);
+ allowed_within_userinfo.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed for the IPv6reference component.
+ * The characters '[', ']' in IPv6reference should be excluded.
+ */
+ public static final BitSet allowed_IPv6reference = new BitSet(256);
+ // Static initializer for allowed_IPv6reference
+ static {
+ allowed_IPv6reference.or(IPv6reference);
+ // allowed_IPv6reference.andNot(unwise);
+ allowed_IPv6reference.clear('[');
+ allowed_IPv6reference.clear(']');
+ }
+
+
+ /**
+ * Those characters that are allowed for the host component.
+ * The characters '[', ']' in IPv6reference should be excluded.
+ */
+ public static final BitSet allowed_host = new BitSet(256);
+ // Static initializer for allowed_host
+ static {
+ allowed_host.or(hostname);
+ allowed_host.or(allowed_IPv6reference);
+ }
+
+
+ /**
+ * Those characters that are allowed for the authority component.
+ */
+ public static final BitSet allowed_within_authority = new BitSet(256);
+ // Static initializer for allowed_within_authority
+ static {
+ allowed_within_authority.or(server);
+ allowed_within_authority.or(reg_name);
+ allowed_within_authority.clear(';');
+ allowed_within_authority.clear(':');
+ allowed_within_authority.clear('@');
+ allowed_within_authority.clear('?');
+ allowed_within_authority.clear('/');
+ }
+
+
+ /**
+ * Those characters that are allowed for the abs_path.
+ */
+ public static final BitSet allowed_abs_path = new BitSet(256);
+ // Static initializer for allowed_abs_path
+ static {
+ allowed_abs_path.or(abs_path);
+ // allowed_abs_path.set('/'); // aleady included
+ allowed_abs_path.andNot(percent);
+ allowed_abs_path.clear('+');
+ }
+
+
+ /**
+ * Those characters that are allowed for the rel_path.
+ */
+ public static final BitSet allowed_rel_path = new BitSet(256);
+ // Static initializer for allowed_rel_path
+ static {
+ allowed_rel_path.or(rel_path);
+ allowed_rel_path.clear('%');
+ allowed_rel_path.clear('+');
+ }
+
+
+ /**
+ * Those characters that are allowed within the path.
+ */
+ public static final BitSet allowed_within_path = new BitSet(256);
+ // Static initializer for allowed_within_path
+ static {
+ allowed_within_path.or(abs_path);
+ allowed_within_path.clear('/');
+ allowed_within_path.clear(';');
+ allowed_within_path.clear('=');
+ allowed_within_path.clear('?');
+ }
+
+
+ /**
+ * Those characters that are allowed for the query component.
+ */
+ public static final BitSet allowed_query = new BitSet(256);
+ // Static initializer for allowed_query
+ static {
+ allowed_query.or(uric);
+ allowed_query.clear('%');
+ }
+
+
+ /**
+ * Those characters that are allowed within the query component.
+ */
+ public static final BitSet allowed_within_query = new BitSet(256);
+ // Static initializer for allowed_within_query
+ static {
+ allowed_within_query.or(allowed_query);
+ allowed_within_query.andNot(reserved); // excluded 'reserved'
+ }
+
+
+ /**
+ * Those characters that are allowed for the fragment component.
+ */
+ public static final BitSet allowed_fragment = new BitSet(256);
+ // Static initializer for allowed_fragment
+ static {
+ allowed_fragment.or(uric);
+ allowed_fragment.clear('%');
+ }
+
+ // ------------------------------------------- Flags for this URI-reference
+
+ // TODO: Figure out what all these variables are for and provide javadoc
+
+ // URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ // absoluteURI = scheme ":" ( hier_part | opaque_part )
+ protected boolean _is_hier_part;
+ protected boolean _is_opaque_part;
+ // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
+ // hier_part = ( net_path | abs_path ) [ "?" query ]
+ protected boolean _is_net_path;
+ protected boolean _is_abs_path;
+ protected boolean _is_rel_path;
+ // net_path = "//" authority [ abs_path ]
+ // authority = server | reg_name
+ protected boolean _is_reg_name;
+ protected boolean _is_server; // = _has_server
+ // server = [ [ userinfo "@" ] hostport ]
+ // host = hostname | IPv4address | IPv6reference
+ protected boolean _is_hostname;
+ protected boolean _is_IPv4address;
+ protected boolean _is_IPv6reference;
+
+ // ------------------------------------------ Character and escape encoding
+
+ /**
+ * Encodes URI string.
+ *
+ * This is a two mapping, one from original characters to octets, and
+ * subsequently a second from octets to URI characters:
+ *
+ *
+ * An escaped octet is encoded as a character triplet, consisting of the
+ * percent character "%" followed by the two hexadecimal digits
+ * representing the octet code. For example, "%20" is the escaped
+ * encoding for the US-ASCII space character.
+ *
+ * Conversion from the local filesystem character set to UTF-8 will
+ * normally involve a two step process. First convert the local character
+ * set to the UCS; then convert the UCS to UTF-8.
+ * The first step in the process can be performed by maintaining a mapping
+ * table that includes the local character set code and the corresponding
+ * UCS code.
+ * The next step is to convert the UCS character code to the UTF-8 encoding.
+ *
+ * Mapping between vendor codepages can be done in a very similar manner
+ * as described above.
+ *
+ * The only time escape encodings can allowedly be made is when a URI is
+ * being created from its component parts. The escape and validate methods
+ * are internally performed within this method.
+ *
+ * @param original the original character sequence
+ * @param allowed those characters that are allowed within a component
+ * @param charset the protocol charset
+ * @return URI character sequence
+ * @throws URIException null component or unsupported character encoding
+ */
+
+ protected static char[] encode(String original, BitSet allowed,
+ String charset) throws URIException {
+ if (original == null) {
+ throw new IllegalArgumentException("Original string may not be null");
+ }
+ if (allowed == null) {
+ throw new IllegalArgumentException("Allowed bitset may not be null");
+ }
+ byte[] rawdata = URLCodec.encodeUrl(allowed, getBytes(original, charset));
+ return new String(rawdata, StandardCharsets.US_ASCII).toCharArray();
+ }
+
+ private static byte[] getBytes(String original, String charset) {
+ try {
+ return original.getBytes(charset);
+ } catch (UnsupportedEncodingException e) {
+ return original.getBytes(UTF_8);
+ }
+ }
+
+ /**
+ * Decodes URI encoded string.
+ *
+ * This is a two mapping, one from URI characters to octets, and
+ * subsequently a second from octets to original characters:
+ *
+ *
+ * A URI must be separated into its components before the escaped
+ * characters within those components can be allowedly decoded.
+ *
+ * Notice that there is a chance that URI characters that are non UTF-8
+ * may be parsed as valid UTF-8. A recent non-scientific analysis found
+ * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
+ * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
+ * false reading.
+ *
+ * The percent "%" character always has the reserved purpose of being
+ * the escape indicator, it must be escaped as "%25" in order to be used
+ * as data within a URI.
+ *
+ * The unescape method is internally performed within this method.
+ *
+ * @param component the URI character sequence
+ * @param charset the protocol charset
+ * @return original character sequence
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ */
+ protected static String decode(char[] component, String charset)
+ throws URIException {
+ if (component == null) {
+ throw new IllegalArgumentException("Component array of chars may not be null");
+ }
+ return decode(new String(component), charset);
+ }
+
+ /**
+ * Decodes URI encoded string.
+ *
+ * This is a two mapping, one from URI characters to octets, and
+ * subsequently a second from octets to original characters:
+ *
+ *
+ * A URI must be separated into its components before the escaped
+ * characters within those components can be allowedly decoded.
+ *
+ * Notice that there is a chance that URI characters that are non UTF-8
+ * may be parsed as valid UTF-8. A recent non-scientific analysis found
+ * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
+ * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
+ * false reading.
+ *
+ * The percent "%" character always has the reserved purpose of being
+ * the escape indicator, it must be escaped as "%25" in order to be used
+ * as data within a URI.
+ *
+ * The unescape method is internally performed within this method.
+ *
+ * @param component the URI character sequence
+ * @param charset the protocol charset
+ * @return original character sequence
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ *
+ * @since 3.0
+ */
+ protected static String decode(String component, String charset)
+ throws URIException {
+ if (component == null) {
+ throw new IllegalArgumentException("Component array of chars may not be null");
+ }
+ byte[] rawdata = null;
+ try {
+ rawdata = URLCodec.decodeUrl(component.getBytes(StandardCharsets.US_ASCII));
+ } catch (DecoderException e) {
+ throw new URIException(e.getMessage());
+ }
+ try {
+ Charset cs = Charset.forName(charset);
+ return new String(rawdata, cs);
+ } catch (IllegalCharsetNameException e) {
+ return new String(rawdata, StandardCharsets.US_ASCII);
+ }
+ }
+
+ /**
+ * Pre-validate the unescaped URI string within a specific component.
+ *
+ * @param component the component string within the component
+ * @param disallowed those characters disallowed within the component
+ * @return if true, it doesn't have the disallowed characters
+ * if false, the component is undefined or an incorrect one
+ */
+ protected boolean prevalidate(String component, BitSet disallowed) {
+ // prevalidate the given component by disallowed characters
+ if (component == null) {
+ return false; // undefined
+ }
+ char[] target = component.toCharArray();
+ for (int i = 0; i < target.length; i++) {
+ if (disallowed.get(target[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ /**
+ * Validate the URI characters within a specific component.
+ * The component must be performed after escape encoding. Or it doesn't
+ * include escaped characters.
+ *
+ * @param component the characters sequence within the component
+ * @param generous those characters that are allowed within a component
+ * @return if true, it's the correct URI character sequence
+ */
+ protected boolean validate(char[] component, BitSet generous) {
+ // validate each component by generous characters
+ return validate(component, 0, -1, generous);
+ }
+
+
+ /**
+ * Validate the URI characters within a specific component.
+ * The component must be performed after escape encoding. Or it doesn't
+ * include escaped characters.
+ *
+ * It's not that much strict, generous. The strict validation might be
+ * performed before being called this method.
+ *
+ * @param component the characters sequence within the component
+ * @param soffset the starting offset of the given component
+ * @param eoffset the ending offset of the given component
+ * if -1, it means the length of the component
+ * @param generous those characters that are allowed within a component
+ * @return if true, it's the correct URI character sequence
+ */
+ protected boolean validate(char[] component, int soffset, int eoffset,
+ BitSet generous) {
+ // validate each component by generous characters
+ if (eoffset == -1) {
+ eoffset = component.length - 1;
+ }
+ for (int i = soffset; i <= eoffset; i++) {
+ if (!generous.get(component[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ /**
+ * In order to avoid any possilbity of conflict with non-ASCII characters,
+ * Parse a URI reference as a
+ * The following line is the regular expression for breaking-down a URI
+ * reference into its components.
+ *
+ * For example, matching the above expression to
+ * http://jakarta.apache.org/ietf/uri/#Related
+ * results in the following subexpression matches:
+ *
+ *
+ * @param original the original character sequence
+ * @param escaped
+ */
+ int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
+ if (at == -1) {
+ at = 0;
+ }
+
+ /*
+ * Parse the scheme.
+ *
+ */
+ if (at > 0 && at < length && tmp.charAt(at) == ':') {
+ char[] target = tmp.substring(0, at).toLowerCase(Locale.ROOT).toCharArray();
+ if (validate(target, scheme)) {
+ _scheme = target;
+ } else {
+ throw new URIException("incorrect scheme");
+ }
+ from = ++at;
+ }
+
+ /*
+ * Parse the authority component.
+ *
+ */
+ // Reset flags
+ _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
+ if (0 <= at && at < length && tmp.charAt(at) == '/') {
+ // Set flag
+ _is_hier_part = true;
+ if (at + 2 < length && tmp.charAt(at + 1) == '/'
+ && !isStartedFromPath) {
+ // the temporary index to start the search from
+ int next = indexFirstOf(tmp, "/?#", at + 2);
+ if (next == -1) {
+ next = (tmp.substring(at + 2).length() == 0) ? at + 2
+ : tmp.length();
+ }
+ parseAuthority(tmp.substring(at + 2, next), escaped);
+ from = at = next;
+ // Set flag
+ _is_net_path = true;
+ }
+ if (from == at) {
+ // Set flag
+ _is_abs_path = true;
+ }
+ }
+
+ /*
+ * Parse the path component.
+ *
+ */
+ if (from < length) {
+ // rel_path = rel_segment [ abs_path ]
+ int next = indexFirstOf(tmp, "?#", from);
+ if (next == -1) {
+ next = tmp.length();
+ }
+ if (!_is_abs_path) {
+ if (!escaped
+ && prevalidate(tmp.substring(from, next), disallowed_rel_path)
+ || escaped
+ && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
+ // Set flag
+ _is_rel_path = true;
+ } else if (!escaped
+ && prevalidate(tmp.substring(from, next), disallowed_opaque_part)
+ || escaped
+ && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
+ // Set flag
+ _is_opaque_part = true;
+ } else {
+ // the path component may be empty
+ _path = null;
+ }
+ }
+ String s = tmp.substring(from, next);
+ if (escaped) {
+ setRawPath(s.toCharArray());
+ } else {
+ setPath(s);
+ }
+ at = next;
+ }
+
+ // set the charset to do escape encoding
+ String charset = getProtocolCharset();
+
+ /*
+ * Parse the query component.
+ *
+ */
+ if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
+ int next = tmp.indexOf('#', at + 1);
+ if (next == -1) {
+ next = tmp.length();
+ }
+ if (escaped) {
+ _query = tmp.substring(at + 1, next).toCharArray();
+ if (!validate(_query, uric)) {
+ throw new URIException("Invalid query");
+ }
+ } else {
+ _query = encode(tmp.substring(at + 1, next), allowed_query, charset);
+ }
+ at = next;
+ }
+
+ /*
+ * Parse the fragment component.
+ *
+ */
+ if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
+ if (at + 1 == length) { // empty fragment
+ _fragment = "".toCharArray();
+ } else {
+ _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
+ : encode(tmp.substring(at + 1), allowed_fragment, charset);
+ }
+ }
+
+ // set this URI.
+ setURI();
+ }
+
+
+ /**
+ * Get the earlier index that to be searched for the first occurrance in
+ * one of any of the given string.
+ *
+ * @param s the string to be indexed
+ * @param delims the delimiters used to index
+ * @return the earlier index if there are delimiters
+ */
+ protected int indexFirstOf(String s, String delims) {
+ return indexFirstOf(s, delims, -1);
+ }
+
+
+ /**
+ * Get the earlier index that to be searched for the first occurrance in
+ * one of any of the given string.
+ *
+ * @param s the string to be indexed
+ * @param delims the delimiters used to index
+ * @param offset the from index
+ * @return the earlier index if there are delimiters
+ */
+ protected int indexFirstOf(String s, String delims, int offset) {
+ if (s == null || s.length() == 0) {
+ return -1;
+ }
+ if (delims == null || delims.length() == 0) {
+ return -1;
+ }
+ // check boundaries
+ if (offset < 0) {
+ offset = 0;
+ } else if (offset > s.length()) {
+ return -1;
+ }
+ // s is never null
+ int min = s.length();
+ char[] delim = delims.toCharArray();
+ for (int i = 0; i < delim.length; i++) {
+ int at = s.indexOf(delim[i], offset);
+ if (at >= 0 && at < min) {
+ min = at;
+ }
+ }
+ return (min == s.length()) ? -1 : min;
+ }
+
+
+ /**
+ * Get the earlier index that to be searched for the first occurrance in
+ * one of any of the given array.
+ *
+ * @param s the character array to be indexed
+ * @param delim the delimiter used to index
+ * @return the ealier index if there are a delimiter
+ */
+ protected int indexFirstOf(char[] s, char delim) {
+ return indexFirstOf(s, delim, 0);
+ }
+
+
+ /**
+ * Get the earlier index that to be searched for the first occurrance in
+ * one of any of the given array.
+ *
+ * @param s the character array to be indexed
+ * @param delim the delimiter used to index
+ * @param offset The offset.
+ * @return the ealier index if there is a delimiter
+ */
+ protected int indexFirstOf(char[] s, char delim, int offset) {
+ if (s == null || s.length == 0) {
+ return -1;
+ }
+ // check boundaries
+ if (offset < 0) {
+ offset = 0;
+ } else if (offset > s.length) {
+ return -1;
+ }
+ for (int i = offset; i < s.length; i++) {
+ if (s[i] == delim) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+
+ /**
+ * Parse the authority component.
+ *
+ * @param original the original character sequence of authority component
+ * @param escaped
+ * The character set used to store files SHALL remain a local decision and
+ * MAY depend on the capability of local operating systems. Prior to the
+ * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format
+ * and UTF-8 encoded. This approach, while allowing international exchange
+ * of URIs, will still allow backward compatibility with older systems
+ * because the code set positions for ASCII characters are identical to the
+ * one byte sequence in UTF-8.
+ *
+ * An individual URI scheme may require a single charset, define a default
+ * charset, or provide a way to indicate the charset used.
+ *
+ *
+ * Always all the time, the setter method is always succeeded and throws
+ *
+ * An individual URI scheme may require a single charset, define a default
+ * charset, or provide a way to indicate the charset used.
+ *
+ * To work globally either requires support of a number of character sets
+ * and to be able to convert between them, or the use of a single preferred
+ * character set.
+ * For support of global compatibility it is STRONGLY RECOMMENDED that
+ * clients and servers use UTF-8 encoding when exchanging URIs.
+ *
+ * @return the default charset string
+ */
+ public static String getDefaultProtocolCharset() {
+ return defaultProtocolCharset;
+ }
+
+
+ /**
+ * Get the protocol charset used by this current URI instance.
+ * It was set by the constructor for this instance. If it was not set by
+ * contructor, it will return the default protocol charset.
+ *
+ * @return the protocol charset string
+ * @see #getDefaultProtocolCharset
+ */
+ public String getProtocolCharset() {
+ return (protocolCharset != null)
+ ? protocolCharset
+ : defaultProtocolCharset;
+ }
+
+
+ /**
+ * Set the default charset of the document.
+ *
+ * Notice that it will be possible to contain mixed characters (e.g.
+ * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional
+ * display of these character sets, the protocol charset could be simply
+ * used again. Because it's not yet implemented that the insertion of BIDI
+ * control characters at different points during composition is extracted.
+ *
+ *
+ * Always all the time, the setter method is always succeeded and throws
+ *
+ *
+ * @param escapedAuthority the raw escaped authority
+ * @throws URIException If {@link
+ * #parseAuthority(String,boolean)} fails
+ * @throws NullPointerException null authority
+ */
+ public void setRawAuthority(char[] escapedAuthority)
+ throws URIException, NullPointerException {
+
+ parseAuthority(new String(escapedAuthority), true);
+ setURI();
+ }
+
+
+ /**
+ * Set the authority. It can be one type of server, hostport, hostname,
+ * IPv4address, IPv6reference and reg_name.
+ * Note that there is no setAuthority method by the escape encoding reason.
+ *
+ * @param escapedAuthority the escaped authority string
+ * @throws URIException If {@link
+ * #parseAuthority(String,boolean)} fails
+ */
+ public void setEscapedAuthority(String escapedAuthority)
+ throws URIException {
+
+ parseAuthority(escapedAuthority, true);
+ setURI();
+ }
+
+
+ /**
+ * Get the raw-escaped authority.
+ *
+ * @return the raw-escaped authority
+ */
+ public char[] getRawAuthority() {
+ return _authority;
+ }
+
+
+ /**
+ * Get the escaped authority.
+ *
+ * @return the escaped authority
+ */
+ public String getEscapedAuthority() {
+ return (_authority == null) ? null : new String(_authority);
+ }
+
+
+ /**
+ * Get the authority.
+ *
+ * @return the authority
+ * @throws URIException If {@link #decode} fails
+ */
+ public String getAuthority() throws URIException {
+ return (_authority == null) ? null : decode(_authority,
+ getProtocolCharset());
+ }
+
+ // ----------------------------------------------------------- The userinfo
+
+ /**
+ * Get the raw-escaped userinfo.
+ *
+ * @return the raw-escaped userinfo
+ * @see #getAuthority
+ */
+ public char[] getRawUserinfo() {
+ return _userinfo;
+ }
+
+
+ /**
+ * Get the escaped userinfo.
+ *
+ * @return the escaped userinfo
+ * @see #getAuthority
+ */
+ public String getEscapedUserinfo() {
+ return (_userinfo == null) ? null : new String(_userinfo);
+ }
+
+
+ /**
+ * Get the userinfo.
+ *
+ * @return the userinfo
+ * @throws URIException If {@link #decode} fails
+ * @see #getAuthority
+ */
+ public String getUserinfo() throws URIException {
+ return (_userinfo == null) ? null : decode(_userinfo,
+ getProtocolCharset());
+ }
+
+ // --------------------------------------------------------------- The host
+
+ /**
+ * Get the host.
+ *
+ *
+ * @return the host
+ * @see #getAuthority
+ */
+ public char[] getRawHost() {
+ return _host;
+ }
+
+
+ /**
+ * Get the host.
+ *
+ *
+ * @return the host
+ * @throws URIException If {@link #decode} fails
+ * @see #getAuthority
+ */
+ public String getHost() throws URIException {
+ if (_host != null) {
+ return decode(_host, getProtocolCharset());
+ } else {
+ return null;
+ }
+ }
+
+ // --------------------------------------------------------------- The port
+
+ /**
+ * Get the port. In order to get the specfic default port, the specific
+ * protocol-supported class extended from the URI class should be used.
+ * It has the server-based naming authority.
+ *
+ * @return the port
+ * if -1, it has the default port for the scheme or the server-based
+ * naming authority is not supported in the specific URI.
+ */
+ public int getPort() {
+ return _port;
+ }
+
+ // --------------------------------------------------------------- The path
+
+ /**
+ * Set the raw-escaped path.
+ *
+ * @param escapedPath the path character sequence
+ * @throws URIException encoding error or not proper for initial instance
+ * @see #encode
+ */
+ public void setRawPath(char[] escapedPath) throws URIException {
+ if (escapedPath == null || escapedPath.length == 0) {
+ _path = _opaque = escapedPath;
+ setURI();
+ return;
+ }
+ // remove the fragment identifier
+ escapedPath = removeFragmentIdentifier(escapedPath);
+ if (_is_net_path || _is_abs_path) {
+ if (escapedPath[0] != '/') {
+ throw new URIException(URIException.PARSING,
+ "not absolute path");
+ }
+ if (!validate(escapedPath, abs_path)) {
+ throw new URIException(URIException.ESCAPING,
+ "escaped absolute path not valid");
+ }
+ _path = escapedPath;
+ } else if (_is_rel_path) {
+ int at = indexFirstOf(escapedPath, '/');
+ if (at == 0) {
+ throw new URIException(URIException.PARSING, "incorrect path");
+ }
+ if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment)
+ && !validate(escapedPath, at, -1, abs_path)
+ || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) {
+
+ throw new URIException(URIException.ESCAPING,
+ "escaped relative path not valid");
+ }
+ _path = escapedPath;
+ } else if (_is_opaque_part) {
+ if (!uric_no_slash.get(escapedPath[0])
+ && !validate(escapedPath, 1, -1, uric)) {
+ throw new URIException(URIException.ESCAPING,
+ "escaped opaque part not valid");
+ }
+ _opaque = escapedPath;
+ } else {
+ throw new URIException(URIException.PARSING, "incorrect path");
+ }
+ setURI();
+ }
+
+
+ /**
+ * Set the escaped path.
+ *
+ * @param escapedPath the escaped path string
+ * @throws URIException encoding error or not proper for initial instance
+ * @see #encode
+ */
+ public void setEscapedPath(String escapedPath) throws URIException {
+ if (escapedPath == null) {
+ _path = _opaque = null;
+ setURI();
+ return;
+ }
+ setRawPath(escapedPath.toCharArray());
+ }
+
+
+ /**
+ * Set the path.
+ *
+ * @param path the path string
+ * @throws URIException set incorrectly or fragment only
+ * @see #encode
+ */
+ public void setPath(String path) throws URIException {
+
+ if (path == null || path.length() == 0) {
+ _path = _opaque = (path == null) ? null : path.toCharArray();
+ setURI();
+ return;
+ }
+ // set the charset to do escape encoding
+ String charset = getProtocolCharset();
+
+ if (_is_net_path || _is_abs_path) {
+ _path = encode(path, allowed_abs_path, charset);
+ } else if (_is_rel_path) {
+ StringBuffer buff = new StringBuffer(path.length());
+ int at = path.indexOf('/');
+ if (at == 0) { // never 0
+ throw new URIException(URIException.PARSING,
+ "incorrect relative path");
+ }
+ if (at > 0) {
+ buff.append(encode(path.substring(0, at), allowed_rel_path,
+ charset));
+ buff.append(encode(path.substring(at), allowed_abs_path,
+ charset));
+ } else {
+ buff.append(encode(path, allowed_rel_path, charset));
+ }
+ _path = buff.toString().toCharArray();
+ } else if (_is_opaque_part) {
+ StringBuffer buf = new StringBuffer();
+ buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset));
+ buf.insert(1, encode(path.substring(1), uric, charset));
+ _opaque = buf.toString().toCharArray();
+ } else {
+ throw new URIException(URIException.PARSING, "incorrect path");
+ }
+ setURI();
+ }
+
+
+ /**
+ * Resolve the base and relative path.
+ *
+ * @param basePath a character array of the basePath
+ * @param relPath a character array of the relPath
+ * @return the resolved path
+ * @throws URIException no more higher path level to be resolved
+ */
+ protected char[] resolvePath(char[] basePath, char[] relPath)
+ throws URIException {
+
+ // REMINDME: paths are never null
+ String base = (basePath == null) ? "" : new String(basePath);
+
+ // _path could be empty
+ if (relPath == null || relPath.length == 0) {
+ return normalize(basePath);
+ } else if (relPath[0] == '/') {
+ return normalize(relPath);
+ } else {
+ int at = base.lastIndexOf('/');
+ if (at != -1) {
+ basePath = base.substring(0, at + 1).toCharArray();
+ }
+ StringBuffer buff = new StringBuffer(base.length()
+ + relPath.length);
+ buff.append((at != -1) ? base.substring(0, at + 1) : "/");
+ buff.append(relPath);
+ return normalize(buff.toString().toCharArray());
+ }
+ }
+
+
+ /**
+ * Get the raw-escaped current hierarchy level in the given path.
+ * If the last namespace is a collection, the slash mark ('/') should be
+ * ended with at the last character of the path string.
+ *
+ * @param path the path
+ * @return the current hierarchy level
+ * @throws URIException no hierarchy level
+ */
+ protected char[] getRawCurrentHierPath(char[] path) throws URIException {
+
+ if (_is_opaque_part) {
+ throw new URIException(URIException.PARSING, "no hierarchy level");
+ }
+ if (path == null) {
+ throw new URIException(URIException.PARSING, "empty path");
+ }
+ String buff = new String(path);
+ int first = buff.indexOf('/');
+ int last = buff.lastIndexOf('/');
+ if (last == 0) {
+ return rootPath;
+ } else if (first != last && last != -1) {
+ return buff.substring(0, last).toCharArray();
+ }
+ // FIXME: it could be a document on the server side
+ return path;
+ }
+
+
+ /**
+ * Get the raw-escaped current hierarchy level.
+ *
+ * @return the raw-escaped current hierarchy level
+ * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
+ */
+ public char[] getRawCurrentHierPath() throws URIException {
+ return (_path == null) ? null : getRawCurrentHierPath(_path);
+ }
+
+
+ /**
+ * Get the escaped current hierarchy level.
+ *
+ * @return the escaped current hierarchy level
+ * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
+ */
+ public String getEscapedCurrentHierPath() throws URIException {
+ char[] path = getRawCurrentHierPath();
+ return (path == null) ? null : new String(path);
+ }
+
+
+ /**
+ * Get the current hierarchy level.
+ *
+ * @return the current hierarchy level
+ * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
+ * @see #decode
+ */
+ public String getCurrentHierPath() throws URIException {
+ char[] path = getRawCurrentHierPath();
+ return (path == null) ? null : decode(path, getProtocolCharset());
+ }
+
+
+ /**
+ * Get the level above the this hierarchy level.
+ *
+ * @return the raw above hierarchy level
+ * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
+ */
+ public char[] getRawAboveHierPath() throws URIException {
+ char[] path = getRawCurrentHierPath();
+ return (path == null) ? null : getRawCurrentHierPath(path);
+ }
+
+
+ /**
+ * Get the level above the this hierarchy level.
+ *
+ * @return the raw above hierarchy level
+ * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
+ */
+ public String getEscapedAboveHierPath() throws URIException {
+ char[] path = getRawAboveHierPath();
+ return (path == null) ? null : new String(path);
+ }
+
+
+ /**
+ * Get the level above the this hierarchy level.
+ *
+ * @return the above hierarchy level
+ * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
+ * @see #decode
+ */
+ public String getAboveHierPath() throws URIException {
+ char[] path = getRawAboveHierPath();
+ return (path == null) ? null : decode(path, getProtocolCharset());
+ }
+
+
+ /**
+ * Get the raw-escaped path.
+ *
+ *
+ * @return the raw-escaped path
+ */
+ public char[] getRawPath() {
+ return _is_opaque_part ? _opaque : _path;
+ }
+
+
+ /**
+ * Get the escaped path.
+ *
+ *
+ * @return the escaped path string
+ */
+ public String getEscapedPath() {
+ char[] path = getRawPath();
+ return (path == null) ? null : new String(path);
+ }
+
+
+ /**
+ * Get the path.
+ *
+ * @return the path string
+ * @throws URIException If {@link #decode} fails.
+ * @see #decode
+ */
+ public String getPath() throws URIException {
+ char[] path = getRawPath();
+ return (path == null) ? null : decode(path, getProtocolCharset());
+ }
+
+
+ /**
+ * Get the raw-escaped basename of the path.
+ *
+ * @return the raw-escaped basename
+ */
+ public char[] getRawName() {
+ if (_path == null) {
+ return null;
+ }
+
+ int at = 0;
+ for (int i = _path.length - 1; i >= 0; i--) {
+ if (_path[i] == '/') {
+ at = i + 1;
+ break;
+ }
+ }
+ int len = _path.length - at;
+ char[] basename = new char[len];
+ System.arraycopy(_path, at, basename, 0, len);
+ return basename;
+ }
+
+
+ /**
+ * Get the escaped basename of the path.
+ *
+ * @return the escaped basename string
+ */
+ public String getEscapedName() {
+ char[] basename = getRawName();
+ return (basename == null) ? null : new String(basename);
+ }
+
+
+ /**
+ * Get the basename of the path.
+ *
+ * @return the basename string
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getName() throws URIException {
+ char[] basename = getRawName();
+ return (basename == null) ? null : decode(getRawName(),
+ getProtocolCharset());
+ }
+
+ // ----------------------------------------------------- The path and query
+
+ /**
+ * Get the raw-escaped path and query.
+ *
+ * @return the raw-escaped path and query
+ */
+ public char[] getRawPathQuery() {
+
+ if (_path == null && _query == null) {
+ return null;
+ }
+ StringBuffer buff = new StringBuffer();
+ if (_path != null) {
+ buff.append(_path);
+ }
+ if (_query != null) {
+ buff.append('?');
+ buff.append(_query);
+ }
+ return buff.toString().toCharArray();
+ }
+
+
+ /**
+ * Get the escaped query.
+ *
+ * @return the escaped path and query string
+ */
+ public String getEscapedPathQuery() {
+ char[] rawPathQuery = getRawPathQuery();
+ return (rawPathQuery == null) ? null : new String(rawPathQuery);
+ }
+
+
+ /**
+ * Get the path and query.
+ *
+ * @return the path and query string.
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getPathQuery() throws URIException {
+ char[] rawPathQuery = getRawPathQuery();
+ return (rawPathQuery == null) ? null : decode(rawPathQuery,
+ getProtocolCharset());
+ }
+
+ // -------------------------------------------------------------- The query
+
+ /**
+ * Set the raw-escaped query.
+ *
+ * @param escapedQuery the raw-escaped query
+ * @throws URIException escaped query not valid
+ */
+ public void setRawQuery(char[] escapedQuery) throws URIException {
+ if (escapedQuery == null || escapedQuery.length == 0) {
+ _query = escapedQuery;
+ setURI();
+ return;
+ }
+ // remove the fragment identifier
+ escapedQuery = removeFragmentIdentifier(escapedQuery);
+ if (!validate(escapedQuery, query)) {
+ throw new URIException(URIException.ESCAPING,
+ "escaped query not valid");
+ }
+ _query = escapedQuery;
+ setURI();
+ }
+
+
+ /**
+ * Set the escaped query string.
+ *
+ * @param escapedQuery the escaped query string
+ * @throws URIException escaped query not valid
+ */
+ public void setEscapedQuery(String escapedQuery) throws URIException {
+ if (escapedQuery == null) {
+ _query = null;
+ setURI();
+ return;
+ }
+ setRawQuery(escapedQuery.toCharArray());
+ }
+
+
+ /**
+ * Set the query.
+ *
+ * When a query string is not misunderstood the reserved special characters
+ * ("&", "=", "+", ",", and "$") within a query component, it is
+ * recommended to use in encoding the whole query with this method.
+ *
+ * The additional APIs for the special purpose using by the reserved
+ * special characters used in each protocol are implemented in each protocol
+ * classes inherited from
+ * The optional fragment identifier is not part of a URI, but is often used
+ * in conjunction with a URI.
+ *
+ * The format and interpretation of fragment identifiers is dependent on
+ * the media type [RFC2046] of the retrieval result.
+ *
+ * A fragment identifier is only meaningful when a URI reference is
+ * intended for retrieval and the result of that retrieval is a document
+ * for which the identified fragment is consistently defined.
+ *
+ * @return the raw-escaped fragment
+ */
+ public char[] getRawFragment() {
+ return _fragment;
+ }
+
+
+ /**
+ * Get the escaped fragment.
+ *
+ * @return the escaped fragment string
+ */
+ public String getEscapedFragment() {
+ return (_fragment == null) ? null : new String(_fragment);
+ }
+
+
+ /**
+ * Get the fragment.
+ *
+ * @return the fragment string
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getFragment() throws URIException {
+ return (_fragment == null) ? null : decode(_fragment,
+ getProtocolCharset());
+ }
+
+ // ------------------------------------------------------------- Utilities
+
+ /**
+ * Remove the fragment identifier of the given component.
+ *
+ * @param component the component that a fragment may be included
+ * @return the component that the fragment identifier is removed
+ */
+ protected char[] removeFragmentIdentifier(char[] component) {
+ if (component == null) {
+ return null;
+ }
+ int lastIndex = new String(component).indexOf('#');
+ if (lastIndex != -1) {
+ component = new String(component).substring(0,
+ lastIndex).toCharArray();
+ }
+ return component;
+ }
+
+
+ /**
+ * Normalize the given hier path part.
+ *
+ * Algorithm taken from URI reference parser at
+ * http://www.apache.org/~fielding/uri/rev-2002/issues.html.
+ *
+ * @param path the path to normalize
+ * @return the normalized path
+ * @throws URIException no more higher path level to be normalized
+ */
+ protected char[] normalize(char[] path) throws URIException {
+
+ if (path == null) {
+ return null;
+ }
+
+ String normalized = new String(path);
+
+ // If the buffer begins with "./" or "../", the "." or ".." is removed.
+ if (normalized.startsWith("./")) {
+ normalized = normalized.substring(1);
+ } else if (normalized.startsWith("../")) {
+ normalized = normalized.substring(2);
+ } else if (normalized.startsWith("..")) {
+ normalized = normalized.substring(2);
+ }
+
+ // All occurrences of "/./" in the buffer are replaced with "/"
+ int index = -1;
+ while ((index = normalized.indexOf("/./")) != -1) {
+ normalized = normalized.substring(0, index) + normalized.substring(index + 2);
+ }
+
+ // If the buffer ends with "/.", the "." is removed.
+ if (normalized.endsWith("/.")) {
+ normalized = normalized.substring(0, normalized.length() - 1);
+ }
+
+ int startIndex = 0;
+
+ // All occurrences of "/
+ * To copy the identical
+ * It is clearly unwise to use a URL that contains a password which is
+ * intended to be secret. In particular, the use of a password within
+ * the 'userinfo' component of a URL is strongly disrecommended except
+ * in those rare cases where the 'password' parameter is intended to be
+ * public.
+ *
+ * When you want to get each part of the userinfo, you need to use the
+ * specific methods in the specific URL. It depends on the specific URL.
+ *
+ * @return the URI character sequence
+ */
+ public char[] getRawURI() {
+ return _uri;
+ }
+
+
+ /**
+ * It can be gotten the URI character sequence. It's escaped.
+ * For the purpose of the protocol to be transported, it will be useful.
+ *
+ * @return the escaped URI string
+ */
+ public String getEscapedURI() {
+ return (_uri == null) ? null : new String(_uri);
+ }
+
+
+ /**
+ * It can be gotten the URI character sequence.
+ *
+ * @return the original URI string
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getURI() throws URIException {
+ return (_uri == null) ? null : decode(_uri, getProtocolCharset());
+ }
+
+
+ /**
+ * Get the URI reference character sequence.
+ *
+ * @return the URI reference character sequence
+ */
+ public char[] getRawURIReference() {
+ if (_fragment == null) {
+ return _uri;
+ }
+ if (_uri == null) {
+ return _fragment;
+ }
+ // if _uri != null && _fragment != null
+ String uriReference = new String(_uri) + "#" + new String(_fragment);
+ return uriReference.toCharArray();
+ }
+
+
+ /**
+ * Get the escaped URI reference string.
+ *
+ * @return the escaped URI reference string
+ */
+ public String getEscapedURIReference() {
+ char[] uriReference = getRawURIReference();
+ return (uriReference == null) ? null : new String(uriReference);
+ }
+
+
+ /**
+ * Get the original URI reference string.
+ *
+ * @return the original URI reference string
+ * @throws URIException If {@link #decode} fails.
+ */
+ public String getURIReference() throws URIException {
+ char[] uriReference = getRawURIReference();
+ return (uriReference == null) ? null : decode(uriReference,
+ getProtocolCharset());
+ }
+
+
+ /**
+ * Get the escaped URI string.
+ *
+ * On the document, the URI-reference form is only used without the userinfo
+ * component like http://jakarta.apache.org/ by the security reason.
+ * But the URI-reference form with the userinfo component could be parsed.
+ *
+ * In other words, this URI and any its subclasses must not expose the
+ * URI-reference expression with the userinfo component like
+ * http://user:password@hostport/restricted_zone.
+ * The distribution of this class is Servlets.com. It was originally
+ * written by Jason Hunter [jhunter at acm.org] and used by with permission.
+ */
+ public static class LocaleToCharsetMap {
+
+ /** A mapping of language code to charset */
+ private static final Hashtable LOCALE_TO_CHARSET_MAP;
+ static {
+ LOCALE_TO_CHARSET_MAP = new Hashtable();
+ LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6");
+ LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7");
+ LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8");
+ LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS");
+ LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR");
+ LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2");
+ LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1");
+ LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9");
+ LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5");
+ LOCALE_TO_CHARSET_MAP.put("zh", "GB2312");
+ LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5");
+ }
+
+ /**
+ * Get the preferred charset for the given locale.
+ *
+ * @param locale the locale
+ * @return the preferred charset or null if the locale is not
+ * recognized.
+ */
+ public static String getCharset(Locale locale) {
+ // try for an full name match (may include country)
+ String charset =
+ (String) LOCALE_TO_CHARSET_MAP.get(locale.toString());
+ if (charset != null) {
+ return charset;
+ }
+
+ // if a full name didn't match, try just the language
+ charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage());
+ return charset; // may be null
+ }
+
+ }
+
+}
+
diff --git a/src/main/java/org/archive/url/URIException.java b/src/main/java/org/archive/url/URIException.java
new file mode 100644
index 00000000..49fa2cb5
--- /dev/null
+++ b/src/main/java/org/archive/url/URIException.java
@@ -0,0 +1,180 @@
+/*
+ * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/URIException.java,v 1.12 2004/09/30 18:53:20 olegk Exp $
+ * $Revision: 480424 $
+ * $Date: 2006-11-29 06:56:49 +0100 (Wed, 29 Nov 2006) $
+ *
+ * ====================================================================
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * We used to use {@link java.net.URI} for parsing URIs but ran across
@@ -50,7 +49,7 @@
* @author gojomo
* @author stack
*
- * @see org.apache.commons.httpclient.URI
+ * @see org.archive.url.URI
*/
public class UsableURI extends LaxURI
implements CharSequence, Serializable {
@@ -121,7 +120,6 @@ protected UsableURI() {
* @param uri String representation of an absolute URI.
* @param escaped If escaped.
* @param charset Charset to use.
- * @throws org.apache.commons.httpclient.URIException
*/
protected UsableURI(String uri, boolean escaped, String charset)
throws URIException {
@@ -132,7 +130,6 @@ protected UsableURI(String uri, boolean escaped, String charset)
/**
* @param relative String representation of URI.
* @param base Parent UURI to use derelativizing.
- * @throws org.apache.commons.httpclient.URIException
*/
protected UsableURI(UsableURI base, UsableURI relative) throws URIException {
super(base, relative);
@@ -275,7 +272,7 @@ public String toString() {
/**
* In the case of a puny encoded IDN, this method returns the decoded Unicode version.
*
- * Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}.
+ * Most of this implementation is copied from {@link org.archive.url.URI#setURI()}.
*
* @return decoded IDN version of URI
*/
diff --git a/src/main/java/org/archive/url/UsableURIFactory.java b/src/main/java/org/archive/url/UsableURIFactory.java
index 1059bfbd..3038ada5 100644
--- a/src/main/java/org/archive/url/UsableURIFactory.java
+++ b/src/main/java/org/archive/url/UsableURIFactory.java
@@ -20,17 +20,15 @@
import gnu.inet.encoding.IDNA;
import gnu.inet.encoding.IDNAException;
-import it.unimi.dsi.lang.MutableString;
import java.io.UnsupportedEncodingException;
import java.util.BitSet;
+import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.httpclient.URI;
-import org.apache.commons.httpclient.URIException;
import org.archive.util.TextUtils;
/**
@@ -88,7 +86,7 @@ public class UsableURIFactory extends URI {
* The numbers in the second line above are only to assist readability;
* they indicate the reference points for each subexpression (i.e., each
* paired parenthesis). We refer to the value matched for subexpression
- *
* This involves converting it to the largest unit
- * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
+ * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
*
* Additionally, at least 2 significant digits are always displayed.
*
@@ -807,7 +803,6 @@ public static String prettyString(Object obj) {
/**
* Provide a improved String of a Map's entries
*
- * @param Map
* @return prettified (in curly brackets) string of Map contents
*/
public static String prettyString(Map, ?> map) {
@@ -830,7 +825,6 @@ public static String prettyString(Map, ?> map) {
/**
* Provide a slightly-improved String of Object[]
*
- * @param Object[]
* @return prettified (in square brackets) of Object[]
*/
public static String prettyString(Object[] array) {
@@ -859,7 +853,7 @@ private static String loadVersion() {
BufferedReader br = null;
String version;
try {
- br = new BufferedReader(new InputStreamReader(input));
+ br = new BufferedReader(new InputStreamReader(input, UTF_8));
version = br.readLine();
br.readLine();
} catch (IOException e) {
@@ -881,7 +875,7 @@ private static String loadVersion() {
br = null;
String timestamp;
try {
- br = new BufferedReader(new InputStreamReader(input));
+ br = new BufferedReader(new InputStreamReader(input, UTF_8));
timestamp = br.readLine();
} catch (IOException e) {
return version;
@@ -902,13 +896,13 @@ private static String loadVersion() {
TLDS = new HashSet Transparently coalesces chunks of a HTTP stream that uses
+ * Transfer-Encoding chunked. Note that this class NEVER closes the underlying stream, even when close
+ * gets called. Instead, it will read until the "end" of its chunking on close,
+ * which allows for the seamless invocation of subsequent HTTP 1.1 calls, while
+ * not requiring the client to remember to read the entire contents of the
+ * response. Returns all the data in a chunked stream in coalesced form. A chunk
+ * is followed by a CRLF. The method returns -1 as soon as a chunksize of 0
+ * is detected. Trailer headers are read automcatically at the end of the stream and
+ * can be obtained with the getResponseFooters() method. Note that this function is intended as a non-public utility.
+ * This is a little weird, but it seemed silly to make a utility
+ * class for this one function, so instead it is just static and
+ * shared that way.
* This involves converting it to the largest unit
- * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
+ * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1.
*
* Additionally, at least 2 significant digits are always displayed.
*
diff --git a/src/main/java/org/archive/util/DevUtils.java b/src/main/java/org/archive/util/DevUtils.java
index d630a0b1..7ee4b13a 100644
--- a/src/main/java/org/archive/util/DevUtils.java
+++ b/src/main/java/org/archive/util/DevUtils.java
@@ -25,6 +25,7 @@
import java.io.StringWriter;
import java.util.logging.Logger;
+import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Write a message and stack trace to the 'org.archive.util.DevUtils' logger.
@@ -78,15 +79,6 @@ public static String extraInfo() {
return sw.toString();
}
- /**
- * Nothing to see here, move along.
- * @deprecated This method was never used.
- */
- @Deprecated
- public static void betterPrintStack(RuntimeException re) {
- re.printStackTrace(System.err);
- }
-
/**
* Send this JVM process a SIGQUIT; giving a thread dump and possibly
* a heap histogram (if using -XX:+PrintClassHistogram).
@@ -101,7 +93,7 @@ public static void sigquitSelf() {
Process p = Runtime.getRuntime().exec(
new String[] {"perl", "-e", "print getppid(). \"\n\";"});
BufferedReader br =
- new BufferedReader(new InputStreamReader(p.getInputStream()));
+ new BufferedReader(new InputStreamReader(p.getInputStream(), UTF_8));
String ppid = br.readLine();
Runtime.getRuntime().exec(
new String[] {"sh", "-c", "kill -3 "+ppid}).waitFor();
diff --git a/src/main/java/org/archive/util/FileNameSpec.java b/src/main/java/org/archive/util/FileNameSpec.java
index a3312cfc..7ace8b59 100644
--- a/src/main/java/org/archive/util/FileNameSpec.java
+++ b/src/main/java/org/archive/util/FileNameSpec.java
@@ -1,5 +1,6 @@
package org.archive.util;
+import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
public class FileNameSpec {
@@ -15,7 +16,7 @@ public FileNameSpec(String prefix, String suffix) {
public String getNextName() {
StringBuilder sb = new StringBuilder();
sb.append(prefix);
- sb.append(String.format("%06d",aInt.incrementAndGet()));
+ sb.append(String.format(Locale.ROOT, "%06d",aInt.incrementAndGet()));
sb.append(suffix);
return sb.toString();
}
diff --git a/src/main/java/org/archive/util/FileUtils.java b/src/main/java/org/archive/util/FileUtils.java
index 3de276a9..271d0212 100644
--- a/src/main/java/org/archive/util/FileUtils.java
+++ b/src/main/java/org/archive/util/FileUtils.java
@@ -32,6 +32,7 @@
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import java.util.Locale;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -39,13 +40,13 @@
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.IOFileFilter;
-import org.apache.commons.lang.math.LongRange;
+import org.apache.commons.lang3.LongRange;
/** Utility methods for manipulating files and directories.
*
- * @contributor John Erik Halse
- * @contributor gojomo
+ * @author John Erik Halse
+ * @author gojomo
*/
public class FileUtils {
private static final Logger LOGGER =
@@ -219,8 +220,8 @@ protected static void workaroundCopyFile(final File src,
FileFilter prefixFilter = new FileFilter() {
public boolean accept(File pathname)
{
- return pathname.getName().toLowerCase().
- startsWith(prefix.toLowerCase());
+ return pathname.getName().toLowerCase(Locale.ROOT).
+ startsWith(prefix.toLowerCase(Locale.ROOT));
}
};
return dir.listFiles(prefixFilter);
@@ -283,7 +284,7 @@ public static boolean isReadableWithExtensionAndMagic(final File f,
throws IOException {
boolean result = false;
FileUtils.assertReadable(f);
- if(f.getName().toLowerCase().endsWith(uncompressedExtension)) {
+ if(f.getName().toLowerCase(Locale.ROOT).endsWith(uncompressedExtension)) {
FileInputStream fis = new FileInputStream(f);
try {
byte [] b = new byte[magic.length()];
@@ -384,7 +385,7 @@ public static boolean moveAsideIfExists(File file) throws IOException {
* want this number of lines ending with a line containing
* the position; if positive, want this number of lines,
* all starting at or after position.
- * @param lines List
- * -- -1338,6 +1346,12 --
- *
- * public void releaseConnection() {
- *
- * + // HERITRIX always ants the streams closed.
- * + if (responseConnection != null)
- * + {
- * + responseConnection.close();
- * + }
- * +
- * if (responseStream != null) {
- * try {
- * // FYI - this may indirectly invoke responseBodyConsumed.
- * -- -1959,6 +1973,11 --
- * this.statusLine = null;
- * }
- * }
- * + // HERITRIX mark transition from header to content.
- * + if (this.httpRecorder != null)
- * + {
- * + this.httpRecorder.markContentBegin();
- * + }
- * readResponseBody(state, conn);
- * processResponseBody(state, conn);
- * } catch (IOException e) {
- *
- *
- * true if stale checking will be enabled on HttpConections
- *
- * @see HttpConnection#isStaleCheckingEnabled()
- *
- * @deprecated Use {@link HttpConnectionManagerParams#isStaleCheckingEnabled()},
- * {@link HttpConnectionManager#getParams()}.
- */
- public boolean isConnectionStaleCheckingEnabled() {
- return this.params.isStaleCheckingEnabled();
- }
-
- /**
- * Sets the staleCheckingEnabled value to be set on HttpConnections that are created.
- *
- * @param connectionStaleCheckingEnabled true if stale checking will be enabled
- * on HttpConections
- *
- * @see HttpConnection#setStaleCheckingEnabled(boolean)
- *
- * @deprecated Use {@link HttpConnectionManagerParams#setStaleCheckingEnabled(boolean)},
- * {@link HttpConnectionManager#getParams()}.
- */
- public void setConnectionStaleCheckingEnabled(
- final boolean connectionStaleCheckingEnabled) {
- this.params.setStaleCheckingEnabled(connectionStaleCheckingEnabled);
- }
-
- /**
- * @see HttpConnectionManager#getConnectionWithTimeout(HostConfiguration, long)
- *
- * @since 3.0
- */
- public HttpConnection getConnectionWithTimeout(
- final HostConfiguration hostConfiguration, final long timeout) {
-
- final ConnectionInfo ci = getConnectionInfo();
- HttpConnection httpConnection = ci.conn;
-
- // make sure the host and proxy are correct for this connection
- // close it and set the values if they are not
- if(httpConnection == null || !finishLastResponse(httpConnection)
- || !hostConfiguration.hostEquals(httpConnection)
- || !hostConfiguration.proxyEquals(httpConnection)) {
-
- if(httpConnection != null && httpConnection.isOpen()) {
- closer.closeConnection(httpConnection);
- }
-
- httpConnection = new HttpConnection(hostConfiguration);
- httpConnection.setHttpConnectionManager(this);
- httpConnection.getParams().setDefaults(this.params);
- ci.conn = httpConnection;
-
- httpConnection.setHost(hostConfiguration.getHost());
- httpConnection.setPort(hostConfiguration.getPort());
- httpConnection.setProtocol(hostConfiguration.getProtocol());
- httpConnection.setLocalAddress(hostConfiguration.getLocalAddress());
-
- httpConnection.setProxyHost(hostConfiguration.getProxyHost());
- httpConnection.setProxyPort(hostConfiguration.getProxyPort());
- }
-
- // remove the connection from the timeout handler
- ci.idleStartTime = Long.MAX_VALUE;
-
- return httpConnection;
- }
-
- /**
- * @see HttpConnectionManager#getConnection(HostConfiguration, long)
- *
- * @deprecated Use #getConnectionWithTimeout(HostConfiguration, long)
- */
- public HttpConnection getConnection(
- final HostConfiguration hostConfiguration, final long timeout) {
- return getConnectionWithTimeout(hostConfiguration, timeout);
- }
-
- /**
- * @see HttpConnectionManager#releaseConnection(org.apache.commons.httpclient.HttpConnection)
- */
- public void releaseConnection(final HttpConnection conn) {
- final ConnectionInfo ci = getConnectionInfo();
- HttpConnection httpConnection = ci.conn;
-
- if(conn != httpConnection) {
- throw new IllegalStateException(
- "Unexpected release of an unknown connection.");
- }
-
- finishLastResponse(httpConnection);
-
- // track the time the connection was made idle
- ci.idleStartTime = System.currentTimeMillis();
- }
-
- /**
- * Returns {@link HttpConnectionManagerParams parameters} associated
- * with this connection manager.
- *
- * @since 2.1
- *
- * @see HttpConnectionManagerParams
- */
- public HttpConnectionManagerParams getParams() {
- return this.params;
- }
-
- /**
- * Assigns {@link HttpConnectionManagerParams parameters} for this
- * connection manager.
- *
- * @since 2.1
- *
- * @see HttpConnectionManagerParams
- */
- public void setParams(final HttpConnectionManagerParams p) {
- if(p == null) {
- throw new IllegalArgumentException("Parameters may not be null");
- }
- this.params = p;
- }
-
- /**
- * @since 3.0
- */
- public void closeIdleConnections(final long idleTimeout) {
- long maxIdleTime = System.currentTimeMillis() - idleTimeout;
-
- final ConnectionInfo ci = getConnectionInfo();
-
- if(ci.idleStartTime <= maxIdleTime) {
- ci.conn.close();
- }
- }
-
- private static final class CloserThread extends Thread {
- private ListHttpRecorderGetMethod
+ HttpRecorderGetMethod
ConfigurableTrustManagerProtocolSocketFactory
+ ConfigurableTrustManagerProtocolSocketFactory
References
+ References
size > than buffer then we go to backing file to read
+ * size > than buffer then we go to backing file to read
* data that is beyond buffer.length.
*
* @throws IOException If we fail to open an input stream on
@@ -84,7 +84,7 @@ public ReplayInputStream(byte[] buffer, long size, long responseBodyStart,
* @param buffer Buffer to read from.
* @param size Size of data to replay.
* @param backingFilename Backing file that sits behind the buffer. If
- * size > than buffer then we go to backing file to read
+ * size > than buffer then we go to backing file to read
* data that is beyond buffer.length.
* @throws IOException If we fail to open an input stream on
* backing file.
@@ -130,7 +130,7 @@ public ReplayInputStream(InputStream fillStream) throws IOException {
}
/**
- * Close & destroy any internally-generated temporary files.
+ * Close & destroy any internally-generated temporary files.
*/
public void destroy() {
IOUtils.closeQuietly(this);
diff --git a/src/main/java/org/archive/io/RepositionableInputStream.java b/src/main/java/org/archive/io/RepositionableInputStream.java
index 6f885130..838b5952 100644
--- a/src/main/java/org/archive/io/RepositionableInputStream.java
+++ b/src/main/java/org/archive/io/RepositionableInputStream.java
@@ -29,7 +29,7 @@
* stream. Uses a {@link BufferedInputStream}. Calls mark on every read so
* we'll remember at least the last thing read (You can only backup on the
* last thing read -- not last 2 or 3 things read). Used by
- * {@link GzippedInputStream} when reading streams over a network. Wraps a
+ * GzippedInputStream when reading streams over a network. Wraps a
* HTTP, etc., stream so we can back it up if needs be after the
* GZIP inflater has done a fill of its full buffer though it only needed
* the first few bytes to finish decompressing the current GZIP member.
diff --git a/src/main/java/org/archive/io/UTF8Bytes.java b/src/main/java/org/archive/io/UTF8Bytes.java
index c280b08d..4dc0144b 100644
--- a/src/main/java/org/archive/io/UTF8Bytes.java
+++ b/src/main/java/org/archive/io/UTF8Bytes.java
@@ -19,6 +19,7 @@
package org.archive.io;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
/**
* Marker Interface for instances that can be serialized as UTF8 bytes.
@@ -27,7 +28,7 @@
* @version $Date$ $Version$
*/
public interface UTF8Bytes {
- public static final String UTF8 = "UTF-8";
+ public static final String UTF8 = StandardCharsets.UTF_8.name();
/**
* @return Instance as UTF-8 bytes.
diff --git a/src/main/java/org/archive/io/WriterPool.java b/src/main/java/org/archive/io/WriterPool.java
index 2dc385a1..79da16c0 100644
--- a/src/main/java/org/archive/io/WriterPool.java
+++ b/src/main/java/org/archive/io/WriterPool.java
@@ -30,6 +30,7 @@
import java.util.logging.Level;
import java.util.logging.Logger;
+import org.archive.format.ArchiveFileConstants;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@@ -88,10 +89,7 @@ public abstract class WriterPool {
/**
* Constructor
* @param serial Used to generate unique filename sequences
- * @param factory Factory that knows how to make a {@link WriterPoolMember}.
* @param settings Settings for this pool.
- * @param poolMaximumActive
- * @param poolMaximumWait
*/
public WriterPool(final AtomicInteger serial,
final WriterPoolSettings settings,
@@ -218,7 +216,7 @@ public synchronized void invalidateFile(WriterPoolMember f)
// gets attention.
File file = f.getFile();
file.renameTo(new File(file.getAbsoluteFile() +
- WriterPoolMember.INVALID_SUFFIX));
+ ArchiveFileConstants.INVALID_SUFFIX));
}
/**
diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java
index 85d44e5d..5d350534 100644
--- a/src/main/java/org/archive/io/WriterPoolMember.java
+++ b/src/main/java/org/archive/io/WriterPoolMember.java
@@ -19,17 +19,19 @@
package org.archive.io;
-import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
-
+import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
@@ -39,6 +41,7 @@
import org.archive.util.FileUtils;
import org.archive.util.PropertyUtils;
+import static org.archive.format.ArchiveFileConstants.*;
/**
@@ -49,10 +52,10 @@
* @author stack
* @version $Date$ $Revision$
*/
-public abstract class WriterPoolMember implements ArchiveFileConstants {
+public abstract class WriterPoolMember {
private final Logger logger = Logger.getLogger(this.getClass().getName());
- public static final String UTF8 = "UTF-8";
+ public static final String UTF8 = StandardCharsets.UTF_8.name();
/**
* Default archival-aggregate filename template.
@@ -103,12 +106,17 @@ public abstract class WriterPoolMember implements ArchiveFileConstants {
*/
protected static int roundRobinIndex = 0;
+ /**
+ * Symbol set for serial number formatter.
+ */
+ protected static DecimalFormatSymbols serialNoFormatterSymbols = new DecimalFormatSymbols(Locale.ROOT);
+
/**
* NumberFormat instance for formatting serial number.
*
* Pads serial number with zeros.
*/
- protected static NumberFormat serialNoFormatter = new DecimalFormat("00000");
+ protected static NumberFormat serialNoFormatter = new DecimalFormat("00000", serialNoFormatterSymbols);
/**
@@ -125,9 +133,6 @@ public abstract class WriterPoolMember implements ArchiveFileConstants {
* @param serialNo used to create unique filename sequences
* @param out Where to write.
* @param file File the out is connected to.
- * @param cmprs Compress the content written.
- * @param a14DigitDate If null, we'll write current time.
- * @throws IOException
*/
protected WriterPoolMember(AtomicInteger serialNo,
final OutputStream out, final File file,
@@ -145,11 +150,6 @@ protected WriterPoolMember(AtomicInteger serialNo,
* Constructor.
*
* @param serialNo used to create unique filename sequences
- * @param dirs Where to drop files.
- * @param prefix File prefix to use.
- * @param cmprs Compress the records written.
- * @param maxSize Maximum size for ARC files written.
- * @param template filenaming template to use
* @param extension Extension to give file.
*/
public WriterPoolMember(AtomicInteger serialNo,
@@ -208,7 +208,7 @@ protected String createFile(final File file) throws IOException {
close();
this.f = file;
FileOutputStream fos = new FileOutputStream(this.f);
- this.countOut = new MiserOutputStream(new FastBufferedOutputStream(fos),settings.getFrequentFlushes());
+ this.countOut = new MiserOutputStream(new BufferedOutputStream(fos),settings.getFrequentFlushes());
this.out = this.countOut;
logger.fine("Opened " + this.f.getAbsolutePath());
return this.f.getName();
@@ -361,7 +361,6 @@ protected void postWriteRecordTasks()
* Position in raw output (typically, physical file).
* Used making accounting of bytes written.
* @return Position in final media (assuming all flushing completes)
- * @throws IOException
*/
public long getPosition() {
return (countOut==null)? 0L : this.countOut.getCount();
diff --git a/src/main/java/org/archive/io/arc/ARC2WCDX.java b/src/main/java/org/archive/io/arc/ARC2WCDX.java
index 19010131..aec571e9 100644
--- a/src/main/java/org/archive/io/arc/ARC2WCDX.java
+++ b/src/main/java/org/archive/io/arc/ARC2WCDX.java
@@ -22,18 +22,18 @@
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
-import java.util.Date;
-import java.util.Iterator;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
import java.util.zip.GZIPOutputStream;
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HeaderGroup;
-import org.apache.commons.httpclient.util.DateParseException;
-import org.apache.commons.httpclient.util.DateUtil;
+import org.archive.format.http.HttpHeader;
import org.archive.io.ArchiveRecord;
import org.archive.util.ArchiveUtils;
import org.archive.util.SURT;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
* Writes .wcdx.gz in same directory.
@@ -63,7 +63,7 @@ public static Object[] createWcdx(ARCReader reader) {
PrintStream writer = null;
long count = 0;
try {
- writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)));
+ writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)), false, UTF_8.name());
// write header: legend + timestamp
StringBuilder legend = new StringBuilder();
@@ -95,12 +95,15 @@ public static Object[] createWcdx(ARCReader reader) {
ARCRecord record = (ARCRecord) iter.next();
record.close();
ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader();
- Header[] httpHeaders = record.getHttpHeaders();
+ HttpHeader[] httpHeaders = record.getHttpHeaders();
if(httpHeaders==null) {
- httpHeaders = new Header[0];
+ httpHeaders = new HttpHeader[0];
+ }
+ Map
- * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
+ * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
* /tmp/hx20040109230030-0.dat.gz
- * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
+ * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
*
* Examine the produced cdx file to make sure it makes sense. Search
* for 'no-type 0'. If found, then we're opening a gzip record w/o data to
@@ -110,7 +111,7 @@
*
* @author stack
*/
-public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeable {
+public class ARCWriter extends WriterPoolMember implements Closeable {
private static final Logger logger =
Logger.getLogger(ARCWriter.class.getName());
@@ -129,12 +130,7 @@ public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeab
* @param serialNo used to generate unique file name sequences
* @param out Where to write.
* @param arc File the out is connected to.
- * @param cmprs Compress the content written.
- * @param metadata File meta data. Can be null. Is list of File and/or
- * String objects.
- * @param a14DigitDate If null, we'll write current time.
- * @throws IOException
- */
+2 */
public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
final File arc, final WriterPoolSettings settings)
throws IOException {
diff --git a/src/main/java/org/archive/io/warc/WARCConstants.java b/src/main/java/org/archive/io/warc/WARCConstants.java
deleted file mode 100644
index 83cc8a6d..00000000
--- a/src/main/java/org/archive/io/warc/WARCConstants.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * This file is part of the Heritrix web crawler (crawler.archive.org).
- *
- * Licensed to the Internet Archive (IA) by one or more individual
- * contributors.
- *
- * The IA licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.io.warc;
-
-@Deprecated
-public interface WARCConstants extends org.archive.format.warc.WARCConstants {
-}
diff --git a/src/main/java/org/archive/io/warc/WARCReader.java b/src/main/java/org/archive/io/warc/WARCReader.java
index a34854ef..34583e58 100644
--- a/src/main/java/org/archive/io/warc/WARCReader.java
+++ b/src/main/java/org/archive/io/warc/WARCReader.java
@@ -24,6 +24,7 @@
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
@@ -31,17 +32,19 @@
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
-import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.lang3.NotImplementedException;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
+import static org.archive.format.warc.WARCConstants.*;
+
/**
* WARCReader.
* Go via {@link WARCReaderFactory} to get instance.
* @author stack
* @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
*/
-public class WARCReader extends ArchiveReader implements WARCConstants {
+public class WARCReader extends ArchiveReader {
protected WARCReader() {
super();
}
@@ -196,7 +199,6 @@ public static void main(String [] args)
Options options = getOptions();
PosixParser parser = new PosixParser();
CommandLine cmdline = parser.parse(options, args, false);
- @SuppressWarnings("unchecked")
Listout is connected to.
- * @param cmprs Compress the content written.
- * @param a14DigitDate If null, we'll write current time.
- * @throws IOException
*/
public WARCWriter(final AtomicInteger serialNo,
final OutputStream out, final File f,
@@ -110,13 +109,6 @@ public WARCWriter(final AtomicInteger serialNo,
/**
* Constructor.
- *
- * @param dirs Where to drop files.
- * @param prefix File prefix to use.
- * @param cmprs Compress the records written.
- * @param maxSize Maximum size for ARC files written.
- * @param suffix File tail to use. If null, unused.
- * @param warcinfoData File metadata for warcinfo record.
*/
public WARCWriter(final AtomicInteger serialNo,
final WARCWriterPoolSettings settings) {
@@ -353,9 +345,9 @@ public URI writeWarcinfoRecord(String filename, final String description)
recordInfo.setMimetype("application/warc-fields");
// Strip .open suffix if present.
- if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
+ if (filename.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) {
filename = filename.substring(0,
- filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
+ filename.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length());
}
recordInfo.addExtraHeader(HEADER_KEY_FILENAME, filename);
if (description != null && description.length() > 0) {
@@ -366,12 +358,12 @@ public URI writeWarcinfoRecord(String filename, final String description)
byte [] warcinfoBody = null;
if (settings.getMetadata() == null) {
// TODO: What to write into a warcinfo? What to associate?
- warcinfoBody = "TODO: Unimplemented".getBytes();
+ warcinfoBody = "TODO: Unimplemented".getBytes(UTF_8);
} else {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (final IteratorImplementation Notes
Tools
Arc2Warc and Warc2Arc
-tools can be found in the package above this one, at
-{@link org.archive.io.Arc2Warc} and {@link org.archive.io.Warc2Arc}
+tools can be found in Heritrix, at
+org.archive.io.Arc2Warc and org.archive.io.Warc2Arc
respectively. Pass --help to learn how to use each tool.
record-id generator.
*
- * @contributor stack
- * @contributor gojomo
+ * @author stack
+ * @author gojomo
* @version $Revision$ $Date$
*/
public interface RecordIDGenerator {
/**
* @return A URI that can serve as a record-id.
- * @throws URISyntaxException
*/
public URI getRecordID();
diff --git a/src/main/java/org/archive/uid/package.html b/src/main/java/org/archive/uid/package.html
index dc49f07b..bc69c9e3 100644
--- a/src/main/java/org/archive/uid/package.html
+++ b/src/main/java/org/archive/uid/package.html
@@ -8,13 +8,13 @@
Default is {@link org.archive.uid.UUIDGenerator}.
To use another ID Generator, set the System Property
org.archive.uid.GeneratorFactory.generator to point
-at an alternate implementation of {@link org.archive.uid.Generator}.
+at an alternate implementation of {@link org.archive.uid.RecordIDGenerator}.
TODO
CRLFCRLF) and two leading hyphens. Add to
- {@link org.archive.uid.Generator}
+ {@link org.archive.uid.RecordIDGenerator}
interface an upper-bound on generated ID length.http://archive.org/UID-SCHEME/ID
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
index c09ad6e6..3957c9ef 100644
--- a/src/main/java/org/archive/url/BasicURLCanonicalizer.java
+++ b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -6,7 +6,9 @@
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -15,18 +17,18 @@
/**
* Canonicalizer that does more or less basic fixup. Based initially on rules
* specified at https://developers.google.com/safe-browsing/developers_guide_v2#
- * Canonicalization. These rules are designed for clients of google's
+ * Canonicalization. These rules are designed for clients of Google's
* "experimental" Safe Browsing API to "check URLs against Google's
* constantly-updated blacklists of suspected phishing and malware pages".
*
*
+ * URI character sequence: char
+ * octet sequence: byte
+ * original character sequence: String
+ *
+ * - In general, written as follows:
+ * Absolute URI = <scheme>:<scheme-specific-part>
+ * Generic URI = <scheme>://<authority><path>?<query>
+ *
+ * - Syntax
+ * absoluteURI = scheme ":" ( hier_part | opaque_part )
+ * hier_part = ( net_path | abs_path ) [ "?" query ]
+ * net_path = "//" authority [ abs_path ]
+ * abs_path = "/" path_segments
+ *
+ * ftp://ftp.is.co.za/rfc/rfc1808.txt
+ * -- ftp scheme for File Transfer Protocol services
+ * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
+ * -- gopher scheme for Gopher and Gopher+ Protocol services
+ * http://www.math.uio.no/faq/compression-faq/part1.html
+ * -- http scheme for Hypertext Transfer Protocol services
+ * mailto:mduerst@ifi.unizh.ch
+ * -- mailto scheme for electronic mail addresses
+ * news:comp.infosystems.www.servers.unix
+ * -- news scheme for USENET news groups and articles
+ * telnet://melvyl.ucop.edu/
+ * -- telnet scheme for interactive services via the TELNET Protocol
+ *
+ * Please, notice that there are many modifications from URL(RFC 1738) and
+ * relative URL(RFC 1808).
+ *
+ * For escaped URI forms
+ * - URI(char[]) // constructor
+ * - char[] getRawXxx() // method
+ * - String getEscapedXxx() // method
+ * - String toString() // method
+ *
+ * For unescaped URI forms
+ * - URI(String) // constructor
+ * - String getXXX() // method
+ *
null
+ *
+ * @see #getProtocolCharset
+ *
+ * @since 3.0
+ */
+ public URI(String s, boolean escaped, String charset)
+ throws URIException, NullPointerException {
+ protocolCharset = charset;
+ parseUriReference(s, escaped);
+ }
+
+ /**
+ * Construct a URI from a string with the given charset. The input string can
+ * be either in escaped or unescaped form.
+ *
+ * @param s URI character sequence
+ * @param escaped true if URI character sequence is in escaped form.
+ * false otherwise.
+ *
+ * @throws URIException If the URI cannot be created.
+ * @throws NullPointerException if input string is null
+ *
+ * @see #getProtocolCharset
+ *
+ * @since 3.0
+ */
+ public URI(String s, boolean escaped)
+ throws URIException, NullPointerException {
+ parseUriReference(s, escaped);
+ }
+
+ /**
+ * Construct a URI as an escaped form of a character array with the given
+ * charset.
+ *
+ * @param escaped the URI character sequence
+ * @param charset the charset string to do escape encoding
+ * @throws URIException If the URI cannot be created.
+ * @throws NullPointerException if escaped is null
+ * @see #getProtocolCharset
+ *
+ * @deprecated Use #URI(String, boolean, String)
+ */
+ public URI(char[] escaped, String charset)
+ throws URIException, NullPointerException {
+ protocolCharset = charset;
+ parseUriReference(new String(escaped), true);
+ }
+
+
+ /**
+ * Construct a URI as an escaped form of a character array.
+ * An URI can be placed within double-quotes or angle brackets like
+ * "http://test.com/" and <http://test.com/>
+ *
+ * @param escaped the URI character sequence
+ * @throws URIException If the URI cannot be created.
+ * @throws NullPointerException if escaped is null
+ * @see #getDefaultProtocolCharset
+ *
+ * @deprecated Use #URI(String, boolean)
+ */
+ public URI(char[] escaped)
+ throws URIException, NullPointerException {
+ parseUriReference(new String(escaped), true);
+ }
+
+
+ /**
+ * Construct a URI from the given string with the given charset.
+ *
+ * @param original the string to be represented to URI character sequence
+ * It is one of absoluteURI and relativeURI.
+ * @param charset the charset string to do escape encoding
+ * @throws URIException If the URI cannot be created.
+ * @see #getProtocolCharset
+ *
+ * @deprecated Use #URI(String, boolean, String)
+ */
+ public URI(String original, String charset) throws URIException {
+ protocolCharset = charset;
+ parseUriReference(original, false);
+ }
+
+
+ /**
+ * Construct a URI from the given string.
+ *
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ *
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ * absoluteURI = scheme ":" ( hier_part | opaque_part )
+ * opaque_part = uric_no_slash *uric
+ *
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ * absoluteURI = scheme ":" ( hier_part | opaque_part )
+ * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
+ * hier_part = ( net_path | abs_path ) [ "?" query ]
+ *
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
+ *
+ * http://a/b/c/d;p?q
+ *
+ * g:h = g:h
+ * g = http://a/b/c/g
+ * ./g = http://a/b/c/g
+ * g/ = http://a/b/c/g/
+ * /g = http://a/g
+ * //g = http://g
+ * ?y = http://a/b/c/?y
+ * g?y = http://a/b/c/g?y
+ * #s = (current document)#s
+ * g#s = http://a/b/c/g#s
+ * g?y#s = http://a/b/c/g?y#s
+ * ;x = http://a/b/c/;x
+ * g;x = http://a/b/c/g;x
+ * g;x?y#s = http://a/b/c/g;x?y#s
+ * . = http://a/b/c/
+ * ./ = http://a/b/c/
+ * .. = http://a/b/
+ * ../ = http://a/b/
+ * ../g = http://a/b/g
+ * ../.. = http://a/
+ * ../../ = http://a/
+ * ../../g = http://a/g
+ *
+ * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
+ * "8" | "9"
+ *
+ * alpha = lowalpha | upalpha
+ *
+ * alphanum = alpha | digit
+ *
+ * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
+ * "a" | "b" | "c" | "d" | "e" | "f"
+ *
+ * escaped = "%" hex hex
+ *
+ * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
+ * "(" | ")"
+ *
+ * unreserved = alphanum | mark
+ *
+ * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
+ * "$" | ","
+ *
+ * uric = reserved | unreserved | escaped
+ *
+ * fragment = *uric
+ *
+ * query = *uric
+ *
+ * pchar = unreserved | escaped |
+ * ":" | "@" | "&" | "=" | "+" | "$" | ","
+ *
+ * param = *pchar
+ *
+ * segment = *pchar *( ";" param )
+ *
+ * path_segments = segment *( "/" segment )
+ *
+ * abs_path = "/" path_segments
+ *
+ * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
+ * "&" | "=" | "+" | "$" | ","
+ *
+ * opaque_part = uric_no_slash *uric
+ *
+ * path = [ abs_path | opaque_part ]
+ *
+ * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
+ *
+ * IPv6address = hexpart [ ":" IPv4address ]
+ *
+ * IPv6reference = "[" IPv6address "]"
+ *
+ * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
+ *
+ * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
+ *
+ * hostname = *( domainlabel "." ) toplabel [ "." ]
+ *
+ * host = hostname | IPv4address | IPv6reference
+ *
+ * hostport = host [ ":" port ]
+ *
+ * userinfo = *( unreserved | escaped |
+ * ";" | ":" | "&" | "=" | "+" | "$" | "," )
+ *
+ * server = [ [ userinfo "@" ] hostport ]
+ *
+ * reg_name = 1*( unreserved | escaped | "$" | "," |
+ * ";" | ":" | "@" | "&" | "=" | "+" )
+ *
+ * authority = server | reg_name
+ *
+ * scheme = alpha *( alpha | digit | "+" | "-" | "." )
+ *
+ * rel_segment = 1*( unreserved | escaped |
+ * ";" | "@" | "&" | "=" | "+" | "$" | "," )
+ *
+ * rel_path = rel_segment [ abs_path ]
+ *
+ * net_path = "//" authority [ abs_path ]
+ *
+ * hier_part = ( net_path | abs_path ) [ "?" query ]
+ *
+ * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
+ *
+ * absoluteURI = scheme ":" ( hier_part | opaque_part )
+ *
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ *
+ * original character sequence->octet sequence->URI character sequence
+ *
+ * URI character sequence->octet sequence->original character sequence
+ *
+ * URI character sequence->octet sequence->original character sequence
+ *
String with the character
+ * encoding of the local system or the document.
+ *
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ * 12 3 4 5 6 7 8 9
+ *
+ * $1 = http:
+ * scheme = $2 = http
+ * $3 = //jakarta.apache.org
+ * authority = $4 = jakarta.apache.org
+ * path = $5 = /ietf/uri/
+ * $6 = [undefined]
+ * query = $7 = [undefined]
+ * $8 = #Related
+ * fragment = $9 = Related
+ *
true if original is escaped
+ * @throws URIException If an error occurs.
+ */
+ protected void parseUriReference(String original, boolean escaped)
+ throws URIException {
+
+ // validate and contruct the URI character sequence
+ if (original == null) {
+ throw new URIException("URI-Reference required");
+ }
+
+ /* @
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ */
+ String tmp = original.trim();
+
+ /*
+ * The length of the string sequence of characters.
+ * It may not be equal to the length of the byte array.
+ */
+ int length = tmp.length();
+
+ /*
+ * Remove the delimiters like angle brackets around an URI.
+ */
+ if (length > 0) {
+ char[] firstDelimiter = { tmp.charAt(0) };
+ if (validate(firstDelimiter, delims)) {
+ if (length >= 2) {
+ char[] lastDelimiter = { tmp.charAt(length - 1) };
+ if (validate(lastDelimiter, delims)) {
+ tmp = tmp.substring(1, length - 1);
+ length = length - 2;
+ }
+ }
+ }
+ }
+
+ /*
+ * The starting index
+ */
+ int from = 0;
+
+ /*
+ * The test flag whether the URI is started from the path component.
+ */
+ boolean isStartedFromPath = false;
+ int atColon = tmp.indexOf(':');
+ int atSlash = tmp.indexOf('/');
+ if ((atColon <= 0 && !tmp.startsWith("//"))
+ || (atSlash >= 0 && atSlash < atColon)) {
+ isStartedFromPath = true;
+ }
+
+ /*
+ *
+ * @@@@@@@@
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *
+ * scheme = $2 = http
+ * @
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *
+ * authority = $4 = jakarta.apache.org
+ * @@
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *
+ * path = $5 = /ietf/uri/
+ * @@@@@@
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *
+ * query = $7 =
+ * fragment = $9 = Related
+ * @@@@@@@@
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *
true if original is escaped
+ * @throws URIException If an error occurs.
+ */
+ protected void parseAuthority(String original, boolean escaped)
+ throws URIException {
+
+ // Reset flags
+ _is_reg_name = _is_server =
+ _is_hostname = _is_IPv4address = _is_IPv6reference = false;
+
+ // set the charset to do escape encoding
+ String charset = getProtocolCharset();
+
+ boolean hasPort = true;
+ int from = 0;
+ int next = original.indexOf('@');
+ if (next != -1) { // neither -1 and 0
+ // each protocol extented from URI supports the specific userinfo
+ _userinfo = (escaped) ? original.substring(0, next).toCharArray()
+ : encode(original.substring(0, next), allowed_userinfo,
+ charset);
+ from = next + 1;
+ }
+ next = original.indexOf('[', from);
+ if (next >= from) {
+ next = original.indexOf(']', from);
+ if (next == -1) {
+ throw new URIException(URIException.PARSING, "IPv6reference");
+ } else {
+ next++;
+ }
+ // In IPv6reference, '[', ']' should be excluded
+ _host = (escaped) ? original.substring(from, next).toCharArray()
+ : encode(original.substring(from, next), allowed_IPv6reference,
+ charset);
+ // Set flag
+ _is_IPv6reference = true;
+ } else { // only for !_is_IPv6reference
+ next = original.indexOf(':', from);
+ if (next == -1) {
+ next = original.length();
+ hasPort = false;
+ }
+ // REMINDME: it doesn't need the pre-validation
+ _host = original.substring(from, next).toCharArray();
+ if (validate(_host, IPv4address)) {
+ // Set flag
+ _is_IPv4address = true;
+ } else if (validate(_host, hostname)) {
+ // Set flag
+ _is_hostname = true;
+ } else {
+ // Set flag
+ _is_reg_name = true;
+ }
+ }
+ if (_is_reg_name) {
+ // Reset flags for a server-based naming authority
+ _is_server = _is_hostname = _is_IPv4address =
+ _is_IPv6reference = false;
+ // set a registry-based naming authority
+ if (escaped) {
+ _authority = original.toCharArray();
+ if (!validate(_authority, reg_name)) {
+ throw new URIException("Invalid authority");
+ }
+ } else {
+ _authority = encode(original, allowed_reg_name, charset);
+ }
+ } else {
+ if (original.length() - 1 > next && hasPort
+ && original.charAt(next) == ':') { // not empty
+ from = next + 1;
+ try {
+ _port = Integer.parseInt(original.substring(from));
+ } catch (NumberFormatException error) {
+ throw new URIException(URIException.PARSING,
+ "invalid port number");
+ }
+ }
+ // set a server-based naming authority
+ StringBuffer buf = new StringBuffer();
+ if (_userinfo != null) { // has_userinfo
+ buf.append(_userinfo);
+ buf.append('@');
+ }
+ if (_host != null) {
+ buf.append(_host);
+ if (_port != -1) {
+ buf.append(':');
+ buf.append(_port);
+ }
+ }
+ _authority = buf.toString().toCharArray();
+ // Set flag
+ _is_server = true;
+ }
+ }
+
+
+ /**
+ * Once it's parsed successfully, set this URI.
+ *
+ * @see #getRawURI
+ */
+ protected void setURI() {
+ // set _uri
+ StringBuffer buf = new StringBuffer();
+ // ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ if (_scheme != null) {
+ buf.append(_scheme);
+ buf.append(':');
+ }
+ if (_is_net_path) {
+ buf.append("//");
+ if (_authority != null) { // has_authority
+ buf.append(_authority);
+ }
+ }
+ if (_opaque != null && _is_opaque_part) {
+ buf.append(_opaque);
+ } else if (_path != null) {
+ // _is_hier_part or _is_relativeURI
+ if (_path.length != 0) {
+ buf.append(_path);
+ }
+ }
+ if (_query != null) { // has_query
+ buf.append('?');
+ buf.append(_query);
+ }
+ // ignore the fragment identifier
+ _uri = buf.toString().toCharArray();
+ hash = 0;
+ }
+
+ // ----------------------------------------------------------- Test methods
+
+
+ /**
+ * Tell whether or not this URI is absolute.
+ *
+ * @return true iif this URI is absoluteURI
+ */
+ public boolean isAbsoluteURI() {
+ return (_scheme != null);
+ }
+
+
+ /**
+ * Tell whether or not this URI is relative.
+ *
+ * @return true iif this URI is relativeURI
+ */
+ public boolean isRelativeURI() {
+ return (_scheme == null);
+ }
+
+
+ /**
+ * Tell whether or not the absoluteURI of this URI is hier_part.
+ *
+ * @return true iif the absoluteURI is hier_part
+ */
+ public boolean isHierPart() {
+ return _is_hier_part;
+ }
+
+
+ /**
+ * Tell whether or not the absoluteURI of this URI is opaque_part.
+ *
+ * @return true iif the absoluteURI is opaque_part
+ */
+ public boolean isOpaquePart() {
+ return _is_opaque_part;
+ }
+
+
+ /**
+ * Tell whether or not the relativeURI or heir_part of this URI is net_path.
+ * It's the same function as the has_authority() method.
+ *
+ * @return true iif the relativeURI or heir_part is net_path
+ * @see #hasAuthority
+ */
+ public boolean isNetPath() {
+ return _is_net_path || (_authority != null);
+ }
+
+
+ /**
+ * Tell whether or not the relativeURI or hier_part of this URI is abs_path.
+ *
+ * @return true iif the relativeURI or hier_part is abs_path
+ */
+ public boolean isAbsPath() {
+ return _is_abs_path;
+ }
+
+
+ /**
+ * Tell whether or not the relativeURI of this URI is rel_path.
+ *
+ * @return true iif the relativeURI is rel_path
+ */
+ public boolean isRelPath() {
+ return _is_rel_path;
+ }
+
+
+ /**
+ * Tell whether or not this URI has authority.
+ * It's the same function as the is_net_path() method.
+ *
+ * @return true iif this URI has authority
+ * @see #isNetPath
+ */
+ public boolean hasAuthority() {
+ return (_authority != null) || _is_net_path;
+ }
+
+ /**
+ * Tell whether or not the authority component of this URI is reg_name.
+ *
+ * @return true iif the authority component is reg_name
+ */
+ public boolean isRegName() {
+ return _is_reg_name;
+ }
+
+
+ /**
+ * Tell whether or not the authority component of this URI is server.
+ *
+ * @return true iif the authority component is server
+ */
+ public boolean isServer() {
+ return _is_server;
+ }
+
+
+ /**
+ * Tell whether or not this URI has userinfo.
+ *
+ * @return true iif this URI has userinfo
+ */
+ public boolean hasUserinfo() {
+ return (_userinfo != null);
+ }
+
+
+ /**
+ * Tell whether or not the host part of this URI is hostname.
+ *
+ * @return true iif the host part is hostname
+ */
+ public boolean isHostname() {
+ return _is_hostname;
+ }
+
+
+ /**
+ * Tell whether or not the host part of this URI is IPv4address.
+ *
+ * @return true iif the host part is IPv4address
+ */
+ public boolean isIPv4address() {
+ return _is_IPv4address;
+ }
+
+
+ /**
+ * Tell whether or not the host part of this URI is IPv6reference.
+ *
+ * @return true iif the host part is IPv6reference
+ */
+ public boolean isIPv6reference() {
+ return _is_IPv6reference;
+ }
+
+
+ /**
+ * Tell whether or not this URI has query.
+ *
+ * @return true iif this URI has query
+ */
+ public boolean hasQuery() {
+ return (_query != null);
+ }
+
+
+ /**
+ * Tell whether or not this URI has fragment.
+ *
+ * @return true iif this URI has fragment
+ */
+ public boolean hasFragment() {
+ return (_fragment != null);
+ }
+
+
+ // ---------------------------------------------------------------- Charset
+
+
+ /**
+ * Set the default charset of the protocol.
+ * DefaultCharsetChanged exception.
+ *
+ * So API programmer must follow the following way:
+ *
+ * import org.apache.util.URI$DefaultCharsetChanged;
+ * .
+ * .
+ * .
+ * try {
+ * URI.setDefaultProtocolCharset("UTF-8");
+ * } catch (DefaultCharsetChanged cc) {
+ * // CASE 1: the exception could be ignored, when it is set by user
+ * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
+ * // CASE 2: let user know the default protocol charset changed
+ * } else {
+ * // CASE 2: let user know the default document charset changed
+ * }
+ * }
+ *
+ *
+ * The API programmer is responsible to set the correct charset.
+ * And each application should remember its own charset to support.
+ *
+ * @param charset the default charset for each protocol
+ * @throws DefaultCharsetChanged default charset changed
+ */
+ public static void setDefaultProtocolCharset(String charset)
+ throws DefaultCharsetChanged {
+
+ defaultProtocolCharset = charset;
+ throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET,
+ "the default protocol charset changed");
+ }
+
+
+ /**
+ * Get the default charset of the protocol.
+ * DefaultCharsetChanged exception.
+ *
+ * So API programmer must follow the following way:
+ *
+ * import org.apache.util.URI$DefaultCharsetChanged;
+ * .
+ * .
+ * .
+ * try {
+ * URI.setDefaultDocumentCharset("EUC-KR");
+ * } catch (DefaultCharsetChanged cc) {
+ * // CASE 1: the exception could be ignored, when it is set by user
+ * if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
+ * // CASE 2: let user know the default document charset changed
+ * } else {
+ * // CASE 2: let user know the default protocol charset changed
+ * }
+ * }
+ *
+ *
+ * The API programmer is responsible to set the correct charset.
+ * And each application should remember its own charset to support.
+ *
+ * @param charset the default charset for the document
+ * @throws DefaultCharsetChanged default charset changed
+ */
+ public static void setDefaultDocumentCharset(String charset)
+ throws DefaultCharsetChanged {
+
+ defaultDocumentCharset = charset;
+ throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET,
+ "the default document charset changed");
+ }
+
+
+ /**
+ * Get the recommended default charset of the document.
+ *
+ * @return the default charset string
+ */
+ public static String getDefaultDocumentCharset() {
+ return defaultDocumentCharset;
+ }
+
+
+ /**
+ * Get the default charset of the document by locale.
+ *
+ * @return the default charset string by locale
+ */
+ public static String getDefaultDocumentCharsetByLocale() {
+ return defaultDocumentCharsetByLocale;
+ }
+
+
+ /**
+ * Get the default charset of the document by platform.
+ *
+ * @return the default charset string by platform
+ */
+ public static String getDefaultDocumentCharsetByPlatform() {
+ return defaultDocumentCharsetByPlatform;
+ }
+
+ // ------------------------------------------------------------- The scheme
+
+ /**
+ * Get the scheme.
+ *
+ * @return the scheme
+ */
+ public char[] getRawScheme() {
+ return _scheme;
+ }
+
+
+ /**
+ * Get the scheme.
+ *
+ * @return the scheme
+ * null if undefined scheme
+ */
+ public String getScheme() {
+ return (_scheme == null) ? null : new String(_scheme);
+ }
+
+ // ---------------------------------------------------------- The authority
+
+ /**
+ * Set the authority. It can be one type of server, hostport, hostname,
+ * IPv4address, IPv6reference and reg_name.
+ *
+ * authority = server | reg_name
+ *
+ * host = hostname | IPv4address | IPv6reference
+ *
+ * host = hostname | IPv4address | IPv6reference
+ *
+ * path = [ abs_path | opaque_part ]
+ *
+ * path = [ abs_path | opaque_part ]
+ * abs_path = "/" path_segments
+ * opaque_part = uric_no_slash *uric
+ *
+ * path = [ abs_path | opaque_part ]
+ *
URI. So refer to the same-named APIs
+ * implemented in each specific protocol instance.
+ *
+ * @param query the query string.
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #encode
+ */
+ public void setQuery(String query) throws URIException {
+ if (query == null || query.length() == 0) {
+ _query = (query == null) ? null : query.toCharArray();
+ setURI();
+ return;
+ }
+ setRawQuery(encode(query, allowed_query, getProtocolCharset()));
+ }
+
+
+ /**
+ * Get the raw-escaped query.
+ *
+ * @return the raw-escaped query
+ */
+ public char[] getRawQuery() {
+ return _query;
+ }
+
+
+ /**
+ * Get the escaped query.
+ *
+ * @return the escaped query string
+ */
+ public String getEscapedQuery() {
+ return (_query == null) ? null : new String(_query);
+ }
+
+
+ /**
+ * Get the query.
+ *
+ * @return the query string.
+ * @throws URIException incomplete trailing escape pattern or unsupported
+ * character encoding
+ * @see #decode
+ */
+ public String getQuery() throws URIException {
+ return (_query == null) ? null : decode(_query, getProtocolCharset());
+ }
+
+ // ----------------------------------------------------------- The fragment
+
+ /**
+ * Set the raw-escaped fragment.
+ *
+ * @param escapedFragment the raw-escaped fragment
+ * @throws URIException escaped fragment not valid
+ */
+ public void setRawFragment(char[] escapedFragment) throws URIException {
+ if (escapedFragment == null || escapedFragment.length == 0) {
+ _fragment = escapedFragment;
+ hash = 0;
+ return;
+ }
+ if (!validate(escapedFragment, fragment)) {
+ throw new URIException(URIException.ESCAPING,
+ "escaped fragment not valid");
+ }
+ _fragment = escapedFragment;
+ hash = 0;
+ }
+
+
+ /**
+ * Set the escaped fragment string.
+ *
+ * @param escapedFragment the escaped fragment string
+ * @throws URIException escaped fragment not valid
+ */
+ public void setEscapedFragment(String escapedFragment) throws URIException {
+ if (escapedFragment == null) {
+ _fragment = null;
+ hash = 0;
+ return;
+ }
+ setRawFragment(escapedFragment.toCharArray());
+ }
+
+
+ /**
+ * Set the fragment.
+ *
+ * @param fragment the fragment string.
+ * @throws URIException If an error occurs.
+ */
+ public void setFragment(String fragment) throws URIException {
+ if (fragment == null || fragment.length() == 0) {
+ _fragment = (fragment == null) ? null : fragment.toCharArray();
+ hash = 0;
+ return;
+ }
+ _fragment = encode(fragment, allowed_fragment, getProtocolCharset());
+ hash = 0;
+ }
+
+
+ /**
+ * Get the raw-escaped fragment.
+ * String.
+ * URI object including the userinfo
+ * component, it should be used.
+ *
+ * @return a clone of this instance
+ */
+ public synchronized Object clone() throws CloneNotSupportedException {
+
+ URI instance = (URI) super.clone();
+
+ instance._uri = _uri;
+ instance._scheme = _scheme;
+ instance._opaque = _opaque;
+ instance._authority = _authority;
+ instance._userinfo = _userinfo;
+ instance._host = _host;
+ instance._port = _port;
+ instance._path = _path;
+ instance._query = _query;
+ instance._fragment = _fragment;
+ // the charset to do escape encoding for this instance
+ instance.protocolCharset = protocolCharset;
+ // flags
+ instance._is_hier_part = _is_hier_part;
+ instance._is_opaque_part = _is_opaque_part;
+ instance._is_net_path = _is_net_path;
+ instance._is_abs_path = _is_abs_path;
+ instance._is_rel_path = _is_rel_path;
+ instance._is_reg_name = _is_reg_name;
+ instance._is_server = _is_server;
+ instance._is_hostname = _is_hostname;
+ instance._is_IPv4address = _is_IPv4address;
+ instance._is_IPv6reference = _is_IPv6reference;
+
+ return instance;
+ }
+
+ // ------------------------------------------------------------ Get the URI
+
+ /**
+ * It can be gotten the URI character sequence. It's raw-escaped.
+ * For the purpose of the protocol to be transported, it will be useful.
+ *
+ * It means that the API client programmer should extract each user and
+ * password to access manually. Probably it will be supported in the each
+ * subclass, however, not a whole URI-reference expression.
+ *
+ * @return the escaped URI string
+ * @see #clone()
+ */
+ public String toString() {
+ return getEscapedURI();
+ }
+
+
+ // ------------------------------------------------------------ Inner class
+
+ /**
+ * The charset-changed normal operation to represent to be required to
+ * alert to user the fact the default charset is changed.
+ */
+ public static class DefaultCharsetChanged extends RuntimeException {
+
+ // ------------------------------------------------------- constructors
+
+ /**
+ * The constructor with a reason string and its code arguments.
+ *
+ * @param reasonCode the reason code
+ * @param reason the reason
+ */
+ public DefaultCharsetChanged(int reasonCode, String reason) {
+ super(reason);
+ this.reason = reason;
+ this.reasonCode = reasonCode;
+ }
+
+ // ---------------------------------------------------------- constants
+
+ /** No specified reason code. */
+ public static final int UNKNOWN = 0;
+
+ /** Protocol charset changed. */
+ public static final int PROTOCOL_CHARSET = 1;
+
+ /** Document charset changed. */
+ public static final int DOCUMENT_CHARSET = 2;
+
+ // ------------------------------------------------- instance variables
+
+ /** The reason code. */
+ private int reasonCode;
+
+ /** The reason message. */
+ private String reason;
+
+ // ------------------------------------------------------------ methods
+
+ /**
+ * Get the reason code.
+ *
+ * @return the reason code
+ */
+ public int getReasonCode() {
+ return reasonCode;
+ }
+
+ /**
+ * Get the reason message.
+ *
+ * @return the reason message
+ */
+ public String getReason() {
+ return reason;
+ }
+
+ }
+
+
+ /**
+ * A mapping to determine the (somewhat arbitrarily) preferred charset for a
+ * given locale. Supports all locales recognized in JDK 1.1.
+ * str is not null.
* @param suffix True if substr is a suffix.
*/
- private void appendNonNull(MutableString b, String str, String substr,
+ private void appendNonNull(StringBuilder b, String str, String substr,
boolean suffix) {
if (str != null && str.length() > 0) {
if (!suffix) {
@@ -760,6 +756,6 @@ private String checkUriElement(String element) {
*/
private String checkUriElementAndLowerCase(String element) {
String tmp = checkUriElement(element);
- return (tmp != null)? tmp.toLowerCase(): tmp;
+ return (tmp != null)? tmp.toLowerCase(Locale.ROOT): tmp;
}
}
diff --git a/src/main/java/org/archive/url/WaybackURLKeyMaker.java b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
index 99fb92e9..56f51b49 100644
--- a/src/main/java/org/archive/url/WaybackURLKeyMaker.java
+++ b/src/main/java/org/archive/url/WaybackURLKeyMaker.java
@@ -5,7 +5,7 @@
public class WaybackURLKeyMaker implements URLKeyMaker {
// URLCanonicalizer canonicalizer = new NonMassagingIAURLCanonicalizer();
- URLCanonicalizer canonicalizer = new DefaultIAURLCanonicalizer();
+ URLCanonicalizer canonicalizer = new AggressiveIAURLCanonicalizer();
public URLCanonicalizer getCanonicalizer() {
return canonicalizer;
diff --git a/src/main/java/org/archive/util/ArchiveUtils.java b/src/main/java/org/archive/util/ArchiveUtils.java
index e4224384..cce411df 100644
--- a/src/main/java/org/archive/util/ArchiveUtils.java
+++ b/src/main/java/org/archive/util/ArchiveUtils.java
@@ -49,10 +49,12 @@
import org.archive.format.gzip.GZIPDecoder;
import org.archive.format.gzip.GZIPFormatException;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Miscellaneous useful methods.
*
- * @author gojomo & others
+ * @author gojomo & others
*/
public class ArchiveUtils {
private static final Logger LOGGER = Logger.getLogger(ArchiveUtils.class.getName());
@@ -317,11 +319,6 @@ public static String get12DigitDate(Date d) {
/**
* A version of getDate which returns the default instead of throwing an exception if parsing fails
- *
- * @param d
- * @param defaultDate
- * @return
- * @throws ParseException
*/
public static Date getDate(String d, Date defaultDate)
{
@@ -337,14 +334,13 @@ public static Date getDate(String d, Date defaultDate)
}
/**
- * Parses an ARC-style date. If passed String is < 12 characters in length,
- * we pad. At a minimum, String should contain a year (>=4 characters).
+ * Parses an ARC-style date. If passed String is < 12 characters in length,
+ * we pad. At a minimum, String should contain a year (>=4 characters).
* Parse will also fail if day or month are incompletely specified. Depends
* on the above getXXDigitDate methods.
- * @param A 4-17 digit date in ARC style (yyyy to
+ * @param d A 4-17 digit date in ARC style (yyyy to
* yyyyMMddHHmmssSSS) formatting.
* @return A Date object representing the passed String.
- * @throws ParseException
*/
public static Date getDate(String d) throws ParseException {
Date date = null;
@@ -605,7 +601,7 @@ public static boolean byteArrayEquals(final byte[] lhs, final byte[] rhs) {
/**
* Converts a double to a string.
* @param val The double to convert
- * @param precision How many characters to include after '.'
+ * @param maxFractionDigits How many characters to include after '.'
* @return the double as a string.
*/
public static String doubleToString(double val, int maxFractionDigits){
@@ -628,7 +624,7 @@ public static String doubleToString(double val, int maxFractionDigits, int minFr
* Takes a byte size and formats it for display with 'friendly' units.
* yyyy to
+ * @param d A 4-17 digit date in ARC style (yyyy to
* yyyyMMddHHmmssSSS) formatting.
* @return A Date object representing the passed String.
* @throws ParseException
@@ -539,7 +539,7 @@ public static boolean byteArrayEquals(final byte[] lhs, final byte[] rhs) {
/**
* Converts a double to a string.
* @param val The double to convert
- * @param precision How many characters to include after '.'
+ * @param maxFractionDigits How many characters to include after '.'
* @return the double as a string.
*/
public static String doubleToString(double val, int maxFractionDigits){
@@ -557,7 +557,7 @@ private static String doubleToString(double val, int maxFractionDigits, int minF
* Takes a byte size and formats it for display with 'friendly' units.
* "\n" terminator encountered
* If the stream ends before the line terminator is found,
* the last part of the string will still be returned.
* If no input data available, null is returned.
@@ -77,7 +76,7 @@ protected LaxHttpParser() { }
* @return a byte array from the stream
*/
public static byte[] readRawLine(InputStream inputStream) throws IOException {
- LOG.trace("enter LaxHttpParser.readRawLine()");
+ LOG.finest("enter LaxHttpParser.readRawLine()");
ByteArrayOutputStream buf = new ByteArrayOutputStream();
int ch;
@@ -94,7 +93,7 @@ public static byte[] readRawLine(InputStream inputStream) throws IOException {
}
/**
- * Read up to "\n" from an (unchunked) input stream.
+ * Read up to "\n" from an (unchunked) input stream.
* If the stream ends before the line terminator is found,
* the last part of the string will still be returned.
* If no input data available, null is returned.
@@ -108,7 +107,7 @@ public static byte[] readRawLine(InputStream inputStream) throws IOException {
* @since 3.0
*/
public static String readLine(InputStream inputStream, String charset) throws IOException {
- LOG.trace("enter LaxHttpParser.readLine(InputStream, String)");
+ LOG.finest("enter LaxHttpParser.readLine(InputStream, String)");
byte[] rawdata = readRawLine(inputStream);
if (rawdata == null) {
return null;
@@ -126,11 +125,15 @@ public static String readLine(InputStream inputStream, String charset) throws IO
}
}
}
- return EncodingUtil.getString(rawdata, 0, len - offset, charset);
+ try {
+ return new String(rawdata, 0, len - offset, charset);
+ } catch (UnsupportedEncodingException e) {
+ return new String(rawdata, 0, len - offset, StandardCharsets.ISO_8859_1);
+ }
}
/**
- * Read up to "\n" from an (unchunked) input stream.
+ * Read up to "\n" from an (unchunked) input stream.
* If the stream ends before the line terminator is found,
* the last part of the string will still be returned.
* If no input data available, null is returned
@@ -144,8 +147,8 @@ public static String readLine(InputStream inputStream, String charset) throws IO
*/
public static String readLine(InputStream inputStream) throws IOException {
- LOG.trace("enter LaxHttpParser.readLine(InputStream)");
- return readLine(inputStream, "US-ASCII");
+ LOG.finest("enter LaxHttpParser.readLine(InputStream)");
+ return readLine(inputStream, StandardCharsets.US_ASCII.name());
}
/**
@@ -158,14 +161,13 @@ public static String readLine(InputStream inputStream) throws IOException {
* @return an array of headers in the order in which they were parsed
*
* @throws IOException if an IO error occurs while reading from the stream
- * @throws HttpException if there is an error parsing a header value
- *
+ *
* @since 3.0
*/
- public static Header[] parseHeaders(InputStream is, String charset) throws IOException, HttpException {
- LOG.trace("enter HeaderParser.parseHeaders(InputStream, String)");
+ public static HttpHeader[] parseHeaders(InputStream is, String charset) throws IOException {
+ LOG.finest("enter HeaderParser.parseHeaders(InputStream, String)");
- ArrayListImplementation Details
of combinations of atom, quoted-string, and
specials tokens, or else consisting of texts>
-