Skip to content

Commit 924540a

Browse files
committed
[CODEC-240] Add Percent-Encoding Codec (described in RFC3986 and RFC7578).
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/trunk@1814505 13f79535-47bb-0310-9956-ffa450edef68
1 parent f1c4b63 commit 924540a

3 files changed

Lines changed: 399 additions & 0 deletions

File tree

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ The <action> type attribute can be add,update,fix,remove.
4444
<body>
4545
<release version="1.12" date="2017-MM-DD" description="Feature and fix release.">
4646
<action issue="CODEC-244" dev="ggregory" type="update">Update from Java 6 to Java 7</action>
47+
<action issue="CODEC-240" dev="ggregory" type="add" due-to="Ioannis Sermetziadis">Add Percent-Encoding Codec (described in RFC3986 and RFC7578)</action>
4748
</release>
4849
<release version="1.11" date="2017-10-20" description="Feature and fix release.">
4950
<!-- The first attribute below should be the issue id; makes it easier to navigate in the IDE outline -->
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.commons.codec.net;
19+
20+
import java.nio.ByteBuffer;
21+
import java.util.BitSet;
22+
import org.apache.commons.codec.BinaryDecoder;
23+
import org.apache.commons.codec.BinaryEncoder;
24+
import org.apache.commons.codec.DecoderException;
25+
import org.apache.commons.codec.EncoderException;
26+
27+
/**
28+
* Implements the Percent-Encoding scheme, as described in HTTP 1.1 specification. For extensibility, an array of
29+
* special US-ASCII characters can be specified in order to perform proper URI encoding for the different parts
30+
* of the URI.
31+
* <p>
32+
* This class is immutable. It is also thread-safe besides using BitSet which is not thread-safe, but its public
33+
* interface only call the access
34+
* </p>
35+
*
36+
* @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">Percent-Encoding</a>
37+
* @since 1.11
38+
*/
39+
public class PercentCodec implements BinaryEncoder, BinaryDecoder {
40+
41+
/**
42+
* The escape character used by the Percent-Encoding in order to introduce an encoded character.
43+
*/
44+
private final byte ESCAPE_CHAR = '%';
45+
/**
46+
* The bit set used to store the character that should be always encoded
47+
*/
48+
private final BitSet alwaysEncodeChars = new BitSet();
49+
/**
50+
* The flag defining if the space character should be encoded as '+'
51+
*/
52+
private final boolean plusForSpace;
53+
/**
54+
* The minimum and maximum code of the bytes that is inserted in the bit set, used to prevent look-ups
55+
*/
56+
private int alwaysEncodeCharsMin = Integer.MAX_VALUE, alwaysEncodeCharsMax = Integer.MIN_VALUE;
57+
58+
/**
59+
* Constructs a Percent coded that will encode all the non US-ASCII characters using the Percent-Encoding
60+
* while it will not encode all the US-ASCII characters, except for character '%' that is used as escape
61+
* character for Percent-Encoding.
62+
*/
63+
public PercentCodec() {
64+
this.plusForSpace = false;
65+
insertAlwaysEncodeChar(ESCAPE_CHAR);
66+
}
67+
68+
/**
69+
* Constructs a Percent codec by specifying the characters that belong to US-ASCII that should
70+
* always be encoded. The rest US-ASCII characters will not be encoded, except for character '%' that
71+
* is used as escape character for Percent-Encoding.
72+
*
73+
* @param alwaysEncodeChars the unsafe characters that should always be encoded
74+
* @param plusForSpace the flag defining if the space character should be encoded as '+'
75+
*/
76+
public PercentCodec(final byte[] alwaysEncodeChars, final boolean plusForSpace) {
77+
this.plusForSpace = plusForSpace;
78+
insertAlwaysEncodeChars(alwaysEncodeChars);
79+
}
80+
81+
/**
82+
* Adds the byte array into a BitSet for faster lookup
83+
*
84+
* @param alwaysEncodeChars
85+
*/
86+
private void insertAlwaysEncodeChars(final byte[] alwaysEncodeChars) {
87+
if (alwaysEncodeChars != null) {
88+
for (byte b : alwaysEncodeChars) {
89+
insertAlwaysEncodeChar(b);
90+
}
91+
}
92+
insertAlwaysEncodeChar(ESCAPE_CHAR);
93+
}
94+
95+
/**
96+
* Inserts a single character into a BitSet and maintains the min and max of the characters of the
97+
* {@code BitSet alwaysEncodeChars} in order to avoid look-ups when a byte is out of this range.
98+
*
99+
* @param b the byte that is candidate for min and max limit
100+
*/
101+
private void insertAlwaysEncodeChar(final byte b) {
102+
this.alwaysEncodeChars.set(b);
103+
if (b < alwaysEncodeCharsMin) {
104+
alwaysEncodeCharsMin = b;
105+
}
106+
if (b > alwaysEncodeCharsMax) {
107+
alwaysEncodeCharsMax = b;
108+
}
109+
}
110+
111+
/**
112+
* Percent-Encoding based on RFC 3986. The non US-ASCII characters are encoded, as well as the
113+
* US-ASCII characters that are configured to be always encoded.
114+
*/
115+
@Override
116+
public byte[] encode(final byte[] bytes) throws EncoderException {
117+
if (bytes == null) {
118+
return null;
119+
}
120+
121+
int expectedEncodingBytes = expectedEncodingBytes(bytes);
122+
boolean willEncode = expectedEncodingBytes != bytes.length;
123+
if (willEncode || (plusForSpace && containsSpace(bytes))) {
124+
return doEncode(bytes, expectedEncodingBytes, willEncode);
125+
} else {
126+
return bytes;
127+
}
128+
}
129+
130+
private byte[] doEncode(final byte[] bytes, int expectedLength, boolean willEncode) {
131+
final ByteBuffer buffer = ByteBuffer.allocate(expectedLength);
132+
for (final byte b : bytes) {
133+
if (willEncode && canEncode(b)) {
134+
byte bb = b;
135+
if (bb < 0) {
136+
bb = (byte) (256 + bb);
137+
}
138+
final char hex1 = Utils.hexDigit(bb >> 4);
139+
final char hex2 = Utils.hexDigit(bb);
140+
buffer.put(ESCAPE_CHAR);
141+
buffer.put((byte) hex1);
142+
buffer.put((byte) hex2);
143+
} else {
144+
if (plusForSpace && b == ' ') {
145+
buffer.put((byte) '+');
146+
} else {
147+
buffer.put(b);
148+
}
149+
}
150+
}
151+
return buffer.array();
152+
}
153+
154+
private int expectedEncodingBytes(final byte[] bytes) {
155+
int byteCount = 0;
156+
for (final byte b : bytes) {
157+
byteCount += canEncode(b) ? 3: 1;
158+
}
159+
return byteCount;
160+
}
161+
162+
private boolean containsSpace(final byte[] bytes) {
163+
for (final byte b : bytes) {
164+
if (b == ' ') {
165+
return true;
166+
}
167+
}
168+
return false;
169+
}
170+
171+
private boolean canEncode(final byte c) {
172+
return !isAsciiChar(c) || (inAlwaysEncodeCharsRange(c) && alwaysEncodeChars.get(c));
173+
}
174+
175+
private boolean inAlwaysEncodeCharsRange(final byte c) {
176+
return c >= alwaysEncodeCharsMin && c <= alwaysEncodeCharsMax;
177+
}
178+
179+
private boolean isAsciiChar(final byte c) {
180+
return c >= 0;
181+
}
182+
183+
/**
184+
* Decode bytes encoded with Percent-Encoding based on RFC 3986. The reverse process is performed in order to
185+
* decode the encoded characters to Unicode.
186+
*/
187+
@Override
188+
public byte[] decode(final byte[] bytes) throws DecoderException {
189+
if (bytes == null) {
190+
return null;
191+
}
192+
193+
final ByteBuffer buffer = ByteBuffer.allocate(expectedDecodingBytes(bytes));
194+
for (int i = 0; i < bytes.length; i++) {
195+
final byte b = bytes[i];
196+
if (b == ESCAPE_CHAR) {
197+
try {
198+
final int u = Utils.digit16(bytes[++i]);
199+
final int l = Utils.digit16(bytes[++i]);
200+
buffer.put((byte) ((u << 4) + l));
201+
} catch (final ArrayIndexOutOfBoundsException e) {
202+
throw new DecoderException("Invalid percent decoding: ", e);
203+
}
204+
} else {
205+
if (plusForSpace && b == '+') {
206+
buffer.put((byte) ' ');
207+
} else {
208+
buffer.put(b);
209+
}
210+
}
211+
}
212+
return buffer.array();
213+
}
214+
215+
private int expectedDecodingBytes(final byte[] bytes) {
216+
int byteCount = 0;
217+
for (int i = 0; i < bytes.length; ) {
218+
byte b = bytes[i];
219+
i += b == ESCAPE_CHAR ? 3: 1;
220+
byteCount++;
221+
}
222+
return byteCount;
223+
}
224+
225+
/**
226+
* Encodes an object into using the Percent-Encoding. Only byte[] objects are accepted.
227+
*
228+
* @param obj the object to encode
229+
* @return the encoding result byte[] as Object
230+
* @throws EncoderException
231+
*/
232+
@Override
233+
public Object encode(final Object obj) throws EncoderException {
234+
if (obj == null) {
235+
return null;
236+
} else if (obj instanceof byte[]) {
237+
return encode((byte[]) obj);
238+
} else {
239+
throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent encoded");
240+
}
241+
}
242+
243+
/**
244+
* Decodes a byte[] Object, whose bytes are encoded with Percent-Encoding.
245+
*
246+
* @param obj the object to decode
247+
* @return the decoding result byte[] as Object
248+
* @throws DecoderException
249+
*/
250+
@Override
251+
public Object decode(final Object obj) throws DecoderException {
252+
if (obj == null) {
253+
return null;
254+
} else if (obj instanceof byte[]) {
255+
return decode((byte[]) obj);
256+
} else {
257+
throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent decoded");
258+
}
259+
}
260+
}

0 commit comments

Comments
 (0)