001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.binary;
019
020 /**
021 * Provides Base32 encoding and decoding as defined by <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>.
022 *
023 * <p>
024 * The class can be parameterized in the following manner with various constructors:
025 * <ul>
026 * <li>Whether to use the "base32hex" variant instead of the default "base32"</li>
027 * <li>Line length: Default 76. Line length that aren't multiples of 8 will still essentially end up being multiples of
028 * 8 in the encoded data.
029 * <li>Line separator: Default is CRLF ("\r\n")</li>
030 * </ul>
031 * </p>
032 * <p>
033 * This class operates directly on byte streams, and not character streams.
034 * </p>
035 * <p>
036 * This class is not thread-safe. Each thread should use its own instance.
037 * </p>
038 *
039 * @see <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>
040 *
041 * @since 1.5
042 * @version $Revision: 1080712 $
043 */
044 public class Base32 extends BaseNCodec {
045
046 /**
047 * BASE32 characters are 5 bits in length.
048 * They are formed by taking a block of five octets to form a 40-bit string,
049 * which is converted into eight BASE32 characters.
050 */
051 private static final int BITS_PER_ENCODED_BYTE = 5;
052 private static final int BYTES_PER_ENCODED_BLOCK = 8;
053 private static final int BYTES_PER_UNENCODED_BLOCK = 5;
054
055 /**
056 * Chunk separator per RFC 2045 section 2.1.
057 *
058 * @see <a href="http://www.ietf.org/rfc/rfc2045.txt">RFC 2045 section 2.1</a>
059 */
060 private static final byte[] CHUNK_SEPARATOR = {'\r', '\n'};
061
062 /**
063 * This array is a lookup table that translates Unicode characters drawn from the "Base32 Alphabet" (as specified in
064 * Table 3 of RFC 2045) into their 5-bit positive integer equivalents. Characters that are not in the Base32
065 * alphabet but fall within the bounds of the array are translated to -1.
066 *
067 */
068 private static final byte[] DECODE_TABLE = {
069 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
070 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
071 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
072 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 63, // 20-2f
073 -1, -1, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
074 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, // 40-4f A-N
075 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 50-5a O-Z
076 };
077
078 /**
079 * This array is a lookup table that translates 5-bit positive integer index values into their "Base32 Alphabet"
080 * equivalents as specified in Table 3 of RFC 2045.
081 */
082 private static final byte[] ENCODE_TABLE = {
083 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
084 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
085 '2', '3', '4', '5', '6', '7',
086 };
087
088 /**
089 * This array is a lookup table that translates Unicode characters drawn from the "Base32 |Hex Alphabet" (as specified in
090 * Table 3 of RFC 2045) into their 5-bit positive integer equivalents. Characters that are not in the Base32 Hex
091 * alphabet but fall within the bounds of the array are translated to -1.
092 *
093 */
094 private static final byte[] HEX_DECODE_TABLE = {
095 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
096 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
097 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
098 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 63, // 20-2f
099 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
100 -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, // 40-4f A-N
101 25, 26, 27, 28, 29, 30, 31, 32, // 50-57 O-V
102 };
103
104 /**
105 * This array is a lookup table that translates 5-bit positive integer index values into their "Base32 Hex Alphabet"
106 * equivalents as specified in Table 3 of RFC 2045.
107 */
108 private static final byte[] HEX_ENCODE_TABLE = {
109 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
110 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
111 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
112 };
113
114 /** Mask used to extract 5 bits, used when encoding Base32 bytes */
115 private static final int MASK_5BITS = 0x1f;
116
117 // The static final fields above are used for the original static byte[] methods on Base32.
118 // The private member fields below are used with the new streaming approach, which requires
119 // some state be preserved between calls of encode() and decode().
120
121 /**
122 * Place holder for the bytes we're dealing with for our based logic.
123 * Bitwise operations store and extract the encoding or decoding from this variable.
124 */
125 private long bitWorkArea;
126
127 /**
128 * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
129 * <code>decodeSize = {@link BYTES_PER_ENCODED_BLOCK} - 1 + lineSeparator.length;</code>
130 */
131 private final int decodeSize;
132
133 /**
134 * Decode table to use.
135 */
136 private final byte[] decodeTable;
137
138 /**
139 * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
140 * <code>encodeSize = {@link BYTES_PER_ENCODED_BLOCK} + lineSeparator.length;</code>
141 */
142 private final int encodeSize;
143
144 /**
145 * Encode table to use.
146 */
147 private final byte[] encodeTable;
148
149 /**
150 * Line separator for encoding. Not used when decoding. Only used if lineLength > 0.
151 */
152 private final byte[] lineSeparator;
153
154 /**
155 * Creates a Base32 codec used for decoding and encoding.
156 * <p>
157 * When encoding the line length is 0 (no chunking).
158 * </p>
159 *
160 */
161 public Base32() {
162 this(false);
163 }
164
165 /**
166 * Creates a Base32 codec used for decoding and encoding.
167 * <p>
168 * When encoding the line length is 0 (no chunking).
169 * </p>
170 * @param useHex if <code>true</code> then use Base32 Hex alphabet
171 */
172 public Base32(boolean useHex) {
173 this(0, null, useHex);
174 }
175
176 /**
177 * Creates a Base32 codec used for decoding and encoding.
178 * <p>
179 * When encoding the line length is given in the constructor, the line separator is CRLF.
180 * </p>
181 *
182 * @param lineLength
183 * Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
184 * If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
185 */
186 public Base32(int lineLength) {
187 this(lineLength, CHUNK_SEPARATOR);
188 }
189
190 /**
191 * Creates a Base32 codec used for decoding and encoding.
192 * <p>
193 * When encoding the line length and line separator are given in the constructor.
194 * </p>
195 * <p>
196 * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
197 * </p>
198 *
199 * @param lineLength
200 * Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
201 * If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
202 * @param lineSeparator
203 * Each line of encoded data will end with this sequence of bytes.
204 * @throws IllegalArgumentException
205 * The provided lineSeparator included some Base32 characters. That's not going to work!
206 */
207 public Base32(int lineLength, byte[] lineSeparator) {
208 this(lineLength, lineSeparator, false);
209 }
210
211 /**
212 * Creates a Base32 / Base32 Hex codec used for decoding and encoding.
213 * <p>
214 * When encoding the line length and line separator are given in the constructor.
215 * </p>
216 * <p>
217 * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
218 * </p>
219 *
220 * @param lineLength
221 * Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
222 * If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
223 * @param lineSeparator
224 * Each line of encoded data will end with this sequence of bytes.
225 * @param useHex if <code>true</code>, then use Base32 Hex alphabet, otherwise use Base32 alphabet
226 * @throws IllegalArgumentException
227 * The provided lineSeparator included some Base32 characters. That's not going to work!
228 * Or the lineLength > 0 and lineSeparator is null.
229 */
230 public Base32(int lineLength, byte[] lineSeparator, boolean useHex) {
231 super(BYTES_PER_UNENCODED_BLOCK, BYTES_PER_ENCODED_BLOCK,
232 lineLength,
233 lineSeparator == null ? 0 : lineSeparator.length);
234 if (useHex){
235 this.encodeTable = HEX_ENCODE_TABLE;
236 this.decodeTable = HEX_DECODE_TABLE;
237 } else {
238 this.encodeTable = ENCODE_TABLE;
239 this.decodeTable = DECODE_TABLE;
240 }
241 if (lineLength > 0) {
242 if (lineSeparator == null) {
243 throw new IllegalArgumentException("lineLength "+lineLength+" > 0, but lineSeparator is null");
244 }
245 // Must be done after initializing the tables
246 if (containsAlphabetOrPad(lineSeparator)) {
247 String sep = StringUtils.newStringUtf8(lineSeparator);
248 throw new IllegalArgumentException("lineSeparator must not contain Base32 characters: [" + sep + "]");
249 }
250 this.encodeSize = BYTES_PER_ENCODED_BLOCK + lineSeparator.length;
251 this.lineSeparator = new byte[lineSeparator.length];
252 System.arraycopy(lineSeparator, 0, this.lineSeparator, 0, lineSeparator.length);
253 } else {
254 this.encodeSize = BYTES_PER_ENCODED_BLOCK;
255 this.lineSeparator = null;
256 }
257 this.decodeSize = this.encodeSize - 1;
258 }
259
260 /**
261 * <p>
262 * Decodes all of the provided data, starting at inPos, for inAvail bytes. Should be called at least twice: once
263 * with the data to decode, and once with inAvail set to "-1" to alert decoder that EOF has been reached. The "-1"
264 * call is not necessary when decoding, but it doesn't hurt, either.
265 * </p>
266 * <p>
267 * Ignores all non-Base32 characters. This is how chunked (e.g. 76 character) data is handled, since CR and LF are
268 * silently ignored, but has implications for other bytes, too. This method subscribes to the garbage-in,
269 * garbage-out philosophy: it will not check the provided data for validity.
270 * </p>
271 *
272 * @param in
273 * byte[] array of ascii data to Base32 decode.
274 * @param inPos
275 * Position to start reading data from.
276 * @param inAvail
277 * Amount of bytes available from input for encoding.
278 *
279 * Output is written to {@link #buffer} as 8-bit octets, using {@link pos} as the buffer position
280 */
281 void decode(byte[] in, int inPos, int inAvail) { // package protected for access from I/O streams
282 if (eof) {
283 return;
284 }
285 if (inAvail < 0) {
286 eof = true;
287 }
288 for (int i = 0; i < inAvail; i++) {
289 byte b = in[inPos++];
290 if (b == PAD) {
291 // We're done.
292 eof = true;
293 break;
294 } else {
295 ensureBufferSize(decodeSize);
296 if (b >= 0 && b < this.decodeTable.length) {
297 int result = this.decodeTable[b];
298 if (result >= 0) {
299 modulus = (modulus+1) % BYTES_PER_ENCODED_BLOCK;
300 bitWorkArea = (bitWorkArea << BITS_PER_ENCODED_BYTE) + result; // collect decoded bytes
301 if (modulus == 0) { // we can output the 5 bytes
302 buffer[pos++] = (byte) ((bitWorkArea >> 32) & MASK_8BITS);
303 buffer[pos++] = (byte) ((bitWorkArea >> 24) & MASK_8BITS);
304 buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
305 buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
306 buffer[pos++] = (byte) (bitWorkArea & MASK_8BITS);
307 }
308 }
309 }
310 }
311 }
312
313 // Two forms of EOF as far as Base32 decoder is concerned: actual
314 // EOF (-1) and first time '=' character is encountered in stream.
315 // This approach makes the '=' padding characters completely optional.
316 if (eof && modulus >= 2) { // if modulus < 2, nothing to do
317 ensureBufferSize(decodeSize);
318
319 // we ignore partial bytes, i.e. only multiples of 8 count
320 switch (modulus) {
321 case 2 : // 10 bits, drop 2 and output one byte
322 buffer[pos++] = (byte) ((bitWorkArea >> 2) & MASK_8BITS);
323 break;
324 case 3 : // 15 bits, drop 7 and output 1 byte
325 buffer[pos++] = (byte) ((bitWorkArea >> 7) & MASK_8BITS);
326 break;
327 case 4 : // 20 bits = 2*8 + 4
328 bitWorkArea = bitWorkArea >> 4; // drop 4 bits
329 buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
330 buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
331 break;
332 case 5 : // 25bits = 3*8 + 1
333 bitWorkArea = bitWorkArea >> 1;
334 buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
335 buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
336 buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
337 break;
338 case 6 : // 30bits = 3*8 + 6
339 bitWorkArea = bitWorkArea >> 6;
340 buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
341 buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
342 buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
343 break;
344 case 7 : // 35 = 4*8 +3
345 bitWorkArea = bitWorkArea >> 3;
346 buffer[pos++] = (byte) ((bitWorkArea >> 24) & MASK_8BITS);
347 buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
348 buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
349 buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
350 break;
351 }
352 }
353 }
354
355 /**
356 * <p>
357 * Encodes all of the provided data, starting at inPos, for inAvail bytes. Must be called at least twice: once with
358 * the data to encode, and once with inAvail set to "-1" to alert encoder that EOF has been reached, so flush last
359 * remaining bytes (if not multiple of 5).
360 * </p>
361 *
362 * @param in
363 * byte[] array of binary data to Base32 encode.
364 * @param inPos
365 * Position to start reading data from.
366 * @param inAvail
367 * Amount of bytes available from input for encoding.
368 */
369 void encode(byte[] in, int inPos, int inAvail) { // package protected for access from I/O streams
370 if (eof) {
371 return;
372 }
373 // inAvail < 0 is how we're informed of EOF in the underlying data we're
374 // encoding.
375 if (inAvail < 0) {
376 eof = true;
377 if (0 == modulus && lineLength == 0) {
378 return; // no leftovers to process and not using chunking
379 }
380 ensureBufferSize(encodeSize);
381 int savedPos = pos;
382 switch (modulus) { // % 5
383 case 1 : // Only 1 octet; take top 5 bits then remainder
384 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 3) & MASK_5BITS]; // 8-1*5 = 3
385 buffer[pos++] = encodeTable[(int)(bitWorkArea << 2) & MASK_5BITS]; // 5-3=2
386 buffer[pos++] = PAD;
387 buffer[pos++] = PAD;
388 buffer[pos++] = PAD;
389 buffer[pos++] = PAD;
390 buffer[pos++] = PAD;
391 buffer[pos++] = PAD;
392 break;
393
394 case 2 : // 2 octets = 16 bits to use
395 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 11) & MASK_5BITS]; // 16-1*5 = 11
396 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 6) & MASK_5BITS]; // 16-2*5 = 6
397 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 1) & MASK_5BITS]; // 16-3*5 = 1
398 buffer[pos++] = encodeTable[(int)(bitWorkArea << 4) & MASK_5BITS]; // 5-1 = 4
399 buffer[pos++] = PAD;
400 buffer[pos++] = PAD;
401 buffer[pos++] = PAD;
402 buffer[pos++] = PAD;
403 break;
404 case 3 : // 3 octets = 24 bits to use
405 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 19) & MASK_5BITS]; // 24-1*5 = 19
406 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 14) & MASK_5BITS]; // 24-2*5 = 14
407 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 9) & MASK_5BITS]; // 24-3*5 = 9
408 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 4) & MASK_5BITS]; // 24-4*5 = 4
409 buffer[pos++] = encodeTable[(int)(bitWorkArea << 1) & MASK_5BITS]; // 5-4 = 1
410 buffer[pos++] = PAD;
411 buffer[pos++] = PAD;
412 buffer[pos++] = PAD;
413 break;
414 case 4 : // 4 octets = 32 bits to use
415 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 27) & MASK_5BITS]; // 32-1*5 = 27
416 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 22) & MASK_5BITS]; // 32-2*5 = 22
417 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 17) & MASK_5BITS]; // 32-3*5 = 17
418 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 12) & MASK_5BITS]; // 32-4*5 = 12
419 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 7) & MASK_5BITS]; // 32-5*5 = 7
420 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 2) & MASK_5BITS]; // 32-6*5 = 2
421 buffer[pos++] = encodeTable[(int)(bitWorkArea << 3) & MASK_5BITS]; // 5-2 = 3
422 buffer[pos++] = PAD;
423 break;
424 }
425 currentLinePos += pos - savedPos; // keep track of current line position
426 // if currentPos == 0 we are at the start of a line, so don't add CRLF
427 if (lineLength > 0 && currentLinePos > 0){ // add chunk separator if required
428 System.arraycopy(lineSeparator, 0, buffer, pos, lineSeparator.length);
429 pos += lineSeparator.length;
430 }
431 } else {
432 for (int i = 0; i < inAvail; i++) {
433 ensureBufferSize(encodeSize);
434 modulus = (modulus+1) % BYTES_PER_UNENCODED_BLOCK;
435 int b = in[inPos++];
436 if (b < 0) {
437 b += 256;
438 }
439 bitWorkArea = (bitWorkArea << 8) + b; // BITS_PER_BYTE
440 if (0 == modulus) { // we have enough bytes to create our output
441 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 35) & MASK_5BITS];
442 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 30) & MASK_5BITS];
443 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 25) & MASK_5BITS];
444 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 20) & MASK_5BITS];
445 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 15) & MASK_5BITS];
446 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 10) & MASK_5BITS];
447 buffer[pos++] = encodeTable[(int)(bitWorkArea >> 5) & MASK_5BITS];
448 buffer[pos++] = encodeTable[(int)bitWorkArea & MASK_5BITS];
449 currentLinePos += BYTES_PER_ENCODED_BLOCK;
450 if (lineLength > 0 && lineLength <= currentLinePos) {
451 System.arraycopy(lineSeparator, 0, buffer, pos, lineSeparator.length);
452 pos += lineSeparator.length;
453 currentLinePos = 0;
454 }
455 }
456 }
457 }
458 }
459
460 /**
461 * Returns whether or not the <code>octet</code> is in the Base32 alphabet.
462 *
463 * @param octet
464 * The value to test
465 * @return <code>true</code> if the value is defined in the the Base32 alphabet <code>false</code> otherwise.
466 */
467 public boolean isInAlphabet(byte octet) {
468 return octet >= 0 && octet < decodeTable.length && decodeTable[octet] != -1;
469 }
470 }