001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 *
017 */
018
019 /*
020 * This package is based on the work done by Timothy Gerard Endres
021 * (time@ice.com) to whom the Ant project is very grateful for his great code.
022 */
023
024 package org.apache.commons.compress.archivers.tar;
025
026 import java.io.BufferedReader;
027 import java.io.IOException;
028 import java.io.InputStream;
029 import java.io.InputStreamReader;
030 import java.util.HashMap;
031 import java.util.Iterator;
032 import java.util.Map;
033 import java.util.Map.Entry;
034
035 import org.apache.commons.compress.archivers.ArchiveEntry;
036 import org.apache.commons.compress.archivers.ArchiveInputStream;
037 import org.apache.commons.compress.utils.ArchiveUtils;
038
039 /**
040 * The TarInputStream reads a UNIX tar archive as an InputStream.
041 * methods are provided to position at each successive entry in
042 * the archive, and the read each entry as a normal input stream
043 * using read().
044 * @NotThreadSafe
045 */
046 public class TarArchiveInputStream extends ArchiveInputStream {
047 private static final int SMALL_BUFFER_SIZE = 256;
048 private static final int BUFFER_SIZE = 8 * 1024;
049
050 private boolean hasHitEOF;
051 private long entrySize;
052 private long entryOffset;
053 private byte[] readBuf;
054 protected final TarBuffer buffer;
055 private TarArchiveEntry currEntry;
056
057 /**
058 * Constructor for TarInputStream.
059 * @param is the input stream to use
060 */
061 public TarArchiveInputStream(InputStream is) {
062 this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE);
063 }
064
065 /**
066 * Constructor for TarInputStream.
067 * @param is the input stream to use
068 * @param blockSize the block size to use
069 */
070 public TarArchiveInputStream(InputStream is, int blockSize) {
071 this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE);
072 }
073
074 /**
075 * Constructor for TarInputStream.
076 * @param is the input stream to use
077 * @param blockSize the block size to use
078 * @param recordSize the record size to use
079 */
080 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
081 this.buffer = new TarBuffer(is, blockSize, recordSize);
082 this.readBuf = null;
083 this.hasHitEOF = false;
084 }
085
086 /**
087 * Closes this stream. Calls the TarBuffer's close() method.
088 * @throws IOException on error
089 */
090 public void close() throws IOException {
091 buffer.close();
092 }
093
094 /**
095 * Get the record size being used by this stream's TarBuffer.
096 *
097 * @return The TarBuffer record size.
098 */
099 public int getRecordSize() {
100 return buffer.getRecordSize();
101 }
102
103 /**
104 * Get the available data that can be read from the current
105 * entry in the archive. This does not indicate how much data
106 * is left in the entire archive, only in the current entry.
107 * This value is determined from the entry's size header field
108 * and the amount of data already read from the current entry.
109 * Integer.MAX_VALUE is returen in case more than Integer.MAX_VALUE
110 * bytes are left in the current entry in the archive.
111 *
112 * @return The number of available bytes for the current entry.
113 * @throws IOException for signature
114 */
115 public int available() throws IOException {
116 if (entrySize - entryOffset > Integer.MAX_VALUE) {
117 return Integer.MAX_VALUE;
118 }
119 return (int) (entrySize - entryOffset);
120 }
121
122 /**
123 * Skip bytes in the input buffer. This skips bytes in the
124 * current entry's data, not the entire archive, and will
125 * stop at the end of the current entry's data if the number
126 * to skip extends beyond that point.
127 *
128 * @param numToSkip The number of bytes to skip.
129 * @return the number actually skipped
130 * @throws IOException on error
131 */
132 public long skip(long numToSkip) throws IOException {
133 // REVIEW
134 // This is horribly inefficient, but it ensures that we
135 // properly skip over bytes via the TarBuffer...
136 //
137 byte[] skipBuf = new byte[BUFFER_SIZE];
138 long skip = numToSkip;
139 while (skip > 0) {
140 int realSkip = (int) (skip > skipBuf.length ? skipBuf.length : skip);
141 int numRead = read(skipBuf, 0, realSkip);
142 if (numRead == -1) {
143 break;
144 }
145 skip -= numRead;
146 }
147 return (numToSkip - skip);
148 }
149
150 /**
151 * Since we do not support marking just yet, we do nothing.
152 */
153 public synchronized void reset() {
154 }
155
156 /**
157 * Get the next entry in this tar archive. This will skip
158 * over any remaining data in the current entry, if there
159 * is one, and place the input stream at the header of the
160 * next entry, and read the header and instantiate a new
161 * TarEntry from the header bytes and return that entry.
162 * If there are no more entries in the archive, null will
163 * be returned to indicate that the end of the archive has
164 * been reached.
165 *
166 * @return The next TarEntry in the archive, or null.
167 * @throws IOException on error
168 */
169 public TarArchiveEntry getNextTarEntry() throws IOException {
170 if (hasHitEOF) {
171 return null;
172 }
173
174 if (currEntry != null) {
175 long numToSkip = entrySize - entryOffset;
176
177 while (numToSkip > 0) {
178 long skipped = skip(numToSkip);
179 if (skipped <= 0) {
180 throw new RuntimeException("failed to skip current tar entry");
181 }
182 numToSkip -= skipped;
183 }
184
185 readBuf = null;
186 }
187
188 byte[] headerBuf = getRecord();
189
190 if (hasHitEOF) {
191 currEntry = null;
192 return null;
193 }
194
195 currEntry = new TarArchiveEntry(headerBuf);
196 entryOffset = 0;
197 entrySize = currEntry.getSize();
198
199 if (currEntry.isGNULongNameEntry()) {
200 // read in the name
201 StringBuffer longName = new StringBuffer();
202 byte[] buf = new byte[SMALL_BUFFER_SIZE];
203 int length = 0;
204 while ((length = read(buf)) >= 0) {
205 longName.append(new String(buf, 0, length));
206 }
207 getNextEntry();
208 if (currEntry == null) {
209 // Bugzilla: 40334
210 // Malformed tar file - long entry name not followed by entry
211 return null;
212 }
213 // remove trailing null terminator
214 if (longName.length() > 0
215 && longName.charAt(longName.length() - 1) == 0) {
216 longName.deleteCharAt(longName.length() - 1);
217 }
218 currEntry.setName(longName.toString());
219 }
220
221 if (currEntry.isPaxHeader()){ // Process Pax headers
222 paxHeaders();
223 }
224
225 if (currEntry.isGNUSparse()){ // Process sparse files
226 readGNUSparse();
227 }
228
229 return currEntry;
230 }
231
232 /**
233 * Get the next record in this tar archive. This will skip
234 * over any remaining data in the current entry, if there
235 * is one, and place the input stream at the header of the
236 * next entry.
237 * If there are no more entries in the archive, null will
238 * be returned to indicate that the end of the archive has
239 * been reached.
240 *
241 * @return The next header in the archive, or null.
242 * @throws IOException on error
243 */
244 private byte[] getRecord() throws IOException {
245 if (hasHitEOF) {
246 return null;
247 }
248
249 byte[] headerBuf = buffer.readRecord();
250
251 if (headerBuf == null) {
252 hasHitEOF = true;
253 } else if (buffer.isEOFRecord(headerBuf)) {
254 hasHitEOF = true;
255 }
256
257 return hasHitEOF ? null : headerBuf;
258 }
259
260 private void paxHeaders() throws IOException{
261 BufferedReader br = new BufferedReader(new InputStreamReader(this, "UTF-8"));
262 Map headers = new HashMap();
263 // Format is "length keyword=value\n";
264 while(true){ // get length
265 int ch;
266 int len=0;
267 int read=0;
268 while((ch = br.read()) != -1){
269 read++;
270 if (ch == ' '){ // End of length string
271 // Get keyword
272 StringBuffer sb = new StringBuffer();
273 while((ch = br.read()) != -1){
274 read++;
275 if (ch == '='){ // end of keyword
276 String keyword = sb.toString();
277 // Get rest of entry
278 char[] cbuf = new char[len-read];
279 int got = br.read(cbuf);
280 if (got != len-read){
281 throw new IOException("Failed to read Paxheader. Expected "+(len-read)+" chars, read "+got);
282 }
283 String value = new String(cbuf, 0 , len-read-1); // Drop trailing NL
284 headers.put(keyword, value);
285 break;
286 }
287 sb.append((char)ch);
288 }
289 break; // Processed single header
290 }
291 len *= 10;
292 len += ch - '0';
293 }
294 if (ch == -1){ // EOF
295 break;
296 }
297 }
298 getNextEntry(); // Get the actual file entry
299 /*
300 * The following headers are defined for Pax.
301 * atime, ctime, mtime, charset: cannot use these without changing TarArchiveEntry fields
302 * comment
303 * gid, gname
304 * linkpath
305 * size
306 * uid,uname
307 */
308 Iterator hdrs = headers.entrySet().iterator();
309 while(hdrs.hasNext()){
310 Entry ent = (Entry) hdrs.next();
311 String key = (String) ent.getKey();
312 String val = (String) ent.getValue();
313 if ("path".equals(key)){
314 currEntry.setName(val);
315 } else if ("linkpath".equals(key)){
316 currEntry.setLinkName(val);
317 } else if ("gid".equals(key)){
318 currEntry.setGroupId(Integer.parseInt(val));
319 } else if ("gname".equals(key)){
320 currEntry.setGroupName(val);
321 } else if ("uid".equals(key)){
322 currEntry.setUserId(Integer.parseInt(val));
323 } else if ("uname".equals(key)){
324 currEntry.setUserName(val);
325 } else if ("size".equals(key)){
326 currEntry.setSize(Long.parseLong(val));
327 }
328 }
329 }
330
331 /**
332 * Adds the sparse chunks from the current entry to the sparse chunks,
333 * including any additional sparse entries following the current entry.
334 *
335 * @throws IOException on error
336 *
337 * @todo Sparse files get not yet really processed.
338 */
339 private void readGNUSparse() throws IOException {
340 /* we do not really process sparse files yet
341 sparses = new ArrayList();
342 sparses.addAll(currEntry.getSparses());
343 */
344 if (currEntry.isExtended()) {
345 TarArchiveSparseEntry entry;
346 do {
347 byte[] headerBuf = getRecord();
348 if (hasHitEOF) {
349 currEntry = null;
350 break;
351 }
352 entry = new TarArchiveSparseEntry(headerBuf);
353 /* we do not really process sparse files yet
354 sparses.addAll(entry.getSparses());
355 */
356 } while (entry.isExtended());
357 }
358 }
359
360 public ArchiveEntry getNextEntry() throws IOException {
361 return getNextTarEntry();
362 }
363
364 /**
365 * Reads bytes from the current tar archive entry.
366 *
367 * This method is aware of the boundaries of the current
368 * entry in the archive and will deal with them as if they
369 * were this stream's start and EOF.
370 *
371 * @param buf The buffer into which to place bytes read.
372 * @param offset The offset at which to place bytes read.
373 * @param numToRead The number of bytes to read.
374 * @return The number of bytes read, or -1 at EOF.
375 * @throws IOException on error
376 */
377 public int read(byte[] buf, int offset, int numToRead) throws IOException {
378 int totalRead = 0;
379
380 if (entryOffset >= entrySize) {
381 return -1;
382 }
383
384 if ((numToRead + entryOffset) > entrySize) {
385 numToRead = (int) (entrySize - entryOffset);
386 }
387
388 if (readBuf != null) {
389 int sz = (numToRead > readBuf.length) ? readBuf.length
390 : numToRead;
391
392 System.arraycopy(readBuf, 0, buf, offset, sz);
393
394 if (sz >= readBuf.length) {
395 readBuf = null;
396 } else {
397 int newLen = readBuf.length - sz;
398 byte[] newBuf = new byte[newLen];
399
400 System.arraycopy(readBuf, sz, newBuf, 0, newLen);
401
402 readBuf = newBuf;
403 }
404
405 totalRead += sz;
406 numToRead -= sz;
407 offset += sz;
408 }
409
410 while (numToRead > 0) {
411 byte[] rec = buffer.readRecord();
412
413 if (rec == null) {
414 // Unexpected EOF!
415 throw new IOException("unexpected EOF with " + numToRead
416 + " bytes unread. Occured at byte: " + getBytesRead());
417 }
418 count(rec.length);
419 int sz = numToRead;
420 int recLen = rec.length;
421
422 if (recLen > sz) {
423 System.arraycopy(rec, 0, buf, offset, sz);
424
425 readBuf = new byte[recLen - sz];
426
427 System.arraycopy(rec, sz, readBuf, 0, recLen - sz);
428 } else {
429 sz = recLen;
430
431 System.arraycopy(rec, 0, buf, offset, recLen);
432 }
433
434 totalRead += sz;
435 numToRead -= sz;
436 offset += sz;
437 }
438
439 entryOffset += totalRead;
440
441 return totalRead;
442 }
443
444 /**
445 * Whether this class is able to read the given entry.
446 *
447 * <p>May return false if the current entry is a sparse file.</p>
448 */
449 public boolean canReadEntryData(ArchiveEntry ae) {
450 if (ae instanceof TarArchiveEntry) {
451 TarArchiveEntry te = (TarArchiveEntry) ae;
452 return !te.isGNUSparse();
453 }
454 return false;
455 }
456
457 protected final TarArchiveEntry getCurrentEntry() {
458 return currEntry;
459 }
460
461 protected final void setCurrentEntry(TarArchiveEntry e) {
462 currEntry = e;
463 }
464
465 protected final boolean isAtEOF() {
466 return hasHitEOF;
467 }
468
469 protected final void setAtEOF(boolean b) {
470 hasHitEOF = b;
471 }
472
473 /**
474 * Checks if the signature matches what is expected for a tar file.
475 *
476 * @param signature
477 * the bytes to check
478 * @param length
479 * the number of bytes to check
480 * @return true, if this stream is a tar archive stream, false otherwise
481 */
482 public static boolean matches(byte[] signature, int length) {
483 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
484 return false;
485 }
486
487 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
488 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
489 &&
490 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
491 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
492 ){
493 return true;
494 }
495 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
496 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
497 &&
498 (
499 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
500 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
501 ||
502 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
503 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
504 )
505 ){
506 return true;
507 }
508 // COMPRESS-107 - recognise Ant tar files
509 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
510 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
511 &&
512 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
513 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
514 ){
515 return true;
516 }
517 return false;
518 }
519
520 }