diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 22b0182a5f9..b35baba53bf 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -57,7 +57,7 @@ The type attribute can be add,update,fix,remove. Don't loose precision while reading folders from a SevenZFile. Improve some exception messages in TarUtils and TarArchiveEntry. SevenZFile now enforces the same folder and coder limits as the CPP implementation. - + BZip2CompressorInputStream now throw CompressorException (a subclass of IOException) for invalid or corrupted data, providing more specific error reporting. BZip2 input streams treat Huffman codes longer than 20 bits as corrupted data, matching the behavior of the reference implementation. @@ -82,7 +82,8 @@ The type attribute can be add,update,fix,remove. Simplify handling of special AR records in ArArchiveInputStream. Correct byte accounting and truncation errors in ARJ input stream. - + Add strict header validation in ARJ input stream and `selfExtracting` option. + org.apache.commons.compress.harmony.unpack200 now throws Pack200Exception, IllegalArgumentException, and IllegalStateException instead of other runtime exceptions and Error. org.apache.commons.compress.harmony.pack200 now throws Pack200Exception, IllegalArgumentException, IllegalStateException, instead of other runtime exceptions and Error. diff --git a/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java b/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java index 01984c253ad..499bf59a319 100644 --- a/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java +++ b/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java @@ -63,6 +63,8 @@ public class ArjArchiveInputStream extends ArchiveInputStream { */ public static final class Builder extends AbstractArchiveBuilder { + private boolean selfExtracting; + private Builder() { setCharset(ENCODING_NAME); } @@ -71,11 +73,43 @@ private Builder() { public ArjArchiveInputStream get() throws IOException { return new ArjArchiveInputStream(this); } + + /** + * Enables compatibility with self-extracting (SFX) ARJ files. + * + *

When {@code true}, the stream is scanned forward to locate the first + * valid ARJ main header. All bytes before that point are ignored, which + * allows reading ARJ data embedded in an executable stub.

+ * + *

Caveat: this lenient pre-scan can mask corruption that + * would otherwise be reported at the start of a normal {@code .arj} file. + * Enable only when you expect an SFX input.

+ * + *

Default: {@code false}.

+ * + * @param selfExtracting {@code true} if the input stream is for a self-extracting archive + * @return {@code this} instance + * @since 1.29.0 + */ + public Builder setSelfExtracting(final boolean selfExtracting) { + this.selfExtracting = selfExtracting; + return asThis(); + } } private static final String ENCODING_NAME = "CP437"; private static final int ARJ_MAGIC_1 = 0x60; private static final int ARJ_MAGIC_2 = 0xEA; + /** + * Maximum size of the basic header, in bytes. + * + *

The value is taken from the reference implementation

+ */ + private static final int MAX_BASIC_HEADER_SIZE = 2600; + /** + * Minimum size of the first header (the fixed-size part of the basic header), in bytes. + */ + private static final int MIN_FIRST_HEADER_SIZE = 30; /** * Creates a new builder. @@ -98,21 +132,10 @@ public static boolean matches(final byte[] signature, final int length) { return length >= 2 && (0xff & signature[0]) == ARJ_MAGIC_1 && (0xff & signature[1]) == ARJ_MAGIC_2; } - private static void readExtraData(final int firstHeaderSize, final InputStream firstHeader, final LocalFileHeader localFileHeader) throws IOException { - if (firstHeaderSize >= 33) { - localFileHeader.extendedFilePosition = EndianUtils.readSwappedInteger(firstHeader); - if (firstHeaderSize >= 45) { - localFileHeader.dateTimeAccessed = EndianUtils.readSwappedInteger(firstHeader); - localFileHeader.dateTimeCreated = EndianUtils.readSwappedInteger(firstHeader); - localFileHeader.originalSizeEvenForVolumes = EndianUtils.readSwappedInteger(firstHeader); - } - } - } - private static int readUnsignedByte(InputStream in) throws IOException { final int value = in.read(); if (value == -1) { - throw new EOFException(); + throw new EOFException("Truncated ARJ archive: expected more data"); } return value & 0xff; } @@ -123,7 +146,7 @@ private static int readUnsignedByte(InputStream in) throws IOException { private ArjArchiveInputStream(final Builder builder) throws IOException { super(builder); - mainHeader = readMainHeader(); + mainHeader = readMainHeader(builder.selfExtracting); if ((mainHeader.arjFlags & MainHeader.Flags.GARBLED) != 0) { throw new ArchiveException("Encrypted ARJ files are unsupported"); } @@ -164,6 +187,55 @@ public boolean canReadEntryData(final ArchiveEntry ae) { return ae instanceof ArjArchiveEntry && ((ArjArchiveEntry) ae).getMethod() == LocalFileHeader.Methods.STORED; } + /** + * Verifies the CRC32 checksum of the given data against the next four bytes read from the input stream. + * + * @param data The data to verify. + * @return true if the checksum matches, false otherwise. + * @throws EOFException If the end of the stream is reached before reading the checksum. + * @throws IOException If an I/O error occurs. + */ + @SuppressWarnings("Since15") + private boolean checkCRC32(final byte[] data) throws IOException { + final CRC32 crc32 = new CRC32(); + crc32.update(data); + final long expectedCrc32 = readSwappedUnsignedInteger(); + return crc32.getValue() == expectedCrc32; + } + + /** + * Scans for the next valid ARJ header. + * + * @return The header bytes. + * @throws EOFException If the end of the stream is reached before a valid header is found. + * @throws IOException If an I/O error occurs. + */ + private byte[] findMainHeader() throws IOException { + byte[] basicHeaderBytes; + try { + while (true) { + int first; + int second = readUnsignedByte(); + do { + first = second; + second = readUnsignedByte(); + } while (first != ARJ_MAGIC_1 && second != ARJ_MAGIC_2); + final int basicHeaderSize = readSwappedUnsignedShort(); + // At least two bytes are required for the null-terminated name and comment + if (MIN_FIRST_HEADER_SIZE + 2 <= basicHeaderSize && basicHeaderSize <= MAX_BASIC_HEADER_SIZE) { + basicHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, basicHeaderSize); + count(basicHeaderSize); + if (checkCRC32(basicHeaderBytes)) { + return basicHeaderBytes; + } + } + // CRC32 failed, continue scanning + } + } catch (EOFException e) { + throw new ArchiveException("Corrupted ARJ archive: unable to find valid main header"); + } + } + /** * Gets the archive's comment. * @@ -263,33 +335,26 @@ private String readEntryName(final InputStream dataIn) throws IOException { * @throws IOException If an I/O error occurs. */ private byte[] readHeader() throws IOException { - byte[] basicHeaderBytes; - // TODO: Explain why we are scanning for a valid ARJ header - // and don't throw, when an invalid/corrupted header is found, - // which might indicate a corrupted archive. - while (true) { - int first; - int second = readUnsignedByte(); - do { - first = second; - second = readUnsignedByte(); - } while (first != ARJ_MAGIC_1 && second != ARJ_MAGIC_2); - final int basicHeaderSize = readSwappedUnsignedShort(); - if (basicHeaderSize == 0) { - // end of archive - return null; - } else if (basicHeaderSize <= 2600) { - basicHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, basicHeaderSize); - count(basicHeaderSize); - final long basicHeaderCrc32 = EndianUtils.readSwappedUnsignedInteger(in); - count(4); - final CRC32 crc32 = new CRC32(); - crc32.update(basicHeaderBytes); - if (basicHeaderCrc32 == crc32.getValue()) { - return basicHeaderBytes; - } - } + final int first = readUnsignedByte(); + final int second = readUnsignedByte(); + if (first != ARJ_MAGIC_1 || second != ARJ_MAGIC_2) { + throw new ArchiveException("Corrupted ARJ archive: invalid ARJ header signature 0x%02X 0x%02X", first, second); } + final int basicHeaderSize = readSwappedUnsignedShort(); + if (basicHeaderSize == 0) { + // End of archive + return null; + } + // At least two bytes are required for the null-terminated name and comment + if (basicHeaderSize < MIN_FIRST_HEADER_SIZE + 2 || basicHeaderSize > MAX_BASIC_HEADER_SIZE) { + throw new ArchiveException("Corrupted ARJ archive: invalid ARJ header size %,d", basicHeaderSize); + } + final byte[] basicHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, basicHeaderSize); + count(basicHeaderSize); + if (!checkCRC32(basicHeaderBytes)) { + throw new ArchiveException("Corrupted ARJ archive: invalid ARJ header CRC32 checksum"); + } + return basicHeaderBytes; } private LocalFileHeader readLocalFileHeader() throws IOException { @@ -318,8 +383,18 @@ private LocalFileHeader readLocalFileHeader() throws IOException { localFileHeader.fileAccessMode = EndianUtils.readSwappedShort(firstHeader); localFileHeader.firstChapter = readUnsignedByte(firstHeader); localFileHeader.lastChapter = readUnsignedByte(firstHeader); - - readExtraData(firstHeaderSize, firstHeader, localFileHeader); + // Total read (including size byte): 10 + 4 * 4 + 2 * 2 = 30 bytes + + if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4) { + localFileHeader.extendedFilePosition = EndianUtils.readSwappedInteger(firstHeader); + // Total read (including size byte): 30 + 4 = 34 bytes + if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4 + 12) { + localFileHeader.dateTimeAccessed = EndianUtils.readSwappedInteger(firstHeader); + localFileHeader.dateTimeCreated = EndianUtils.readSwappedInteger(firstHeader); + localFileHeader.originalSizeEvenForVolumes = EndianUtils.readSwappedInteger(firstHeader); + // Total read (including size byte): 34 + 12 = 46 bytes + } + } } localFileHeader.name = readEntryName(basicHeader); @@ -331,12 +406,8 @@ private LocalFileHeader readLocalFileHeader() throws IOException { while ((extendedHeaderSize = readSwappedUnsignedShort()) > 0) { final byte[] extendedHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, extendedHeaderSize); count(extendedHeaderSize); - final long extendedHeaderCrc32 = EndianUtils.readSwappedUnsignedInteger(in); - count(4); - final CRC32 crc32 = new CRC32(); - crc32.update(extendedHeaderBytes); - if (extendedHeaderCrc32 != crc32.getValue()) { - throw new ArchiveException("Extended header CRC32 verification failure"); + if (!checkCRC32(extendedHeaderBytes)) { + throw new ArchiveException("Corrupted ARJ archive: extended header CRC32 verification failure"); } extendedHeaders.add(extendedHeaderBytes); } @@ -345,8 +416,8 @@ private LocalFileHeader readLocalFileHeader() throws IOException { return localFileHeader; } - private MainHeader readMainHeader() throws IOException { - final byte[] basicHeaderBytes = readHeader(); + private MainHeader readMainHeader(final boolean selfExtracting) throws IOException { + final byte[] basicHeaderBytes = selfExtracting ? findMainHeader() : readHeader(); final MainHeader header = new MainHeader(); try (InputStream basicHeader = new ByteArrayInputStream(basicHeaderBytes)) { @@ -368,12 +439,14 @@ private MainHeader readMainHeader() throws IOException { header.securityEnvelopeLength = EndianUtils.readSwappedShort(firstHeader); header.encryptionVersion = readUnsignedByte(firstHeader); header.lastChapter = readUnsignedByte(firstHeader); + // Total read (including size byte): 10 + 4 * 4 + 2 * 2 = 30 bytes - if (firstHeaderSize >= 33) { + if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4) { header.arjProtectionFactor = readUnsignedByte(firstHeader); header.arjFlags2 = readUnsignedByte(firstHeader); readUnsignedByte(firstHeader); readUnsignedByte(firstHeader); + // Total read (including size byte): 30 + 4 = 34 bytes } } @@ -385,12 +458,8 @@ private MainHeader readMainHeader() throws IOException { if (extendedHeaderSize > 0) { header.extendedHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, extendedHeaderSize); count(extendedHeaderSize); - final long extendedHeaderCrc32 = EndianUtils.readSwappedUnsignedInteger(in); - count(4); - final CRC32 crc32 = new CRC32(); - crc32.update(header.extendedHeaderBytes); - if (extendedHeaderCrc32 != crc32.getValue()) { - throw new ArchiveException("Extended header CRC32 verification failure"); + if (!checkCRC32(header.extendedHeaderBytes)) { + throw new ArchiveException("Corrupted ARJ archive: extended header CRC32 verification failure"); } } @@ -407,6 +476,12 @@ private ByteArrayOutputStream readString(final InputStream dataIn) throws IOExce } } + private long readSwappedUnsignedInteger() throws IOException { + final long value = EndianUtils.readSwappedUnsignedInteger(in); + count(4); + return value; + } + private int readSwappedUnsignedShort() throws IOException { final int value = EndianUtils.readSwappedUnsignedShort(in); count(2); diff --git a/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java b/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java index 8d32be9e15b..cb856e163f0 100644 --- a/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java +++ b/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java @@ -23,30 +23,71 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; +import java.io.ByteArrayInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; +import java.io.SequenceInputStream; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.util.Calendar; import java.util.TimeZone; +import java.util.stream.Stream; +import java.util.zip.CRC32; import org.apache.commons.compress.AbstractTest; +import org.apache.commons.compress.archivers.ArchiveException; +import org.apache.commons.io.EndianUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.BoundedInputStream; import org.apache.commons.io.output.ByteArrayOutputStream; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; +import org.junitpioneer.jupiter.cartesian.CartesianTest; /** * Tests {@link ArjArchiveInputStream}. */ class ArjArchiveInputStreamTest extends AbstractTest { + private static byte[] createArjArchiveHeader(int size, boolean computeCrc) { + // Enough space for the fixed-size portion of the header plus: + // - signature (2 bytes) + // - the 2-byte basic header size field itself (2 bytes) + // - at least one byte each for the filename and comment C-strings (2 bytes) + // - the 4-byte CRC-32 that follows the basic header + final byte[] bytes = new byte[4 + size + 10]; + bytes[0] = (byte) 0x60; // ARJ signature + bytes[1] = (byte) 0xEA; + // Basic header size (little-endian) + EndianUtils.writeSwappedShort(bytes, 2, (short) (size + 2)); + // First header size + bytes[4] = (byte) size; + // Compute valid CRC-32 for the basic header + if (computeCrc) { + final CRC32 crc32 = new CRC32(); + crc32.update(bytes, 4, size + 2); + EndianUtils.writeSwappedInteger(bytes, 4 + size + 2, (int) crc32.getValue()); + } + return bytes; + } + + static Stream testSelfExtractingArchive() { + return Stream.of( + new byte[] { 0x10, 0x11, 0x12, 0x13, 0x14 }, + // In a normal context: an archive trailer. + new byte[] { 0x60, (byte) 0xEA, 0x00, 0x00 }, + // Header of valid size, but with an invalid CRC-32. + createArjArchiveHeader(30, false) + ); + } + private void assertArjArchiveEntry(final ArjArchiveEntry entry) { assertNotNull(entry.getName()); assertNotNull(entry.getLastModifiedDate()); @@ -83,12 +124,6 @@ private void assertForEach(final ArjArchiveInputStream archive) throws IOExcepti }); } - @Test - void testFirstHeaderSizeSetToZero() { - assertThrows(IOException.class, - () -> ArjArchiveInputStream.builder().setURI(getURI("org/apache/commons/compress/arj/zero_sized_headers-fail.arj")).get().close()); - } - @Test void testForEach() throws Exception { final StringBuilder expected = new StringBuilder(); @@ -264,6 +299,25 @@ void testReadingOfAttributesUnixVersion() throws Exception { } } + @ParameterizedTest + @MethodSource + void testSelfExtractingArchive(byte[] junk) throws Exception { + final Path validArj = getPath("bla.arj"); + try (InputStream first = new ByteArrayInputStream(junk); + InputStream second = Files.newInputStream(validArj); + SequenceInputStream seq = new SequenceInputStream(first, second); + ArjArchiveInputStream in = ArjArchiveInputStream.builder().setInputStream(seq).setSelfExtracting(true).get()) { + ArjArchiveEntry entry = in.getNextEntry(); + assertNotNull(entry); + assertEquals("test1.xml", entry.getName()); + entry = in.getNextEntry(); + assertNotNull(entry); + assertEquals("test2.xml", entry.getName()); + entry = in.getNextEntry(); + assertNull(entry); + } + } + @Test void testSingleArgumentConstructor() throws Exception { try (InputStream inputStream = Files.newInputStream(getPath("bla.arj")); @@ -283,6 +337,14 @@ void testSingleByteReadConsistentlyReturnsMinusOneAtEof() throws Exception { } } + @CartesianTest + void testSmallFirstHeaderSize( + // 30 is the minimum valid size + @CartesianTest.Values(ints = {0, 1, 10, 29}) int size, @CartesianTest.Values(booleans = {false, true}) boolean selfExtracting) { + final byte[] bytes = createArjArchiveHeader(size, true); + assertThrows(ArchiveException.class, () -> ArjArchiveInputStream.builder().setByteArray(bytes).setSelfExtracting(selfExtracting).get()); + } + /** * Verifies that reading an ARJ header record cut short at various boundaries * results in an {@link EOFException}. @@ -319,7 +381,7 @@ void testSingleByteReadConsistentlyReturnsMinusOneAtEof() throws Exception { // One byte before the first file’s data 0x95 }) - void testTruncatedLocalHeader(final long maxCount) throws Exception { + void testTruncatedLocalHeader(long maxCount) throws Exception { try (InputStream input = BoundedInputStream.builder().setURI(getURI("bla.arj")).setMaxCount(maxCount).get(); ArjArchiveInputStream archive = ArjArchiveInputStream.builder().setInputStream(input).get()) { assertThrows(EOFException.class, () -> { @@ -361,7 +423,7 @@ void testTruncatedLocalHeader(final long maxCount) throws Exception { 0x30, 0x33, // Inside the extended-header length (2 bytes) 0x34}) - void testTruncatedMainHeader(final long maxCount) throws Exception { + void testTruncatedMainHeader(long maxCount) throws Exception { try (InputStream input = BoundedInputStream.builder() .setURI(getURI("bla.arj")) .setMaxCount(maxCount)