Skip to content

Commit 1645ab9

Browse files
committed
[CODEC-267] Add hash32x86 methods to fix sign extension error.
Adds a new API to preserve behavioural compatibility with the released version: hash32x86(byte[]) hash32x86(byte[], int offset, int length, int seed) IncrementalHash32x64 The methods with the sign extension bug have been deprecated. The new API methods use zero as the default seed. mvn clirr:check is OK with adding a new class IncrementalHash32x64 as the parent of IncrementalHash32.
1 parent 1490fbf commit 1645ab9

3 files changed

Lines changed: 267 additions & 17 deletions

File tree

src/changes/changes.xml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ The <action> type attribute can be add,update,fix,remove.
4343
<body>
4444

4545
<release version="1.14" date="TBD" description="Feature and fix release.">
46-
<action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to IncrementalHash32.end() to generate the same value.</action>
46+
<action issue="CODEC-267" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash32x86 methods and IncrementalHash32x86 to fix sign extension error in hash32 methods.</action>
47+
<action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to MurmurHash3.IncrementalHash32.end() to generate the same value.</action>
4748
</release>
4849

4950
<release version="1.13" date="2019-07-20" description="Feature and fix release.">

src/main/java/org/apache/commons/codec/digest/MurmurHash3.java

Lines changed: 144 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,9 @@ public static int hash32(final long data, final int seed) {
217217
* @param data The input byte array
218218
* @return The 32-bit hash
219219
* @see #hash32(byte[], int, int, int)
220+
* @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
220221
*/
222+
@Deprecated
221223
public static int hash32(final byte[] data) {
222224
return hash32(data, 0, data.length, DEFAULT_SEED);
223225
}
@@ -240,7 +242,9 @@ public static int hash32(final byte[] data) {
240242
* @param data The input string
241243
* @return The 32-bit hash
242244
* @see #hash32(byte[], int, int, int)
245+
* @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
243246
*/
247+
@Deprecated
244248
public static int hash32(final String data) {
245249
final byte[] bytes = data.getBytes();
246250
return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
@@ -263,7 +267,9 @@ public static int hash32(final String data) {
263267
* @param length The length of array
264268
* @return The 32-bit hash
265269
* @see #hash32(byte[], int, int, int)
270+
* @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
266271
*/
272+
@Deprecated
267273
public static int hash32(final byte[] data, final int length) {
268274
return hash32(data, length, DEFAULT_SEED);
269275
}
@@ -285,7 +291,9 @@ public static int hash32(final byte[] data, final int length) {
285291
* @param seed The initial seed value
286292
* @return The 32-bit hash
287293
* @see #hash32(byte[], int, int, int)
294+
* @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
288295
*/
296+
@Deprecated
289297
public static int hash32(final byte[] data, final int length, final int seed) {
290298
return hash32(data, 0, length, seed);
291299
}
@@ -305,7 +313,9 @@ public static int hash32(final byte[] data, final int length, final int seed) {
305313
* @param length The length of array
306314
* @param seed The initial seed value
307315
* @return The 32-bit hash
316+
* @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
308317
*/
318+
@Deprecated
309319
public static int hash32(final byte[] data, final int offset, final int length, final int seed) {
310320
int hash = seed;
311321
final int nblocks = length >> 2;
@@ -342,6 +352,69 @@ public static int hash32(final byte[] data, final int offset, final int length,
342352
return fmix32(hash);
343353
}
344354

355+
/**
356+
* Generates 32-bit hash from the byte array with a seed of zero.
357+
* This is a helper method that will produce the same result as:
358+
*
359+
* <pre>
360+
* int offset = 0;
361+
* int seed = 0;
362+
* int hash = hash32x86(data, offset, data.length, seed);
363+
* </pre>
364+
*
365+
* @param data The input byte array
366+
* @return The 32-bit hash
367+
* @see #hash32x86(byte[], int, int, int)
368+
*/
369+
public static int hash32x86(final byte[] data) {
370+
return hash32x86(data, 0, data.length, 0);
371+
}
372+
373+
/**
374+
* Generates 32-bit hash from the byte array with the given offset, length and seed.
375+
*
376+
* <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
377+
* from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
378+
*
379+
* @param data The input byte array
380+
* @param offset The offset of data
381+
* @param length The length of array
382+
* @param seed The initial seed value
383+
* @return The 32-bit hash
384+
*/
385+
public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) {
386+
int hash = seed;
387+
final int nblocks = length >> 2;
388+
389+
// body
390+
for (int i = 0; i < nblocks; i++) {
391+
final int index = offset + (i << 2);
392+
final int k = getLittleEndianInt(data, index);
393+
hash = mix32(k, hash);
394+
}
395+
396+
// tail
397+
final int index = offset + (nblocks << 2);
398+
int k1 = 0;
399+
switch (offset + length - index) {
400+
case 3:
401+
k1 ^= (data[index + 2] & 0xff) << 16;
402+
case 2:
403+
k1 ^= (data[index + 1] & 0xff) << 8;
404+
case 1:
405+
k1 ^= (data[index] & 0xff);
406+
407+
// mix functions
408+
k1 *= C1_32;
409+
k1 = Integer.rotateLeft(k1, R1_32);
410+
k1 *= C2_32;
411+
hash ^= k1;
412+
}
413+
414+
hash ^= length;
415+
return fmix32(hash);
416+
}
417+
345418
/**
346419
* Generates 64-bit hash from a long with a default seed.
347420
*
@@ -804,12 +877,8 @@ private static long fmix64(long hash) {
804877
*
805878
* <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
806879
* from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
807-
*
808-
* <p>This implementation contains a sign-extension bug in the finalisation step of
809-
* any bytes left over from dividing the length by 4. This manifests if any of these
810-
* bytes are negative.<p>
811880
*/
812-
public static class IncrementalHash32 {
881+
public static class IncrementalHash32x86 {
813882
/** The size of byte blocks that are processed together. */
814883
private static final int BLOCK_SIZE = 4;
815884

@@ -916,26 +985,33 @@ public final void add(final byte[] data, final int offset, final int length) {
916985
* Generate the 32-bit hash value. Repeat calls to this method with no additional data
917986
* will generate the same hash value.
918987
*
919-
* <p>This implementation contains a sign-extension bug in the finalisation step of
920-
* any bytes left over from dividing the length by 4. This manifests if any of these
921-
* bytes are negative.<p>
922-
*
923988
* @return The 32-bit hash
924989
*/
925990
public final int end() {
926991
// Allow calling end() again after adding no data to return the same result.
992+
return finalise(hash, unprocessedLength, unprocessed, totalLen);
993+
}
994+
995+
/**
996+
* Finalise the running hash to the output 32-bit hash by processing remaining bytes
997+
* and performing final mixing.
998+
*
999+
* @param hash The running hash
1000+
* @param unprocessedLength The number of unprocessed bytes in the tail data.
1001+
* @param unprocessed Up to 3 unprocessed bytes from input data.
1002+
* @param totalLen The total number of input bytes added since the start.
1003+
* @return The 32-bit hash
1004+
*/
1005+
int finalise(int hash, int unprocessedLength, byte[] unprocessed, int totalLen) {
9271006
int result = hash;
928-
// ************
929-
// Note: This fails to apply masking using 0xff to the 3 remaining bytes.
930-
// ************
9311007
int k1 = 0;
9321008
switch (unprocessedLength) {
9331009
case 3:
934-
k1 ^= unprocessed[2] << 16;
1010+
k1 ^= (unprocessed[2] & 0xff) << 16;
9351011
case 2:
936-
k1 ^= unprocessed[1] << 8;
1012+
k1 ^= (unprocessed[1] & 0xff) << 8;
9371013
case 1:
938-
k1 ^= unprocessed[0];
1014+
k1 ^= (unprocessed[0] & 0xff);
9391015

9401016
// mix functions
9411017
k1 *= C1_32;
@@ -964,4 +1040,57 @@ private static int orBytes(final byte b1, final byte b2, final byte b3, final by
9641040
return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
9651041
}
9661042
}
1043+
1044+
/**
1045+
* Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
1046+
* hash computed.
1047+
*
1048+
* <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
1049+
* from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
1050+
*
1051+
* <p>This implementation contains a sign-extension bug in the finalisation step of
1052+
* any bytes left over from dividing the length by 4. This manifests if any of these
1053+
* bytes are negative.<p>
1054+
*
1055+
* @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
1056+
*/
1057+
@Deprecated
1058+
public static class IncrementalHash32 extends IncrementalHash32x86 {
1059+
/**
1060+
* {@inheritDoc}
1061+
*
1062+
* <p>This implementation contains a sign-extension bug in the finalisation step of
1063+
* any bytes left over from dividing the length by 4. This manifests if any of these
1064+
* bytes are negative.<p>
1065+
*
1066+
* @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
1067+
*/
1068+
@Override
1069+
@Deprecated
1070+
int finalise(int hash, int unprocessedLength, byte[] unprocessed, int totalLen) {
1071+
int result = hash;
1072+
// ************
1073+
// Note: This fails to apply masking using 0xff to the 3 remaining bytes.
1074+
// ************
1075+
int k1 = 0;
1076+
switch (unprocessedLength) {
1077+
case 3:
1078+
k1 ^= unprocessed[2] << 16;
1079+
case 2:
1080+
k1 ^= unprocessed[1] << 8;
1081+
case 1:
1082+
k1 ^= unprocessed[0];
1083+
1084+
// mix functions
1085+
k1 *= C1_32;
1086+
k1 = Integer.rotateLeft(k1, R1_32);
1087+
k1 *= C2_32;
1088+
result ^= k1;
1089+
}
1090+
1091+
// finalization
1092+
result ^= totalLen;
1093+
return fmix32(result);
1094+
}
1095+
}
9671096
}

src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.util.concurrent.ThreadLocalRandom;
2525

2626
import org.apache.commons.codec.digest.MurmurHash3.IncrementalHash32;
27+
import org.apache.commons.codec.digest.MurmurHash3.IncrementalHash32x86;
2728
import org.junit.Test;
2829

2930
/**
@@ -349,7 +350,7 @@ public void testHash32String() {
349350
* if the final 1, 2, or 3 bytes are negative.
350351
*/
351352
@Test
352-
public void testHash32With1TrailingSignedByteIsInvalid() {
353+
public void testHash32WithTrailingNegativeSignedBytesIsInvalid() {
353354
// import mmh3
354355
// import numpy as np
355356
// mmh3.hash(np.uint8([-1]))
@@ -366,6 +367,71 @@ public void testHash32With1TrailingSignedByteIsInvalid() {
366367
Assert.assertNotEquals(-225068062, MurmurHash3.hash32(new byte[] {0, -1, 0}, 0, 3, 0));
367368
}
368369

370+
/**
371+
* Test the {@link MurmurHash3#hash32x86(byte[])} algorithm.
372+
*
373+
* <p>Reference data is taken from the Python library {@code mmh3}.</p>
374+
*
375+
* @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
376+
*/
377+
@Test
378+
public void testhash32x86() {
379+
// Note: Default seed is zero.
380+
381+
// mmh3.hash(bytes, 0)
382+
Assert.assertEquals(1546271276, MurmurHash3.hash32x86(RANDOM_BYTES));
383+
384+
// Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
385+
// 15 bytes remaining.
386+
// for x in range(0, 32):
387+
// print(mmh3.hash(bytes[:x], 0), ',')
388+
final int[] answers = {0, -1353253853, 915381745, -734983419, 1271125654, -1042265893, -1204521619, 735845843,
389+
138310876, -1918938664, 1399647898, -1126342309, 2067593280, 1220975287, 1941281084, -1289513180, 942412060,
390+
-618173583, -269546647, -1645631262, 1162379906, -1960125577, -1856773195, 1980513522, 1174612855,
391+
905810751, 1044578220, -1758486689, -491393913, 839836946, -435014415, 2044851178,};
392+
for (int i = 0; i < answers.length; i++) {
393+
final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i);
394+
Assert.assertEquals(answers[i], MurmurHash3.hash32x86(bytes));
395+
}
396+
}
397+
398+
/**
399+
* Test the {@link MurmurHash3#hash32x86(byte[], int, int, int)} algorithm.
400+
*
401+
* <p>Reference data is taken from the Python library {@code mmh3}.</p>
402+
*
403+
* @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
404+
*/
405+
@Test
406+
public void testHash32x86WithOffsetLengthAndSeed() {
407+
// Data as above for testing MurmurHash3.hash32(byte[], int, int, int).
408+
final int seed = -42;
409+
final int offset = 13;
410+
final int[] answers = {192929823, -27171978, -1282326280, -816314453, -1176217753, -1904531247, 1962794233,
411+
-1302316624, -1151850323, -1464386748, -369299427, 972232488, 1747314487, 2137398916, 690986564,
412+
-1985866226, -678669121, -2123325690, -253319081, 46181235, 656058278, 1401175653, 1750113912, -1567219725,
413+
2032742772, -2024269989, -305340794, 1161737942, -661265418, 172838872, -650122718, -1934812417,};
414+
for (int i = 0; i < answers.length; i++) {
415+
Assert.assertEquals(answers[i], MurmurHash3.hash32x86(RANDOM_BYTES, offset, i, seed));
416+
}
417+
}
418+
419+
/**
420+
* Test to demonstrate {@link MurmurHash3#hash32x86(byte[], int, int, int)} is OK
421+
* if the final 1, 2, or 3 bytes are negative.
422+
*/
423+
@Test
424+
public void testHash32x86WithTrailingNegativeSignedBytes() {
425+
// Data as above for testing MurmurHash3.hash32(byte[], int, int, int).
426+
// This test uses assertEquals().
427+
Assert.assertEquals(-43192051, MurmurHash3.hash32x86(new byte[] {-1}, 0, 1, 0));
428+
Assert.assertEquals(-582037868, MurmurHash3.hash32x86(new byte[] {0, -1}, 0, 2, 0));
429+
Assert.assertEquals(922088087, MurmurHash3.hash32x86(new byte[] {0, 0, -1}, 0, 3, 0));
430+
Assert.assertEquals(-1309567588, MurmurHash3.hash32x86(new byte[] {-1, 0}, 0, 2, 0));
431+
Assert.assertEquals(-363779670, MurmurHash3.hash32x86(new byte[] {-1, 0, 0}, 0, 3, 0));
432+
Assert.assertEquals(-225068062, MurmurHash3.hash32x86(new byte[] {0, -1, 0}, 0, 3, 0));
433+
}
434+
369435
/**
370436
* Test the {@link MurmurHash3#hash64(byte[])} algorithm.
371437
* Unknown origin of test data. It may be from the Apache Hive project.
@@ -600,4 +666,58 @@ private static void assertIncrementalHash32(byte[] bytes, int seed, int... block
600666
Assert.assertEquals("Hashes differ after no additional data", h1, inc.end());
601667
}
602668
}
669+
670+
/**
671+
* Test {@link IncrementalHash32x86} returns the same values as
672+
* {@link MurmurHash3#hash32x86(byte[], int, int, int)}.
673+
*/
674+
@Test
675+
public void testIncrementalHash32x86() {
676+
final byte[] bytes = new byte[1023];
677+
ThreadLocalRandom.current().nextBytes(bytes);
678+
// The seed does not matter
679+
for (final int seed : new int[] {-567, 0, 6787990}) {
680+
// Cases are constructed to hit all edge cases of processing:
681+
// Nothing added
682+
assertIncrementalHash32x86(bytes, seed, 0, 0);
683+
// Add single bytes
684+
assertIncrementalHash32x86(bytes, seed, 1, 1, 1, 1, 1, 1, 1, 1);
685+
// Leading unprocessed 1, 2, 3
686+
assertIncrementalHash32x86(bytes, seed, 1, 4);
687+
assertIncrementalHash32x86(bytes, seed, 2, 4);
688+
assertIncrementalHash32x86(bytes, seed, 3, 4);
689+
// Trailing unprocessed 1, 2, 3
690+
assertIncrementalHash32x86(bytes, seed, 4, 1);
691+
assertIncrementalHash32x86(bytes, seed, 4, 2);
692+
assertIncrementalHash32x86(bytes, seed, 4, 3);
693+
// Complete blocks
694+
assertIncrementalHash32x86(bytes, seed, 4, 16, 64);
695+
}
696+
}
697+
698+
/**
699+
* Assert {@link IncrementalHash32x86} returns the same values as
700+
* {@link MurmurHash3#hash32x86(byte[], int, int, int)}.
701+
*
702+
* <p>The bytes are added to the incremental hash in the given blocks.</p>
703+
*
704+
* @param bytes the bytes
705+
* @param seed the seed
706+
* @param blocks the blocks
707+
*/
708+
private static void assertIncrementalHash32x86(byte[] bytes, int seed, int... blocks) {
709+
int offset = 0;
710+
int total = 0;
711+
final IncrementalHash32x86 inc = new IncrementalHash32x86();
712+
inc.start(seed);
713+
for (final int block : blocks) {
714+
total += block;
715+
final int h1 = MurmurHash3.hash32x86(bytes, 0, total, seed);
716+
inc.add(bytes, offset, block);
717+
offset += block;
718+
final int h2 = inc.end();
719+
Assert.assertEquals("Hashes differ", h1, h2);
720+
Assert.assertEquals("Hashes differ after no additional data", h1, inc.end());
721+
}
722+
}
603723
}

0 commit comments

Comments
 (0)