Skip to content

Commit 82b108f

Browse files
committed
[CODEC-264] Add hash128x64 methods to fix sign extension error.
Adds a new API to preserve behavioural compatibility with the released version: hash128x64(byte[]) hash128x64(byte[], int offset, int length, int seed) The methods with the sign extension bug have been deprecated. The new API methods use zero as the default seed.
1 parent 1645ab9 commit 82b108f

3 files changed

Lines changed: 167 additions & 3 deletions

File tree

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ The <action> type attribute can be add,update,fix,remove.
4343
<body>
4444

4545
<release version="1.14" date="TBD" description="Feature and fix release.">
46+
<action issue="CODEC-264" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash128x64 methods to fix sign extension error during seeding in hash128 methods.</action>
4647
<action issue="CODEC-267" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash32x86 methods and IncrementalHash32x86 to fix sign extension error in hash32 methods.</action>
4748
<action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to MurmurHash3.IncrementalHash32.end() to generate the same value.</action>
4849
</release>

src/main/java/org/apache/commons/codec/digest/MurmurHash3.java

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ public static long hash64(final byte[] data) {
554554
*
555555
* <pre>
556556
* int seed = 104729;
557-
* long hash = hash64(data, offset, length, seed);
557+
* long hash = MurmurHash3.hash64(data, offset, length, seed);
558558
* <pre>
559559
*
560560
* @param data The input byte array
@@ -645,8 +645,9 @@ public static long hash64(final byte[] data, final int offset, final int length,
645645
* This is a helper method that will produce the same result as:
646646
*
647647
* <pre>
648+
* int offset = 0;
648649
* int seed = 104729;
649-
* int hash = hash128(data, 0, data.length, seed);
650+
* int hash = MurmurHash3.hash128(data, offset, data.length, seed);
650651
* </pre>
651652
*
652653
* <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
@@ -660,6 +661,24 @@ public static long[] hash128(final byte[] data) {
660661
return hash128(data, 0, data.length, DEFAULT_SEED);
661662
}
662663

664+
/**
665+
* Generates 128-bit hash from the byte array with a seed of zero.
666+
* This is a helper method that will produce the same result as:
667+
*
668+
* <pre>
669+
* int offset = 0;
670+
* int seed = 0;
671+
* int hash = MurmurHash3.hash128x64(data, offset, data.length, 0);
672+
* </pre>
673+
*
674+
* @param data The input byte array
675+
* @return The 128-bit hash (2 longs)
676+
* @see #hash128x64(byte[], int, int, int)
677+
*/
678+
public static long[] hash128x64(final byte[] data) {
679+
return hash128x64(data, 0, data.length, 0);
680+
}
681+
663682
/**
664683
* Generates 128-bit hash from a string with a default seed.
665684
* The string is converted to bytes using the default encoding.
@@ -668,7 +687,7 @@ public static long[] hash128(final byte[] data) {
668687
* <pre>
669688
* int seed = 104729;
670689
* byte[] bytes = data.getBytes();
671-
* int hash = hash128(bytes, 0, bytes.length, seed);
690+
* int hash = MurmurHash3.hash128(bytes, 0, bytes.length, seed);
672691
* </pre>
673692
*
674693
* <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
@@ -697,11 +716,46 @@ public static long[] hash128(final String data) {
697716
* @param length The length of array
698717
* @param seed The initial seed value
699718
* @return The 128-bit hash (2 longs)
719+
* @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialisation.
700720
*/
721+
@Deprecated
701722
public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
702723
// ************
703724
// Note: This fails to apply masking using 0xffffffffL to the seed.
704725
// ************
726+
return hash128x64(data, offset, length, seed);
727+
}
728+
729+
/**
730+
* Generates 128-bit hash from the byte array with the given offset, length and seed.
731+
*
732+
* <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
733+
* from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
734+
*
735+
* @param data The input byte array
736+
* @param offset The first element of array
737+
* @param length The length of array
738+
* @param seed The initial seed value
739+
* @return The 128-bit hash (2 longs)
740+
*/
741+
public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) {
742+
// Use an unsigned 32-bit integer as the seed
743+
return hash128x64(data, offset, length, seed & 0xffffffffL);
744+
}
745+
746+
/**
747+
* Generates 128-bit hash from the byte array with the given offset, length and seed.
748+
*
749+
* <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
750+
* from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
751+
*
752+
* @param data The input byte array
753+
* @param offset The first element of array
754+
* @param length The length of array
755+
* @param seed The initial seed value
756+
* @return The 128-bit hash (2 longs)
757+
*/
758+
private static long[] hash128x64(final byte[] data, final int offset, final int length, final long seed) {
705759
long h1 = seed;
706760
long h2 = seed;
707761
final int nblocks = length >> 4;

src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
/**
3131
* Test for {@link MurmurHash3}.
3232
*/
33+
@SuppressWarnings("deprecation")
3334
public class MurmurHash3Test {
3435

3536
/** Test data for the hash64 method. */
@@ -613,6 +614,114 @@ public void testHash128String() {
613614
}
614615
}
615616

617+
/**
618+
* Test the {@link MurmurHash3#hash128x64(byte[])} algorithm.
619+
*
620+
* <p>Reference data is taken from the Python library {@code mmh3}.</p>
621+
*
622+
* @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
623+
*/
624+
@Test
625+
public void testHash128x64() {
626+
// Note: Default seed is zero.
627+
628+
// mmh3.hash64(bytes, 0)
629+
Assert.assertArrayEquals(new long[] {1972113670104592209L, 5171809317673151911L},
630+
MurmurHash3.hash128x64(RANDOM_BYTES));
631+
632+
// Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
633+
// 15 bytes remaining.
634+
// for x in range(0, 32):
635+
// print(mmh3.hash64(bytes[:x], 0), ',')
636+
final long[][] answers = {{0L, 0L}, {-2808653841080383123L, -2531784594030660343L},
637+
{-1284575471001240306L, -8226941173794461820L}, {1645529003294647142L, 4109127559758330427L},
638+
{-4117979116203940765L, -8362902660322042742L}, {2559943399590596158L, 4738005461125350075L},
639+
{-1651760031591552651L, -5386079254924224461L}, {-6208043960690815609L, 7862371518025305074L},
640+
{-5150023478423646337L, 8346305334874564507L}, {7658274117911906792L, -4962914659382404165L},
641+
{1309458104226302269L, 570003296096149119L}, {7440169453173347487L, -3489345781066813740L},
642+
{-5698784298612201352L, 3595618450161835420L}, {-3822574792738072442L, 6878153771369862041L},
643+
{3705084673301918328L, 3202155281274291907L}, {-6797166743928506931L, -4447271093653551597L},
644+
{5240533565589385084L, -5575481185288758327L}, {-8467620131382649428L, -6450630367251114468L},
645+
{3632866961828686471L, -5957695976089313500L}, {-6450283648077271139L, -7908632714374518059L},
646+
{226350826556351719L, 8225586794606475685L}, {-2382996224496980401L, 2188369078123678011L},
647+
{-1337544762358780825L, 7004253486151757299L}, {2889033453638709716L, -4099509333153901374L},
648+
{-8644950936809596954L, -5144522919639618331L}, {-5628571865255520773L, -839021001655132087L},
649+
{-5226774667293212446L, -505255961194269502L}, {1337107025517938142L, 3260952073019398505L},
650+
{9149852874328582511L, 1880188360994521535L}, {-4035957988359881846L, -7709057850766490780L},
651+
{-3842593823306330815L, 3805147088291453755L}, {4030161393619149616L, -2813603781312455238L},};
652+
for (int i = 0; i < answers.length; i++) {
653+
final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i);
654+
Assert.assertArrayEquals(answers[i], MurmurHash3.hash128x64(bytes));
655+
}
656+
}
657+
658+
/**
659+
* Test the {@link MurmurHash3#hash128x64(byte[], int, int, int)} algorithm.
660+
*
661+
* <p>Reference data is taken from the Python library {@code mmh3}.</p>
662+
*
663+
* @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
664+
*/
665+
@Test
666+
public void testHash128x64WithOffsetLengthAndSeed() {
667+
// Seed can be positive
668+
final int seed = 42;
669+
final int offset = 13;
670+
671+
// Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
672+
// 15 bytes remaining.
673+
// for x in range(0, 32):
674+
// print(mmh3.hash64(bytes[13:x+13], 42), ',')
675+
final long[][] answers = {{-1140915396076141277L, -3386313222241793095L},
676+
{2745805417334040752L, -3045882272665292331L}, {6807939080212835946L, -1975749467247671127L},
677+
{-7924884987449335214L, -4468571497642087939L}, {3005389733967167773L, -5809440073240597398L},
678+
{8032745196600164727L, 4545709434702374224L}, {2095398623732573832L, 1778447136435513908L},
679+
{4492723708121417255L, -7411125500882394867L}, {8467397417110552178L, -1503802302645548949L},
680+
{4189760269121918355L, -8004336343217265057L}, {4939298084211301953L, -8419135013628844658L},
681+
{5497136916151148085L, -394028342910298191L}, {3405983294878231737L, -3216533807498089078L},
682+
{5833223403351466775L, -1792451370239813325L}, {7730583391236194819L, 5356157313842354092L},
683+
{3111977482488580945L, -3119414725698132191L}, {3314524606404365027L, -1923219843083192742L},
684+
{7299569240140613949L, 4176392429810027494L}, {6398084683727166117L, 7703960505857395788L},
685+
{-8594572031068184774L, 4394224719145783692L}, {-7589785442804461713L, 4110439243215224554L},
686+
{-5343610105946840628L, -4423992782020122809L}, {-522490326525787270L, 289136460641968781L},
687+
{-5320637070354802556L, -7845553044730489027L}, {1344456408744313334L, 3803048032054968586L},
688+
{1131205296221907191L, -6256656049039287019L}, {8583339267101027117L, 8934225022848628726L},
689+
{-6379552869905441749L, 8973517768420051734L}, {5076646564516328801L, 8561479196844000567L},
690+
{-4610341636137642517L, -6694266039505142069L}, {-758896383254029789L, 4050360662271552727L},
691+
{-6123628195475753507L, 4283875822581966645L},};
692+
for (int i = 0; i < answers.length; i++) {
693+
Assert.assertArrayEquals("Length: " + i, answers[i], MurmurHash3.hash128x64(RANDOM_BYTES, offset, i, seed));
694+
}
695+
696+
// Seed can be negative
697+
final int seed2 = -42;
698+
699+
// Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
700+
// 15 bytes remaining.
701+
// for x in range(0, 32):
702+
// print(mmh3.hash64(bytes[13:x+13], -42), ',')
703+
final long[][] answers2 = {{7182599573337898253L, -6490979146667806054L},
704+
{-461284136738605467L, 7073284964362976233L}, {-3090354666589400212L, 2978755180788824810L},
705+
{5052807367580803906L, -4497188744879598335L}, {5003711854877353474L, -6616808651483337088L},
706+
{2043501804923817748L, -760668448196918637L}, {6813003268375229932L, -1818545210475363684L},
707+
{4488070015393027916L, 8520186429078977003L}, {4709278711722456062L, -2262080641289046033L},
708+
{-5944514262756048380L, 5968714500873552518L}, {-2304397529301122510L, 6451500469518446488L},
709+
{-1054078041081348909L, -915114408909600132L}, {1300471646869277217L, -399493387666437046L},
710+
{-2821780479886030222L, -9061571187511294733L}, {8005764841242557505L, 4135287855434326053L},
711+
{318307346637037498L, -5355856739901286522L}, {3380719536119187025L, 1890890833937151467L},
712+
{2691044185935730001L, 7963546423617895734L}, {-5277462388534000227L, 3613853764390780573L},
713+
{8504421722476165699L, 2058020162708103700L}, {-6578421288092422241L, 3311200163790829579L},
714+
{-5915037218487974215L, -7385137075895184179L}, {659642911937398022L, 854071824595671049L},
715+
{-7007237968866727198L, 1372258010932080058L}, {622891376282772539L, -4140783491297489868L},
716+
{8357110718969014985L, -4737117827581590306L}, {2208857857926305405L, -8360240839768465042L},
717+
{858120048221036376L, -5822288789703639119L}, {-1988334009458340679L, 1262479472434068698L},
718+
{-8580307083590783934L, 3634449965473715778L}, {6705664584730187559L, 5192304951463791556L},
719+
{-6426410954037604142L, -1579122709247558101L},};
720+
for (int i = 0; i < answers.length; i++) {
721+
Assert.assertArrayEquals("Length: " + i, answers2[i], MurmurHash3.hash128x64(RANDOM_BYTES, offset, i, seed2));
722+
}
723+
}
724+
616725
/**
617726
* Test {@link IncrementalHash32} returns the same values as
618727
* {@link MurmurHash3#hash32(byte[], int, int, int)}.

0 commit comments

Comments
 (0)