diff --git a/pom.xml b/pom.xml index cd58814c9c..825f66bfed 100644 --- a/pom.xml +++ b/pom.xml @@ -476,6 +476,12 @@ ${commons.junit.version} test + + org.junit.jupiter + junit-jupiter-params + ${commons.junit.version} + test + org.junit.vintage junit-vintage-engine @@ -488,12 +494,6 @@ - - org.junit.jupiter - junit-jupiter-params - ${commons.junit.version} - test - org.hamcrest hamcrest diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java deleted file mode 100644 index 18e1fee029..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.PrimitiveIterator.OfInt; -import java.util.function.LongBinaryOperator; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -/** - * An abstract Bloom filter providing default implementations for most Bloom filter - * functions. Specific implementations are encouraged to override the methods that can be - * more efficiently implemented. - *

- * This abstract class provides additional functionality not declared in the interface. - * Specifically: - *

- * - * @since 4.5 - */ -public abstract class AbstractBloomFilter implements BloomFilter { - - /** - * The shape used by this BloomFilter - */ - private final Shape shape; - - /** - * Construct a Bloom filter with the specified shape. - * - * @param shape The shape. - */ - protected AbstractBloomFilter(final Shape shape) { - this.shape = shape; - } - - @Override - public int andCardinality(final BloomFilter other) { - verifyShape(other); - final long[] mine = getBits(); - final long[] theirs = other.getBits(); - final int limit = Integer.min(mine.length, theirs.length); - int count = 0; - for (int i = 0; i < limit; i++) { - count += Long.bitCount(mine[i] & theirs[i]); - } - return count; - } - - @Override - public int cardinality() { - int count = 0; - for (final long bits : getBits()) { - count += Long.bitCount(bits); - } - return count; - } - - @Override - public boolean contains(final BloomFilter other) { - verifyShape(other); - return other.cardinality() == andCardinality(other); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final long[] buff = getBits(); - - final OfInt iter = hasher.iterator(shape); - while (iter.hasNext()) { - final int idx = iter.nextInt(); - BloomFilterIndexer.checkPositive(idx); - final int buffIdx = BloomFilterIndexer.getLongIndex(idx); - final long buffOffset = BloomFilterIndexer.getLongBit(idx); - if ((buff[buffIdx] & buffOffset) == 0) { - return false; - } - } - return true; - } - - @Override - public final Shape getShape() { - return shape; - } - - /** - * Determines if the bloom filter is "full". Full is defined as having no unset - * bits. - * - * @return true if the filter is full. - */ - public final boolean isFull() { - return cardinality() == getShape().getNumberOfBits(); - } - - @Override - public int orCardinality(final BloomFilter other) { - // Logical OR - return opCardinality(other, (a, b) -> a | b); - } - - /** - * Verifies that the hasher has the same name as the shape. - * - * @param hasher the Hasher to check - */ - protected void verifyHasher(final Hasher hasher) { - // It is assumed that the filter and hasher have been constructed using the - // same hash function. Use the signature for a fast check the hash function is equal. - // Collisions will occur at a rate of 1 in 2^64. - if (shape.getHashFunctionIdentity().getSignature() != hasher.getHashFunctionIdentity().getSignature()) { - throw new IllegalArgumentException( - String.format("Hasher (%s) is not the hasher for shape (%s)", - HashFunctionIdentity.asCommonString(hasher.getHashFunctionIdentity()), - shape.toString())); - } - } - - /** - * Verify the other Bloom filter has the same shape as this Bloom filter. - * - * @param other the other filter to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - protected void verifyShape(final BloomFilter other) { - verifyShape(other.getShape()); - } - - /** - * Verify the specified shape has the same shape as this Bloom filter. - * - * @param shape the other shape to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - protected void verifyShape(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", shape, this.shape)); - } - } - - @Override - public int xorCardinality(final BloomFilter other) { - // Logical XOR - return opCardinality(other, (a, b) -> a ^ b); - } - - /** - * Perform the operation on the matched longs from this filter and the other filter - * and count the cardinality. - * - *

The remaining unmatched longs from the larger filter are always counted. This - * method is suitable for OR and XOR cardinality. - * - * @param other the other Bloom filter. - * @param operation the operation (e.g. OR, XOR) - * @return the cardinality - */ - private int opCardinality(final BloomFilter other, final LongBinaryOperator operation) { - verifyShape(other); - final long[] mine = getBits(); - final long[] theirs = other.getBits(); - final long[] small; - final long[] big; - if (mine.length > theirs.length) { - big = mine; - small = theirs; - } else { - small = mine; - big = theirs; - } - int count = 0; - for (int i = 0; i < small.length; i++) { - count += Long.bitCount(operation.applyAsLong(small[i], big[i])); - } - for (int i = small.length; i < big.length; i++) { - count += Long.bitCount(big[i]); - } - return count; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 0722b92576..b6f120ce2c 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -16,18 +16,13 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.BitSet; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import java.util.Objects; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; +import java.util.stream.IntStream; /** - * A counting Bloom filter using an array to track counts for each enabled bit + * A counting Bloom filter using an int array to track counts for each enabled bit * index. * *

Any operation that results in negative counts or integer overflow of @@ -35,13 +30,13 @@ * The operation is completed in full, no exception is raised and the state is * set to invalid. This allows the counts for the filter immediately prior to the * operation that created the invalid state to be recovered. See the documentation - * in {@link #isValid()} for details. + * in {@link #isValid()} for details.

* *

All the operations in the filter assume the counts are currently valid, - * for example cardinality or contains operations. Behaviour of an invalid + * for example {@code cardinality} or {@code contains} operations. Behavior of an invalid * filter is undefined. It will no longer function identically to a standard * Bloom filter that is the merge of all the Bloom filters that have been added - * to and not later subtracted from the counting Bloom filter. + * to and not later subtracted from the counting Bloom filter.

* *

The maximum supported number of items that can be stored in the filter is * limited by the maximum array size combined with the {@link Shape}. For @@ -53,7 +48,12 @@ * @see Shape * @since 4.5 */ -public class ArrayCountingBloomFilter extends AbstractBloomFilter implements CountingBloomFilter { +public final class ArrayCountingBloomFilter implements CountingBloomFilter { + + /** + * The shape of this Bloom filter. + */ + private final Shape shape; /** * The count of each bit index in the filter. @@ -61,20 +61,20 @@ public class ArrayCountingBloomFilter extends AbstractBloomFilter implements Cou private final int[] counts; /** - * The state flag. This is a bitwise OR of the entire history of all updated + * The state flag. This is a bitwise @{code OR} of the entire history of all updated * counts. If negative then a negative count or integer overflow has occurred on * one or more counts in the history of the filter and the state is invalid. * *

Maintenance of this state flag is branch-free for improved performance. It * eliminates a conditional check for a negative count during remove/subtract * operations and a conditional check for integer overflow during merge/add - * operations. + * operations.

* *

Note: Integer overflow is unlikely in realistic usage scenarios. A count * that overflows indicates that the number of items in the filter exceeds the * maximum possible size (number of bits) of any Bloom filter constrained by * integer indices. At this point the filter is most likely full (all bits are - * non-zero) and thus useless. + * non-zero) and thus useless.

* *

Negative counts are a concern if the filter is used incorrectly by * removing an item that was never added. It is expected that a user of a @@ -82,174 +82,108 @@ public class ArrayCountingBloomFilter extends AbstractBloomFilter implements Cou * Enabling an explicit recovery path for negative or overflow counts is a major * performance burden not deemed necessary for the unlikely scenarios when an * invalid state is created. Maintenance of the state flag is a concession to - * flag improper use that should not have a major performance impact. + * flag improper use that should not have a major performance impact.

*/ private int state; - /** - * An iterator of all indexes with non-zero counts. - * - *

In the event that the filter state is invalid any index with a negative count - * will also be produced by the iterator. - */ - private class IndexIterator implements PrimitiveIterator.OfInt { - /** The next non-zero index (or counts.length). */ - private int next; - - /** - * Create an instance. - */ - IndexIterator() { - advance(); - } - - /** - * Advance to the next non-zero index. - */ - void advance() { - while (next < counts.length && counts[next] == 0) { - next++; - } - } - - @Override - public boolean hasNext() { - return next < counts.length; - } - - @Override - public int nextInt() { - if (hasNext()) { - final int result = next++; - advance(); - return result; - } - // Currently unreachable as the iterator is only used by - // the StaticHasher which iterates correctly. - throw new NoSuchElementException(); - } - } - /** * Constructs an empty counting Bloom filter with the specified shape. * * @param shape the shape of the filter + * */ public ArrayCountingBloomFilter(final Shape shape) { - super(shape); + Objects.requireNonNull(shape, "shape"); + this.shape = shape; counts = new int[shape.getNumberOfBits()]; } - @Override - public int cardinality() { - int size = 0; - for (final int c : counts) { - if (c != 0) { - size++; - } - } - return size; + private ArrayCountingBloomFilter(ArrayCountingBloomFilter source) { + this.shape = source.shape; + this.state = source.state; + this.counts = source.counts.clone(); } @Override - public boolean contains(final BloomFilter other) { - // The AbstractBloomFilter implementation converts both filters to long[] bits. - // This would involve checking all indexes in this filter against zero. - // Ideally we use an iterator of bit indexes to allow fail-fast on the - // first bit index that is zero. - if (other instanceof ArrayCountingBloomFilter) { - verifyShape(other); - return contains(((ArrayCountingBloomFilter) other).iterator()); - } - - // Note: - // This currently creates a StaticHasher which stores all the indexes. - // It would greatly benefit from direct generation of the index iterator - // avoiding the intermediate storage. - return contains(other.getHasher()); + public ArrayCountingBloomFilter copy() { + return new ArrayCountingBloomFilter(this); } @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - return contains(hasher.iterator(getShape())); - } - - /** - * Return true if this filter is has non-zero counts for each index in the iterator. - * - * @param iter the iterator - * @return true if this filter contains all the indexes - */ - private boolean contains(final OfInt iter) { - while (iter.hasNext()) { - if (counts[iter.nextInt()] == 0) { - return false; - } - } + public boolean isSparse() { return true; } @Override - public long[] getBits() { - final BitSet bs = new BitSet(); - for (int i = 0; i < counts.length; i++) { - if (counts[i] != 0) { - bs.set(i); - } - } - return bs.toLongArray(); + public int cardinality() { + return (int) IntStream.range(0, counts.length).filter(i -> counts[i] > 0).count(); } @Override - public StaticHasher getHasher() { - return new StaticHasher(iterator(), getShape()); + public CountingBloomFilter merge(BloomFilter other) { + Objects.requireNonNull(other, "other"); + CountingBloomFilter filter = copy(); + filter.add(BitCountProducer.from(other)); + return filter; } - /** - * Returns an iterator over the enabled indexes in this filter. - * Any index with a non-zero count is considered enabled. - * The iterator returns indexes in their natural order. - * - * @return an iterator over the enabled indexes - */ - private PrimitiveIterator.OfInt iterator() { - return new IndexIterator(); + @Override + public CountingBloomFilter merge(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + ArrayCountingBloomFilter filter = copy(); + try { + filter.add(BitCountProducer.from(hasher.uniqueIndices(shape))); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); + } + return filter; } @Override - public boolean merge(final BloomFilter other) { - applyAsBloomFilter(other, this::increment); - return isValid(); + public boolean mergeInPlace(final BloomFilter other) { + Objects.requireNonNull(other, "other"); + try { + return add(BitCountProducer.from(other)); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException( e ); + } } @Override - public boolean merge(final Hasher hasher) { - applyAsHasher(hasher, this::increment); - return isValid(); + public boolean mergeInPlace(final Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + try { + return add(BitCountProducer.from(hasher.uniqueIndices(shape))); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); + } } @Override public boolean remove(final BloomFilter other) { - applyAsBloomFilter(other, this::decrement); - return isValid(); + Objects.requireNonNull(other, "other"); + return subtract(BitCountProducer.from(other)); } @Override public boolean remove(final Hasher hasher) { - applyAsHasher(hasher, this::decrement); - return isValid(); + Objects.requireNonNull(hasher, "hasher"); + return subtract(BitCountProducer.from(hasher.uniqueIndices(shape))); } @Override - public boolean add(final CountingBloomFilter other) { - applyAsCountingBloomFilter(other, this::add); + public boolean add(final BitCountProducer other) { + Objects.requireNonNull(other, "other"); + other.forEachCount(this::add); return isValid(); } @Override - public boolean subtract(final CountingBloomFilter other) { - applyAsCountingBloomFilter(other, this::subtract); + public boolean subtract(final BitCountProducer other) { + Objects.requireNonNull(other, "other"); + other.forEachCount(this::subtract); return isValid(); } @@ -258,14 +192,14 @@ public boolean subtract(final CountingBloomFilter other) { * *

Implementation note * - *

The state transition to invalid is permanent. + *

The state transition to invalid is permanent.

* *

This implementation does not correct negative counts to zero or integer * overflow counts to {@link Integer#MAX_VALUE}. Thus the operation that * generated invalid counts can be reversed by using the complement of the * original operation with the same Bloom filter. This will restore the counts * to the state prior to the invalid operation. Counts can then be extracted - * using {@link #forEachCount(BitCountConsumer)}. + * using {@link #forEachCount(BitCountConsumer)}.

*/ @Override public boolean isValid() { @@ -273,69 +207,31 @@ public boolean isValid() { } @Override - public void forEachCount(final BitCountConsumer action) { + public boolean forEachCount(final BitCountProducer.BitCountConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); for (int i = 0; i < counts.length; i++) { - if (counts[i] != 0) { - action.accept(i, counts[i]); + if (counts[i] != 0 && !consumer.test(i, counts[i])) { + return false; } } + return true; } - /** - * Apply the action for each index in the Bloom filter. - */ - private void applyAsBloomFilter(final BloomFilter other, final IntConsumer action) { - verifyShape(other); - if (other instanceof ArrayCountingBloomFilter) { - // Only use the presence of non-zero and not the counts - final int[] counts2 = ((ArrayCountingBloomFilter) other).counts; - for (int i = 0; i < counts2.length; i++) { - if (counts2[i] != 0) { - action.accept(i); - } + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + for (int i = 0; i < counts.length; i++) { + if (counts[i] != 0 && !consumer.test(i)) { + return false; } - } else { - BitSet.valueOf(other.getBits()).stream().forEach(action); } + return true; } - /** - * Apply the action for each index in the hasher. - */ - private void applyAsHasher(final Hasher hasher, final IntConsumer action) { - verifyHasher(hasher); - // We do not naturally handle duplicates so filter them. - IndexFilters.distinctIndexes(hasher, getShape(), action); - } - - /** - * Apply the action for each index in the Bloom filter. - */ - private void applyAsCountingBloomFilter(final CountingBloomFilter other, final BitCountConsumer action) { - verifyShape(other); - other.forEachCount(action); - } - - /** - * Increment to the count for the bit index. - * - * @param idx the index - */ - private void increment(final int idx) { - final int updated = counts[idx] + 1; - state |= updated; - counts[idx] = updated; - } - - /** - * Decrement from the count for the bit index. - * - * @param idx the index - */ - private void decrement(final int idx) { - final int updated = counts[idx] - 1; - state |= updated; - counts[idx] = updated; + @Override + public boolean forEachBitMap(LongPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + return BitMapProducer.fromIndexProducer(this, shape.getNumberOfBits()).forEachBitMap(consumer); } /** @@ -343,11 +239,13 @@ private void decrement(final int idx) { * * @param idx the index * @param addend the amount to add + * @return {@code true} always. */ - private void add(final int idx, final int addend) { + private boolean add(final int idx, final int addend) { final int updated = counts[idx] + addend; state |= updated; counts[idx] = updated; + return true; } /** @@ -355,10 +253,32 @@ private void add(final int idx, final int addend) { * * @param idx the index * @param subtrahend the amount to subtract + * @return {@code true} always. */ - private void subtract(final int idx, final int subtrahend) { + private boolean subtract(final int idx, final int subtrahend) { final int updated = counts[idx] - subtrahend; state |= updated; counts[idx] = updated; + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + return indexProducer.forEachIndex(idx -> this.counts[idx] != 0); + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains(IndexProducer.fromBitMapProducer(bitMapProducer)); + } + + @Override + public int[] asIndexArray() { + return IntStream.range(0, counts.length).filter(i -> counts[i] > 0).toArray(); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java new file mode 100644 index 0000000000..aea07b36e7 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.function.IntPredicate; + +/** + * Produces bit counts for counting type Bloom filters. + * + * @since 4.5 + */ +@FunctionalInterface +public interface BitCountProducer extends IndexProducer { + + /** + * Performs the given action for each {@code } pair where the count is non-zero. + * Any exceptions thrown by the action are relayed to the caller. The consumer is applied to each + * index-count pair, if the consumer returns {@code false} the execution is stopped, {@code false} + * is returned, and no further pairs are processed. + * + *

Must only process each index once, and must process indexes in order.

+ * + * @param consumer the action to be performed for each non-zero bit count + * @return {@code true} if all count pairs return true from consumer, {@code false} otherwise. + * @throws NullPointerException if the specified action is null + */ + boolean forEachCount(BitCountConsumer consumer); + + @Override + default boolean forEachIndex(IntPredicate predicate) { + return forEachCount((i, v) -> predicate.test(i)); + } + + /** + * Creates a BitCountProducer from an IndexProducer. The resulting + * producer will count each enabled bit once. + * @param idx An index producer. + * @return A BitCountProducer with the same indices as the IndexProducer. + */ + static BitCountProducer from(IndexProducer idx) { + return new BitCountProducer() { + @Override + public boolean forEachCount(BitCountConsumer consumer) { + return idx.forEachIndex(i -> consumer.test(i, 1)); + } + }; + } + + /** + * Represents an operation that accepts an {@code } pair representing + * the count for a bit index in a Bit Count Producer Bloom filter and returns {@code true} + * if processing should continue, {@code false} otherwise. + * + *

Note: This is a functional interface as a specialization of + * {@link java.util.function.BiPredicate} for {@code int}.

+ */ + @FunctionalInterface + interface BitCountConsumer { + /** + * Performs this operation on the given {@code } pair. + * + * @param index the bit index. + * @param count the count at the specified bit index. + * @return {@code true} if processing should continue, {@code false} it processing should stop. + */ + boolean test(int index, int count); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java similarity index 53% rename from src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java rename to src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index fe9b1161a9..f6f744ef78 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -17,25 +17,57 @@ package org.apache.commons.collections4.bloomfilter; /** - * Contains functions to convert {@code int} indices into Bloom filter bit positions. + * Contains functions to convert {@code int} indices into Bloom filter bit positions and visa versa. + * + *

The functions view an array of longs as a collection of bit maps each containing 64 bits. The bits are arranged + * in memory as a little-endian long value. This matches the requirements of the BitMapProducer interface.

+ * + * @since 4.5 */ -public final class BloomFilterIndexer { +public class BitMap { /** A bit shift to apply to an integer to divided by 64 (2^6). */ private static final int DIVIDE_BY_64 = 6; /** Do not instantiate. */ - private BloomFilterIndexer() {} + private BitMap() { + } + + /** + * Calculates the number of bit maps (longs) required for the numberOfBits parameter. + * + *

If the input is negative the behavior is not defined.

+ * + * @param numberOfBits the number of bits to store in the array of bit maps. + * @return the number of bit maps necessary. + */ + public static int numberOfBitMaps(int numberOfBits) { + return ((numberOfBits - 1) >> DIVIDE_BY_64) + 1; + } + + /** + * Checks if the specified index bit is enabled in the array of bit maps. + * + * If the bit specified by bitIndex is not in the bit map false is returned. + * + * @param bitMaps The array of bit maps. + * @param bitIndex the index of the bit to locate. + * @return {@code true} if the bit is enabled, {@code false} otherwise. + * @throws IndexOutOfBoundsException if bitIndex specifies a bit not in the range being tracked. + */ + public static boolean contains(long[] bitMaps, int bitIndex) { + return (bitMaps[getLongIndex(bitIndex)] & getLongBit(bitIndex)) != 0; + } /** - * Check the index is positive. + * Sets the bit in the bit maps. + *

Does not perform range checking

* - * @param bitIndex the bit index - * @throws IndexOutOfBoundsException if the index is not positive + * @param bitMaps The array of bit maps. + * @param bitIndex the index of the bit to set. + * @throws IndexOutOfBoundsException if bitIndex specifies a bit not in the range being tracked. */ - public static void checkPositive(final int bitIndex) { - if (bitIndex < 0) { - throw new IndexOutOfBoundsException("Negative bitIndex: " + bitIndex); - } + public static void set(long[] bitMaps, int bitIndex) { + bitMaps[getLongIndex(bitIndex)] |= getLongBit(bitIndex); } /** @@ -43,20 +75,21 @@ public static void checkPositive(final int bitIndex) { * to store bits starting at index 0. * *

The index is assumed to be positive. For a positive index the result will match - * {@code bitIndex / 64}. + * {@code bitIndex / 64}.

* - *

The divide is performed using bit shifts. If the input is negative the behavior - * is not defined. + *

The divide is performed using bit shifts. If the input is negative the behavior + * is not defined.

* * @param bitIndex the bit index (assumed to be positive) - * @return the filter index - * @see #checkPositive(int) + * @return the index of the bit map in an array of bit maps. */ public static int getLongIndex(final int bitIndex) { - // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is positive. + // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is + // positive. // We do not explicitly check for a negative here. Instead we use a // a signed shift. Any negative index will produce a negative value - // by sign-extension and if used as an index into an array it will throw an exception. + // by sign-extension and if used as an index into an array it will throw an + // exception. return bitIndex >> DIVIDE_BY_64; } @@ -66,13 +99,12 @@ public static int getLongIndex(final int bitIndex) { * 1 bit set. * *

The index is assumed to be positive. For a positive index the result will match - * {@code 1L << (bitIndex % 64)}. + * {@code 1L << (bitIndex % 64)}.

* - *

If the input is negative the behavior is not defined. + *

If the input is negative the behavior is not defined.

* * @param bitIndex the bit index (assumed to be positive) * @return the filter bit - * @see #checkPositive(int) */ public static long getLongBit(final int bitIndex) { // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java new file mode 100644 index 0000000000..84561eba55 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.Objects; +import java.util.function.LongPredicate; + +/** + * Produces bit map longs for a Bloom filter. + * + * Each bit map is a little-endian long value representing a block of bits of in a filter. + * + *

The returned array will have length {@code ceil(m / 64)} where {@code m} is the + * number of bits in the filter and {@code ceil} is the ceiling function. + * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit + * index is enabled. + *

+ * The default implementations of the {@code makePredicate()} and {@code asBitMapArray} methods + * are slow and should be reimplemented in the implementing classes where possible.

+ * + * @since 4.5 + */ +@FunctionalInterface +public interface BitMapProducer { + + /** + * Each bit map is passed to the predicate in order. The predicate is applied to each + * bit map value, if the predicate returns {@code false} the execution is stopped, {@code false} + * is returned, and no further bit maps are processed. + * + *

If the producer is empty this method will return true.

+ * + *

Any exceptions thrown by the action are relayed to the caller.

+ * + * @param predicate the function to execute + * @return {@code true} if all bit maps returned {@code true}, {@code false} otherwise. + * @throws NullPointerException if the specified consumer is null + */ + boolean forEachBitMap(LongPredicate predicate); + + /** + * Applies the {@code func} to each bit map pair in order. Will apply all of the bit maps from the other + * BitMapProducer to this producer. If this producer does not have as many bit maps it will provide 0 (zero) + * for all excess calls to the LongBiPredicate. + *

+ * The default implementation of this method uses {@code asBitMapArray()} It is recommended that implementations + * of BitMapProducer that have local arrays reimplement this method.

+ * + * @param other The other BitMapProducer that provides the y values in the (x,y) pair. + * @param func The function to apply. + * @return A LongPredicate that tests this BitMapProducers bitmap values in order. + */ + default boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate func) { + CountingLongPredicate p = new CountingLongPredicate(asBitMapArray(), func); + return other.forEachBitMap(p) && p.forEachRemaining(); + } + + /** + * Return a copy of the BitMapProducer data as a bit map array. + *

+ * The default implementation of this method is slow. It is recommended + * that implementing classes reimplement this method. + *

+ * @return An array of bit map data. + */ + default long[] asBitMapArray() { + class Bits { + private long[] data = new long[16]; + private int size; + + boolean add(long bits) { + if (size == data.length) { + // This will throw an out-of-memory error if there are too many bits. + // Since bits are addressed using 32-bit signed integer indices + // the maximum length should be ~2^31 / 2^6 = ~2^25. + // Any more is a broken implementation. + data = Arrays.copyOf(data, size * 2); + } + data[size++] = bits; + return true; + } + + long[] toArray() { + // Edge case to avoid a large array copy + return size == data.length ? data : Arrays.copyOf(data, size); + } + } + Bits bits = new Bits(); + forEachBitMap(bits::add); + return bits.toArray(); + } + + /** + * Creates a BitMapProducer from an array of Long. + * @param bitMaps the bit maps to return. + * @return a BitMapProducer. + */ + static BitMapProducer fromBitMapArray(long... bitMaps) { + return new BitMapProducer() { + @Override + public boolean forEachBitMap(LongPredicate predicate) { + for (long word : bitMaps) { + if (!predicate.test(word)) { + return false; + } + } + return true; + } + + @Override + public long[] asBitMapArray() { + return Arrays.copyOf(bitMaps, bitMaps.length); + } + + @Override + public boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate func) { + CountingLongPredicate p = new CountingLongPredicate(bitMaps, func); + return other.forEachBitMap(p) && p.forEachRemaining(); + } + }; + } + + /** + * Creates a BitMapProducer from an IndexProducer. + * @param producer the IndexProducer that specifies the indexes of the bits to enable. + * @param numberOfBits the number of bits in the Bloom filter. + * @return A BitMapProducer that produces the bit maps equivalent of the Indices from the producer. + */ + static BitMapProducer fromIndexProducer(IndexProducer producer, int numberOfBits) { + Objects.requireNonNull(producer, "producer"); + Objects.requireNonNull(numberOfBits, "numberOfBits"); + + long[] result = new long[BitMap.numberOfBitMaps(numberOfBits)]; + producer.forEachIndex(i -> { + BitMap.set(result, i); + return true; + }); + return fromBitMapArray(result); + } + + /** + * A long predicate that applies the test func to each member of the @{code ary} in sequence for each call to @{code test()}. + * if the @{code ary} is exhausted, the subsequent calls to to @{code test} are executed with a zero value. + * If the calls to @{code test} do not exhaust the @{code ary} the @{code forEachRemaining} method can be called to + * execute the @code{text} with a zero value for each remaining @{code idx} value. + * + */ + class CountingLongPredicate implements LongPredicate { + int idx = 0; + final long[] ary; + final LongBiPredicate func; + + CountingLongPredicate(long[] ary, LongBiPredicate func) { + this.ary = ary; + this.func = func; + } + + @Override + public boolean test(long other) { + return func.test(idx == ary.length ? 0 : ary[idx++], other); + } + + boolean forEachRemaining() { + while (idx != ary.length && func.test(ary[idx], 0)) { + idx++; + } + return idx == ary.length; + } + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java deleted file mode 100644 index de55cbe93d..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.BitSet; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; - -/** - * A bloom filter using a Java BitSet to track enabled bits. This is a standard - * implementation and should work well for most Bloom filters. - * @since 4.5 - */ -public class BitSetBloomFilter extends AbstractBloomFilter { - - /** - * The bitSet that defines this BloomFilter. - */ - private final BitSet bitSet; - - /** - * Constructs an empty BitSetBloomFilter. - * - * @param shape the desired shape of the filter. - */ - public BitSetBloomFilter(final Shape shape) { - super(shape); - this.bitSet = new BitSet(); - } - - @Override - public int andCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.and(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.andCardinality(other); - } - - @Override - public int cardinality() { - return bitSet.cardinality(); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final OfInt iter = hasher.iterator(getShape()); - while (iter.hasNext()) { - if (!bitSet.get(iter.nextInt())) { - return false; - } - } - return true; - } - - @Override - public long[] getBits() { - return bitSet.toLongArray(); - } - - @Override - public StaticHasher getHasher() { - return new StaticHasher(bitSet.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - verifyShape(other); - if (other instanceof BitSetBloomFilter) { - bitSet.or(((BitSetBloomFilter) other).bitSet); - } else { - bitSet.or(BitSet.valueOf(other.getBits())); - } - return true; - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) bitSet::set); - return true; - } - - @Override - public int orCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.or(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.orCardinality(other); - } - - @Override - public int xorCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.xor(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.xorCardinality(other); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index af43ddd51e..9a4e6324fa 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -16,138 +16,237 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import java.util.Objects; /** * The interface that describes a Bloom filter. + *

+ * See implementation notes for BitMapProducer and IndexProducer. + *

+ * @see BitMapProducer + * @see IndexProducer * @since 4.5 */ -public interface BloomFilter { - - // Query Operations +public interface BloomFilter extends IndexProducer, BitMapProducer { /** - * Gets the shape of this filter. - * - * @return the shape of this filter + * Creates a new instance of the BloomFilter with the same properties as the current one. + * @return a copy of this BloomFilter */ - Shape getShape(); + BloomFilter copy(); + + // Query Operations /** - * Gets an array of little-endian long values representing the bits of this filter. + * This method is used to determine the best method for matching. * - *

The returned array will have length {@code ceil(m / 64)} where {@code m} is the - * number of bits in the filter and {@code ceil} is the ceiling function. - * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit - * index is enabled. + *

For `sparse` implementations + * the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations + * the {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier + * for the implementation to produce indexes of bit map blocks.

* - * @return the {@code long[]} representation of this filter + * @return {@code true} if the implementation is sparse {@code false} otherwise. + * @see BitMap */ - long[] getBits(); + boolean isSparse(); /** - * Creates a StaticHasher that contains the indexes of the bits that are on in this - * filter. - * - * @return a StaticHasher for that produces this Bloom filter + * Gets the shape that was used when the filter was built. + * @return The shape the filter was built with. */ - StaticHasher getHasher(); + Shape getShape(); /** - * Returns {@code true} if this filter contains the specified filter. Specifically this + * Returns {@code true} if this filter contains the specified filter. + * + *

Specifically this * returns {@code true} if this filter is enabled for all bits that are enabled in the * {@code other} filter. Using the bit representations this is - * effectively {@code (this AND other) == other}. + * effectively {@code (this AND other) == other}.

* * @param other the other Bloom filter - * @return true if this filter is enabled for all enabled bits in the other filter - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @return true if all enabled bits in the other filter are enabled in this filter. */ - boolean contains(BloomFilter other); + default boolean contains(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return isSparse() ? contains((IndexProducer) other) : contains((BitMapProducer) other); + } /** - * Returns {@code true} if this filter contains the specified decomposed Bloom filter. - * Specifically this returns {@code true} if this filter is enabled for all bit indexes - * identified by the {@code hasher}. Using the bit representations this is - * effectively {@code (this AND hasher) == hasher}. + * Returns {@code true} if this filter contains the bits specified in the hasher. + * + *

Specifically this returns {@code true} if this filter is enabled for all bit indexes + * identified by the {@code hasher}. Using the bit map representations this is + * effectively {@code (this AND hasher) == hasher}.

* * @param hasher the hasher to provide the indexes * @return true if this filter is enabled for all bits specified by the hasher - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter */ - boolean contains(Hasher hasher); + default boolean contains(Hasher hasher) { + Objects.requireNonNull(hasher, "Hasher"); + Shape shape = getShape(); + return contains(hasher.indices(shape)); + } - // Modification Operations + /** + * Returns {@code true} if this filter contains the indices specified IndexProducer. + * + *

Specifically this returns {@code true} if this filter is enabled for all bit indexes + * identified by the {@code IndexProducer}.

+ * + * @param indexProducer the IndexProducer to provide the indexes + * @return {@code true} if this filter is enabled for all bits specified by the IndexProducer + */ + boolean contains(IndexProducer indexProducer); /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all bit indexes - * that are enabled in the {@code other} filter will be enabled in this filter. + * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the + * bitMapProducer. * - *

Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the {@code other} Bloom filter. + * @param bitMapProducer the the {@code BitMapProducer} to provide the bit maps. + * @return {@code true} if this filter is enabled for all bits specified by the bit maps + */ + default boolean contains(BitMapProducer bitMapProducer) { + return forEachBitMapPair(bitMapProducer, (x, y) -> (x & y) == y); + } + + // update operations + + /** + * Merges the specified Bloom filter with this Bloom filter creating a new Bloom filter. + * + *

Specifically all bit indexes that are enabled in the {@code other} and in @code this} filter will be + * enabled in the resulting filter.

* * @param other the other Bloom filter + * @return The new Bloom filter. + */ + default BloomFilter merge(BloomFilter other) { + Objects.requireNonNull(other, "other"); + BloomFilter result = copy(); + result.mergeInPlace(other); + return result; + } + + /** + * Merges the specified Hasher with this Bloom filter and returns a new Bloom filter. + * + *

Specifically all bit indexes that are identified by the {@code hasher} and in {@code this} Bloom filter + * be enabled in the resulting filter.

+ * + * @param hasher the hasher to provide the indices + * @return the new Bloom filter. + */ + default BloomFilter merge(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + BloomFilter result = copy(); + result.mergeInPlace(hasher); + return result; + } + + /** + * Merges the specified Bloom filter into this Bloom filter. + * + *

Specifically all + * bit indexes that are identified by the {@code other} will be enabled in this filter.

+ * + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain + * the {@code other} Bloom filter. This state may occur in complex Bloom filter implementations like + * counting Bloom filters.

+ * + * @param other The bloom filter to merge into this one. * @return true if the merge was successful - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter */ - boolean merge(BloomFilter other); + boolean mergeInPlace(BloomFilter other); /** - * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all + * Merges the specified hasher into this Bloom filter. Specifically all * bit indexes that are identified by the {@code hasher} will be enabled in this filter. * - *

Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the specified decomposed Bloom filter. + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain + * the {@code other} Bloom filter. This state may occur in complex Bloom filter implementations like + * counting Bloom filters.

* - * @param hasher the hasher to provide the indexes + * @param hasher The hasher to merge. * @return true if the merge was successful - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter */ - boolean merge(Hasher hasher); + default boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + Shape shape = getShape(); + // create the bloomfilter that is most likely to merge quickly with this one + BloomFilter result = isSparse() ? new SparseBloomFilter(shape, hasher) : new SimpleBloomFilter(shape, hasher); + return mergeInPlace(result); + } // Counting Operations + /** + * Determines if the bloom filter is "full". + * + *

Full is defined as having no unset bits.

+ * + * @return {@code true} if the filter is full, {@code false} otherwise. + */ + default boolean isFull() { + return cardinality() == getShape().getNumberOfBits(); + } + /** * Gets the cardinality (number of enabled bits) of this Bloom filter. * - *

This is also known as the Hamming value.

+ *

This is also known as the Hamming value or Hamming number.

* * @return the cardinality of this filter */ int cardinality(); /** - * Performs a logical "AND" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the Bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this AND other)} + *

By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the + * shape and cardinality of this filter.

+ * + *

This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter.

+ * + * @return an estimate of the number of items in the bloom filter. + * @see Shape#estimateN(int) */ - int andCardinality(BloomFilter other); + default int estimateN() { + return (int) Math.round(getShape().estimateN(cardinality())); + } /** - * Performs a logical "OR" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the union of this Bloom filter with the other bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this OR other)} + *

By default this is the {@code estimateN()} of the merging of this filter with the {@code other} filter.

+ * + *

This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either + * of the filters.

+ * + * @param other The other Bloom filter + * @return an estimate of the number of items in the union. + * @see #estimateN() */ - int orCardinality(BloomFilter other); + default int estimateUnion(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return this.merge(other).estimateN(); + } /** - * Performs a logical "XOR" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this XOR other)} + *

By default this is the {@code estimateN() + other.estimateN() - estimateUnion(other)}

+ * + *

This produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both + * of the filters.

+ * + * @param other The other Bloom filter + * @return an estimate of the number of items in the intersection. */ - int xorCardinality(BloomFilter other); + default int estimateIntersection(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return estimateN() + other.estimateN() - estimateUnion(other); + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index 0c414ebe93..49655351f5 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -16,8 +16,6 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; - /** * The interface that describes a Bloom filter that associates a count with each * bit index to allow reversal of merge operations with remove operations. @@ -27,7 +25,7 @@ * to and not later subtracted from the counting Bloom filter. The functional * state of a CountingBloomFilter at the start and end of a series of merge and * subsequent remove operations of the same Bloom filters, irrespective of - * remove order, is expected to be the same. + * remove order, is expected to be the same.

* *

Removal of a filter that has not previously been merged results in an * invalid state where the counts no longer represent a sum of merged Bloom @@ -36,166 +34,148 @@ * undetected. The CountingBloomFilter maintains a state flag that is used as a * warning that an operation was performed that resulted in invalid counts and * thus an invalid state. For example this may occur if a count for an index was - * set to negative following a remove operation. + * set to negative following a remove operation.

* *

Implementations should document the expected state of the filter after an * operation that generates invalid counts, and any potential recovery options. * An implementation may support a reversal of the operation to restore the * state to that prior to the operation. In the event that invalid counts are * adjusted to a valid range then it should be documented if there has been - * irreversible information loss. + * irreversible information loss.

* *

Implementations may choose to throw an exception during an operation that * generates invalid counts. Implementations should document the expected state * of the filter after such an operation. For example are the counts not updated, - * partially updated or updated entirely before the exception is raised. + * partially updated or updated entirely before the exception is raised.

* * @since 4.5 */ -public interface CountingBloomFilter extends BloomFilter { - - /** - * Represents an operation that accepts an {@code } pair representing - * the count for a bit index in a counting Bloom filter and returns no result. - * - *

Note: This is a functional interface as a primitive type specialization of - * {@link java.util.function.BiConsumer} for {@code int}. - */ - @FunctionalInterface - interface BitCountConsumer { - /** - * Performs this operation on the given {@code } pair. - * - * @param index the bit index - * @param count the count at the specified bit index - */ - void accept(int index, int count); - } +public interface CountingBloomFilter extends BloomFilter, BitCountProducer { // Query Operations /** - * Returns true if the internal state is valid. This flag is a warning that an addition or + * Returns {@code true} if the internal state is valid. + * + *

This flag is a warning that an addition or * subtraction of counts from this filter resulted in an invalid count for one or more * indexes. For example this may occur if a count for an index was * set to negative following a subtraction operation, or overflows an {@code int} following an - * addition operation. + * addition operation.

* *

A counting Bloom filter that has an invalid state is no longer ensured to function * identically to a standard Bloom filter instance that is the merge of all the Bloom filters - * that have been added to and not later subtracted from this counting Bloom filter. + * that have been added to and not later subtracted from this counting Bloom filter.

* *

Note: The change to an invalid state may or may not be reversible. Implementations * are expected to document their policy on recovery from an addition or removal operation - * that generated an invalid state. + * that generated an invalid state.

* - * @return true if the state is valid + * @return {@code true} if the state is valid */ boolean isValid(); - /** - * Performs the given action for each {@code } pair where the count is non-zero. - * Any exceptions thrown by the action are relayed to the caller. - * - * @param action the action to be performed for each non-zero bit count - * @throws NullPointerException if the specified action is null - */ - void forEachCount(BitCountConsumer action); - // Modification Operations /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all counts for - * indexes that are enabled in the {@code other} filter will be incremented by 1. + * Removes the specified Bloom filter from this Bloom filter. + * + *

Specifically: all counts for the indexes identified by the {@code other} filter will be decremented by 1,

* - *

Note: If the other filter is a counting Bloom filter the index counts are ignored; only - * the enabled indexes are used. + *

Note: If the other filter is a counting Bloom filter the index counts are ignored and it is treated as an + * IndexProducer.

* - *

This method will return true if the filter is valid after the operation. + *

This method will return {@code true} if the filter is valid after the operation.

* - * @param other {@inheritDoc} - * @return true if the merge was successful and the state is valid - * @throws IllegalArgumentException {@inheritDoc} + * @param other the other Bloom filter + * @return {@code true} if the removal was successful and the state is valid * @see #isValid() + * @see #subtract(BitCountProducer) */ - @Override - boolean merge(BloomFilter other); + boolean remove(BloomFilter other); /** - * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all - * counts for the distinct indexes that are identified by the {@code hasher} will - * be incremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored. + * Removes the specified hasher from the Bloom filter from this Bloom filter. + * + *

Specifically all counts for the indices produced by the {@code hasher} will be + * decremented by 1.

* - *

This method will return true if the filter is valid after the operation. + *

For HasherCollections each enclosed Hasher will be considered a single item and decremented + * from the counts separately.

* - * @param hasher {@inheritDoc} - * @return true if the merge was successful and the state is valid - * @throws IllegalArgumentException {@inheritDoc} + *

This method will return {@code true} if the filter is valid after the operation.

+ * + * @param hasher the hasher to provide the indexes + * @return {@code true} if the removal was successful and the state is valid * @see #isValid() + * @see #subtract(BitCountProducer) */ - @Override - boolean merge(Hasher hasher); + boolean remove(Hasher hasher); /** - * Removes the specified Bloom filter from this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be decremented by 1. + * Adds the specified BitCountProducer to this Bloom filter. * - *

Note: If the other filter is a counting Bloom filter the index counts are ignored; only - * the enabled indexes are used. + *

Specifically + * all counts for the indexes identified by the {@code other} will be incremented + * by their corresponding values in the {@code other}.

* - *

This method will return true if the filter is valid after the operation. + *

This method will return {@code true} if the filter is valid after the operation.

* - * @param other the other Bloom filter - * @return true if the removal was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @param other the BitCountProducer to add. + * @return {@code true} if the addition was successful and the state is valid * @see #isValid() - * @see #subtract(CountingBloomFilter) + * @see #subtract(BitCountProducer) */ - boolean remove(BloomFilter other); + boolean add(BitCountProducer other); /** - * Removes the specified decomposed Bloom filter from this Bloom filter. Specifically - * all counts for the distinct indexes identified by the {@code hasher} will be - * decremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored. + * Adds the specified BitCountProducer to this Bloom filter. * - *

This method will return true if the filter is valid after the operation. + *

Specifically + * all counts for the indexes identified by the {@code other} will be decremented + * by their corresponding values in the {@code other}.

* - * @param hasher the hasher to provide the indexes - * @return true if the removal was successful and the state is valid - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter + *

This method will return true if the filter is valid after the operation.

+ * + * @param other the BitCountProducer to subtract. + * @return {@code true} if the subtraction was successful and the state is valid * @see #isValid() + * @see #add(BitCountProducer) */ - boolean remove(Hasher hasher); + boolean subtract(BitCountProducer other); /** - * Adds the specified counting Bloom filter to this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be incremented - * by their corresponding counts in the {@code other} filter. + * Merges the specified Bloom filter into this Bloom filter to produce a new CountingBloomFilter. * - *

This method will return true if the filter is valid after the operation. + *

Specifically the new Bloom filter will contain all the counts of this filter and in addition + * all bit indexes that are enabled in the {@code other} filter will be incremented + * by one in the new filter.

* - * @param other the other counting Bloom filter - * @return true if the addition was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter - * @see #isValid() + *

Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} + * should be called on the new filter.

+ * + * @param other the other Bloom filter + * @return A new CountingBloomFilter instance. */ - boolean add(CountingBloomFilter other); + @Override + CountingBloomFilter merge(BloomFilter other); /** - * Adds the specified counting Bloom filter to this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be decremented - * by their corresponding counts in the {@code other} filter. + * Merges the specified hasher with this Bloom filter to create a new CountingBloomFilter. * - *

This method will return true if the filter is valid after the operation. + *

Specifically the new Bloom filter will contain all the counts of this filter and in addition + * all bit indexes specified by the {@code hasher} will be incremented + * by one in the new filter.

* - * @param other the other counting Bloom filter - * @return true if the subtraction was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter - * @see #isValid() + *

For HasherCollections each enclosed Hasher will be considered a single item and increment + * the counts separately.

+ * + *

Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} + * should be called on the new filter.

+ * + * @param hasher the hasher to provide the indexes + * @return A new CountingBloomFilter instance. */ - boolean subtract(CountingBloomFilter other); + @Override + CountingBloomFilter merge(Hasher hasher); } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java new file mode 100644 index 0000000000..3afd9fbe08 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.function.IntPredicate; + +/** + * A Hasher creates IndexProducer based on the hash implementation and the + * provided Shape. + * + * @since 4.5 + */ +public interface Hasher { + + /** + * Creates an IndexProducer for this hasher based on the Shape. + * + *

The @{code IndexProducer} will create indices within the range defined by the number of bits in + * the shape. The total number of indices will respect the number of hash functions per item + * defined by the shape. However the count of indices may not be a multiple of the number of + * hash functions if the implementation has removed duplicates.

+ * + *

This IndexProducer must be deterministic in that it must return the same indices for the + * same Shape.

+ * + *

No guarantee is made as to order of indices.

+ *

Duplicates indices for a single item may be produced.

+ * + * @param shape the shape of the desired Bloom filter. + * @return the iterator of integers + */ + IndexProducer indices(Shape shape); + + /** + * Creates an IndexProducer of unique indices for this hasher based on the Shape. + * + *

This is like the `indices(Shape)` method except that it adds the guarantee that no + * duplicate values will be returned

+ * + * @param shape the shape of the desired Bloom filter. + * @return the iterator of integers + */ + IndexProducer uniqueIndices(Shape shape); + + /** + * A convenience class for Hasher implementations to filter out duplicate indices. + * + *

If the index is negative the behavior is not defined.

+ * + *

This is conceptually a unique filter implemented as a {@code IntPredicate}.

+ * @since 4.5 + */ + final class IndexFilter implements IntPredicate { + private final IntPredicate tracker; + private final int size; + private final IntPredicate consumer; + + /** + * Creates an instance optimized for the specified shape. + * @param shape The shape that is being generated. + * @param consumer The consumer to accept the values. + * @return an IndexFilter optimized for the specified shape. + */ + public static IndexFilter create(Shape shape, IntPredicate consumer) { + return new IndexFilter(shape, consumer); + } + + /** + * Creates an instance optimized for the specified shape. + * @param shape The shape that is being generated. + * @param consumer The consumer to accept the values. + */ + private IndexFilter(Shape shape, IntPredicate consumer) { + this.size = shape.getNumberOfBits(); + this.consumer = consumer; + if (BitMap.numberOfBitMaps(shape.getNumberOfBits()) * Long.BYTES < (long) shape.getNumberOfHashFunctions() + * Integer.BYTES) { + this.tracker = new BitMapTracker(shape); + } else { + this.tracker = new ArrayTracker(shape); + } + } + + /** + * Test if the number should be processed by the {@code consumer}. + * + *

If the number has not been seen before it is passed to the {@code consumer} and the result returned. + * If the number has been seen before the {@code consumer} is not called and {@code true} returned.

+ * + *

If the input is not in the range [0,size) an IndexOutOfBoundsException exception is thrown.

+ * + * @param number the number to check. + * @return {@code true} if processing should continue, {@code false} otherwise. + */ + @Override + public boolean test(int number) { + if (number >= size) { + throw new IndexOutOfBoundsException(String.format("number too large %d >= %d", number, size)); + } + return tracker.test(number) ? consumer.test(number) : true; + } + + /** + * An IndexTracker implementation that uses an array of integers to track whether or not a + * number has been seen. Suitable for Shapes that have few hash functions. + * @since 4.5 + */ + static class ArrayTracker implements IntPredicate { + private int[] seen; + private int populated; + + /** + * Constructs the tracker based on the shape. + * @param shape the shape to build the tracker for. + */ + ArrayTracker(Shape shape) { + seen = new int[shape.getNumberOfHashFunctions()]; + } + + @Override + public boolean test(int number) { + if (number < 0) { + throw new IndexOutOfBoundsException("number may not be less than zero. " + number); + } + for (int i = 0; i < populated; i++) { + if (seen[i] == number) { + return false; + } + } + seen[populated++] = number; + return true; + } + } + + /** + * An IndexTracker implementation that uses an array of bit maps to track whether or not a + * number has been seen. + * @since 4.5 + */ + static class BitMapTracker implements IntPredicate { + private long[] bits; + + /** + * Constructs a bit map based tracker for the specified shape. + * @param shape The shape that is being generated. + */ + BitMapTracker(Shape shape) { + bits = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + } + + @Override + public boolean test(int number) { + boolean retval = !BitMap.contains(bits, number); + BitMap.set(bits, number); + return retval; + } + } + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java deleted file mode 100644 index 1ae2b79d1e..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.Arrays; -import java.util.Set; -import java.util.TreeSet; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; -import org.apache.commons.collections4.iterators.EmptyIterator; -import org.apache.commons.collections4.iterators.IteratorChain; - -/** - * A Bloom filter built on a single hasher. This filter type should only be used for small - * filters (few on bits). While this implementation correctly supports the merge() methods - * it is recommended that if merges are expected that one of the other Bloom filter - * implementations be used. - * @since 4.5 - */ -public class HasherBloomFilter extends AbstractBloomFilter { - /** The bit representation for an empty Bloom filter. */ - private static final long[] EMPTY = {}; - - /** - * The internal hasher representation. - */ - private StaticHasher hasher; - - /** - * Constructs a HasherBloomFilter from a hasher and a shape. - * - * @param hasher the hasher to use. - * @param shape the shape of the Bloom filter. - */ - public HasherBloomFilter(final Hasher hasher, final Shape shape) { - super(shape); - verifyHasher(hasher); - if (hasher instanceof StaticHasher) { - this.hasher = (StaticHasher) hasher; - verifyShape(this.hasher.getShape()); - } else { - this.hasher = new StaticHasher(hasher, shape); - } - } - - /** - * Constructs an empty HasherBloomFilter from a shape. - * - * @param shape the shape of the Bloom filter. - */ - public HasherBloomFilter(final Shape shape) { - super(shape); - this.hasher = new StaticHasher(EmptyIterator.emptyIterator(), shape); - } - - @Override - public int cardinality() { - return hasher.size(); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final Set set = new TreeSet<>(); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) set::add); - final OfInt iter = this.hasher.iterator(getShape()); - while (iter.hasNext()) { - final int idx = iter.nextInt(); - set.remove(idx); - if (set.isEmpty()) { - return true; - } - } - return false; - } - - @Override - public long[] getBits() { - if (hasher.isEmpty()) { - return EMPTY; - } - - // Note: This can be simplified if the StaticHasher exposed a getMaxIndex() - // method. Since it maintains an ordered list of unique indices the maximum - // is the last value in the iterator. Knowing this value would allow - // exact allocation of the long[]. - // For now we assume that the long[] will have a positive length and at least - // 1 bit set in the entire array. - - final int n = (int) Math.ceil(hasher.getShape().getNumberOfBits() * (1.0 / Long.SIZE)); - final long[] result = new long[n]; - final OfInt iter = hasher.iterator(hasher.getShape()); - iter.forEachRemaining((IntConsumer) idx -> { - BloomFilterIndexer.checkPositive(idx); - final int buffIdx = BloomFilterIndexer.getLongIndex(idx); - final long buffOffset = BloomFilterIndexer.getLongBit(idx); - result[buffIdx] |= buffOffset; - }); - - int limit = result.length; - - // Assume the array has a non-zero length and at least 1 bit set. - // This is tested using assertions. - assert limit > 0 : "Number of bits in Shape is 0"; - while (result[limit - 1] == 0) { - limit--; - // If the hasher was not empty it is not possible to return - // an array of length zero. - assert limit > 0 : "Hasher reported a non-zero size but has no indices"; - } - if (limit < result.length) { - return Arrays.copyOf(result, limit); - } - return result; - } - - @Override - public StaticHasher getHasher() { - return hasher; - } - - @Override - public boolean merge(final BloomFilter other) { - return merge(other.getHasher()); - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - final IteratorChain iter = new IteratorChain<>(this.hasher.iterator(getShape()), - hasher.iterator(getShape())); - this.hasher = new StaticHasher(iter, getShape()); - return true; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherCollection.java new file mode 100644 index 0000000000..91aa43b1ef --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherCollection.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.function.IntPredicate; + +/** + * A collection of Hashers. Useful when the generation of a Bloom filter depends upon + * multiple items. + *

+ * Hashers for each item are added to the HasherCollection and then + * the collection is used wherever a Hasher can be used in the API. + *

+ * @since 4.5 + */ +public class HasherCollection implements Hasher { + + /** + * The list of hashers to be used to generate the indices. + */ + private final List hashers; + + /** + * Constructs an empty HasherCollection. + */ + public HasherCollection() { + this.hashers = new ArrayList<>(); + } + + /** + * Constructs a HasherCollection from a collection of Hasher objects. + * + * @param hashers A collections of Hashers to build the indices with. + */ + public HasherCollection(final Collection hashers) { + Objects.requireNonNull(hashers, "hashers"); + this.hashers = new ArrayList<>(hashers); + } + + /** + * Constructor. + * + * @param hashers A list of Hashers to initialize the collection with. + */ + public HasherCollection(Hasher... hashers) { + this(Arrays.asList(hashers)); + } + + /** + * Adds a hasher to the collection. + * @param hasher The hasher to add. + */ + public void add(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + hashers.add(hasher); + } + + /** + * Add all the Hashers in a collection to this HasherCollection. + * @param hashers The hashers to add. + */ + public void add(Collection hashers) { + Objects.requireNonNull(hashers, "hashers"); + this.hashers.addAll(hashers); + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + return new HasherCollectionIndexProducer(shape); + } + + @Override + public IndexProducer uniqueIndices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + return new HasherCollectionIndexProducer(shape) { + @Override + public boolean forEachIndex(IntPredicate consumer) { + for (Hasher hasher : hashers) { + if (!hasher.uniqueIndices(shape).forEachIndex(consumer)) { + return false; + } + } + return true; + } + }; + } + + /** + * Allow child classes access to the hashers. + * @return hashers + */ + protected List getHashers() { + return Collections.unmodifiableList(hashers); + } + + /** + * IndexProducer that will return duplicates from the collection. + * + */ + class HasherCollectionIndexProducer implements IndexProducer { + private final Shape shape; + + HasherCollectionIndexProducer(Shape shape) { + this.shape = shape; + } + + @Override + public boolean forEachIndex(IntPredicate consumer) { + for (Hasher hasher : hashers) { + if (!hasher.indices(shape).forEachIndex(consumer)) { + return false; + } + } + return true; + } + + @Override + public int[] asIndexArray() { + List lst = new ArrayList<>(); + int[] count = new int[1]; + /* + * This method needs to return duplicate indices + */ + for (Hasher hasher : hashers) { + int[] ary = hasher.indices(shape).asIndexArray(); + lst.add(ary); + count[0] += ary.length; + } + if (lst.isEmpty()) { + return new int[0]; + } + if (lst.size() == 1) { + return lst.get(0); + } + int[] result = new int[count[0]]; + int offset = 0; + for (int[] ary : lst) { + System.arraycopy(ary, 0, result, offset, ary.length); + offset += ary.length; + } + return result; + } + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java deleted file mode 100644 index e4adb4fc66..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -import java.util.Objects; -import java.util.Set; -import java.util.TreeSet; -import java.util.function.Consumer; -import java.util.function.IntConsumer; - -/** - * Contains functions to filter indexes. - */ -final class IndexFilters { - /** Do not instantiate. */ - private IndexFilters() { - } - - /** - * Transfer all distinct indexes in the specified {@code hasher} generated for the - * specified {@code shape} to the specified {@code consumer}. For example this - * can be used to merge a {@link Hasher} representation of a Bloom filter into a - * {@link BloomFilter} instance that does not naturally handle duplicate indexes. - * - *

This method is functionally equivalent to: - * - *

-     *     final Set<Integer> distinct = new TreeSet<>();
-     *     hasher.iterator(shape).forEachRemaining((Consumer<Integer>) i -> {
-     *         if (distinct.add(i)) {
-     *             consumer.accept(i);
-     *         }
-     *     });
-     * 
- * - * @param hasher the hasher - * @param shape the shape - * @param consumer the consumer to receive distinct indexes - * @throws NullPointerException if the hasher, shape or action are null - * @see Hasher#iterator(Shape) - */ - static void distinctIndexes(final Hasher hasher, final Shape shape, final IntConsumer consumer) { - Objects.requireNonNull(hasher, "hasher"); - Objects.requireNonNull(shape, "shape"); - Objects.requireNonNull(consumer, "consumer"); - - // TODO - // This function can be optimised based on the expected size - // (number of indexes) of the hasher and the number of bits in the shape. - // - // A large size would benefit from a pre-allocated BitSet-type filter. - // A very small size may be more efficient as a simple array of values - // that have already been seen that is scanned for each new index. - // - // A default is to use a Set to filter distinct values. The choice of set - // should be evaluated. A HashSet would be optimal if size is known. - // A TreeSet has lower memory consumption and performance is not as - // sensitive to knowing the size in advance. - - final Set distinct = new TreeSet<>(); - hasher.iterator(shape).forEachRemaining((Consumer) i -> { - if (distinct.add(i)) { - consumer.accept(i); - } - }); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java new file mode 100644 index 0000000000..ca6ac6e8cd --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.BitSet; +import java.util.Objects; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * An object that produces indices of a Bloom filter. + *

+ * The default implementation of {@code asIndexArray} is slow. Implementers should reimplement the + * method where possible.

+ * + * @since 4.5 + */ +@FunctionalInterface +public interface IndexProducer { + + /** + * Each index is passed to the predicate. The predicate is applied to each + * index value, if the predicate returns {@code false} the execution is stopped, {@code false} + * is returned, and no further indices are processed. + * + *

Any exceptions thrown by the action are relayed to the caller.

+ * + *

Indices ordering is not guaranteed

+ * + * @param predicate the action to be performed for each non-zero bit index. + * @return {@code true} if all indexes return true from consumer, {@code false} otherwise. + * @throws NullPointerException if the specified action is null + */ + boolean forEachIndex(IntPredicate predicate); + + /** + * Creates an IndexProducer from an array of integers. + * @param values the index values + * @return an IndexProducer that uses the values. + */ + static IndexProducer fromIndexArray(final int... values) { + return new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate predicate) { + for (int value : values) { + if (!predicate.test(value)) { + return false; + } + } + return true; + } + }; + } + + /** + * Creates an IndexProducer from a {@code BitMapProducer}. + * @param producer the {@code BitMapProducer} + * @return a new {@code IndexProducer}. + */ + static IndexProducer fromBitMapProducer(BitMapProducer producer) { + Objects.requireNonNull(producer, "producer"); + return new IndexProducer() { + @Override + public boolean forEachIndex(IntPredicate consumer) { + LongPredicate longPredicate = new LongPredicate() { + int wordIdx = 0; + + @Override + public boolean test(long word) { + int i = wordIdx; + while (word != 0) { + if ((word & 1) == 1) { + if (!consumer.test(i)) { + return false; + } + } + word >>>= 1; + i++; + } + wordIdx += 64; + return true; + } + }; + return producer.forEachBitMap(longPredicate::test); + } + }; + } + + /** + * Return a copy of the IndexProducer data as an int array. + *

+ * The default implementation of this method is slow. It is recommended + * that implementing classes reimplement this method. + *

+ * @return An int array of the data. + */ + default int[] asIndexArray() { + BitSet result = new BitSet(); + forEachIndex(i -> { + result.set(i); + return true; + }); + return result.stream().toArray(); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/LongBiPredicate.java b/src/main/java/org/apache/commons/collections4/bloomfilter/LongBiPredicate.java new file mode 100644 index 0000000000..9b45a09afb --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/LongBiPredicate.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +/** + * Represents a function that accepts a two long-valued argument and produces a binary result. + * This is the long-consuming primitive specialization for {@code BiPredicate}. + * + * This is a functional interface whose functional method is {@code test(long,long)}. + * + * @since 4.5 + */ +@FunctionalInterface +public interface LongBiPredicate { + + /** + * A function that takes to long arguments and returns a boolean. + * @param x the first long argument. + * @param y the second long argument. + * @return true or false. + */ + boolean test(long x, long y); +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java index 48c43620ad..fa28559715 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java @@ -16,147 +16,176 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import java.util.function.LongBinaryOperator; /** - * Implementations of set operations on Bloom filters. + * Implementations of set operations on BitMapProducers. * + * @since 4.5 */ public final class SetOperations { /** - * Calculates the Cosine distance between two Bloom filters. + * Calculates the cardinality of the result of a LongBinaryOperator using the + * {@code BitMapProducer.makePredicate} method. + * @param first the first BitMapProducer + * @param second the second BitMapProducer + * @param op a long binary operation on where x = {@code first} and y = {@code second} bitmap producers. + * @return the calculated cardinality. + */ + private static int cardinality(BitMapProducer first, BitMapProducer second, LongBinaryOperator op) { + int[] cardinality = new int[1]; + + first.forEachBitMapPair(second, (x, y) -> { + cardinality[0] += Long.bitCount(op.applyAsLong(x, y)); + return true; + }); + return cardinality[0]; + } + + /** + * Calculates the cardinality of a BitMapProducer. By necessity this method will visit each bit map + * created by the producer. + * @param producer the Producer to calculate the cardinality for. + * @return the cardinality of the bit maps produced by the producer. + */ + public static int cardinality(BitMapProducer producer) { + int[] cardinality = new int[1]; + producer.forEachBitMap(l -> { + cardinality[0] += Long.bitCount(l); + return true; + }); + return cardinality[0]; + } + + /** + * Calculates the cardinality of the logical {@code AND} of the bit maps for the two filters. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code AND} of the filters. + */ + public static int andCardinality(final BitMapProducer first, final BitMapProducer second) { + return cardinality(first, second, (x, y) -> x & y); + } + + /** + * Calculates the cardinality of the logical {@code OR} of the bit maps for the two filters. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code OR} of the filters. + */ + public static int orCardinality(final BitMapProducer first, final BitMapProducer second) { + return cardinality(first, second, (x, y) -> x | y); + } + + /** + * Calculates the cardinality of the logical {@code XOR} of the bit maps for the two filters. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code XOR} of the filters. + */ + public static int xorCardinality(final BitMapProducer first, final BitMapProducer second) { + return cardinality(first, second, (x, y) -> x ^ y); + } + + /** + * Calculates the Cosine distance between two BitMapProducer. * *

Cosine distance is defined as {@code 1 - Cosine similarity}

* - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the jaccard distance. */ - public static double cosineDistance(final BloomFilter first, final BloomFilter second) { + public static double cosineDistance(final BitMapProducer first, final BitMapProducer second) { return 1.0 - cosineSimilarity(first, second); } /** - * Calculates the Cosine similarity between two Bloom filters. + * Calculates the Cosine similarity between two BitMapProducers. *

Also known as Orchini similarity and the Tucker coefficient of congruence or * Ochiai similarity.

* - *

If either filter is empty (no enabled bits) the result is 0 (zero)

+ *

If either producer is empty the result is 0 (zero)

* - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the Cosine similarity. */ - public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final int numerator = first.andCardinality(second); - return numerator == 0 ? 0 : numerator / (Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality())); + public static double cosineSimilarity(final BitMapProducer first, final BitMapProducer second) { + final int numerator = andCardinality(first, second); + // Given that the cardinality is an int then the product as a double will not + // overflow, we can use one sqrt: + return numerator == 0 ? 0 : numerator / Math.sqrt(cardinality(first) * cardinality(second)); } /** - * Estimates the number of items in the intersection of the sets represented by two - * Bloom filters. + * Calculates the Cosine similarity between two Bloom filters. + *

Also known as Orchini similarity and the Tucker coefficient of congruence or + * Ochiai similarity.

* - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return an estimate of the size of the intersection between the two filters. - */ - public static long estimateIntersectionSize(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - // do subtraction early to avoid Long overflow. - return estimateSize(first) - estimateUnionSize(first, second) + estimateSize(second); - } - - /** - * Estimates the number of items in the Bloom filter based on the shape and the number - * of bits that are enabled. + *

If either filter is empty (no enabled bits) the result is 0 (zero)

* - * @param filter the Bloom filter to estimate size for. - * @return an estimate of the number of items that were placed in the Bloom filter. - */ - public static long estimateSize(final BloomFilter filter) { - final Shape shape = filter.getShape(); - final double estimate = -(shape.getNumberOfBits() * - Math.log(1.0 - filter.cardinality() * 1.0 / shape.getNumberOfBits())) / - shape.getNumberOfHashFunctions(); - return Math.round(estimate); - } - - /** - * Estimates the number of items in the union of the sets represented by two - * Bloom filters. + *

This is a version of cosineSimilarity optimized for Bloom filters.

* * @param first the first Bloom filter. * @param second the second Bloom filter. - * @return an estimate of the size of the union between the two filters. + * @return the Cosine similarity. */ - public static long estimateUnionSize(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final Shape shape = first.getShape(); - final double estimate = -(shape.getNumberOfBits() * - Math.log(1.0 - first.orCardinality(second) * 1.0 / shape.getNumberOfBits())) / - shape.getNumberOfHashFunctions(); - return Math.round(estimate); + public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { + final int numerator = andCardinality(first, second); + // Given that the cardinality is an int then the product as a double will not + // overflow, we can use one sqrt: + return numerator == 0 ? 0 : numerator / Math.sqrt(first.cardinality() * second.cardinality()); } /** - * Calculates the Hamming distance between two Bloom filters. + * Calculates the Hamming distance between two BitMapProducers. * - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the Hamming distance. */ - public static int hammingDistance(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - return first.xorCardinality(second); + public static int hammingDistance(final BitMapProducer first, final BitMapProducer second) { + return xorCardinality(first, second); } /** - * Calculates the Jaccard distance between two Bloom filters. + * Calculates the Jaccard distance between two BitMapProducer. * *

Jaccard distance is defined as {@code 1 - Jaccard similarity}

* - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the Jaccard distance. */ - public static double jaccardDistance(final BloomFilter first, final BloomFilter second) { + public static double jaccardDistance(final BitMapProducer first, final BitMapProducer second) { return 1.0 - jaccardSimilarity(first, second); } /** - * Calculates the Jaccard similarity between two Bloom filters. + * Calculates the Jaccard similarity between two BitMapProducer. * *

Also known as Jaccard index, Intersection over Union, and Jaccard similarity coefficient

* - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the Jaccard similarity. */ - public static double jaccardSimilarity(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final int orCard = first.orCardinality(second); - // if the orCard is zero then the hamming distance will also be zero. - return orCard == 0 ? 0 : hammingDistance(first, second) / (double) orCard; - } - - /** - * Verifies the Bloom filters have the same shape. - * - * @param first the first filter to check. - * @param second the second filter to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - private static void verifyShape(final BloomFilter first, final BloomFilter second) { - if (!first.getShape().equals(second.getShape())) { - throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", - first.getShape(), second.getShape())); - } + public static double jaccardSimilarity(final BitMapProducer first, final BitMapProducer second) { + int[] cardinality = new int[2]; + first.forEachBitMapPair(second, (x, y) -> { + cardinality[0] += Long.bitCount(x & y); + cardinality[1] += Long.bitCount(x | y); + return true; + }); + final int intersection = cardinality[0]; + return intersection == 0 ? 0 : intersection / (double) cardinality[1]; } /** * Do not instantiate. */ - private SetOperations() {} + private SetOperations() { + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java similarity index 59% rename from src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java rename to src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index a82586fe4e..40db56516a 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.commons.collections4.bloomfilter.hasher; +package org.apache.commons.collections4.bloomfilter; import java.util.Objects; @@ -27,23 +27,23 @@ * *

Interrelatedness of values

* - *
Number of Items ({@code n})
- *
{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}
Probability of - * False Positives ({@code p})
{@code p = pow(1 - exp(-k / (m / n)), k)}
Number - * of Bits ({@code m})
- *
{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}
Number of - * Functions ({@code k})
{@code k = round((m / n) * ln(2))}
+ *
+ *
Number of Items ({@code n})
+ *
{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}
+ *
Probability of False Positives ({@code p})
+ *
{@code p = pow(1 - exp(-k / (m / n)), k)}
+ *
Number of Bits ({@code m})
+ *
{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}
+ *
Number of Functions ({@code k})
+ *
{@code k = round((m / n) * ln(2))}
+ *
* - *

Comparisons

For purposes of equality checking and hashCode - * calculations a {@code Shape} is defined by the hashing function identity, the number of - * bits ({@code m}), and the number of functions ({@code k}).

- * - * @see Bloom Filter calculator + * @see Bloom Filter calculator * @see Bloom filter * [Wikipedia] * @since 4.5 */ -public final class Shape { +public final class Shape implements Comparable { /** * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. @@ -58,9 +58,9 @@ public final class Shape { private static final double DENOMINATOR = -LN_2 * LN_2; /** - * Number of items in the filter ({@code n}). + * Number of hash functions to create a filter ({@code k}). */ - private final int numberOfItems; + private final int numberOfHashFunctions; /** * Number of bits in the filter ({@code m}). @@ -68,19 +68,137 @@ public final class Shape { private final int numberOfBits; /** - * Number of hash functions ({@code k}). + * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and + * bits ({@code m}). + * + * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. + * @param numberOfBits The number of bits in the filter + * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} */ - private final int numberOfHashFunctions; + private Shape(final int numberOfHashFunctions, final int numberOfBits) { + this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); + this.numberOfBits = checkNumberOfBits(numberOfBits); + } + + @Override + public int compareTo(Shape other) { + int i = Integer.compare(numberOfBits, other.numberOfBits); + return i == 0 ? Integer.compare(numberOfHashFunctions, other.numberOfHashFunctions) : i; + } + + @Override + public boolean equals(final Object o) { + return (o instanceof Shape) ? compareTo((Shape) o) == 0 : false; + } + + @Override + public int hashCode() { + return Objects.hash(numberOfBits, numberOfHashFunctions); + } + + /** + * Gets the number of bits in the Bloom filter. + * This is also known as {@code m}. + * + * @return the number of bits in the Bloom filter ({@code m}). + */ + public int getNumberOfBits() { + return numberOfBits; + } + + /** + * Gets the number of hash functions used to construct the filter. + * This is also known as {@code k}. + * + * @return the number of hash functions used to construct the filter ({@code k}). + */ + public int getNumberOfHashFunctions() { + return numberOfHashFunctions; + } + + /** + * Calculates the probability of false positives ({@code p}) given + * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). + *
p = pow(1 - exp(-k / (m / n)), k)
+ * + *

This is the probability that a Bloom filter will return true for the presence of an item + * when it does not contain the item.

+ * + *

The probability assumes that the Bloom filter is filled with the expected number of + * items. If the filter contains fewer items then the actual probability will be lower. + * Thus, this returns the worst-case false positive probability for a filter that has not + * exceeded its expected number of items.

+ * + * @param numberOfItems the number of items hashed into the Bloom filter. + * @return the probability of false positives. + */ + public double getProbability(int numberOfItems) { + if (numberOfItems < 0) { + throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems); + } + if (numberOfItems == 0) { + return 0; + } + return Math.pow(-Math.expm1(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), + numberOfHashFunctions); + } + + @Override + public String toString() { + return String.format("Shape[k=%s m=%s]", numberOfHashFunctions, numberOfBits); + } + + /** + * Determines if a cardinality is sparse based on the shape. + *

This method assumes that bit maps are 64bits and indexes are 32bits. If the memory + * necessary to store the cardinality as indexes is less than the estimated memory for bit maps, + * the cardinality is determined to be {@code sparse}.

+ * @param cardinality the cardinality to check. + * @return true if the cardinality is sparse within the shape. + */ + public boolean isSparse(int cardinality) { + /* + * Since the size of a bit map is a long and the size of an index is an int, + * there can be 2 indexes for each bit map. In Bloom filters indexes are evenly + * distributed across the range of possible values, Thus if the cardinality + * (number of indexes) is less than or equal to 2*number of bit maps the + * cardinality is sparse within the shape. + */ + return cardinality <= (BitMap.numberOfBitMaps(getNumberOfBits()) * 2); + } /** - * The hash code for this filter. + * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled. + * + *

Note:

+ *
    + *
  • if cardinality == numberOfBits, then result is infinity.
  • + *
  • if cardinality > numberOfBits, then result is NaN.
  • + *
+ * + * @param cardinality the number of enabled bits also known as the hamming value. + * @return An estimate of the number of items in the Bloom filter. */ - private final int hashCode; + public double estimateN(int cardinality) { + double c = cardinality; + double m = numberOfBits; + double k = numberOfHashFunctions; + return -(m / k) * Math.log1p(-c / m); + } /** - * The identity of the hasher function. + * The factory to assist in the creation of proper Shapes. + * + * In the methods of this factory the `from` names are appended with the standard variable + * names in the order expected: + * + *
+ *
{@code N})
The number of items to be placed in the Bloom filter
+ *
{@code M})
The number of bits in the Bloom filter
+ *
{@code K})
The number of hash functions for each item placed in the Bloom filter
+ *
{@code P})
The probability of a collision once N items have been placed in the Bloom filter
+ *
*/ - private final HashFunctionIdentity hashFunctionIdentity; /** * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the @@ -94,27 +212,23 @@ public final class Shape { * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the * shape is invalid for use as a Bloom filter). * - * @param hashFunctionIdentity The identity of the hash function this shape uses * @param probability The desired false-positive probability in the range {@code (0, 1)} * @param numberOfBits The number of bits in the filter * @param numberOfHashFunctions The number of hash functions in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}; - * if {@code numberOfBits < 1}; if {@code numberOfHashFunctions < 1}; or if the actual + * @return a valid Shape. + * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}, + * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual * probability is {@code >= 1.0} - * @see #getProbability() */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final double probability, final int numberOfBits, - final int numberOfHashFunctions) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); + public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) { checkProbability(probability); - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); + checkNumberOfBits(numberOfBits); + checkNumberOfHashFunctions(numberOfHashFunctions); // Number of items (n): // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) - final double n = Math.ceil(numberOfBits / - (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); + final double n = Math.ceil(numberOfBits + / (-numberOfHashFunctions / Math.log(-Math.expm1(Math.log(probability) / numberOfHashFunctions)))); // log of probability is always < 0 // number of hash functions is >= 1 @@ -126,10 +240,11 @@ public Shape(final HashFunctionIdentity hashFunctionIdentity, final double proba // // similarly we can not produce a number greater than numberOfBits so we // do not have to check for Integer.MAX_VALUE either. - this.numberOfItems = (int) n; + + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); + checkCalculatedProbability(shape.getProbability((int) n)); + return shape; } /** @@ -147,17 +262,14 @@ public Shape(final HashFunctionIdentity hashFunctionIdentity, final double proba * functions. An exception is raised if this is greater than or equal to 1 (i.e. the * shape is invalid for use as a Bloom filter). * - * @param hashFunctionIdentity The identity of the hash function this shape uses * @param numberOfItems Number of items to be placed in the filter * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if the desired probability - * is not in the range {@code (0, 1)}; or if the actual probability is {@code >= 1.0} - * @see #getProbability() + * @return a valid Shape + * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability + * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}. */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final double probability) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); + public static Shape fromNP(final int numberOfItems, final double probability) { + checkNumberOfItems(numberOfItems); checkProbability(probability); // Number of bits (m) @@ -165,12 +277,26 @@ public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOf if (m > Integer.MAX_VALUE) { throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); } - this.numberOfBits = (int) m; + int numberOfBits = (int) m; - this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); + checkCalculatedProbability(shape.getProbability(numberOfItems)); + return shape; + } + + /** + * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and + * bits ({@code m}). + * + * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. + * @param numberOfBits The number of bits in the filter + * @return a valid Shape. + * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} + */ + public static Shape fromKM(final int numberOfHashFunctions, final int numberOfBits) { + return new Shape(numberOfHashFunctions, numberOfBits); } /** @@ -184,23 +310,20 @@ public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOf * functions. An exception is raised if this is greater than or equal to 1 (i.e. the * shape is invalid for use as a Bloom filter). * - * @param hashFunctionIdentity The identity of the hash function this shape uses * @param numberOfItems Number of items to be placed in the filter * @param numberOfBits The number of bits in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if the calculated number of hash function is {@code < 1}; - * or if the actual probability is {@code >= 1.0} - * @see #getProbability() + * @return a valid Shape. + * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, + * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0} */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + public static Shape fromNM(final int numberOfItems, final int numberOfBits) { + checkNumberOfItems(numberOfItems); + checkNumberOfBits(numberOfBits); + int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); + checkCalculatedProbability(shape.getProbability(numberOfItems)); + return shape; } /** @@ -211,24 +334,22 @@ public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOf * functions. An exception is raised if this is greater than or equal to 1 (i.e. the * shape is invalid for use as a Bloom filter). * - * @param hashFunctionIdentity The identity of the hash function this shape uses * @param numberOfItems Number of items to be placed in the filter * @param numberOfBits The number of bits in the filter. * @param numberOfHashFunctions The number of hash functions in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if {@code numberOfHashFunctions < 1}; or if the actual probability is {@code >= 1.0} - * @see #getProbability() + * @return a valid Shape. + * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, + * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}. */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits, - final int numberOfHashFunctions) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); + public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) { + checkNumberOfItems(numberOfItems); + checkNumberOfBits(numberOfBits); + checkNumberOfHashFunctions(numberOfHashFunctions); + // check that probability is within range + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); + checkCalculatedProbability(shape.getProbability(numberOfItems)); + return shape; } /** @@ -236,7 +357,7 @@ public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOf * * @param numberOfItems the number of items * @return the number of items - * @throws IllegalArgumentException if the number of items is {@code < 1} + * @throws IllegalArgumentException if the number of items is {@code < 1}. */ private static int checkNumberOfItems(final int numberOfItems) { if (numberOfItems < 1) { @@ -250,7 +371,7 @@ private static int checkNumberOfItems(final int numberOfItems) { * * @param numberOfBits the number of bits * @return the number of bits - * @throws IllegalArgumentException if the number of bits is {@code < 1} + * @throws IllegalArgumentException if the number of bits is {@code < 1}. */ private static int checkNumberOfBits(final int numberOfBits) { if (numberOfBits < 1) { @@ -260,15 +381,16 @@ private static int checkNumberOfBits(final int numberOfBits) { } /** - * Check number of hash functions is strictly positive + * Check number of hash functions is strictly positive. * * @param numberOfHashFunctions the number of hash functions * @return the number of hash functions - * @throws IllegalArgumentException if the number of hash functions is {@code < 1} + * @throws IllegalArgumentException if the number of hash functions is {@code < 1}. */ private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { if (numberOfHashFunctions < 1) { - throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); + throw new IllegalArgumentException( + "Number of hash functions must be greater than 0: " + numberOfHashFunctions); } return numberOfHashFunctions; } @@ -294,7 +416,7 @@ private static void checkProbability(final double probability) { * construction. * * @param probability the probability - * @throws IllegalArgumentException if the probability is {@code >= 1.0} + * @throws IllegalArgumentException if the probability is {@code >= 1.0}. */ private static void checkCalculatedProbability(final double probability) { // We do not need to check for p <= 0.0 since we only allow positive values for @@ -303,7 +425,7 @@ private static void checkCalculatedProbability(final double probability) { // always be 0<1 and y>0 if (probability >= 1.0) { throw new IllegalArgumentException( - String.format("Calculated probability is greater than or equal to 1: " + probability)); + String.format("Calculated probability is greater than or equal to 1: " + probability)); } } @@ -322,7 +444,7 @@ private static int calculateNumberOfHashFunctions(final int numberOfItems, final final long k = Math.round(LN_2 * numberOfBits / numberOfItems); if (k < 1) { throw new IllegalArgumentException( - String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); + String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); } // Normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but // since numberOfBits is at most Integer.MAX_VALUE the numerator of @@ -330,91 +452,4 @@ private static int calculateNumberOfHashFunctions(final int numberOfItems, final // value of k can not be above Integer.MAX_VALUE. return (int) k; } - - @Override - public boolean equals(final Object o) { - if (o instanceof Shape) { - final Shape other = (Shape) o; - return numberOfBits == other.numberOfBits && - numberOfHashFunctions == other.numberOfHashFunctions && - HashFunctionValidator.areEqual(hashFunctionIdentity, - other.hashFunctionIdentity); - } - return false; - } - - @Override - public int hashCode() { - return hashCode; - } - - private int generateHashCode() { - return Objects.hash(numberOfBits, numberOfHashFunctions, HashFunctionValidator.hash(hashFunctionIdentity)); - } - - /** - * Gets the HashFunctionIdentity of the hash function this shape uses. - * @return the HashFunctionIdentity of the hash function this shape uses. - */ - public HashFunctionIdentity getHashFunctionIdentity() { - return hashFunctionIdentity; - } - - /** - * Gets the number of bits in the Bloom filter. - * This is also known as {@code m}. - * - * @return the number of bits in the Bloom filter ({@code m}). - */ - public int getNumberOfBits() { - return numberOfBits; - } - - /** - * Gets the number of hash functions used to construct the filter. - * This is also known as {@code k}. - * - * @return the number of hash functions used to construct the filter ({@code k}). - */ - public int getNumberOfHashFunctions() { - return numberOfHashFunctions; - } - - /** - * Gets the number of items that are expected in the filter. - * This is also known as {@code n}. - * - * @return the number of items ({@code n}). - */ - public int getNumberOfItems() { - return numberOfItems; - } - - /** - * Calculates the probability of false positives ({@code p}) given - * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). - *
p = pow(1 - exp(-k / (m / n)), k)
- * - *

This is the probability that a Bloom filter will return true for the presence of an item - * when it does not contain the item. - * - *

The probability assumes that the Bloom filter is filled with the expected number of - * items. If the filter contains fewer items then the actual probability will be lower. - * Thus this returns the worst-case false positive probability for a filter that has not - * exceeded its expected number of items. - * - * @return the probability of false positives. - * @see #getNumberOfItems() - */ - public double getProbability() { - return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), - numberOfHashFunctions); - } - - @Override - public String toString() { - return String.format("Shape[ %s n=%s m=%s k=%s ]", - HashFunctionIdentity.asCommonString(hashFunctionIdentity), - numberOfItems, numberOfBits, numberOfHashFunctions); - } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java new file mode 100644 index 0000000000..fcdcf36d5b --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.Objects; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * A bloom filter using an array of bit maps to track enabled bits. This is a standard + * implementation and should work well for most Bloom filters. + * @since 4.5 + */ +public final class SimpleBloomFilter implements BloomFilter { + + /** + * The array of bit map longs that defines this Bloom filter. Will be null if the filter is empty. + */ + private final long[] bitMap; + + /** + * The Shape of this Bloom filter. + */ + private final Shape shape; + + /** + * The cardinality of this Bloom filter. + */ + private int cardinality; + + /** + * Creates an empty instance. + * + * @param shape The shape for the filter. + */ + public SimpleBloomFilter(Shape shape) { + Objects.requireNonNull(shape, "shape"); + this.shape = shape; + this.bitMap = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + this.cardinality = 0; + } + + /** + * Creates an instance that is equivalent to {@code other}. + * + * @param other The bloom filter to copy. + */ + public SimpleBloomFilter(BloomFilter other) { + Objects.requireNonNull(other, "other"); + this.shape = other.getShape(); + this.bitMap = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + this.cardinality = 0; + if (other.isSparse()) { + mergeInPlace((IndexProducer) other); + } else { + mergeInPlace((BitMapProducer) other); + } + } + + /** + * Creates a populated instance. + * @param shape The shape for the filter. + * @param hasher the Hasher to initialize the filter with. + */ + public SimpleBloomFilter(final Shape shape, Hasher hasher) { + this(shape); + Objects.requireNonNull(hasher, "hasher"); + mergeInPlace(hasher); + } + + /** + * Creates a populated instance. + * @param shape The shape for the filter. + * @param indices the IndexProducer to initialize the filter with. + * @throws IllegalArgumentException if producer sends illegal value. + */ + public SimpleBloomFilter(final Shape shape, IndexProducer indices) { + this(shape); + Objects.requireNonNull(indices, "indices"); + mergeInPlace(indices); + } + + /** + * Creates a populated instance. + * @param shape The shape for the filter. + * @param bitMaps the BitMapProducer to initialize the filter with. + * @throws IllegalArgumentException if the producer returns too many or too few bit maps. + */ + public SimpleBloomFilter(final Shape shape, BitMapProducer bitMaps) { + this(shape); + Objects.requireNonNull(bitMaps, "bitMaps"); + mergeInPlace(bitMaps); + } + + /** + * Copy constructor for {@code copy()} use. + * @param source + */ + private SimpleBloomFilter(SimpleBloomFilter source) { + this.shape = source.shape; + this.bitMap = source.bitMap.clone(); + this.cardinality = source.cardinality; + } + + @Override + public long[] asBitMapArray() { + return Arrays.copyOf(bitMap, bitMap.length); + } + + @Override + public boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate func) { + CountingLongPredicate p = new CountingLongPredicate(bitMap, func); + return other.forEachBitMap(p) && p.forEachRemaining(); + } + + @Override + public SimpleBloomFilter copy() { + return new SimpleBloomFilter(this); + } + + /** + * Performs a merge in place using an IndexProducer. + * @param indexProducer the IndexProducer to merge from. + * @throws IllegalArgumentException if producer sends illegal value. + */ + private void mergeInPlace(IndexProducer indexProducer) { + indexProducer.forEachIndex(idx -> { + if (idx < 0 || idx >= shape.getNumberOfBits()) { + throw new IllegalArgumentException(String.format( + "IndexProducer should only send values in the range[0,%s]", shape.getNumberOfBits() - 1)); + } + BitMap.set(bitMap, idx); + return true; + }); + cardinality = -1; + } + + /** + * Performs a merge in place using an BitMapProducer. + * @param bitMapProducer the BitMapProducer to merge from. + * @throws IllegalArgumentException if producer sends illegal value. + */ + private void mergeInPlace(BitMapProducer bitMapProducer) { + try { + int[] idx = new int[1]; + bitMapProducer.forEachBitMap(value -> { + bitMap[idx[0]++] |= value; + return true; + }); + // idx[0] will be limit+1 so decrement it + idx[0]--; + int idxLimit = BitMap.getLongIndex(shape.getNumberOfBits()); + if (idxLimit < idx[0]) { + throw new IllegalArgumentException(String.format( + "BitMapProducer set a bit higher than the limit for the shape: %s", shape.getNumberOfBits())); + } + if (idxLimit == idx[0]) { + long excess = (bitMap[idxLimit] >> shape.getNumberOfBits()); + if (excess != 0) { + throw new IllegalArgumentException( + String.format("BitMapProducer set a bit higher than the limit for the shape: %s", + shape.getNumberOfBits())); + } + } + cardinality = -1; + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("BitMapProducer should send at most %s maps", bitMap.length), e); + } + } + + @Override + public boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + mergeInPlace(hasher.indices(shape)); + return true; + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + Objects.requireNonNull(other, "other"); + if (other.isSparse()) { + mergeInPlace((IndexProducer) other); + } else { + mergeInPlace((BitMapProducer) other); + } + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean isSparse() { + return false; + } + + @Override + public int cardinality() { + // Lazy evaluation with caching + int c = cardinality; + if (c < 0) { + cardinality = c = SetOperations.cardinality(this); + } + return c; + } + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + return IndexProducer.fromBitMapProducer(this).forEachIndex(consumer); + } + + @Override + public boolean forEachBitMap(LongPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + for (long l : bitMap) { + if (!consumer.test(l)) { + return false; + } + } + return true; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + return indexProducer.forEachIndex(idx -> BitMap.contains(bitMap, idx)); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleHasher.java new file mode 100644 index 0000000000..9bc7a99648 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleHasher.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntPredicate; + +/** + * A Hasher that implements combinatorial hashing as as described by + * Krisch and Mitzenmacher. + *

+ * Common use for this hasher is to generate a byte array as the output of a hashing + * or MessageDigest algorithm.

+ * + * @since 4.5 + */ +public class SimpleHasher implements Hasher { + + /** + * The initial hash value. + */ + private final long initial; + + /** + * The value to increment the hash value by. + */ + private final long increment; + + /** + * Convert bytes to long. + * @param byteArray the byte array to extract the values from. + * @param offset the offset to start extraction from. + * @param len the length of the extraction, may be longer than 8. + * @return + */ + private static long toLong(byte[] byteArray, int offset, int len) { + long val = 0; + len = Math.min(len, Long.BYTES); + for (int i = 0; i < len; i++) { + val <<= 8; + val |= (byteArray[offset + i] & 0x00FF); + } + return val; + } + + /** + * Constructs the SimpleHasher from a byte array. + *

The byte array is split in 2 and each half is interpreted as a long value. + * Excess bytes are ignored. This simplifies the conversion from a Digest or hasher algorithm output + * to the two values used by the SimpleHasher.

+ *

If the second long is zero the default increment is used instead.

+ * @param buffer the buffer to extract the longs from. + * @throws IllegalArgumentException is buffer length is zero. + * @see #getDefaultIncrement() + */ + public SimpleHasher(byte[] buffer) { + if (buffer.length == 0) { + throw new IllegalArgumentException("buffer length must be greater than 0"); + } + int segment = buffer.length / 2; + this.initial = toLong(buffer, 0, segment); + long possibleIncrement = toLong(buffer, segment, buffer.length - segment); + this.increment = possibleIncrement == 0 ? getDefaultIncrement() : possibleIncrement; + } + + /** + * Constructs the SimpleHasher from 2 longs. The long values will be interpreted as unsigned values. + *

If the increment is zero the default increment is used instead.

+ * @param initial The initial value for the hasher. + * @param increment The value to increment the hash by on each iteration. + * @see #getDefaultIncrement() + */ + public SimpleHasher(long initial, long increment) { + this.initial = initial; + this.increment = increment == 0 ? getDefaultIncrement() : increment; + } + + /** + * Get the default increment used when the requested increment is zero. + *

+ * By default this is the same + * default increment used in Java's SplittableRandom random number generator. It is the + * fractional representation of the golden ratio (0.618...) with a base of 2^64. + *

+ * Implementations may want to override this value to match defaults in legacy implementations. + *

+ * @return The default increment to use when the requested increment is zero. + */ + public long getDefaultIncrement() { + return 0x9e3779b97f4a7c15L; + } + + /** + * Performs a modulus calculation on an unsigned long and an integer divisor. + * @param dividend a unsigned long value to calculate the modulus of. + * @param divisor the divisor for the modulus calculation. + * @return the remainder or modulus value. + */ + static int mod(long dividend, int divisor) { + // See Hacker's Delight (2nd ed), section 9.3. + // Assume divisor is positive. + // Divide half the unsigned number and then double the quotient result. + final long quotient = ((dividend >>> 1) / divisor) << 1; + final long remainder = dividend - quotient * divisor; + // remainder in [0, 2 * divisor) + return (int) (remainder >= divisor ? remainder - divisor : remainder); + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + + return new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + int bits = shape.getNumberOfBits(); + /* + * Essentially this is computing a wrapped modulus from a start point and an + * increment. So actually you only need two modulus operations before the loop. + * This avoids any modulus operation inside the while loop. It uses a long index + * to avoid overflow. + */ + long index = mod(initial, bits); + int inc = mod(increment, bits); + + for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { + + if (!consumer.test((int) index)) { + return false; + } + index += inc; + index = index >= bits ? index - bits : index; + } + return true; + } + + @Override + public int[] asIndexArray() { + int[] result = new int[shape.getNumberOfHashFunctions()]; + int[] idx = new int[1]; + /* + * This method needs to return duplicate indices + */ + forEachIndex(i -> { + result[idx[0]++] = i; + return true; + }); + return result; + } + }; + } + + @Override + public IndexProducer uniqueIndices(final Shape shape) { + return new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + IndexFilter filter = IndexFilter.create(shape, consumer); + + int bits = shape.getNumberOfBits(); + + // Set up for the modulus. Use a long index to avoid overflow. + long index = mod(initial, bits); + int inc = mod(increment, bits); + + for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { + + if (!filter.test((int) index)) { + return false; + } + index += inc; + index = index >= bits ? index - bits : index; + } + return true; + } + }; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java new file mode 100644 index 0000000000..4711667ac1 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.TreeSet; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * A bloom filter using a TreeSet of integers to track enabled bits. This is a standard + * implementation and should work well for most low cardinality Bloom filters. + * @since 4.5 + */ +public final class SparseBloomFilter implements BloomFilter { + + /** + * The bitSet that defines this BloomFilter. + */ + private final TreeSet indices; + + /** + * The shape of this BloomFilter. + */ + private final Shape shape; + + /** + * Constructs an empty BitSetBloomFilter. + * + * @param shape The shape of the filter. + */ + public SparseBloomFilter(Shape shape) { + Objects.requireNonNull(shape, "shape"); + this.shape = shape; + this.indices = new TreeSet<>(); + } + + /** + * Creates an instance that is equivalent to {@code other}. + * + * @param other The bloom filter to copy. + */ + public SparseBloomFilter(BloomFilter other) { + Objects.requireNonNull(other, "other"); + this.shape = other.getShape(); + this.indices = new TreeSet<>(); + if (other.isSparse()) { + mergeInPlace((IndexProducer) other); + } else { + mergeInPlace(IndexProducer.fromBitMapProducer(other)); + } + } + + private void checkIndices(Shape shape) { + if (this.indices.floor(-1) != null || this.indices.ceiling(shape.getNumberOfBits()) != null) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); + } + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape for the bloom filter. + * @param hasher the hasher to provide the initial data. + */ + public SparseBloomFilter(final Shape shape, Hasher hasher) { + this(shape); + Objects.requireNonNull(hasher, "hasher"); + hasher.indices(shape).forEachIndex(this::add); + checkIndices(shape); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param indices an index producer for the indices to to enable. + * @throws IllegalArgumentException if indices contains a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, IndexProducer indices) { + this(shape); + Objects.requireNonNull(indices, "indices"); + indices.forEachIndex(this::add); + checkIndices(shape); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param bitMaps a BitMapProducer for the bit maps to add. + * @throws IllegalArgumentException if the bit maps contain a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, BitMapProducer bitMaps) { + this(shape); + Objects.requireNonNull(bitMaps, "bitMaps"); + mergeInPlace(IndexProducer.fromBitMapProducer(bitMaps)); + } + + private SparseBloomFilter(SparseBloomFilter source) { + shape = source.shape; + indices = new TreeSet(source.indices); + } + + @Override + public long[] asBitMapArray() { + long[] result = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + for (int i : indices) { + BitMap.set(result, i); + } + return result; + } + + @Override + public SparseBloomFilter copy() { + return new SparseBloomFilter(this); + } + + /** + * Adds the index to the indices. + * @param idx the index to add. + * @return {@code true} always + */ + private boolean add(int idx) { + indices.add(idx); + return true; + } + + /** + * Performs a merge in place using an IndexProducer. + * @param indexProducer the IndexProducer to merge from. + * @throws IllegalArgumentException if producer sends illegal value. + */ + private void mergeInPlace(IndexProducer indexProducer) { + indexProducer.forEachIndex(this::add); + if (!this.indices.isEmpty()) { + if (this.indices.last() >= shape.getNumberOfBits()) { + throw new IllegalArgumentException(String.format("Value in list %s is greater than maximum value (%s)", + this.indices.last(), shape.getNumberOfBits())); + } + if (this.indices.first() < 0) { + throw new IllegalArgumentException( + String.format("Value in list %s is less than 0", this.indices.first())); + } + } + } + + @Override + public boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + mergeInPlace(hasher.indices(shape)); + return true; + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + Objects.requireNonNull(other, "other"); + IndexProducer producer = other.isSparse() ? (IndexProducer) other : IndexProducer.fromBitMapProducer(other); + mergeInPlace(producer); + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean isSparse() { + return true; + } + + @Override + public int cardinality() { + return indices.size(); + } + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + for (int value : indices) { + if (!consumer.test(value)) { + return false; + } + } + return true; + } + + @Override + public boolean forEachBitMap(LongPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + int limit = BitMap.numberOfBitMaps(shape.getNumberOfBits()); + /* + * because our indices are always in order we can shorten the time necessary to + * create the longs for the consumer + */ + // the currenlty constructed bitMap + long bitMap = 0; + // the bitmap we are working on + int idx = 0; + for (int i : indices) { + while (BitMap.getLongIndex(i) != idx) { + if (!consumer.test(bitMap)) { + return false; + } + bitMap = 0; + idx++; + } + bitMap |= BitMap.getLongBit(i); + } + // we fall through with data in the bitMap + if (!consumer.test(bitMap)) { + return false; + } + // account for hte bitMap in the previous block + the next one + idx++; + // while there are more blocks to generate send zero to the consumer. + while (idx < limit) { + if (!consumer.test(0L)) { + return false; + } + idx++; + } + return true; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + return indexProducer.forEachIndex(indices::contains); + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains(IndexProducer.fromBitMapProducer(bitMapProducer)); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java deleted file mode 100644 index ab6b773d6c..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; - -/** - * The class that performs hashing on demand. - * @since 4.5 - */ -public class DynamicHasher implements Hasher { - - /** - * The builder for DynamicHashers. - * @since 4.5 - */ - public static class Builder implements Hasher.Builder { - - /** - * The list of items (each as a byte[]) that are to be hashed. - */ - private final List buffers; - - /** - * The function that the resulting DynamicHasher will use. - */ - private final HashFunction function; - - /** - * Constructs a DynamicHasher builder. - * - * @param function the function implementation. - */ - public Builder(final HashFunction function) { - this.function = function; - this.buffers = new ArrayList<>(); - } - - @Override - public DynamicHasher build() throws IllegalArgumentException { - // Assumes the hasher will create a copy of the buffers - final DynamicHasher hasher = new DynamicHasher(function, buffers); - // Reset for further use - buffers.clear(); - return hasher; - } - - @Override - public final DynamicHasher.Builder with(final byte[] property) { - buffers.add(property); - return this; - } - - @Override - public DynamicHasher.Builder with(final CharSequence item, final Charset charset) { - Hasher.Builder.super.with(item, charset); - return this; - } - - @Override - public DynamicHasher.Builder withUnencoded(final CharSequence item) { - Hasher.Builder.super.withUnencoded(item); - return this; - } - } - - /** - * The iterator of integers. - * - *

This assumes that the list of buffers is not empty. - */ - private class Iterator implements PrimitiveIterator.OfInt { - /** The number of hash functions per item. */ - private final int k; - /** The number of bits in the shape. */ - private final int m; - /** The current item. */ - private byte[] item; - /** The index of the next item. */ - private int nextItem; - /** The count of hash functions for the current item. */ - private int functionCount; - - /** - * Constructs iterator with the specified shape. - * - * @param shape - */ - private Iterator(final Shape shape) { - // Assumes that shape returns non-zero positive values for hash functions and bits - k = shape.getNumberOfHashFunctions(); - m = shape.getNumberOfBits(); - // Assume non-empty - item = buffers.get(0); - nextItem = 1; - } - - @Override - public boolean hasNext() { - if (functionCount != k) { - return true; - } - // Reached the number of hash functions for the current item. - // Try and advance to the next item. - if (nextItem != buffers.size()) { - item = buffers.get(nextItem++); - functionCount = 0; - return true; - } - // Finished. - // functionCount == shape.getNumberOfHashFunctions() - // nextItem == buffers.size() - return false; - } - - @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. - @Override - public int nextInt() { - if (hasNext()) { - return (int) Math.floorMod(function.apply(item, functionCount++), - // Cast to long to workaround a bug in animal-sniffer. - (long) m); - } - throw new NoSuchElementException(); - } - } - - /** - * An iterator of integers to use when there are no values. - */ - private static class NoValuesIterator implements PrimitiveIterator.OfInt { - /** The singleton instance. */ - private static final NoValuesIterator INSTANCE = new NoValuesIterator(); - - /** - * Empty constructor. - */ - private NoValuesIterator() {} - - @Override - public boolean hasNext() { - return false; - } - - @Override - public int nextInt() { - throw new NoSuchElementException(); - } - } - - /** - * The list of byte arrays that are to be hashed. - * Package private for access by the iterator. - */ - final List buffers; - - /** - * The function to hash the buffers. - * Package private for access by the iterator. - */ - final HashFunction function; - - /** - * Constructs a DynamicHasher. - * - * @param function the function to use. - * @param buffers the byte buffers that will be hashed. - */ - public DynamicHasher(final HashFunction function, final List buffers) { - this.buffers = new ArrayList<>(buffers); - this.function = function; - } - - @Override - public PrimitiveIterator.OfInt iterator(final Shape shape) { - HashFunctionValidator.checkAreEqual(getHashFunctionIdentity(), - shape.getHashFunctionIdentity()); - // Use optimised iterator for no values - return buffers.isEmpty() ? NoValuesIterator.INSTANCE : new Iterator(shape); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return function; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java deleted file mode 100644 index d14fd3d830..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -/** - * Defines a hash function used by a {@link Hasher} . - * @since 4.5 - */ -public interface HashFunction extends HashFunctionIdentity { - - /** - * Applies the hash function to the buffer. - * - * @param buffer the buffer to apply the hash function to. - * @param seed the seed for the hashing. - * @return the long value of the hash. - */ - long apply(byte[] buffer, int seed); - - /** - * Gets the signature of this function. - * - *

The signature of this function is calculated as: - *


-     * int seed = 0;
-     * apply(String.format("%s-%s-%s",
-     *                     getName().toUpperCase(Locale.ROOT), getSignedness(), getProcess())
-     *             .getBytes("UTF-8"), seed);
-     * 
- * - * @see HashFunctionIdentity#prepareSignatureBuffer(HashFunctionIdentity) - */ - @Override - long getSignature(); -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java deleted file mode 100644 index 0ff2edb8d4..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.nio.charset.StandardCharsets; -import java.util.Locale; - -/** - * Defines the hash function used by a {@link Hasher}. - * - * @since 4.5 - */ -public interface HashFunctionIdentity { - - /** - * Identifies the process type of this function. - * - *
- *
Iterative processes
- *
Call the underlying hash algorithm for each (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)}.
- *
Cyclic processes
- *
Call the underlying hash algorithm using a (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)} to initialize the state. Subsequent - * calls can generate hash values without calling the underlying algorithm.
- *
- */ - enum ProcessType { - /** - * Call the underlying hash algorithm for a (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)} when the state is uninitialized or - * the seed is zero. This initializes the state. Subsequent calls with a non-zero - * seed use the state to generate a new value. - */ - CYCLIC, - /** - * Call the underlying hash algorithm for each (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)}. - */ - ITERATIVE - } - - /** - * Identifies the signedness of the calculations for this function. - *

- * When the hash function executes it typically returns an array of bytes. - * That array is converted into one or more numerical values which will be provided - * as a {@code long} primitive type. - * The signedness identifies if those {@code long} values are signed or unsigned. - * For example a hash function that outputs only 32-bits can be unsigned if converted - * using {@link Integer#toUnsignedLong(int)}. A hash function that outputs more than - * 64-bits is typically signed. - *

- */ - enum Signedness { - /** - * The result of {@link HashFunction#apply(byte[], int)} is signed, - * thus the sign bit may be set. - * - *

- * The result can be used with {@code Math.floorMod(x, y)} to generate a positive - * value if y is positive. - *

- * - * @see Math#floorMod(int, int) - */ - SIGNED, - /** - * The result of {@link HashFunction#apply(byte[], int)} is unsigned, - * thus the sign bit is never set. - * - *

- * The result can be used with {@code x % y} to generate a positive - * value if y is positive. - *

- */ - UNSIGNED - } - - /** - * Gets a common formatted string for general display. - * - * @param identity the identity to format. - * @return the String representing the identity. - */ - static String asCommonString(final HashFunctionIdentity identity) { - return String.format("%s-%s-%s", identity.getName(), identity.getSignedness(), identity.getProcessType()); - } - - /** - * Gets a {@code byte[]} buffer for a HashFunctionIdentity to create a signature. The - * {@code byte[]} is composed using properties of the hash function as: - * - *

-     * String.format("%s-%s-%s",
-     *               getName().toUpperCase(Locale.ROOT), getSignedness(), getProcess())
-     *       .getBytes("UTF-8");
-     * 
- * - * @param identity The HashFunctionIdentity to create the buffer for. - * @return the signature buffer for the identity - * @see #getSignature() - */ - static byte[] prepareSignatureBuffer(final HashFunctionIdentity identity) { - return String.format("%s-%s-%s", - identity.getName().toUpperCase(Locale.ROOT), identity.getSignedness(), - identity.getProcessType()).getBytes(StandardCharsets.UTF_8); - } - - /** - * Gets the name of this hash function. - *

- * Hash function should be the common name - * for the hash. This may include indications as to hash length - *

- *

- * Names are not case specific. Thus, "MD5" and "md5" should be considered as the same. - *

- * @return the Hash name - */ - String getName(); - - /** - * Gets the process type of this function. - * - * @return process type of this function. - */ - ProcessType getProcessType(); - - /** - * Gets the name of the provider of this hash function implementation. - *

- * Provider names are not case specific. Thus, "Apache Commons Collection" and - * "apache commons collection" should be considered as the same. - *

- * @return the name of the provider of this hash implementation. - */ - String getProvider(); - - /** - * Gets the signature of this function. The signature is the output of the hash function - * when applied to a set of bytes composed using properties of the hash function. - * - *

- * Implementations should define the method used to generate the signature. - *

- * - * @return the signature of this function. - * @see #prepareSignatureBuffer(HashFunctionIdentity) - */ - long getSignature(); - - /** - * Gets the signedness of this function. - * - * @return signedness of this function. - */ - Signedness getSignedness(); -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java deleted file mode 100644 index c75973a376..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -/** - * An instance of HashFunctionIdentity that is suitable for deserializing - * HashFunctionIdentity data from a stream or any other situation where the - * hash function is not available but the identify of the function is required. - * - * @since 4.5 - */ -public final class HashFunctionIdentityImpl implements HashFunctionIdentity { - private final String name; - private final String provider; - private final Signedness signedness; - private final ProcessType process; - private final long signature; - - /** - * Creates a copy of the HashFunctionIdentity. - * @param identity the identity to copy. - */ - public HashFunctionIdentityImpl(final HashFunctionIdentity identity) { - this.name = identity.getName(); - this.provider = identity.getProvider(); - this.signedness = identity.getSignedness(); - this.process = identity.getProcessType(); - this.signature = identity.getSignature(); - } - - /** - * Creates a HashFunctionIdentity from component values. - * @param provider the name of the provider. - * @param name the name of the hash function. - * @param signedness the signedness of the hash function. - * @param process the processes of the hash function. - * @param signature the signature for the hash function. - */ - public HashFunctionIdentityImpl(final String provider, final String name, final Signedness signedness, final ProcessType process, - final long signature) { - this.name = name; - this.provider = provider; - this.signedness = signedness; - this.process = process; - this.signature = signature; - } - - @Override - public String getName() { - return name; - } - - @Override - public ProcessType getProcessType() { - return process; - } - - @Override - public String getProvider() { - return provider; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return signedness; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java deleted file mode 100644 index 3ec0753e4a..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.util.Locale; -import java.util.Objects; - -/** - * Contains validation for hash functions. - */ -public final class HashFunctionValidator { - /** Do not instantiate. */ - private HashFunctionValidator() {} - - /** - * Generates a hash code for the identity of the hash function. The hash code is - * generated using the same properties as those tested in - * {@link #areEqual(HashFunctionIdentity, HashFunctionIdentity)}, that is the - * signedness, process type and name. The name is not case specific and is converted - * to lower-case using the {@link Locale#ROOT root locale}. - * - *

The generated value is suitable for use in generation of a hash code that satisfies - * the contract of {@link Object#hashCode()} if the {@link Object#equals(Object)} method - * is implemented using {@link #areEqual(HashFunctionIdentity, HashFunctionIdentity)}. That - * is two objects considered equal will have the same hash code. - * - *

If the hash function identity is a field within a larger object the generated hash code - * should be incorporated into the entire hash, for example using - * {@link Objects#hash(Object...)}. - * - * @param a hash function. - * @return hash code - * @see String#toLowerCase(Locale) - * @see Locale#ROOT - */ - static int hash(final HashFunctionIdentity a) { - return Objects.hash(a.getSignedness(), - a.getProcessType(), - a.getName().toLowerCase(Locale.ROOT)); - } - - /** - * Compares the identity of the two hash functions. The functions are considered - * equal if the signedness, process type and name are equal. The name is not - * case specific. - * - *

A pair of functions that are equal would be expected to produce the same - * hash output from the same input. - * - * @param a First hash function. - * @param b Second hash function. - * @return true, if successful - * @see String#equalsIgnoreCase(String) - */ - public static boolean areEqual(final HashFunctionIdentity a, final HashFunctionIdentity b) { - return (a.getSignedness() == b.getSignedness() && - a.getProcessType() == b.getProcessType() && - a.getName().equalsIgnoreCase(b.getName())); - } - - /** - * Compares the identity of the two hash functions and throws an exception if they - * are not equal. - * - * @param a First hash function. - * @param b Second hash function. - * @see #areEqual(HashFunctionIdentity, HashFunctionIdentity) - * @throws IllegalArgumentException if the hash functions are not equal - */ - public static void checkAreEqual(final HashFunctionIdentity a, final HashFunctionIdentity b) { - if (!areEqual(a, b)) { - throw new IllegalArgumentException(String.format("Hash functions are not equal: (%s) != (%s)", - HashFunctionIdentity.asCommonString(a), HashFunctionIdentity.asCommonString(b))); - } - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java deleted file mode 100644 index 3700567f1a..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.nio.charset.Charset; -import java.util.PrimitiveIterator; - -/** - * A Hasher represents items of arbitrary byte size as a byte representation of - * fixed size (a hash). The hash representations can be used to create indexes - * for a Bloom filter. - * - *

The hash for each item is created using a hash function; use of different - * seeds allows generation of different hashes for the same item. The hashes can - * be dynamically converted into the bit index representation used by a Bloom - * filter. The shape of the Bloom filter defines the number of indexes per item - * and the range of the indexes. The hasher can generate the correct number of - * indexes in the range required by the Bloom filter for each item it - * represents. - * - *

Note that the process of generating hashes and mapping them to a Bloom - * filter shape may create duplicate indexes. The hasher may generate fewer than - * the required number of hash functions per item if duplicates have been - * removed. Implementations of {@code iterator()} may return duplicate values - * and may return values in a random order. See implementation javadoc notes as - * to the guarantees provided by the specific implementation. - * - *

Hashers have an identity based on the hashing algorithm used. - * - * @since 4.5 - */ -public interface Hasher { - - /** - * A builder to build a hasher. - * - *

A hasher represents one or more items of arbitrary byte size. The builder - * contains methods to collect byte representations of items. Each method to add - * to the builder will add an entire item to the final hasher created by the - * {@link #build()} method. - * - * @since 4.5 - */ - interface Builder { - - /** - * Builds the hasher from all the items. - * - *

This method will clear the builder for future use. - * - * @return the fully constructed hasher - */ - Hasher build(); - - /** - * Adds a byte array item to the hasher. - * - * @param item the item to add - * @return a reference to this object - */ - Builder with(byte[] item); - - /** - * Adds a character sequence item to the hasher using the specified {@code charset} - * encoding. - * - * @param item the item to add - * @param charset the character set - * @return a reference to this object - */ - default Builder with(final CharSequence item, final Charset charset) { - return with(item.toString().getBytes(charset)); - } - - /** - * Adds a character sequence item to the hasher. Each 16-bit character is - * converted to 2 bytes using little-endian order. - * - * @param item the item to add - * @return a reference to this object - */ - default Builder withUnencoded(final CharSequence item) { - final int length = item.length(); - final byte[] bytes = new byte[length * 2]; - for (int i = 0; i < length; i++) { - final char ch = item.charAt(i); - bytes[i * 2] = (byte) ch; - bytes[i * 2 + 1] = (byte) (ch >>> 8); - } - return with(bytes); - } - } - - /** - * Gets an iterator of integers that are the bits to enable in the Bloom - * filter based on the shape. - * - *

The iterator will create indexes within the range defined by the number of bits in - * the shape. The total number of indexes will respect the number of hash functions per item - * defined by the shape. However the count of indexes may not be a multiple of the number of - * hash functions if the implementation has removed duplicates. - * - *

No guarantee is made as to order of values. - * - * @param shape the shape of the desired Bloom filter - * @return the iterator of integers - * @throws IllegalArgumentException if the hasher cannot generate indexes for - * the specified @{@code shape} - */ - PrimitiveIterator.OfInt iterator(Shape shape); - - /** - * Gets the identify of the hash function used by the the hasher. - * - * @return the identity of the hash function - */ - HashFunctionIdentity getHashFunctionIdentity(); -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java deleted file mode 100644 index 430f99b565..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.PrimitiveIterator.OfInt; -import java.util.Set; -import java.util.TreeSet; - -/** - * A Hasher implementation that contains the index for all enabled bits for a specific - * Shape. - * @since 4.5 - */ -public final class StaticHasher implements Hasher { - - /** - * The shape of this hasher - */ - private final Shape shape; - - /** - * The ordered set of values that this hasher will return. - */ - private final int[] values; - - /** - * Constructs the StaticHasher from a Hasher and a Shape. - * @param hasher the Hasher to read. - * @param shape the Shape for the resulting values. - * @throws IllegalArgumentException if the hasher function and the shape function are not the same. - */ - public StaticHasher(final Hasher hasher, final Shape shape) { - this(hasher.iterator(shape), shape); - HashFunctionValidator.checkAreEqual(hasher.getHashFunctionIdentity(), - shape.getHashFunctionIdentity()); - } - - /** - * Constructs a StaticHasher from an Iterator of Integers and a Shape. - * @param iter the Iterator of Integers. - * @param shape the Shape that the integers were generated for. - * @throws IllegalArgumentException if any Integer is outside the range [0,shape.getNumberOfBits()) - */ - public StaticHasher(final Iterator iter, final Shape shape) { - this.shape = shape; - final Set workingValues = new TreeSet<>(); - iter.forEachRemaining(idx -> { - if (idx >= this.shape.getNumberOfBits()) { - throw new IllegalArgumentException(String.format("Bit index (%s) is too big for %s", idx, shape)); - } - if (idx < 0) { - throw new IllegalArgumentException(String.format("Bit index (%s) may not be less than zero", idx)); - } - workingValues.add(idx); - }); - this.values = new int[workingValues.size()]; - int i = 0; - for (final Integer value : workingValues) { - values[i++] = value.intValue(); - } - } - - /** - * Constructs the StaticHasher from a StaticHasher and a Shape. - * @param hasher the StaticHasher to read. - * @param shape the Shape for the resulting values. - * @throws IllegalArgumentException if the shape of the hasher and the shape parameter are not the same. - */ - public StaticHasher(final StaticHasher hasher, final Shape shape) { - if (!hasher.shape.equals(shape)) { - throw new IllegalArgumentException(String.format("Hasher shape (%s) is not the same as shape (%s)", - hasher.getShape().toString(), shape.toString())); - } - this.shape = shape; - this.values = hasher.values; - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } - - /** - * Gets the shape this static hasher was created with. - * - * @return the Shape of this hasher. - */ - public Shape getShape() { - return shape; - } - - /** - * Tests emptiness (size == 0). - * - * @return Whether or not this is empty. - */ - public boolean isEmpty() { - return size() == 0; - } - - /** - * Gets an iterator of integers that are the bits to enable in the Bloom - * filter based on the shape. The iterator will not return the same value multiple - * times. Values will be returned in ascending order. - * - * @param shape {@inheritDoc} - * @return {@inheritDoc} - * @throws IllegalArgumentException {@inheritDoc} - */ - @Override - public OfInt iterator(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException( - String.format("shape (%s) does not match internal shape (%s)", shape, this.shape)); - } - return Arrays.stream(values).iterator(); - } - - /** - * Gets the the number of unique values in this hasher. - * @return the number of unique values. - */ - public int size() { - return values.length; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java deleted file mode 100644 index 8e07793b7f..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import java.nio.ByteBuffer; - -import java.nio.LongBuffer; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * performs MD5 hashing using a signed cyclic method. - * @since 4.5 - */ -public final class MD5Cyclic implements HashFunction { - - /** - * The name of this hash function. - */ - public static final String NAME = "MD5"; - - /** - * The MD5 digest implementation. - */ - private final MessageDigest messageDigest; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * The result from the digest 0 - */ - private final long[] result = new long[2]; - - /** - * Constructs the MD5 hashing function. - */ - public MD5Cyclic() { - try { - messageDigest = MessageDigest.getInstance(NAME); - } catch (final NoSuchAlgorithmException e) { - // This should not happen - throw new IllegalStateException("Missing the standard MD5 message digest algorithm", e); - } - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - - if (seed == 0) { - final byte[] hash; - synchronized (messageDigest) { - messageDigest.update(buffer); - hash = messageDigest.digest(); - messageDigest.reset(); - } - - final LongBuffer lb = ByteBuffer.wrap(hash).asLongBuffer(); - result[0] = lb.get(0); - result[1] = lb.get(1); - } else { - result[0] += result[1]; - } - return result[0]; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java deleted file mode 100644 index 99c27c8819..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.codec.digest.MurmurHash3; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * uses an underlying Murmur3 128-bit hash with a signed cyclic method. - * - *

Requires the optional Apache Commons Codec - * library which contains a Java port of the 128-bit hash function - * {@code MurmurHash3_x64_128} from Austin Applyby's original {@code c++} - * code in SMHasher.

- * - * @see SMHasher - * @since 4.5 - */ -public final class Murmur128x64Cyclic implements HashFunction { - - /** - * The name of this hash method. - */ - public static final String NAME = "Murmur3_x64_128"; - - /** - * The result of the hash 0 call. - */ - private long[] parts; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * Constructs a Murmur3 x64 128 hash. - */ - public Murmur128x64Cyclic() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - if (parts == null || seed == 0) { - parts = MurmurHash3.hash128x64(buffer, 0, buffer.length, 0); - } else { - parts[0] += parts[1]; - } - return parts[0]; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java deleted file mode 100644 index 982ef5c869..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.codec.digest.MurmurHash3; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * uses an underlying Murmur3 32-bit hash with a signed iterative method. - * - *

Requires the optional Apache Commons Codec - * library which contains a Java port of the 32-bit hash function - * {@code MurmurHash3_x86_32} from Austin Applyby's original {@code c++} - * code in SMHasher.

- * - * @see Apache Commons Codec - * @see SMHasher - * @since 4.5 - */ -public final class Murmur32x86Iterative implements HashFunction { - - /** - * The name of this hash function. - */ - public static final String NAME = "Murmur3_x86_32"; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * Constructs a Murmur3 x86 32 hash - */ - public Murmur32x86Iterative() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - return MurmurHash3.hash32x86(buffer, 0, buffer.length, seed); - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.ITERATIVE; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java deleted file mode 100644 index da0fc2c2db..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * performs {@code Objects.hash} hashing using a signed iterative method. - *

- * Except in the case of seed 0, the value of the previous hash is - * used as a seed for the next hash. Hashes are seeded by calling - * {@code Arrays.deepHashCode( new Object[]{seed, buffer} )}. - *

- * @since 4.5 - */ -public final class ObjectsHashIterative implements HashFunction { - - /** - * The name of the hash function. - */ - public static final String NAME = "Objects32"; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * The value of the last hash. - */ - private long last; - - /** - * Constructs a hash that uses the Objects.hash method to has values. - */ - public ObjectsHashIterative() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - if (seed == 0) { - last = 0; - } - // Effectively: - // result = Arrays.deepHashCode(new Object[] { last, buffer }); - // The method loops over items starting with result=1 - // for i in items: - // result = 31 * result + hashCode(i) - // Here we unroll the computation to 2 iterations. - // The computation is done using 32-bit integers then cast to a long - final long result = 31 * (31 + Long.hashCode(last)) + Arrays.hashCode(buffer); - last += result; - return result; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.ITERATIVE; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java deleted file mode 100644 index b7f35ac051..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; - -/** - * Allow computation of HashFunction signatures. - * @since 4.5 - */ -final class Signatures { - - /** No instances. */ - private Signatures() {} - - /** - * Gets the standard signature for the hash function. The signature is prepared as: - *


-     * int seed = 0;
-     * return hashFunction.apply(HashFunctionIdentity.prepareSignatureBuffer(hashFunction), seed);
-     * 
- * - * @param hashFunction the hash function - * @return the signature - * @see HashFunctionIdentity#prepareSignatureBuffer(HashFunctionIdentity) - * @see HashFunction#apply(byte[], int) - */ - static long getSignature(final HashFunction hashFunction) { - return hashFunction.apply(HashFunctionIdentity.prepareSignatureBuffer(hashFunction), 0); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java index bfc3d67abe..c207254561 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -18,101 +18,78 @@ /** * A collection of extensible Bloom filter classes and interfaces. * - *

- * Background:

- *

- * A Bloom filter is conceptually a bit vector. It is used to - * tell you where things are not. Basically, you create a Bloom filter by creating hashes - * and converting those to enabled bits in a vector. You can merge the Bloom filters - * together with logical "or" (call this filter "B"). You can then check to see if filter - * "A" was "or"ed into "B" by testing A & B == A. if the statement is false then "A" was - * not merged into "B", otherwise it _might_ have. They are generally used where hash - * tables would be too large or as a filter front end for longer processes. For example + *

Background:

+ * + *

The Bloom filter is a probabilistic data structure that indicates where things are not. + * Conceptually it is a bit vector. You create a Bloom filter by creating hashes + * and converting those to enabled bits in the vector. Multiple Bloom filters may be merged + * together into one Bloom filter. It is possible to test if a filter {@code B} has merged into + * another filter {@code A} by verifying that {@code (A & B) == B}.

+ * + *

Bloom filters are generally used where hash + * tables would be too large, or as a filter front end for longer processes. For example * most browsers have a Bloom filter that is built from all known bad URLs (ones that * serve up malware). When you enter a URL the browser builds a Bloom filter and checks to * see if it is "in" the bad URL filter. If not the URL is good, if it matches, then the * expensive lookup on a remote system is made to see if it actually is in the list. There * are lots of other uses, and in most cases the reason is to perform a fast check as a * gateway for a longer operation.

- *

- * BloomFilter

- *

- * The bloom filter code is - * an abstract class that requires implementation of 4 methods:

    - *
  • - * getBits() which - * returns the set bits as a buffer encoded into an array of long.
  • - *
  • - * getHasher() - * which returns a list of integers that are indexes of the bits that are enabled. These - * are returned in a Hasher construct.
  • - *
  • - * merge( BloomFilter ) to merge another - * Bloom filter into this one.
  • - *
  • - * merge( Hasher ) to merge the values in a hasher - * into this Bloom filter.
  • - *
- * There are 3 implementations of Bloom filter - * provided:
    - *
  • - * BitSetBloomFilter - based on the Java BitSet class.
  • - *
  • - * - * CountingBloomFilter - uses a sparse array of integers (Map) to implement a counting - * Bloom filter. This filter also implements remove() methods as that is the great - * advantage of a counting Bloom filter.
  • - *
  • - * HasherBloomFilter - implements bloom - * filter on a Hasher. A rather slow implementation but convenient in some - * situations.
  • - *
- * - *

- * Shape

- *

- * Describes the Bloom filter using the - * standard number of bits, number of hash functions and number of items along with a - * description of the HashFunction. It is this description that has caused the most issues - * of late.

- *

- * Hasher

- *

- * converts byte buffers into an iterator if int based - * on a Shape. There are 2 implementations of Hasher provided

    - *
  • - * Dynamic - calls - * the HashFunction for each value required in the Bloom filter.
  • - *
  • - * Static - based - * on a pre-calculated list of Bloom filter index values. It is also limited to generating - * values for a specific Shape.
  • - *
- * - *

- * Hash Functions

- *

- * Hash - * functions generate individual index values for the filter from a byte buffer. There are - * four implementations provided.

- *

- * HashFunctionIdentity

- *

- * The - * HashFunctionIdentity is the base interface for the HashFunction. It tracks three (3) - * properties:

    - *
  • - * The Hashing algorithm
  • - *
  • - * Whether the contents of the - * resulting hash buffer are read as signed or unsigned values.
  • - *
  • - * Whether the hash - * function uses an iterative or cyclic method. In traditional iterative methods this is - * done by calling the selected hash function with a different seed for each hash - * required. The second method described by Adam Kirsch and Micheal Mitzenmacher[1] has - * become more common and is used in applications like Cassandra[2].
  • - *
+ * + *

BloomFilter

+ * + *

The Bloom filter architecture here is designed so that the implementation of the storage of bits is abstracted. + * Programs that utilize the Bloom filters may use the {@code BitMapProducer} or {@code IndexProducer} to retrieve a + * representation of the internal structure. Additional methods are available in the {@code BitMap} to assist in + * manipulation of the representations.

+ * + *

The bloom filter code is an interface that requires implementation of 6 methods:

+ *
    + *
  • {@code cardinality()} + * returns the number of bits enabled in the Bloom filter.
  • + * + *
  • {@code contains(BitMapProducer)} which + * returns true if the bits specified by the bit maps generated by the BitMapProducer are enabled in the Bloom filter.
  • + * + *
  • {@code contains(IndexProducer)} which + * returns true if the bits specified by the indices generated by IndexProducer are enabled in the Bloom filter.
  • + * + *
  • {@code getShape()} which + * returns the shape the Bloom filter was created with.
  • + + *
  • {@code isSparse()} which + * returns true if an the implementation tracks indices natively, false if bit maps are used. In cases where + * neither are used the {@code isSparse} return value should reflect which is faster to produce.
  • + * + *
  • {@code mergeInPlace(BloomFilter)} which + * utilizes either the {@code BitMapProducer} or {@code IndexProducer} from the argument to enable extra bits + * in the internal representation of the Bloom filter.
  • + *
+ * + *

Other methods should be implemented where they can be done so more efficiently than the default implementations. + *

+ * + *

CountingBloomFilter

+ * + *

The counting bloom filter extends the Bloom filter by counting the number of times a specific bit has been + * enabled or disabled. This allows the removal (opposite of merge) of Bloom filters at the expense of additional + * overhead.

+ * + *

Shape

+ * + *

The Shape describes the Bloom filter using the number of bits and the number of hash functions

+ * + *

Hasher

+ * + *

A Hasher converts bytes into a series of integers based on a Shape. With the exception of the HasherCollecton, + * each hasher represents one item being added to the Bloom filter. The HasherCollection represents the + * number of items as the sum of the number of items represented by the Hashers in the collection.

+ * + *

The SimpleHasher uses a combinatorial generation technique to create the integers. It is easily + * initialized by using a standard {@code MessageDigest} or other Hash function to hash the item to insert and + * then splitting the hash bytes in half and considering each as a long value.

+ * + *

Other implementations of the Hasher are easy to implement, and should make use of the {@code Hasher.Filter} + * and/or {@code Hasher.FileredIntConsumer} classes to filter out duplicate indices.

* *

References

* diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java new file mode 100644 index 0000000000..5894b7c376 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.commons.collections4.bloomfilter.BitCountProducer.BitCountConsumer; +import org.junit.jupiter.api.Test; + +public abstract class AbstractBitCountProducerTest extends AbstractIndexProducerTest { + + /** + * A testing BitCountConsumer that always returns false. + */ + public static BitCountConsumer FALSE_CONSUMER = new BitCountConsumer() { + + @Override + public boolean test(int index, int count) { + return false; + } + }; + + /** + * A testing BitCountConsumer that always returns true. + */ + public static BitCountConsumer TRUE_CONSUMER = new BitCountConsumer() { + + @Override + public boolean test(int index, int count) { + return true; + } + }; + + /** + * Creates a producer with some data. + * @return a producer with some data + */ + @Override + protected abstract BitCountProducer createProducer(); + + /** + * Creates an producer without data. + * @return a producer that has no data. + */ + @Override + protected abstract BitCountProducer createEmptyProducer(); + + /** + * Determines if empty tests should be run. Some producers do not implement an empty + * version. Tests for those classes should return false. + * @return + */ + protected boolean supportsEmpty() { + return true; + } + + @Test + public final void testForEachCount() { + + assertFalse(createProducer().forEachCount(FALSE_CONSUMER), "non-empty should be false"); + assertTrue(createProducer().forEachCount(TRUE_CONSUMER), "non-empty should be true"); + if (supportsEmpty()) { + assertTrue(createEmptyProducer().forEachCount(FALSE_CONSUMER), "empty should be true"); + assertTrue(createEmptyProducer().forEachCount(TRUE_CONSUMER), "empty should be true"); + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitMapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitMapProducerTest.java new file mode 100644 index 0000000000..fa6f6cb4df --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitMapProducerTest.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.function.LongPredicate; + +import org.junit.jupiter.api.Test; + +public abstract class AbstractBitMapProducerTest { + + /** + * A testing consumer that always returns false. + */ + public static final LongPredicate FALSE_CONSUMER = new LongPredicate() { + + @Override + public boolean test(long arg0) { + return false; + } + }; + + /** + * A testing consumer that always returns true. + */ + public static final LongPredicate TRUE_CONSUMER = new LongPredicate() { + + @Override + public boolean test(long arg0) { + return true; + } + }; + + /** + * Creates a producer with some data. + * @return a producer with some data + */ + protected abstract BitMapProducer createProducer(); + + /** + * Creates an producer without data. + * @return a producer that has no data. + */ + protected abstract BitMapProducer createEmptyProducer(); + + protected boolean emptyIsZeroLength() { + return false; + } + + @Test + public final void testForEachBitMap() { + assertFalse(createProducer().forEachBitMap(FALSE_CONSUMER), "non-empty should be false"); + if (emptyIsZeroLength()) { + assertTrue(createEmptyProducer().forEachBitMap(FALSE_CONSUMER), "empty should be true"); + } else { + assertFalse(createEmptyProducer().forEachBitMap(FALSE_CONSUMER), "empty should be false"); + } + + assertTrue(createProducer().forEachBitMap(TRUE_CONSUMER), "non-empty should be true"); + assertTrue(createEmptyProducer().forEachBitMap(TRUE_CONSUMER), "empty should be true"); + } + + @Test + public final void testAsBitMapArray() { + long[] array = createEmptyProducer().asBitMapArray(); + for (int i = 0; i < array.length; i++) { + assertEquals(0, array[i], "Wrong value at " + i); + } + + array = createProducer().asBitMapArray(); + assertFalse(array.length == 0); + } + + @Test + public final void testForEachBitMapPair() { + LongBiPredicate func = (x, y) -> x == y; + assertTrue(createEmptyProducer().forEachBitMapPair(createEmptyProducer(), func), "empty == empty failed"); + assertFalse(createEmptyProducer().forEachBitMapPair(createProducer(), func), "empty == not_empty failed"); + assertFalse(createProducer().forEachBitMapPair(createEmptyProducer(), func), "not_empty == empty passed"); + assertTrue(createProducer().forEachBitMapPair(createProducer(), func), "not_empty == not_empty failed"); + + // test BitMapProducers of different length send 0 for missing values. + int[] count = new int[3]; + LongBiPredicate lbp = new LongBiPredicate() { + + @Override + public boolean test(long x, long y) { + if (x == 0) { + count[0]++; + } + if (y == 0) { + count[1]++; + } + count[2]++; + return true; + } + }; + createEmptyProducer().forEachBitMapPair(createProducer(), lbp); + assertEquals(count[2], count[0]); + + Arrays.fill(count, 0); + createProducer().forEachBitMapPair(createEmptyProducer(), lbp); + assertEquals(count[2], count[1]); + } + + @Test + public void testForEachBitMapEarlyExit() { + int[] passes = new int[1]; + assertFalse(createProducer().forEachBitMap(l -> { + passes[0]++; + return false; + })); + assertEquals(1, passes[0]); + + passes[0] = 0; + if (emptyIsZeroLength()) { + assertTrue(createEmptyProducer().forEachBitMap(l -> { + passes[0]++; + return false; + })); + assertEquals(0, passes[0]); + } else { + assertFalse(createEmptyProducer().forEachBitMap(l -> { + passes[0]++; + return false; + })); + assertEquals(1, passes[0]); + } + } + + @Test + public void testForEachBitMapPairEarlyExit() { + + // test BitMapProducers of different length send 0 for missing values. + int[] count = new int[1]; + LongBiPredicate lbp = new LongBiPredicate() { + + @Override + public boolean test(long x, long y) { + count[0]++; + return false; + } + }; + createProducer().forEachBitMapPair(createEmptyProducer(), lbp); + assertEquals(1, count[0]); + + Arrays.fill(count, 0); + createEmptyProducer().forEachBitMapPair(createProducer(), lbp); + assertEquals(1, count[0]); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 8ba620c8c2..9e681f6693 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -16,608 +16,413 @@ */ package org.apache.commons.collections4.bloomfilter; -import static org.junit.jupiter.api.Assertions.assertThrows; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.List; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.BiFunction; -import java.util.function.IntConsumer; import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import java.util.List; import org.junit.jupiter.api.Test; /** * Test standard methods in the {@link BloomFilter} interface. */ -public abstract class AbstractBloomFilterTest { +public abstract class AbstractBloomFilterTest { + + protected final SimpleHasher from1 = new SimpleHasher(1, 1); + protected final long from1Value = 0x3fffeL; + protected final SimpleHasher from11 = new SimpleHasher(11, 1); + protected final long from11Value = 0xffff800L; + protected final HasherCollection bigHasher = new HasherCollection(from1, from11); + protected final long bigHashValue = 0xffffffeL; + protected final HasherCollection fullHasher = new HasherCollection(new SimpleHasher(0, 1)/* 0-16 */, + new SimpleHasher(17, 1)/* 17-33 */, new SimpleHasher(33, 1)/* 33-49 */, new SimpleHasher(50, 1)/* 50-66 */, + new SimpleHasher(67, 1)/* 67-83 */ + ); + protected final long[] fullHashValue = { 0xffffffffffffffffL, 0xfffffL }; /** - * An implementation of BloomFilter that is used to test merge and cardinality - * operations with a filter type that does not match the type of the filter - * being tested. + * The shape of the Bloom filters for testing. + *
    + *
  • Hash functions (k) = 17 + *
  • Number of bits (m) = 72 + *
+ * @return the testing shape. */ - private static class TestBloomFilter extends AbstractBloomFilter { - /** The bits. */ - final BitSet bits; - - protected TestBloomFilter(final Shape shape, final BitSet bits) { - super(shape); - this.bits = bits; - } - - @Override - public long[] getBits() { - return bits.toLongArray(); - } - - @Override - public StaticHasher getHasher() { - return new StaticHasher(bits.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean merge(final Hasher hasher) { - throw new UnsupportedOperationException(); - } + protected final Shape getTestShape() { + return Shape.fromKM(17, 72); } /** - * A HashFunctionIdentity for testing. + * Create an empty version of the BloomFilter implementation we are testing. + * + * @param shape the shape of the filter. + * @return a BloomFilter implementation. */ - protected HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; + protected abstract T createEmptyFilter(Shape shape); /** - * A second HashFunctionIdentity for testing. + * Create the BloomFilter implementation we are testing. + * + * @param shape the shape of the filter. + * @param hasher the hasher to use to create the filter. + * @return a BloomFilter implementation. */ - protected HashFunctionIdentity testFunctionX = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test FunctionX"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 1; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; + protected abstract T createFilter(Shape shape, Hasher hasher); /** - * The shape of the Bloom filters for testing + * Create the BloomFilter implementation we are testing. + * + * @param shape the shape of the filter. + * @param producer A BitMap producer to build the filter with. + * @return a BloomFilter implementation. */ - protected Shape shape = new Shape(testFunction, 3, 72, 17); + protected abstract T createFilter(Shape shape, BitMapProducer producer); /** - * Tests that the andCardinality calculations are correct. + * Create the BloomFilter implementation we are testing. + * + * @param shape the shape of the filter. + * @param producer An Index producer to build the filter with. + * @return a BloomFilter implementation. */ - @Test - public final void andCardinalityTest() { - andCardinalityTest(this::createFilter); - } + protected abstract T createFilter(Shape shape, IndexProducer producer); /** - * Tests that the andCardinality calculations are correct with a generic BloomFilter. + * */ @Test - public final void andCardinalityTest_GenericBloomFilter() { - andCardinalityTest(this::createGenericFilter); + public void testConstructWithBadHasher() { + // value too large + assertThrows(IllegalArgumentException.class, + () -> createFilter(getTestShape(), new BadHasher(getTestShape().getNumberOfBits()))); + // negative value + assertThrows(IllegalArgumentException.class, () -> createFilter(getTestShape(), new BadHasher(-1))); } - /** - * Tests that the andCardinality calculations are correct. - * - * @param filterFactory the factory function to create the filter - */ - private void andCardinalityTest(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); + @Test + public void testConstructWitBitMapProducer() { + long[] values = { from11Value, 0x9L }; + BloomFilter f = createFilter(getTestShape(), BitMapProducer.fromBitMapArray(values)); + List lst = new ArrayList<>(); + for (long l : values) { + lst.add(l); + } + assertTrue(f.forEachBitMap(l -> { + return lst.remove(Long.valueOf(l)); + })); + assertTrue(lst.isEmpty()); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BitMapProducer badProducer = BitMapProducer.fromBitMapArray(0L, Long.MAX_VALUE); + // values too large + assertThrows(IllegalArgumentException.class, () -> createFilter(getTestShape(), badProducer)); + } - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + @Test + public void testConstructWithIndexProducer() { + int[] values = new int[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }; + BloomFilter f = createFilter(getTestShape(), IndexProducer.fromIndexArray(values)); + List lst = new ArrayList<>(); + for (int i : values) { + lst.add(i); + } + assertTrue(f.forEachIndex(i -> { + return lst.remove(Integer.valueOf(i)); + })); + assertTrue(lst.isEmpty()); - assertEquals(7, bf.andCardinality(bf2)); + // value to large + assertThrows(IllegalArgumentException.class, () -> createFilter(getTestShape(), + IndexProducer.fromIndexArray(new int[] { getTestShape().getNumberOfBits() }))); + // negative value + assertThrows(IllegalArgumentException.class, + () -> createFilter(getTestShape(), IndexProducer.fromIndexArray(new int[] { -1 }))); } - /** - * Tests that the andCardinality calculations are correct when there are more than Long.LENGTH bits. - */ @Test - public final void andCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + public final void testContains() { + BloomFilter bf1 = createFilter(getTestShape(), from1); + final BloomFilter bf2 = createFilter(getTestShape(), bigHasher); - final BloomFilter bf = createFilter(hasher, shape); + assertTrue(bf1.contains(bf1), "BF Should contain itself"); + assertTrue(bf2.contains(bf2), "BF2 Should contain itself"); + assertFalse(bf1.contains(bf2), "BF should not contain BF2"); + assertTrue(bf2.contains(bf1), "BF2 should contain BF"); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + assertTrue(bf2.contains(new SimpleHasher(1, 1)), "BF2 Should contain this hasher"); + assertFalse(bf2.contains(new SimpleHasher(1, 3)), "BF2 Should not contain this hasher"); - final BloomFilter bf2 = createFilter(hasher2, shape); + IndexProducer indexProducer = new SimpleHasher(1, 1).indices(getTestShape()); + assertTrue(bf2.contains(indexProducer), "BF2 Should contain this hasher"); + indexProducer = new SimpleHasher(1, 3).indices(getTestShape()); + assertFalse(bf2.contains(indexProducer), "BF2 Should not contain this hasher"); - assertEquals(7, bf.andCardinality(bf2)); - assertEquals(7, bf2.andCardinality(bf)); - } + BitMapProducer bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1, 1).indices(getTestShape()), + getTestShape().getNumberOfBits()); + assertTrue(bf2.contains(bitMapProducer), "BF2 Should contain this hasher"); + bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1, 3).indices(getTestShape()), + getTestShape().getNumberOfBits()); + assertFalse(bf2.contains(bitMapProducer), "BF2 Should not contain this hasher"); - /** - * Compare 2 static hashers to verify they have the same bits enabled. - * - * @param hasher1 the first static hasher. - * @param hasher2 the second static hasher. - */ - private void assertSameBits(final StaticHasher hasher1, final StaticHasher hasher2) { - final OfInt iter1 = hasher1.iterator(shape); - final OfInt iter2 = hasher2.iterator(shape); + // Test different lengths + bf1 = createFilter(getTestShape(), from1); + final BloomFilter bf3 = createFilter(Shape.fromKM(getTestShape().getNumberOfHashFunctions(), Long.SIZE - 1), + from1); + assertTrue(bf1.contains(bf3)); + assertTrue(bf3.contains(bf1)); - while (iter1.hasNext()) { - assertTrue(iter2.hasNext(), "Not enough data in second hasher"); - assertEquals(iter1.nextInt(), iter2.nextInt()); - } - assertFalse(iter2.hasNext(), "Too much data in second hasher"); + final BloomFilter bf4 = createFilter(Shape.fromKM(getTestShape().getNumberOfHashFunctions(), Long.SIZE - 1), + bigHasher); + assertFalse(bf1.contains(bf4)); + assertTrue(bf4.contains(bf1)); } /** - * Tests that cardinality is correct. + * Tests that the andCardinality calculations are correct. + * + * @param filterFactory the factory function to create the filter */ @Test - public final void cardinalityTest() { + public final void testEstimateIntersection() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final BloomFilter bf = createFilter(getTestShape(), from1); + final BloomFilter bf2 = createFilter(getTestShape(), bigHasher); - final BloomFilter bf = createFilter(hasher, shape); - assertEquals(17, bf.cardinality()); - } + assertEquals(1, bf.estimateIntersection(bf2)); + assertEquals(1, bf2.estimateIntersection(bf)); - /** - * Tests that creating an empty hasher works as expected. - */ - @Test - public final void constructorTest_Empty() { + final BloomFilter bf3 = createEmptyFilter(getTestShape()); - final BloomFilter bf = createEmptyFilter(shape); - final long[] lb = bf.getBits(); - assertEquals(0, lb.length); + assertEquals(0, bf.estimateIntersection(bf3)); + assertEquals(0, bf3.estimateIntersection(bf)); } /** - * Tests that creating a filter with a hasher works as expected. - */ - @Test - public final void constructorTest_Hasher() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - final long[] lb = bf.getBits(); - assertEquals(0x1FFFF, lb[0]); - assertEquals(1, lb.length); - } - - /** - * Tests that creating a Bloom filter with a Static hasher that has one shape and a - * different specified shape fails. + * Tests that the andCardinality calculations are correct. + * + * @param filterFactory the factory function to create the filter */ @Test - public final void constructorTest_WrongShape() { - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); + public final void testEstimateUnion() { + final BloomFilter bf = createFilter(getTestShape(), from1); + final BloomFilter bf2 = createFilter(getTestShape(), from11); - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), anotherShape); - assertThrows(IllegalArgumentException.class, () -> createFilter(hasher, shape), - "Should throw IllegalArgumentException"); - } - - /** - * Tests that contains() with a Bloom filter argument returns the proper results. - */ - @Test - public final void containsTest_BloomFilter() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - final BloomFilter bf2 = createFilter(hasher2, shape); - assertTrue(bf.contains(bf2)); - assertFalse(bf2.contains(bf)); - } + assertEquals(2, bf.estimateUnion(bf2)); + assertEquals(2, bf2.estimateUnion(bf)); - /** - * Tests that contains() fails properly if the other Bloom filter is not of the proper shape. - */ - @Test - public final void containsTest_BloomFilter_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final Hasher hasher2 = new StaticHasher(lst.iterator(), anotherShape); - final BloomFilter bf2 = createFilter(hasher2, anotherShape); - assertThrows(IllegalArgumentException.class, () -> bf.contains(bf2), - "Should throw IllegalArgumentException"); - } + final BloomFilter bf3 = createEmptyFilter(getTestShape()); - /** - * Tests that contains() with a Hasher argument returns the proper results. - */ - @Test - public final void containsTest_Hasher() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - assertTrue(bf.contains(hasher2)); - - lst2 = Arrays.asList(17, 18, 19, 20); - hasher2 = new StaticHasher(lst2.iterator(), shape); - assertFalse(bf.contains(hasher2)); - - lst2 = Arrays.asList(10, 11, 12, 17, 18, 19, 20); - hasher2 = new StaticHasher(lst2.iterator(), shape); - assertFalse(bf.contains(hasher2)); + assertEquals(1, bf.estimateUnion(bf3)); + assertEquals(1, bf3.estimateUnion(bf)); } /** - * Tests that contains() fails properly if the hasher is not of the proper shape. + * Tests that the size estimate is correctly calculated. */ @Test - public final void containsTest_Hasher_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); + public final void testEstimateN() { + // build a filter + BloomFilter filter1 = new SimpleBloomFilter(getTestShape(), from1); + assertEquals(1, filter1.estimateN()); - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); + // the data provided above do not generate an estimate that is equivalent to the + // actual. + filter1.mergeInPlace(new SimpleHasher(4, 1)); - final List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); - assertThrows(IllegalArgumentException.class, () -> bf.contains(hasher2), - "Should have thrown IllegalArgumentException"); - } - - /** - * Create an empty version of the BloomFilter implementation we are testing. - * - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - protected abstract AbstractBloomFilter createEmptyFilter(Shape shape); + assertEquals(1, filter1.estimateN()); - /** - * Create the BloomFilter implementation we are testing. - * - * @param hasher the hasher to use to create the filter. - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - protected abstract AbstractBloomFilter createFilter(Hasher hasher, Shape shape); + filter1.mergeInPlace(new SimpleHasher(17, 1)); - /** - * Create a generic BloomFilter implementation. - * - * @param hasher the hasher to use to create the filter. - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - private AbstractBloomFilter createGenericFilter(final Hasher hasher, final Shape shape) { - final BitSet bits = new BitSet(); - hasher.iterator(shape).forEachRemaining((IntConsumer) bits::set); - return new TestBloomFilter(shape, bits); + assertEquals(3, filter1.estimateN()); } /** - * Tests that getBits() works correctly when multiple long values are returned. + * Tests that asBitMapArray works correctly. */ @Test - public final void getBitsTest_SpanLong() { - final List lst = Arrays.asList(63, 64); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - final long[] lb = bf.getBits(); + public final void testAsBitMapArray() { + + // test when multiple long values are returned. + final SimpleHasher hasher = new SimpleHasher(63, 1); + final BloomFilter bf = createFilter(Shape.fromKM(2, 72), hasher); + final long[] lb = bf.asBitMapArray(); assertEquals(2, lb.length); assertEquals(0x8000000000000000L, lb[0]); assertEquals(0x1, lb[1]); } - /** - * Tests that the the hasher returned from getHasher() works correctly. - */ - @Test - public final void getHasherTest() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final StaticHasher hasher2 = bf.getHasher(); - - assertEquals(shape, hasher2.getShape()); - assertSameBits(hasher, hasher2); - } - /** * Tests that isFull() returns the proper values. */ @Test - public final void isFullTest() { + public final void testIsFull() { // create empty filter - AbstractBloomFilter filter = createEmptyFilter(shape); - assertFalse(filter.isFull()); - - final List values = new ArrayList<>(shape.getNumberOfBits()); - for (int i = 0; i < shape.getNumberOfBits(); i++) { - values.add(i); - } - - StaticHasher hasher2 = new StaticHasher(values.iterator(), shape); - filter = createFilter(hasher2, shape); + BloomFilter filter = createEmptyFilter(getTestShape()); + assertFalse(filter.isFull(), "Should not be full"); - assertTrue(filter.isFull()); + filter = createFilter(getTestShape(), fullHasher); + assertTrue(filter.isFull(), "Should be full"); - final int mid = shape.getNumberOfBits() / 2; - values.remove(Integer.valueOf(mid)); - hasher2 = new StaticHasher(values.iterator(), shape); - filter = createFilter(hasher2, shape); - assertFalse(filter.isFull()); - } - - /** - * Tests that merging bloom filters works as expected. - */ - @Test - public final void mergeTest_BloomFilter() { - mergeTest_BloomFilter(this::createFilter); + filter = createFilter(getTestShape(), new SimpleHasher(1, 3)); + assertFalse(filter.isFull(), "Should not be full"); } /** * Tests that merging bloom filters works as expected with a generic BloomFilter. */ @Test - public final void mergeTest_GenericBloomFilter() { - mergeTest_BloomFilter(this::createGenericFilter); - } - - /** - * Tests that merging bloom filters works as expected. - * - * @param filterFactory the factory function to create the filter - */ - private void mergeTest_BloomFilter(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + public final void testMerge() { - final BloomFilter bf = createFilter(hasher, shape); + // test with BloomFilter + final BloomFilter bf1 = createFilter(getTestShape(), from1); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + final BloomFilter bf2 = createFilter(getTestShape(), from11); - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + final BloomFilter bf3 = bf1.merge(bf2); + assertTrue(bf3.contains(bf1), "Should contain bf1"); + assertTrue(bf3.contains(bf2), "Should contain bf2"); - assertTrue(bf.merge(bf2), "Merge should not fail"); - assertEquals(27, bf.cardinality()); - } + final BloomFilter bf4 = bf2.merge(bf1); + assertTrue(bf4.contains(bf1), "Should contain bf1"); + assertTrue(bf4.contains(bf2), "Should contain bf2"); + assertTrue(bf4.contains(bf3), "Should contain bf3"); + assertTrue(bf3.contains(bf4), "Should contain bf4"); - /** - * Tests that merging bloom filters with different shapes fails properly - */ - @Test - public final void mergeTest_BloomFilter_WrongShape() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); + // test with Hasher - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); - final BloomFilter bf2 = createFilter(hasher2, anotherShape); + final BloomFilter bf5 = bf1.merge(from11); + assertTrue(bf5.contains(bf1), "Should contain bf1"); + assertTrue(bf5.contains(bf2), "Should contain bf2"); - assertThrows(IllegalArgumentException.class, () -> bf.merge(bf2), - "Should throw IllegalArgumentException"); + // test with hasher returning numbers out of range + assertThrows(IllegalArgumentException.class, () -> bf1.merge(new BadHasher(bf1.getShape().getNumberOfBits()))); + assertThrows(IllegalArgumentException.class, () -> bf1.merge(new BadHasher(-1))); } /** - * Tests that merging a hasher into a Bloom filter works as expected + * Tests that merging in place works as expected. */ @Test - public final void mergeTest_Hasher() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + public final void testMergeInPlace() { - final BloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf1 = createFilter(getTestShape(), from1); + final BloomFilter bf2 = createFilter(getTestShape(), from11); + final BloomFilter bf3 = bf1.merge(bf2); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + // test with BloomFilter - assertTrue(bf.merge(hasher2), "Merge should not fail"); - assertEquals(27, bf.cardinality()); - } - - /** - * Tests that merging a static hasher with the wrong shape into a Bloom filter fails as expected - */ - @Test - public final void mergeTest_Hasher_WrongShape() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); + long[] bf1Val = bf1.asBitMapArray(); + long[] bf2Val = bf2.asBitMapArray(); + for (int i = 0; i < bf1Val.length; i++) { + bf1Val[i] |= bf2Val[i]; + } + bf1.mergeInPlace(bf2); - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); + long[] bf1New = bf1.asBitMapArray(); + for (int i = 0; i < bf1Val.length; i++) { + assertEquals(bf1Val[i], bf1New[i], "Bad value at " + i); + } - assertThrows(IllegalArgumentException.class, () -> bf.merge(hasher2), - "Should throw IllegalArgumentException"); + assertTrue(bf1.contains(bf2), "Should contain bf2"); + assertTrue(bf1.contains(bf3), "Should contain bf3"); + + // test with hasher + + BloomFilter bf4 = createFilter(getTestShape(), from1); + bf4.mergeInPlace(from11); + + assertTrue(bf4.contains(bf2), "Should contain Bf2"); + assertTrue(bf4.contains(bf3), "Should contain Bf3"); + + // test with hasher returning numbers out of range + assertThrows(IllegalArgumentException.class, + () -> bf1.mergeInPlace(new BadHasher(bf1.getShape().getNumberOfBits()))); + assertThrows(IllegalArgumentException.class, () -> bf1.mergeInPlace(new BadHasher(-1))); + + // test error when bloom filter returns values out of range + final BloomFilter bf5 = new SimpleBloomFilter( + Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE), + new SimpleHasher(Long.SIZE * 2, 1)); + assertThrows(IllegalArgumentException.class, () -> bf1.mergeInPlace(bf5)); + + final BloomFilter bf6 = new SparseBloomFilter( + Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE), + new SimpleHasher(Long.SIZE * 2, 1)); + assertThrows(IllegalArgumentException.class, () -> bf1.mergeInPlace(bf6)); + } + + private void assertIndexProducerConstructor(Shape shape, int[] values, int[] expected) { + IndexProducer indices = IndexProducer.fromIndexArray(values); + SparseBloomFilter filter = new SparseBloomFilter(shape, indices); + List lst = new ArrayList<>(); + filter.forEachIndex(x -> { + lst.add(x); + return true; + }); + assertEquals(expected.length, lst.size()); + for (int value : expected) { + assertTrue(lst.contains(Integer.valueOf(value)), "Missing " + value); + } } - /** - * Tests that the orCardinality calculations are correct. - */ - @Test - public final void orCardinalityTest() { - orCardinalityTest(this::createFilter); + private void assertFailedIndexProducerConstructor(Shape shape, int[] values) { + IndexProducer indices = IndexProducer.fromIndexArray(values); + assertThrows(IllegalArgumentException.class, () -> createFilter(shape, indices)); } - /** - * Tests that the orCardinality calculations are correct with a generic BloomFilter. - */ @Test - public final void orCardinalityTest_GenericBloomFilter() { - orCardinalityTest(this::createGenericFilter); - } + public void testIndexProducerConstructor() { + Shape shape = Shape.fromKM(5, 10); - /** - * Tests that the andCardinality calculations are correct. - * - * @param filterFactory the factory function to create the filter - */ - private void orCardinalityTest(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final AbstractBloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); - - assertEquals(27, bf.orCardinality(bf2)); + assertIndexProducerConstructor(shape, new int[] { 0, 2, 4, 6, 8 }, new int[] { 0, 2, 4, 6, 8 }); + // test duplicate values + assertIndexProducerConstructor(shape, new int[] { 0, 2, 4, 2, 8 }, new int[] { 0, 2, 4, 8 }); + // test negative values + assertFailedIndexProducerConstructor(shape, new int[] { 0, 2, 4, -2, 8 }); + // test index too large + assertFailedIndexProducerConstructor(shape, new int[] { 0, 2, 4, 12, 8 }); + // test no indices + assertIndexProducerConstructor(shape, new int[0], new int[0]); } - /** - * Tests that the orCardinality calculations are correct when there are more than Long.LENGTH bits. - */ @Test - public final void orCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final AbstractBloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - - final AbstractBloomFilter bf2 = createFilter(hasher2, shape); + public void testBitMapProducerSize() { + int[] idx = new int[1]; + createFilter(getTestShape(), from1).forEachBitMap(i -> { + idx[0]++; + return true; + }); + assertEquals(BitMap.numberOfBitMaps(getTestShape().getNumberOfBits()), idx[0]); - assertEquals(27, bf.orCardinality(bf2)); - assertEquals(27, bf2.orCardinality(bf)); + idx[0] = 0; + createEmptyFilter(getTestShape()).forEachBitMap(i -> { + idx[0]++; + return true; + }); + assertEquals(BitMap.numberOfBitMaps(getTestShape().getNumberOfBits()), idx[0]); } /** - * Tests that the xorCardinality calculations are correct. + * Testing class returns the value as the only value. */ - @Test - public final void xorCardinalityTest() { - xorCardinalityTest(this::createFilter); - } - - /** - * Tests that the xorCardinality calculations are correct with a generic BloomFilter. - */ - @Test - public final void xorCardinalityTest_GenericBloomFilter() { - xorCardinalityTest(this::createGenericFilter); - } - - /** - * Tests that the andCardinality calculations are correct. - * - * @param filterFactory the factory function to create the filter - */ - private void xorCardinalityTest(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); + class BadHasher implements Hasher { - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + IndexProducer producer; - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); - - assertEquals(20, bf.xorCardinality(bf2)); - } - - /** - * Tests that the xorCardinality calculations are correct when there are more than Long.LENGTH bits. - */ - @Test - public final void xorCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); + BadHasher(int value) { + this.producer = IndexProducer.fromIndexArray(new int[] { value }); + } - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - final BloomFilter bf2 = createFilter(hasher2, shape); + @Override + public IndexProducer indices(Shape shape) { + return producer; + } - assertEquals(20, bf.xorCardinality(bf2)); - assertEquals(20, bf2.xorCardinality(bf)); + @Override + public IndexProducer uniqueIndices(Shape shape) { + return producer; + } } - } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java new file mode 100644 index 0000000000..a839e2d9c5 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java @@ -0,0 +1,264 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link ArrayCountingBloomFilter}. + */ +public abstract class AbstractCountingBloomFilterTest + extends AbstractBloomFilterTest { + protected int[] from1Counts = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; + protected int[] from11Counts = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0 }; + protected int[] bigHashCounts = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0 }; + + protected final BitCountProducer maximumValueProducer = new BitCountProducer() { + + @Override + public boolean forEachCount(BitCountProducer.BitCountConsumer consumer) { + for (int i = 1; i < 18; i++) { + if (!consumer.test(i, Integer.MAX_VALUE)) { + return false; + } + } + return true; + } + }; + + /** + * Assert the counts match the expected values. Values are for indices starting + * at 0. Assert the cardinality equals the number of non-zero counts. + * + * @param bf the bloom filter + * @param expected the expected counts + */ + private static void assertCounts(final CountingBloomFilter bf, final int[] expected) { + final Map m = new HashMap<>(); + bf.forEachCount((i, c) -> { + m.put(i, c); + return true; + }); + int zeros = 0; + for (int i = 0; i < expected.length; i++) { + if (m.get(i) == null) { + assertEquals(expected[i], 0, "Wrong value for " + i); + zeros++; + } else { + assertEquals(expected[i], m.get(i).intValue(), "Wrong value for " + i); + } + } + assertEquals(expected.length - zeros, bf.cardinality()); + } + + /** + * Tests that counts are correct when a hasher with duplicates is used in the + * constructor. + */ + @Test + public final void testCountingSpecificConstructor() { + // verify hasher duplicates are counted. + // bit hasher has duplicates for 11, 12,13,14,15,16, and 17 + final CountingBloomFilter bf = createFilter(getTestShape(), from1); + bf.add(BitCountProducer.from(from11.indices(getTestShape()))); + + final long[] lb = bf.asBitMapArray(); + assertEquals(2, lb.length); + assertEquals(bigHashValue, lb[0]); + + assertCounts(bf, bigHashCounts); + } + + @Test + public final void testCountingBloomFilterSpecificContains() { + final BloomFilter bf = new SimpleBloomFilter(getTestShape(), from1); + final CountingBloomFilter bf2 = createFilter(getTestShape(), bigHasher); + + assertTrue(bf.contains(bf), "BF Should contain itself"); + assertTrue(bf2.contains(bf2), "BF2 Should contain itself"); + assertFalse(bf.contains(bf2), "BF should not contain BF2"); + assertTrue(bf2.contains(bf), "BF2 should contain BF"); + BitMapProducer producer = bf2; + assertTrue(bf2.contains(producer), "BF2 should contain BF bitMapProducer"); + } + + /** + * Tests that merging bloom filters works as expected with a generic BloomFilter. + */ + @Test + public final void testCountingSpecificMerge() { + final BloomFilter bf1 = createFilter(getTestShape(), from1); + + final BloomFilter bf2 = new SimpleBloomFilter(getTestShape(), from11); + + final BloomFilter bf3 = bf1.merge(bf2); + assertTrue(bf3.contains(bf1), "Should contain"); + assertTrue(bf3.contains(bf2), "Should contain"); + + final BloomFilter bf4 = bf2.merge(bf1); + assertTrue(bf4.contains(bf1), "Should contain"); + assertTrue(bf4.contains(bf2), "Should contain"); + assertTrue(bf4.contains(bf3), "Should contain"); + assertTrue(bf3.contains(bf4), "Should contain"); + + // test overflow + + final CountingBloomFilter bf5 = createEmptyFilter(getTestShape()); + assertTrue(bf5.add(maximumValueProducer), "Should add to empty"); + assertTrue(bf5.isValid(), "Should be valid"); + + CountingBloomFilter bf6 = bf5.merge(new SimpleBloomFilter(getTestShape(), from1)); + assertFalse(bf6.isValid(), "Should not be valid"); + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void testAdd() { + final CountingBloomFilter bf1 = createFilter(getTestShape(), from1); + assertTrue(bf1.add(createFilter(getTestShape(), from11)), "Add should work"); + assertTrue(bf1.contains(from1), "Should contain"); + assertTrue(bf1.contains(from11), "Should contain"); + assertCounts(bf1, bigHashCounts); + + // test overflow + + final CountingBloomFilter bf2 = createEmptyFilter(getTestShape()); + assertTrue(bf2.add(maximumValueProducer), "Should add to empty"); + assertTrue(bf2.isValid(), "Should be valid"); + + assertFalse(bf2.add(createFilter(getTestShape(), from1)), "Should not add"); + assertFalse(bf2.isValid(), "Should not be valid"); + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public final void testSubtract() { + final CountingBloomFilter bf1 = createFilter(getTestShape(), from1); + bf1.add(BitCountProducer.from(from11.indices(getTestShape()))); + + final CountingBloomFilter bf2 = createFilter(getTestShape(), from11); + + assertTrue(bf1.subtract(bf2), "Subtract should work"); + assertFalse(bf1.contains(bigHasher), "Should not contain bitHasher"); + assertTrue(bf1.contains(from1), "Should contain from1"); + + assertCounts(bf1, from1Counts); + + // test underflow + final CountingBloomFilter bf3 = createFilter(getTestShape(), from1); + + final CountingBloomFilter bf4 = createFilter(getTestShape(), from11); + + assertFalse(bf3.subtract(bf4), "Subtract should not work"); + assertFalse(bf3.isValid(), "isValid should return false"); + assertFalse(bf3.contains(from1), "Should not contain"); + assertFalse(bf3.contains(bf4), "Should not contain"); + + assertCounts(bf3, new int[] { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }); + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public final void testRemove() { + final CountingBloomFilter bf1 = createFilter(getTestShape(), from1); + bf1.add(BitCountProducer.from(from11.indices(getTestShape()))); + + assertTrue(bf1.remove(new SimpleBloomFilter(getTestShape(), from11)), "Remove should work"); + assertFalse(bf1.contains(from11), "Should not contain"); + assertTrue(bf1.contains(from1), "Should contain"); + + assertCounts(bf1, from1Counts); + + // with hasher + final CountingBloomFilter bf2 = createFilter(getTestShape(), from1); + bf2.add(BitCountProducer.from(from11.indices(getTestShape()))); + + assertTrue(bf2.remove(from11), "Remove should work"); + assertFalse(bf2.contains(from11), "Should not contain"); + assertTrue(bf2.contains(from1), "Should contain"); + + assertCounts(bf2, from1Counts); + + // test underflow + + final CountingBloomFilter bf3 = createFilter(getTestShape(), from1); + + final BloomFilter bf4 = new SimpleBloomFilter(getTestShape(), from11); + + assertFalse(bf3.remove(bf4), "Subtract should not work"); + assertFalse(bf3.isValid(), "isValid should return false"); + assertFalse(bf3.contains(from1), "Should not contain"); + assertFalse(bf3.contains(bf4), "Should not contain"); + + assertCounts(bf3, new int[] { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + @Test + public void testExcludesDuplicates() { + + // create a hasher that produces duplicates with the specified shape. + // this setup produces 5, 17, 29, 41, 53, 65 two times + Shape shape = Shape.fromKM(12, 72); + SimpleHasher hasher = new SimpleHasher(5, 12); + + CountingBloomFilter bf1 = createFilter(shape, hasher); + assertEquals(6, bf1.cardinality()); + bf1.forEachCount((x, y) -> { + assertEquals(1, y, "Hasher in constructor results in value not equal to 1"); + return true; + }); + + bf1 = createEmptyFilter(shape); + bf1.mergeInPlace(hasher); + assertEquals(6, bf1.cardinality()); + bf1.forEachCount((x, y) -> { + assertEquals(1, y, "Hasher in mergeInPlace results in value not equal to 1"); + return true; + }); + + bf1 = createEmptyFilter(shape); + CountingBloomFilter bf2 = bf1.merge(hasher); + assertEquals(6, bf2.cardinality()); + bf2.forEachCount((x, y) -> { + assertEquals(1, y, "Hasher in merge results in value not equal to 1"); + return true; + }); + + bf1 = createFilter(shape, hasher); + bf1.remove(hasher); + assertEquals(0, bf1.cardinality()); + assertTrue(bf1.forEachCount((x, y) -> (false)), "Hasher in removes results in value not equal to 0"); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractHasherTest.java new file mode 100644 index 0000000000..95b2e59fbf --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractHasherTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashSet; +import java.util.Set; + +import org.junit.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +public abstract class AbstractHasherTest extends AbstractIndexProducerTest { + + protected abstract Hasher createHasher(); + + protected abstract Hasher createEmptyHasher(); + + /** + * A method to get the number of items in a hasher. Mostly applies to + * Collections of hashers. + * @param hasher the hasher to check. + * @return the number of hashers in the hasher + */ + protected abstract int getHasherSize(Hasher hasher); + + /** + * The shape of the Hashers filters for testing. + *
    + *
  • Hash functions (k) = 17 + *
  • Number of bits (m) = 72 + *
+ * @return the testing shape. + */ + protected final Shape getTestShape() { + return Shape.fromKM(17, 72); + } + + @Override + protected IndexProducer createProducer() { + return createHasher().indices(getTestShape()); + } + + @Override + protected IndexProducer createEmptyProducer() { + return createEmptyHasher().indices(getTestShape()); + } + + @ParameterizedTest + @CsvSource({ "17, 72", "3, 14", "5, 67868", }) + public void testHashing(int k, int m) { + int[] count = { 0 }; + Hasher hasher = createHasher(); + hasher.indices(Shape.fromKM(k, m)).forEachIndex(i -> { + assertTrue(i >= 0 && i < m, () -> "Out of range: " + i + ", m=" + m); + count[0]++; + return true; + }); + assertEquals(k * getHasherSize(hasher), count[0], + () -> String.format("Did not produce k=%d * m=%d indices", k, getHasherSize(hasher))); + } + + @Test + public void testUniqueIndex() { + // create a hasher that produces duplicates with the specified shape. + // this setup produces 5, 17, 29, 41, 53, 65 two times + Shape shape = Shape.fromKM(12, 72); + Hasher hasher = new SimpleHasher(5, 12); + Set set = new HashSet<>(); + assertTrue(hasher.uniqueIndices(shape).forEachIndex(set::add), "Duplicate detected"); + assertEquals(6, set.size()); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractIndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractIndexProducerTest.java new file mode 100644 index 0000000000..54dc01c7d4 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractIndexProducerTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.IntPredicate; + +import org.junit.jupiter.api.Test; + +public abstract class AbstractIndexProducerTest { + + public static final IntPredicate TRUE_PREDICATE = new IntPredicate() { + + @Override + public boolean test(int arg0) { + return true; + } + }; + + public static final IntPredicate FALSE_PREDICATE = new IntPredicate() { + + @Override + public boolean test(int arg0) { + return false; + } + }; + + /** + * Creates a producer with some data. + * @return a producer with some data + */ + protected abstract IndexProducer createProducer(); + + /** + * Creates an producer without data. + * @return a producer that has no data. + */ + protected abstract IndexProducer createEmptyProducer(); + + @Test + public final void testForEachIndex() { + + IndexProducer populated = createProducer(); + IndexProducer empty = createEmptyProducer(); + assertFalse(populated.forEachIndex(FALSE_PREDICATE), "non-empty should be false"); + + assertTrue(empty.forEachIndex(FALSE_PREDICATE), "empty should be true"); + + assertTrue(populated.forEachIndex(TRUE_PREDICATE), "non-empty should be true"); + assertTrue(empty.forEachIndex(TRUE_PREDICATE), "empty should be true"); + } + + @Test + public final void testAsIndexArray() { + int ary[] = createEmptyProducer().asIndexArray(); + assertEquals(0, ary.length); + + IndexProducer producer = createProducer(); + List lst = new ArrayList(); + for (int i : producer.asIndexArray()) { + lst.add(i); + } + assertTrue(producer.forEachIndex(new IntPredicate() { + + @Override + public boolean test(int value) { + assertTrue(lst.remove(Integer.valueOf(value)), + String.format("Instance of %d was not found in lst", value)); + return true; + } + })); + } + + @Test + public void testForIndexEarlyExit() { + int[] passes = new int[1]; + assertFalse(createProducer().forEachIndex(i -> { + passes[0]++; + return false; + })); + assertEquals(1, passes[0]); + + passes[0] = 0; + assertTrue(createEmptyProducer().forEachIndex(i -> { + passes[0]++; + return false; + })); + assertEquals(0, passes[0]); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index a661f93fde..86bd638b73 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -16,520 +16,37 @@ */ package org.apache.commons.collections4.bloomfilter; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.ThreadLocalRandom; -import java.util.function.BiPredicate; -import java.util.function.Function; -import java.util.function.ToIntBiFunction; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.junit.jupiter.api.Test; - /** * Tests for the {@link ArrayCountingBloomFilter}. */ -public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { - - /** - * Function to convert int arrays to BloomFilters for testing. - */ - private final Function converter = counts -> { - final BloomFilter testingFilter = new BitSetBloomFilter(shape); - testingFilter.merge(new FixedIndexesTestHasher(shape, counts)); - return testingFilter; - }; +public class ArrayCountingBloomFilterTest extends AbstractCountingBloomFilterTest { @Override - protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) { + protected ArrayCountingBloomFilter createEmptyFilter(Shape shape) { return new ArrayCountingBloomFilter(shape); } @Override - protected ArrayCountingBloomFilter createFilter(final Hasher hasher, final Shape shape) { - final ArrayCountingBloomFilter result = new ArrayCountingBloomFilter(shape); - result.merge( hasher ); - return result; - } - - private ArrayCountingBloomFilter createFromCounts(final int[] counts) { - // Use a dummy filter to add the counts to an empty filter - final CountingBloomFilter dummy = new ArrayCountingBloomFilter(shape) { - @Override - public void forEachCount(final BitCountConsumer action) { - for (int i = 0; i < counts.length; i++) { - action.accept(i, counts[i]); - } - } - }; - final ArrayCountingBloomFilter bf = new ArrayCountingBloomFilter(shape); - bf.add(dummy); - return bf; - } - - /** - * Assert the counts match the expected values. Values are for indices starting - * at 0. Assert the cardinality equals the number of non-zero counts. - * - * @param bf the bloom filter - * @param expected the expected counts - */ - private static void assertCounts(final CountingBloomFilter bf, final int[] expected) { - final Map m = new HashMap<>(); - bf.forEachCount(m::put); - int zeros = 0; - for (int i = 0; i < expected.length; i++) { - if (m.get(i) == null) { - assertEquals(expected[i], 0, "Wrong value for " + i); - zeros++; - } else { - assertEquals(expected[i], m.get(i).intValue(), "Wrong value for " + i); - } - } - assertEquals(expected.length - zeros, bf.cardinality()); - } - - /** - * Tests that counts are correct when a hasher with duplicates is used in the - * constructor. - */ - @Test - public void constructorTest_Hasher_Duplicates() { - final int[] expected = {0, 1, 1, 0, 0, 1}; - // Some indexes with duplicates - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 2, 5); - - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - final long[] lb = bf.getBits(); - assertEquals(1, lb.length); - assertEquals(0b100110L, lb[0]); - - assertCounts(bf, expected); - } - - /** - * Test the contains function with a standard Bloom filter. - * The contains function is tested using a counting Bloom filter in the parent test class. - */ - @Test - public void contains_BloomFilter() { - // Some indexes with duplicates - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 5); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - BitSetBloomFilter testingFilter = new BitSetBloomFilter(shape); - testingFilter.merge( new FixedIndexesTestHasher(shape, 3, 4)); - assertFalse(bf.contains(testingFilter)); - testingFilter = new BitSetBloomFilter(shape); - testingFilter.merge( new FixedIndexesTestHasher(shape, 2, 5)); - assertTrue(bf.contains(testingFilter)); - } - - /** - * Tests that merge correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void mergeTest_Counts_CountingBloomFilter() { - assertMerge(counts -> createFilter(new FixedIndexesTestHasher(shape, counts), shape), - BloomFilter::merge); - } - - /** - * Tests that merge correctly updates the counts when a BloomFilter is passed. - */ - @Test - public void mergeTest_Counts_BloomFilter() { - assertMerge(converter, BloomFilter::merge); - } - - /** - * Test that merge correctly updates the counts when a Hasher is passed. - */ - @Test - public void mergeTest_Counts_Hasher() { - assertMerge(counts -> new FixedIndexesTestHasher(shape, counts), - BloomFilter::merge); - } - - /** - * Test that merge correctly updates the counts when a Hasher is passed with duplicates. - */ - @Test - public void mergeTest_Counts_Hasher_Duplicates() { - assertMerge(counts -> new FixedIndexesTestHasher(shape, createDuplicates(counts)), - BloomFilter::merge); - } - - /** - * Tests that remove correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void removeTest_Counts_CountingBloomFilter() { - assertRemove(counts -> createFilter(new FixedIndexesTestHasher(shape, counts), shape), - CountingBloomFilter::remove); - } - - /** - * Tests that remove correctly updates the counts when a BloomFilter is passed. - */ - @Test - public void removeTest_Counts_BloomFilter() { - assertRemove(converter, CountingBloomFilter::remove); - } - - /** - * Test that remove correctly updates the counts when a Hasher is passed. - */ - @Test - public void removeTest_Counts_Hasher() { - assertRemove(counts -> new FixedIndexesTestHasher(shape, counts), - CountingBloomFilter::remove); - } - - /** - * Test that remove correctly updates the counts when a Hasher is passed with duplicates. - */ - @Test - public void removeTest_Counts_Hasher_Duplicates() { - assertRemove(counts -> new FixedIndexesTestHasher(shape, createDuplicates(counts)), - CountingBloomFilter::remove); - } - - /** - * Creates duplicates in the counts. - * - * @param counts the counts - * @return the new counts - */ - private static int[] createDuplicates(final int[] counts) { - // Duplicate some values randomly - final int length = counts.length; - final int[] countsWithDuplicates = Arrays.copyOf(counts, 2 * length); - for (int i = length; i < countsWithDuplicates.length; i++) { - // Copy a random value from the counts into the end position - countsWithDuplicates[i] = countsWithDuplicates[ThreadLocalRandom.current().nextInt(i)]; - } - return countsWithDuplicates; - } - - /** - * Assert a merge operation. The converter should construct a suitable object - * to remove the indices from the provided Bloom filter with the remove operation. - * - * @param the type of the filter - * @param converter the converter - * @param merge the merge operation - */ - private void assertMerge(final Function converter, - final BiPredicate merge) { - final int[] indexes1 = { 1, 2, 4, 5, 6}; - final int[] indexes2 = { 3, 4, 6}; - final int[] expected = {0, 1, 1, 1, 2, 1, 2}; - assertOperation(indexes1, indexes2, converter, merge, true, expected); - } - - /** - * Assert a remove operation. The converter should construct a suitable object - * to remove the indices from the provided Bloom filter with the remove operation. - * - * @param the type of the filter - * @param converter the converter - * @param remove the remove operation - */ - private void assertRemove(final Function converter, - final BiPredicate remove) { - final int[] indexes1 = { 1, 2, 4, 5, 6}; - final int[] indexes2 = { 2, 5, 6}; - final int[] expected = {0, 1, 0, 0, 1, 0, 0}; - assertOperation(indexes1, indexes2, converter, remove, true, expected); - } - - /** - * Assert a counting operation. The first set of indexes is used to create the - * CountingBloomFilter. The second set of indices is passed to the converter to - * construct a suitable object to combine with the counting Bloom filter. The counts - * of the first Bloom filter are checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param the type of the filter - * @param indexes1 the first set of indexes - * @param indexes2 the second set of indexes - * @param converter the converter - * @param operation the operation - * @param isValid the expected value for the operation result - * @param expected the expected counts after the operation - */ - private void assertOperation(final int[] indexes1, final int[] indexes2, - final Function converter, - final BiPredicate operation, - final boolean isValid, final int[] expected) { - final Hasher hasher = new FixedIndexesTestHasher(shape, indexes1); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - final F filter = converter.apply(indexes2); - final boolean result = operation.test(bf, filter); - assertEquals(isValid, result); - assertEquals(isValid, bf.isValid()); - assertCounts(bf, expected); - } - - /** - * Tests that merge errors when the counts overflow the maximum integer value. - */ - @Test - public void mergeTest_Overflow() { - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - - final ArrayCountingBloomFilter bf2 = createFromCounts(new int[] {0, 0, Integer.MAX_VALUE}); - - // Small + 1 = OK - // should not fail as the counts are ignored - assertTrue(bf.merge(bf2)); - assertTrue(bf.isValid()); - assertCounts(bf, new int[] {0, 1, 2, 1}); - - // Big + 1 = Overflow - assertTrue(bf2.isValid()); - assertFalse(bf2.merge(bf)); - assertFalse(bf2.isValid(), "Merge should overflow and the filter is invalid"); - - // The counts are not clipped to max. They have simply overflowed. - // Note that this is a merge and the count is only incremented by 1 - // and not the actual count at each index. So it is not 2 + Integer.MAX_VALUE. - assertCounts(bf2, new int[] {0, 1, 1 + Integer.MAX_VALUE, 1}); + protected ArrayCountingBloomFilter createFilter(Shape shape, Hasher hasher) { + return createFilter(shape, hasher.uniqueIndices(shape)); } - /** - * Tests that removal errors when the counts become negative. - */ - @Test - public void removeTest_Negative() { - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - - final Hasher hasher2 = new FixedIndexesTestHasher(shape, 2); - final ArrayCountingBloomFilter bf2 = createFilter(hasher2, shape); - - // More - Less = OK - bf.remove(bf2); - assertTrue(bf.isValid()); - assertCounts(bf, new int[] {0, 1, 0, 1}); - - // Less - More = Negative - assertTrue(bf2.isValid()); - bf2.remove(bf); - assertFalse(bf2.isValid(), "Remove should create negative counts and the filter is invalid"); - - // The counts are not clipped to zero. They have been left as negative. - assertCounts(bf2, new int[] {0, -1, 1, -1}); + @Override + protected ArrayCountingBloomFilter createFilter(Shape shape, BitMapProducer producer) { + return createFilter(shape, IndexProducer.fromBitMapProducer(producer)); } - /** - * Tests that counts can be added to a new instance. - * - *

Note: This test ensures the CountingBloomFilter - * can be created with whatever counts are required for other tests. - */ - @Test - public void addTest_NewInstance() { - for (final int[] counts : new int[][] { - { /* empty */}, - {0, 0, 1}, - {0, 1, 2}, - {2, 3, 4}, - {66, 77, 0, 99}, - {Integer.MAX_VALUE, 42}, - }) { - assertCounts(createFromCounts(counts), counts); + @Override + protected ArrayCountingBloomFilter createFilter(Shape shape, IndexProducer producer) { + ArrayCountingBloomFilter filter = createEmptyFilter(shape); + try { + filter.add(BitCountProducer.from(producer)); + return filter; + } catch (ArrayIndexOutOfBoundsException e) { + // since ArrayCountingBloomFilter does not ahave a constructor that takes a + // hasher + // we have to duplicate the expected results here. + throw new IllegalArgumentException(e); } } - - /** - * Test that add correctly ignores an empty CountingBloomFilter. - */ - @Test - public void addTest_Empty() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[0], - CountingBloomFilter::add, - true, - new int[] {5, 2, 1}); - } - - /** - * Test that add correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void addTest_Counts() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, 4, 1}, - CountingBloomFilter::add, - true, - new int[] {5, 8, 5, 1}); - } - - /** - * Test that add correctly updates the isValid state when a CountingBloomFilter is - * passed and an integer overflow occurs. - */ - @Test - public void addTest_Overflow() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, Integer.MAX_VALUE}, - CountingBloomFilter::add, - false, - new int[] {5, 8, 1 + Integer.MAX_VALUE}); - } - - /** - * Test that subtract correctly ignores an empty CountingBloomFilter. - */ - @Test - public void subtractTest_Empty() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[0], - CountingBloomFilter::subtract, - true, - new int[] {5, 2, 1}); - } - - /** - * Test that subtract correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void subtractTest_Counts() { - assertCountingOperation(new int[] {5, 9, 1, 1}, - new int[] {0, 2, 1}, - CountingBloomFilter::subtract, - true, - new int[] {5, 7, 0, 1}); - } - - /** - * Test that subtract correctly updates the isValid state when a CountingBloomFilter is - * passed and the counts become negative. - */ - @Test - public void subtractTest_Negative() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, 1}, - CountingBloomFilter::subtract, - false, - new int[] {5, -4, 0}); - } - - /** - * Assert a counting operation. Two CountingBloomFilters are created from the - * two sets of counts. The operation is applied and the counts of the first - * Bloom filter is checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param counts1 the first set counts - * @param counts2 the first set counts - * @param operation the operation - * @param isValid the expected value for the operation result - * @param expected the expected counts after the operation - */ - private void assertCountingOperation(final int[] counts1, final int[] counts2, - final BiPredicate operation, - final boolean isValid, final int[] expected) { - final ArrayCountingBloomFilter bf1 = createFromCounts(counts1); - final ArrayCountingBloomFilter bf2 = createFromCounts(counts2); - final boolean result = operation.test(bf1, bf2); - assertEquals(isValid, result); - assertEquals(isValid, bf1.isValid()); - assertCounts(bf1, expected); - } - - /** - * Tests that the andCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void andCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::andCardinality, - 2); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::andCardinality, - 2); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::andCardinality, - 0); - } - - /** - * Tests that the orCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void orCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::orCardinality, - 2); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::orCardinality, - 6); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::orCardinality, - 5); - } - - /** - * Tests that the xorCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void xorCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::xorCardinality, - 0); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::xorCardinality, - 4); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::xorCardinality, - 5); - } - - /** - * Assert a cardinality operation. Two CountingBloomFilters are created from the - * two sets of counts. The operation is applied and the counts of the first - * Bloom filter is checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param counts1 the first set counts - * @param counts2 the first set counts - * @param operation the operation - * @param expected the expected cardinality - */ - private void assertCardinalityOperation(final int[] counts1, final int[] counts2, - final ToIntBiFunction operation, - final int expected) { - final ArrayCountingBloomFilter bf1 = createFromCounts(counts1); - final ArrayCountingBloomFilter bf2 = createFromCounts(counts2); - assertEquals(expected, operation.applyAsInt(bf1, bf2)); - assertEquals(expected, operation.applyAsInt(bf2, bf1)); - } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayTrackerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayTrackerTest.java new file mode 100644 index 0000000000..d340505406 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayTrackerTest.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.function.IntPredicate; + +import org.apache.commons.collections4.bloomfilter.Hasher.IndexFilter.ArrayTracker; +import org.junit.jupiter.api.Test; + +/** + * Tests the Filter class. + */ +public class ArrayTrackerTest { + + @Test + public void testSeen() { + Shape shape = Shape.fromKM(3, 12); + IntPredicate tracker = new ArrayTracker(shape); + + assertTrue(tracker.test(0)); + assertFalse(tracker.test(0)); + assertTrue(tracker.test(1)); + assertFalse(tracker.test(1)); + assertTrue(tracker.test(2)); + assertFalse(tracker.test(2)); + + assertThrows(IndexOutOfBoundsException.class, () -> tracker.test(3)); + assertThrows(IndexOutOfBoundsException.class, () -> tracker.test(-1)); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java new file mode 100644 index 0000000000..59fc4fc4e5 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class BitCountProducerFromArrayCountingBloomFilterTest extends AbstractBitCountProducerTest { + + protected Shape shape = Shape.fromKM(17, 72); + + @Override + protected BitCountProducer createProducer() { + ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); + Hasher hasher = new SimpleHasher(0, 1); + return filter.merge(hasher); + } + + @Override + protected BitCountProducer createEmptyProducer() { + return new ArrayCountingBloomFilter(shape); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromIndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromIndexProducerTest.java new file mode 100644 index 0000000000..d4c2be603e --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromIndexProducerTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; + +import java.util.HashMap; +import java.util.Map; +import org.junit.jupiter.api.Test; + +public class BitCountProducerFromIndexProducerTest extends AbstractBitCountProducerTest { + + @Override + protected BitCountProducer createProducer() { + return BitCountProducer.from(IndexProducer.fromIndexArray(new int[] { 0, 1, 63, 64, 127, 128 })); + } + + @Override + protected BitCountProducer createEmptyProducer() { + return BitCountProducer.from(IndexProducer.fromIndexArray(new int[0])); + } + + @Test + public final void testFromIndexProducer() { + + BitCountProducer producer = createProducer(); + Map m = new HashMap<>(); + + producer.forEachCount((i, v) -> { + m.put(i, v); + return true; + }); + + assertEquals(6, m.size()); + assertEquals(Integer.valueOf(1), m.get(0)); + assertEquals(Integer.valueOf(1), m.get(1)); + assertEquals(Integer.valueOf(1), m.get(63)); + assertEquals(Integer.valueOf(1), m.get(64)); + assertEquals(Integer.valueOf(1), m.get(127)); + assertEquals(Integer.valueOf(1), m.get(128)); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromArrayCountingBloomFilterTest.java new file mode 100644 index 0000000000..25cd04ed34 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromArrayCountingBloomFilterTest.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class BitMapProducerFromArrayCountingBloomFilterTest extends AbstractBitMapProducerTest { + + protected Shape shape = Shape.fromKM(17, 72); + + @Override + protected BitMapProducer createProducer() { + ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); + Hasher hasher = new SimpleHasher(0, 1); + return filter.merge(hasher); + } + + @Override + protected BitMapProducer createEmptyProducer() { + return new ArrayCountingBloomFilter(shape); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromIndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromIndexProducerTest.java new file mode 100644 index 0000000000..4a9500d435 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromIndexProducerTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.IntPredicate; + +import org.junit.jupiter.api.Test; + +public class BitMapProducerFromIndexProducerTest extends AbstractBitMapProducerTest { + + @Override + protected BitMapProducer createProducer() { + IndexProducer iProducer = new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + return consumer.test(0) && consumer.test(1) && consumer.test(63) && consumer.test(64) + && consumer.test(127) && consumer.test(128); + } + }; + return BitMapProducer.fromIndexProducer(iProducer, 200); + } + + @Override + protected BitMapProducer createEmptyProducer() { + IndexProducer iProducer = new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + return true; + } + }; + return BitMapProducer.fromIndexProducer(iProducer, 200); + } + + @Test + public final void testFromIndexProducer() { + List lst = new ArrayList<>(); + createProducer().forEachBitMap(lst::add); + long[] buckets = lst.stream().mapToLong(l -> l.longValue()).toArray(); + assertTrue(BitMap.contains(buckets, 0)); + assertTrue(BitMap.contains(buckets, 1)); + assertTrue(BitMap.contains(buckets, 63)); + assertTrue(BitMap.contains(buckets, 64)); + assertTrue(BitMap.contains(buckets, 127)); + assertTrue(BitMap.contains(buckets, 128)); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromLongArrayTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromLongArrayTest.java new file mode 100644 index 0000000000..77bc6ed455 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromLongArrayTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.IntPredicate; + +import org.junit.jupiter.api.Test; + +public class BitMapProducerFromLongArrayTest extends AbstractBitMapProducerTest { + + @Override + protected BitMapProducer createProducer() { + long[] ary = new long[] { 1L, 2L, 3L, 4L, 5L }; + return BitMapProducer.fromBitMapArray(ary); + } + + @Override + protected BitMapProducer createEmptyProducer() { + return BitMapProducer.fromBitMapArray(new long[0]); + } + + @Override + protected boolean emptyIsZeroLength() { + return true; + } + + @Test + public void constructorTest() { + List lst = new ArrayList<>(); + createProducer().forEachBitMap(lst::add); + assertEquals(Long.valueOf(1), lst.get(0)); + assertEquals(Long.valueOf(2), lst.get(1)); + assertEquals(Long.valueOf(3), lst.get(2)); + assertEquals(Long.valueOf(4), lst.get(3)); + assertEquals(Long.valueOf(5), lst.get(4)); + } + + @Test + public void testFromIndexProducer() { + int limit = Integer.SIZE + Long.SIZE; + IndexProducer iProducer = new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + for (int i = 0; i < limit; i++) { + if (!consumer.test(i)) { + return false; + } + } + return true; + } + }; + BitMapProducer producer = BitMapProducer.fromIndexProducer(iProducer, limit); + List lst = new ArrayList<>(); + producer.forEachBitMap(lst::add); + long expected = ~0L; + assertEquals(expected, lst.get(0).longValue()); + expected &= 0XFFFFFFFFL; + assertEquals(expected, lst.get(1).longValue()); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSimpleBloomFilterTest.java new file mode 100644 index 0000000000..f73b4807bc --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSimpleBloomFilterTest.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class BitMapProducerFromSimpleBloomFilterTest extends AbstractBitMapProducerTest { + + protected Shape shape = Shape.fromKM(17, 72); + + @Override + protected BitMapProducer createProducer() { + Hasher hasher = new SimpleHasher(0, 1); + return new SimpleBloomFilter(shape, hasher); + } + + @Override + protected BitMapProducer createEmptyProducer() { + return new SimpleBloomFilter(shape); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSparseBloomFilterTest.java new file mode 100644 index 0000000000..0a6331ce78 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSparseBloomFilterTest.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class BitMapProducerFromSparseBloomFilterTest extends AbstractBitMapProducerTest { + + protected Shape shape = Shape.fromKM(17, 72); + + @Override + protected BitMapProducer createProducer() { + Hasher hasher = new SimpleHasher(0, 1); + return new SparseBloomFilter(shape, hasher); + } + + @Override + protected BitMapProducer createEmptyProducer() { + return new SparseBloomFilter(shape); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java new file mode 100644 index 0000000000..f68ca7df35 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +public class BitMapTest { + + @Test + public final void testGetLongBit() { + assertEquals(1, BitMap.getLongBit(0)); + assertEquals(0x8000000000000000L, BitMap.getLongBit(63)); + assertEquals(1, BitMap.getLongBit(64)); + assertEquals(0x8000000000000000L, BitMap.getLongBit(127)); + assertEquals(1, BitMap.getLongBit(128)); + } + + @Test + public final void testGetLongIndex() { + assertEquals(0, BitMap.getLongIndex(0)); + assertEquals(0, BitMap.getLongIndex(63)); + assertEquals(1, BitMap.getLongIndex(64)); + assertEquals(1, BitMap.getLongIndex(127)); + assertEquals(2, BitMap.getLongIndex(128)); + } + + @Test + public final void testNumberOfBitMaps() { + assertEquals(0, BitMap.numberOfBitMaps(0), "Number of bits 0"); + for (int i = 1; i < 65; i++) { + assertEquals(1, BitMap.numberOfBitMaps(i), String.format("Number of bits %d", i)); + } + for (int i = 65; i < 129; i++) { + assertEquals(2, BitMap.numberOfBitMaps(i), String.format("Number of bits %d", i)); + } + assertEquals(3, BitMap.numberOfBitMaps(129), "Number of bits 129"); + } + + @Test + public final void testSet() { + long[] bitMaps = new long[BitMap.numberOfBitMaps(129)]; + for (int i = 0; i < 129; i++) { + BitMap.set(bitMaps, i); + assertTrue(BitMap.contains(bitMaps, i), String.format("Failed at index: %d", i)); + } + assertEquals(0xFFFFFFFFFFFFFFFFL, bitMaps[0]); + assertEquals(0xFFFFFFFFFFFFFFFFL, bitMaps[1]); + assertEquals(1L, bitMaps[2]); + } + + @Test + public final void testContains() { + long[] bitMaps = new long[1]; + + for (int i = 0; i < 64; i++) { + bitMaps[0] = 0L; + BitMap.set(bitMaps, i); + for (int j = 0; j < 64; j++) { + if (j == i) { + assertTrue(BitMap.contains(bitMaps, j), String.format("Failed at index: %d for %d", i, j)); + } else { + assertFalse(BitMap.contains(bitMaps, j), String.format("Failed at index %d for %d", i, j)); + } + } + } + + // test boundary conditions + long[] ary = new long[1]; + + final long[] aryT = ary; + assertThrows(ArrayIndexOutOfBoundsException.class, () -> BitMap.contains(aryT, -1)); + assertFalse(BitMap.contains(ary, 0)); + ary[0] = 0x01; + assertTrue(BitMap.contains(ary, 0)); + + assertFalse(BitMap.contains(ary, 63)); + ary[0] = (1L << 63); + assertTrue(BitMap.contains(ary, 63)); + assertThrows(ArrayIndexOutOfBoundsException.class, () -> BitMap.contains(aryT, 64)); + + ary = new long[2]; + assertFalse(BitMap.contains(ary, 64)); + ary[1] = 1; + assertTrue(BitMap.contains(ary, 64)); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTrackerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTrackerTest.java new file mode 100644 index 0000000000..8d6df5c3de --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTrackerTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.function.IntPredicate; + +import org.apache.commons.collections4.bloomfilter.Hasher.IndexFilter.BitMapTracker; +import org.junit.jupiter.api.Test; + +/** + * Tests the Filter class. + */ +public class BitMapTrackerTest { + + @Test + public void testSeen() { + Shape shape = Shape.fromKM(3, 12); + IntPredicate tracker = new BitMapTracker(shape); + + assertTrue(tracker.test(0)); + assertFalse(tracker.test(0)); + assertTrue(tracker.test(1)); + assertFalse(tracker.test(1)); + assertTrue(tracker.test(2)); + assertFalse(tracker.test(2)); + + assertTrue(tracker.test(4)); + assertFalse(tracker.test(4)); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java deleted file mode 100644 index ffd2d0d8c5..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.ArrayList; -import java.util.Random; -import java.util.concurrent.ThreadLocalRandom; - -import static org.junit.jupiter.api.Assertions.assertThrows; - -/** - * Tests for the {@link BloomFilterIndexer}. - */ -public class BloomFilterIndexerTest { - - @Test - public void testCheckPositiveThrows() { - assertThrows(IndexOutOfBoundsException.class, () -> BloomFilterIndexer.checkPositive(-1)); - } - - @Test - public void testGetLongIndex() { - assertEquals(0, BloomFilterIndexer.getLongIndex(0)); - - for (final int index : getIndexes()) { - // getLongIndex is expected to identify a block of 64-bits (starting from zero) - assertEquals(index / Long.SIZE, BloomFilterIndexer.getLongIndex(index)); - - // Verify the behavior for negatives. It should produce a negative (invalid) - // as a simple trip for incorrect usage. - assertTrue(BloomFilterIndexer.getLongIndex(-index) < 0); - - // If index is not zero then when negated this is what a signed shift - // of 6-bits actually does - assertEquals(((1 - index) / Long.SIZE) - 1, - BloomFilterIndexer.getLongIndex(-index)); - } - } - - @Test - public void testGetLongBit() { - assertEquals(1L, BloomFilterIndexer.getLongBit(0)); - - for (final int index : getIndexes()) { - // getLongBit is expected to identify a single bit in a 64-bit block - assertEquals(1L << (index % Long.SIZE), BloomFilterIndexer.getLongBit(index)); - - // Verify the behavior for negatives - assertEquals(1L << (64 - (index & 0x3f)), BloomFilterIndexer.getLongBit(-index)); - } - } - - /** - * Gets non-zero positive indexes for testing. - * - * @return the indices - */ - private static int[] getIndexes() { - final Random rng = ThreadLocalRandom.current(); - final ArrayList indexes = new ArrayList<>(40); - for (int i = 0; i < 10; i++) { - // random positive numbers - indexes.add(rng.nextInt() >>> 1); - indexes.add(rng.nextInt(23647826)); - indexes.add(rng.nextInt(245)); - } - // Quickly remove zeros (as these cannot be negated) - indexes.removeIf(i -> i == 0); - // Add edge cases here - indexes.add(1); - indexes.add(2); - indexes.add(63); - indexes.add(64); - indexes.add(Integer.MAX_VALUE); - return indexes.stream().mapToInt(Integer::intValue).toArray(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitMapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitMapProducerTest.java new file mode 100644 index 0000000000..e7af149b1b --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitMapProducerTest.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.function.LongPredicate; + +public class DefaultBitMapProducerTest extends AbstractBitMapProducerTest { + + @Override + protected BitMapProducer createProducer() { + return new DefaultBitMapProducer(new long[] { 1L, 2L }); + } + + @Override + protected BitMapProducer createEmptyProducer() { + return new DefaultBitMapProducer(new long[0]); + } + + @Override + protected boolean emptyIsZeroLength() { + return true; + } + + class DefaultBitMapProducer implements BitMapProducer { + long[] bitMaps; + + DefaultBitMapProducer(long[] bitMaps) { + this.bitMaps = bitMaps; + } + + @Override + public boolean forEachBitMap(LongPredicate predicate) { + for (long bitmap : bitMaps) { + if (!predicate.test(bitmap)) { + return false; + } + } + return true; + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java deleted file mode 100644 index 0d6443355c..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.BitSet; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; - -/** - * Test all the default implementations of the BloomFilter in {@link AbstractBloomFilter}. - */ -public class DefaultBloomFilterMethodsTest extends AbstractBloomFilterTest { - - /** - * A testing class that implements only the abstract methods from BloomFilter. - * - */ - private static class BF extends AbstractBloomFilter { - - /** - * The bits for this BloomFilter. - */ - private final BitSet bitSet; - - /** - * Constructs a BitSetBloomFilter from a hasher and a shape. - * - * @param hasher the Hasher to use. - * @param shape the desired shape of the filter. - */ - BF(final Hasher hasher, final Shape shape) { - this(shape); - verifyHasher(hasher); - hasher.iterator(shape).forEachRemaining((IntConsumer) bitSet::set); - } - - /** - * Constructs an empty BitSetBloomFilter. - * - * @param shape the desired shape of the filter. - */ - BF(final Shape shape) { - super(shape); - this.bitSet = new BitSet(); - } - - @Override - public long[] getBits() { - return bitSet.toLongArray(); - } - - @Override - public StaticHasher getHasher() { - return new StaticHasher(bitSet.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - verifyShape(other); - bitSet.or(BitSet.valueOf(other.getBits())); - return true; - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) bitSet::set); - return true; - } - } - - @Override - protected AbstractBloomFilter createEmptyFilter(final Shape shape) { - return new BF(shape); - } - - @Override - protected AbstractBloomFilter createFilter(final Hasher hasher, final Shape shape) { - return new BF(hasher, shape); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java new file mode 100644 index 0000000000..aab0f43b2f --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.TreeSet; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link BloomFilter}. + */ +public class DefaultBloomFilterTest extends AbstractBloomFilterTest { + @Override + protected AbstractDefaultBloomFilter createEmptyFilter(final Shape shape) { + return new SparseDefaultBloomFilter(shape); + } + + @Override + protected AbstractDefaultBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new SparseDefaultBloomFilter(shape, hasher); + } + + @Override + protected AbstractDefaultBloomFilter createFilter(final Shape shape, final BitMapProducer producer) { + return new SparseDefaultBloomFilter(shape, producer); + } + + @Override + protected AbstractDefaultBloomFilter createFilter(final Shape shape, final IndexProducer producer) { + return new SparseDefaultBloomFilter(shape, producer); + } + + @Test + public void testDefaultBloomFilterSimpleSpecificMergeInPlace() { + AbstractDefaultBloomFilter filter = new SparseDefaultBloomFilter(Shape.fromKM(3, 150)); + Hasher hasher = new SimpleHasher(0, 1); + assertTrue(filter.mergeInPlace(hasher)); + assertEquals(3, filter.cardinality()); + } + + @Test + public void testDefaultBloomFilterSparseSpecificMergeInPlace() { + AbstractDefaultBloomFilter filter = new SparseDefaultBloomFilter(Shape.fromKM(3, 150)); + Hasher hasher = new SimpleHasher(0, 1); + BloomFilter newFilter = filter.merge(hasher); + assertEquals(3, newFilter.cardinality()); + } + + @Test + public void testDefaultBloomFilterSparseSpecificMerge() { + Shape shape = Shape.fromKM(3, 150); + AbstractDefaultBloomFilter filter = new SparseDefaultBloomFilter(shape); + AbstractDefaultBloomFilter filter2 = new SparseDefaultBloomFilter(shape, new SimpleHasher(0, 1)); + BloomFilter newFilter = filter.merge(filter2); + assertEquals(3, newFilter.cardinality()); + } + + @Test + public void testHasherBasedMergeInPlaceWithDifferingSparseness() { + Hasher hasher = new SimpleHasher(1, 1); + + BloomFilter bf1 = new NonSparseDefaultBloomFilter(getTestShape()); + bf1.mergeInPlace(hasher); + assertTrue(BitMapProducer.fromIndexProducer(hasher.indices(getTestShape()), getTestShape().getNumberOfBits()) + .forEachBitMapPair(bf1, (x, y) -> x == y)); + + bf1 = new SparseDefaultBloomFilter(getTestShape()); + bf1.mergeInPlace(hasher); + assertTrue(BitMapProducer.fromIndexProducer(hasher.indices(getTestShape()), getTestShape().getNumberOfBits()) + .forEachBitMapPair(bf1, (x, y) -> x == y)); + } + + abstract static class AbstractDefaultBloomFilter implements BloomFilter { + private Shape shape; + protected TreeSet indices; + + AbstractDefaultBloomFilter(Shape shape) { + this.shape = shape; + this.indices = new TreeSet<>(); + } + + AbstractDefaultBloomFilter(Shape shape, Hasher hasher) { + this(shape, hasher.indices(shape)); + } + + AbstractDefaultBloomFilter(Shape shape, BitMapProducer producer) { + this(shape, IndexProducer.fromBitMapProducer(producer)); + } + + AbstractDefaultBloomFilter(Shape shape, IndexProducer producer) { + this(shape); + producer.forEachIndex((i) -> { + indices.add(i); + return true; + }); + if (this.indices.floor(-1) != null || this.indices.ceiling(shape.getNumberOfBits()) != null) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); + } + } + + @Override + public boolean forEachIndex(IntPredicate consumer) { + for (Integer i : indices) { + if (!consumer.test(i)) { + return false; + } + } + return true; + } + + @Override + public boolean forEachBitMap(LongPredicate consumer) { + return BitMapProducer.fromIndexProducer(this, shape.getNumberOfBits()).forEachBitMap(consumer); + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + return indexProducer.forEachIndex((i) -> indices.contains(i)); + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains(IndexProducer.fromBitMapProducer(bitMapProducer)); + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + other.forEachIndex((i) -> { + indices.add(i); + return true; + }); + if (!indices.isEmpty()) { + if (indices.last() >= shape.getNumberOfBits()) { + throw new IllegalArgumentException(String.format("Value in list %s is greater than maximum value (%s)", + indices.last(), shape.getNumberOfBits())); + } + if (indices.first() < 0) { + throw new IllegalArgumentException( + String.format("Value in list %s is less than 0", indices.first())); + } + } + return true; + } + + @Override + public int cardinality() { + return indices.size(); + } + } + + static class SparseDefaultBloomFilter extends AbstractDefaultBloomFilter { + + SparseDefaultBloomFilter(Shape shape, BitMapProducer producer) { + super(shape, producer); + } + + SparseDefaultBloomFilter(Shape shape, Hasher hasher) { + super(shape, hasher); + } + + SparseDefaultBloomFilter(Shape shape, IndexProducer producer) { + super(shape, producer); + } + + SparseDefaultBloomFilter(Shape shape) { + super(shape); + } + + @Override + public boolean isSparse() { + return true; + } + + @Override + public AbstractDefaultBloomFilter copy() { + AbstractDefaultBloomFilter result = new SparseDefaultBloomFilter(getShape()); + result.indices.addAll(indices); + return result; + } + } + + static class NonSparseDefaultBloomFilter extends AbstractDefaultBloomFilter { + + NonSparseDefaultBloomFilter(Shape shape, BitMapProducer producer) { + super(shape, producer); + } + + NonSparseDefaultBloomFilter(Shape shape, Hasher hasher) { + super(shape, hasher); + } + + NonSparseDefaultBloomFilter(Shape shape, IndexProducer producer) { + super(shape, producer); + } + + NonSparseDefaultBloomFilter(Shape shape) { + super(shape); + } + + @Override + public boolean isSparse() { + return false; + } + + @Override + public AbstractDefaultBloomFilter copy() { + AbstractDefaultBloomFilter result = new SparseDefaultBloomFilter(getShape()); + result.indices.addAll(indices); + return result; + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java deleted file mode 100644 index ec4886294c..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -import java.util.Arrays; -import java.util.PrimitiveIterator.OfInt; - -/** - * A Hasher implementation to return fixed indexes. Duplicates are allowed. - * The shape is ignored when generating the indexes. - * - *

This is not a real hasher and is used for testing only. - */ -class FixedIndexesTestHasher implements Hasher { - /** The shape. */ - private final Shape shape; - /** The indexes. */ - private final int[] indexes; - - /** - * Create an instance. - * - * @param shape the shape - * @param indexes the indexes - */ - FixedIndexesTestHasher(final Shape shape, final int... indexes) { - this.shape = shape; - this.indexes = indexes; - } - - @Override - public OfInt iterator(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException( - String.format("shape (%s) does not match internal shape (%s)", shape, this.shape)); - } - return Arrays.stream(indexes).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java deleted file mode 100644 index a10df81643..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.DynamicHasher; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; -import org.junit.jupiter.api.Test; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.PrimitiveIterator.OfInt; - -/** - * Tests the {@link HasherBloomFilter}. - */ -public class HasherBloomFilterTest extends AbstractBloomFilterTest { - - /** - * Tests that the constructor works correctly. - */ - @Test - public void constructorTest_NonStatic() { - final Shape shape = new Shape(new MD5Cyclic(), 3, 72, 17); - final DynamicHasher hasher = new DynamicHasher.Builder(new MD5Cyclic()).with("Hello", StandardCharsets.UTF_8).build(); - final HasherBloomFilter filter = createFilter(hasher, shape); - final long[] lb = filter.getBits(); - assertEquals(2, lb.length); - assertEquals(0x6203101001888c44L, lb[0]); - assertEquals(0x60L, lb[1]); - } - - @Override - protected AbstractBloomFilter createEmptyFilter(final Shape shape) { - return new HasherBloomFilter(shape); - } - - @Override - protected HasherBloomFilter createFilter(final Hasher hasher, final Shape shape) { - return new HasherBloomFilter(hasher, shape); - } - - /** - * Test the edge case where the filter is empty and the getBits() function returns a - * zero length array. - */ - @Test - public void getBitsTest_Empty() { - final BloomFilter filter = createEmptyFilter(shape); - assertArrayEquals(new long[0], filter.getBits()); - } - - /** - * Test the edge case where the filter has only 1 bit in the lowest index and the getBits() - * function returns an array of length 1. - */ - @Test - public void getBitsTest_LowestBitOnly() { - final BloomFilter filter = createEmptyFilter(shape); - // Set the lowest bit index only. - filter.merge(new Hasher() { - @Override - public OfInt iterator(final Shape shape) { - return Arrays.stream(new int[] {0}).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } - }); - assertArrayEquals(new long[] {1L}, filter.getBits()); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherCollectionTest.java new file mode 100644 index 0000000000..419196ab7e --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherCollectionTest.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link HasherCollection}. + */ +public class HasherCollectionTest extends AbstractHasherTest { + + @Override + protected HasherCollection createHasher() { + return new HasherCollection(new SimpleHasher(1, 1), new SimpleHasher(2, 2)); + } + + @Override + protected HasherCollection createEmptyHasher() { + return new HasherCollection(); + } + + @Override + protected int getHasherSize(Hasher hasher) { + return ((HasherCollection) hasher).getHashers().size(); + } + + protected void nestedTest(HasherCollectionTest nestedTest) { + nestedTest.testAsIndexArray(); + nestedTest.testForEachIndex(); + nestedTest.testAdd(); + } + + @Test + public void testCollectionConstructor() { + List lst = Arrays.asList(new SimpleHasher(3, 2), new SimpleHasher(4, 2)); + HasherCollectionTest nestedTest = new HasherCollectionTest() { + @Override + protected HasherCollection createHasher() { + return new HasherCollection(lst); + } + + @Override + protected HasherCollection createEmptyHasher() { + return new HasherCollection(); + } + }; + nestedTest(nestedTest); + + nestedTest = new HasherCollectionTest() { + @Override + protected HasherCollection createHasher() { + return new HasherCollection(new SimpleHasher(3, 2), new SimpleHasher(4, 2)); + } + + @Override + protected HasherCollection createEmptyHasher() { + return new HasherCollection(); + } + }; + nestedTest(nestedTest); + } + + @Test + public void testAdd() { + HasherCollection hasher = createHasher(); + hasher.add(new SimpleHasher(2, 2)); + assertEquals(3, hasher.getHashers().size()); + + hasher.add(Arrays.asList(new SimpleHasher(3, 2), new SimpleHasher(4, 2))); + assertEquals(5, hasher.getHashers().size()); + } + + @Override + public void testUniqueIndex() { + // create a hasher that produces duplicates with the specified shape. + // this setup produces 5, 17, 29, 41, 53, 65 two times + Shape shape = Shape.fromKM(12, 72); + Hasher h1 = new SimpleHasher(5, 12); + HasherCollection hasher = createEmptyHasher(); + hasher.add(h1); + hasher.add(h1); + List lst = new ArrayList<>(); + for (int i : new int[] { 5, 17, 29, 41, 53, 65 }) { + lst.add(i); + lst.add(i); + } + + assertTrue(hasher.uniqueIndices(shape).forEachIndex(i -> { + return lst.remove(Integer.valueOf(i)); + }), "unable to remove value"); + assertEquals(0, lst.size()); + } + + @Test + void testHasherCollection() { + Hasher h1 = new SimpleHasher(13, 4678); + Hasher h2 = new SimpleHasher(42, 987); + Hasher h3 = new SimpleHasher(454, 2342); + + HasherCollection hc1 = new HasherCollection(Arrays.asList(h1, h1)); + HasherCollection hc2 = new HasherCollection(Arrays.asList(h2, h3)); + HasherCollection hc3 = new HasherCollection(Arrays.asList(hc1, hc2)); + + ArrayCountingBloomFilter bf = new ArrayCountingBloomFilter(Shape.fromKM(5, 10000)); + + // Should add h1, h1, h2, h3 + Assertions.assertTrue(bf.mergeInPlace(hc3)); + Assertions.assertTrue(bf.remove(h1)); + Assertions.assertTrue(bf.remove(h1)); + Assertions.assertNotEquals(0, bf.cardinality()); + Assertions.assertTrue(bf.remove(hc2)); + Assertions.assertEquals(0, bf.cardinality()); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java index 5aa6c94bef..80c93c1448 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java @@ -16,76 +16,123 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentityImpl; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.junit.jupiter.api.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Set; -import java.util.function.IntConsumer; -import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; +import java.util.SplittableRandom; +import java.util.concurrent.ThreadLocalRandom; + +import org.apache.commons.collections4.bloomfilter.Hasher.IndexFilter; +import org.apache.commons.collections4.bloomfilter.Hasher.IndexFilter.ArrayTracker; +import org.apache.commons.collections4.bloomfilter.Hasher.IndexFilter.BitMapTracker; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + /** - * Tests for the {@link IndexFilters}. + * Tests the Filter class. */ public class IndexFilterTest { - /** - * The shape of the dummy Bloom filter. - * This is used as an argument to a Hasher that just returns fixed indexes - * so the parameters do not matter. - */ - private final Shape shape = new Shape(new HashFunctionIdentityImpl( - "Apache Commons Collections", "Dummy", Signedness.SIGNED, ProcessType.CYCLIC, 0L), - 50, 3000, 4); - @Test - public void testApplyThrowsWithNullArguments() { - final FixedIndexesTestHasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final Shape shape = this.shape; - final ArrayList actual = new ArrayList<>(); - final IntConsumer consumer = actual::add; - assertAll( - () -> assertThrows(NullPointerException.class, () -> IndexFilters.distinctIndexes(null, shape, consumer), "null hasher"), - () -> assertThrows(NullPointerException.class, () -> IndexFilters.distinctIndexes(hasher, null, consumer), "null shape"), - () -> assertThrows(NullPointerException.class, () -> IndexFilters.distinctIndexes(hasher, shape, null), "null consumer") - ); - - // All OK together - IndexFilters.distinctIndexes(hasher, shape, consumer); + public void testFiltering() { + Shape shape = Shape.fromKM(3, 12); + List consumer = new ArrayList(); + IndexFilter filter = IndexFilter.create(shape, consumer::add); + + for (int i = 0; i < 12; i++) { + assertTrue(filter.test(i)); + } + assertEquals(12, consumer.size()); + + for (int i = 0; i < 12; i++) { + assertTrue(filter.test(i)); + } + assertEquals(12, consumer.size()); } - @Test - public void testApply() { - assertFilter(1, 4, 6, 7, 9); + @ParameterizedTest + @CsvSource({ "1, 64", "2, 64", "3, 64", "7, 357", "7, 17", }) + void testFilter(int k, int m) { + Shape shape = Shape.fromKM(k, m); + BitSet used = new BitSet(m); + for (int n = 0; n < 10; n++) { + used.clear(); + List consumer = new ArrayList<>(); + IndexFilter filter = IndexFilter.create(shape, consumer::add); + + // Make random indices; these may be duplicates + long seed = ThreadLocalRandom.current().nextLong(); + SplittableRandom rng = new SplittableRandom(seed); + for (int i = Math.min(k, m / 2); i-- > 0;) { + int bit = rng.nextInt(m); + // duplicates should not alter the list size + int newSize = consumer.size() + (used.get(bit) ? 0 : 1); + assertTrue(filter.test(bit)); + assertEquals(newSize, consumer.size(), () -> String.format("Bad filter. Seed=%d, bit=%d", seed, bit)); + used.set(bit); + } + + // The list should have unique entries + assertArrayEquals(used.stream().toArray(), consumer.stream().mapToInt(i -> (int) i).sorted().toArray()); + final int size = consumer.size(); + + // Second observations do not change the list size + used.stream().forEach(bit -> { + assertTrue(filter.test(bit)); + assertEquals(size, consumer.size(), () -> String.format("Bad filter. Seed=%d, bit=%d", seed, bit)); + }); + + assertThrows(IndexOutOfBoundsException.class, () -> filter.test(m)); + assertThrows(IndexOutOfBoundsException.class, () -> filter.test(-1)); + } } @Test - public void testApplyWithDuplicates() { - assertFilter(1, 4, 4, 6, 7, 7, 7, 7, 7, 9); - } + public void testConstructor() + throws IllegalArgumentException, IllegalAccessException, NoSuchFieldException, SecurityException { + Field tracker = IndexFilter.class.getDeclaredField("tracker"); + tracker.setAccessible(true); + List consumer = new ArrayList(); - private void assertFilter(final int... indexes) { - final FixedIndexesTestHasher hasher = new FixedIndexesTestHasher(shape, indexes); - final Set expected = Arrays.stream(indexes).boxed().collect(Collectors.toSet()); - final ArrayList actual = new ArrayList<>(); + // test even split + int k = 2; + int m = Long.SIZE; + Shape shape = Shape.fromKM(k, m); + IndexFilter filter = IndexFilter.create(shape, consumer::add); + assertTrue(tracker.get(filter) instanceof ArrayTracker); - IndexFilters.distinctIndexes(hasher, shape, actual::add); + // test k ints < longs for m + k = 1; + shape = Shape.fromKM(k, m); + filter = IndexFilter.create(shape, consumer::add); + assertTrue(tracker.get(filter) instanceof ArrayTracker); - assertEquals(expected.size(), actual.size()); - // Check the array has all the values. - // We do not currently check the order of indexes from the - // hasher.iterator() function. - for (final Integer index : actual) { - assertTrue(expected.contains(index)); - } + // test k ints > longs for m + k = 3; + shape = Shape.fromKM(k, m); + filter = IndexFilter.create(shape, consumer::add); + assertTrue(tracker.get(filter) instanceof BitMapTracker); + + /* test overflows */ + shape = Shape.fromKM(2, Integer.MAX_VALUE); + filter = IndexFilter.create(shape, consumer::add); + assertTrue(tracker.get(filter) instanceof ArrayTracker); + + // overflow when computing the storage of the int array + shape = Shape.fromKM(Integer.MAX_VALUE, 123); + filter = IndexFilter.create(shape, consumer::add); + // *** fails *** + assertTrue(tracker.get(filter) instanceof BitMapTracker); + + shape = Shape.fromKM(Integer.MAX_VALUE, Integer.MAX_VALUE); + filter = IndexFilter.create(shape, consumer::add); + assertTrue(tracker.get(filter) instanceof BitMapTracker); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java new file mode 100644 index 0000000000..7e9941d017 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class IndexProducerFromArrayCountingBloomFilterTest extends AbstractIndexProducerTest { + + protected Shape shape = Shape.fromKM(17, 72); + + @Override + protected IndexProducer createProducer() { + ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); + Hasher hasher = new SimpleHasher(0, 1); + return filter.merge(hasher); + } + + @Override + protected IndexProducer createEmptyProducer() { + return new ArrayCountingBloomFilter(shape); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromBitmapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromBitmapProducerTest.java new file mode 100644 index 0000000000..a208d3a2f9 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromBitmapProducerTest.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.LongPredicate; + +import org.junit.jupiter.api.Test; + +public class IndexProducerFromBitmapProducerTest extends AbstractIndexProducerTest { + + @Override + protected IndexProducer createEmptyProducer() { + TestingBitMapProducer producer = new TestingBitMapProducer(new long[0]); + return IndexProducer.fromBitMapProducer(producer); + } + + @Override + protected IndexProducer createProducer() { + /* Creates an index producer that produces the values: + * 0, 65, 128, and 129 + @formatter:off + Index2 Index1 Index0 + bit 128 64 0 + | | | + 1L => | | ...0001 + 2L => | ...0010 + 3L => ...0011 + @formatter:on + */ + TestingBitMapProducer producer = new TestingBitMapProducer(new long[] { 1L, 2L, 3L }); + return IndexProducer.fromBitMapProducer(producer); + } + + @Test + public final void testFromBitMapProducerTest() { + IndexProducer underTest = createProducer(); + List lst = new ArrayList<>(); + + underTest.forEachIndex(lst::add); + assertEquals(4, lst.size()); + assertEquals(Integer.valueOf(0), lst.get(0)); + assertEquals(Integer.valueOf(1 + 64), lst.get(1)); + assertEquals(Integer.valueOf(0 + 128), lst.get(2)); + assertEquals(Integer.valueOf(1 + 128), lst.get(3)); + + BitMapProducer producer = new TestingBitMapProducer(new long[] { 0xFFFFFFFFFFFFFFFFL }); + underTest = IndexProducer.fromBitMapProducer(producer); + lst = new ArrayList<>(); + + underTest.forEachIndex(lst::add); + + assertEquals(64, lst.size()); + for (int i = 0; i < 64; i++) { + assertEquals(Integer.valueOf(i), lst.get(i)); + } + } + + private class TestingBitMapProducer implements BitMapProducer { + long[] values; + + TestingBitMapProducer(long[] values) { + this.values = values; + } + + @Override + public boolean forEachBitMap(LongPredicate consumer) { + for (long l : values) { + if (!consumer.test(l)) { + return false; + } + } + return true; + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java new file mode 100644 index 0000000000..d7e61d796d --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class IndexProducerFromHasherCollectionTest extends AbstractIndexProducerTest { + + @Override + protected IndexProducer createProducer() { + return new HasherCollection(new SimpleHasher(0, 1), new SimpleHasher(0, 2)).indices(Shape.fromKM(17, 72)); + } + + @Override + protected IndexProducer createEmptyProducer() { + return new HasherCollection().indices(Shape.fromKM(17, 72)); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java similarity index 66% rename from src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java index 95951ad7fe..c089b4b420 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java @@ -14,11 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.commons.collections4.bloomfilter; -/** - * Provides implementations of the Bloom filter - * {@link org.apache.commons.collections4.bloomfilter.hasher.HashFunction HashFunction} interface. - * - * @since 4.5 - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; +public class IndexProducerFromHasherTest extends AbstractIndexProducerTest { + + @Override + protected IndexProducer createProducer() { + return new SimpleHasher(0, 1).indices(Shape.fromKM(17, 72)); + } + + @Override + protected IndexProducer createEmptyProducer() { + return NullHasher.INSTANCE.indices(Shape.fromKM(17, 72)); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromIntArrayTest.java similarity index 66% rename from src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromIntArrayTest.java index b73675ed28..4755d24a85 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromIntArrayTest.java @@ -14,12 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.commons.collections4.bloomfilter; -/** - * Provides classes and interfaces to define the shape of a Bloom filter and the conversion - * of generic bytes to a hash of bit indexes to be used with a Bloom filter. - * - * @since 4.5 - */ -package org.apache.commons.collections4.bloomfilter.hasher; +public class IndexProducerFromIntArrayTest extends AbstractIndexProducerTest { + + @Override + protected IndexProducer createEmptyProducer() { + return IndexProducer.fromIndexArray(new int[0]); + } + @Override + protected IndexProducer createProducer() { + return IndexProducer.fromIndexArray(new int[] { 1, 2, 3, 4, 5 }); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java new file mode 100644 index 0000000000..8525428671 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class IndexProducerFromSimpleBloomFilterTest extends AbstractIndexProducerTest { + + protected Shape shape = Shape.fromKM(17, 72); + + @Override + protected IndexProducer createProducer() { + Hasher hasher = new SimpleHasher(0, 1); + return new SparseBloomFilter(shape, hasher); + } + + @Override + protected IndexProducer createEmptyProducer() { + return new SparseBloomFilter(shape); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java new file mode 100644 index 0000000000..4204c90fe7 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class IndexProducerFromSparseBloomFilterTest extends AbstractIndexProducerTest { + + protected Shape shape = Shape.fromKM(17, 72); + + @Override + protected IndexProducer createProducer() { + Hasher hasher = new SimpleHasher(0, 1); + return new SimpleBloomFilter(shape, hasher); + } + + @Override + protected IndexProducer createEmptyProducer() { + return new SimpleBloomFilter(shape); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java new file mode 100644 index 0000000000..fc11df6391 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.LongPredicate; + +import org.junit.jupiter.api.Test; + +public class IndexProducerTest { + + @Test + public void fromBitMapProducerTest() { + TestingBitMapProducer producer = new TestingBitMapProducer(new long[] { 1L, 2L, 3L }); + IndexProducer underTest = IndexProducer.fromBitMapProducer(producer); + List lst = new ArrayList<>(); + + underTest.forEachIndex(lst::add); + assertEquals(4, lst.size()); + assertEquals(Integer.valueOf(0), lst.get(0)); + assertEquals(Integer.valueOf(1 + 64), lst.get(1)); + assertEquals(Integer.valueOf(0 + 128), lst.get(2)); + assertEquals(Integer.valueOf(1 + 128), lst.get(3)); + + producer = new TestingBitMapProducer(new long[] { 0xFFFFFFFFFFFFFFFFL }); + underTest = IndexProducer.fromBitMapProducer(producer); + lst = new ArrayList<>(); + + underTest.forEachIndex(lst::add); + + assertEquals(64, lst.size()); + for (int i = 0; i < 64; i++) { + assertEquals(Integer.valueOf(i), lst.get(i)); + } + } + + private class TestingBitMapProducer implements BitMapProducer { + long[] values; + + TestingBitMapProducer(long[] values) { + this.values = values; + } + + @Override + public boolean forEachBitMap(LongPredicate consumer) { + for (long l : values) { + if (!consumer.test(l)) { + return false; + } + } + return true; + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/NullHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/NullHasher.java new file mode 100644 index 0000000000..537c60d285 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/NullHasher.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntPredicate; + +/** + * A Hasher that returns no values. + * + * @since 4.5 + */ +public final class NullHasher implements Hasher { + + /** + * The instance of the Null Hasher. + */ + public static final NullHasher INSTANCE = new NullHasher(); + + private static final IndexProducer PRODUCER = new IndexProducer() { + @Override + public boolean forEachIndex(IntPredicate consumer) { + return true; + } + }; + + private NullHasher() { + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + return PRODUCER; + } + + @Override + public IndexProducer uniqueIndices(Shape shape) { + return PRODUCER; + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 45a9943e85..9d7659d1fa 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -17,14 +17,7 @@ package org.apache.commons.collections4.bloomfilter; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; - -import java.util.List; -import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; + import org.junit.jupiter.api.Test; /** @@ -32,319 +25,274 @@ */ public class SetOperationsTest { - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - private final Shape shape = new Shape(testFunction, 3, 72, 17); - - @Test - public void testDifferentShapesThrows() { - final List lst = Arrays.asList(1, 2); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - final Shape shape2 = new Shape(testFunction, 3, 72, 18); - final List lst2 = Arrays.asList(2, 3); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape2); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape2); - - assertThrows(IllegalArgumentException.class, () -> SetOperations.cosineDistance(filter1, filter2)); - } + protected final SimpleHasher from1 = new SimpleHasher(1, 1); + protected final long from1Value = 0x3FFFEL; + protected final SimpleHasher from11 = new SimpleHasher(11, 1); + protected final long from11Value = 0xFFFF800L; + protected final HasherCollection bigHasher = new HasherCollection(from1, from11); + protected final long bigHashValue = 0xFFFFFFEL; + private final Shape shape = Shape.fromKM(17, 72); /** * Tests that the Cosine similarity is correctly calculated. */ @Test - public final void cosineDistanceTest() { - List lst = Arrays.asList(1, 2); - Hasher hasher = new StaticHasher(lst.iterator(), shape); - BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(2, 3); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.5, SetOperations.cosineDistance(filter1, filter2), 0.0001); - assertEquals(0.5, SetOperations.cosineDistance(filter2, filter1), 0.0001); - - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - hasher = new StaticHasher(lst.iterator(), shape); - filter1 = new HasherBloomFilter(hasher, shape); - - lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); - assertEquals(0.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); - - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.514928749927334, SetOperations.cosineDistance(filter1, filter2), 0.000000000000001); - assertEquals(0.514928749927334, SetOperations.cosineDistance(filter2, filter1), 0.000000000000001); - } - - /** - * Tests that the Cosine distance is correctly calculated when one or - * both filters are empty - */ - @Test - public final void cosineDistanceTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); - - assertEquals(1.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); - assertEquals(1.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); - assertEquals(1.0, SetOperations.cosineDistance(filter1, filter3), 0.0001); - assertEquals(1.0, SetOperations.cosineDistance(filter3, filter1), 0.0001); + public final void testCosineDistance() { + + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + + // identical filters should have no distance. + double expected = 0; + assertEquals(expected, SetOperations.cosineDistance(filter1, filter2)); + assertEquals(expected, SetOperations.cosineDistance(filter2, filter1)); + + Shape shape2 = Shape.fromKM(2, 72); + filter1 = new SimpleBloomFilter(shape2, from1); + filter2 = new SimpleBloomFilter(shape2, new SimpleHasher(2, 1)); + + int dotProduct = /* [1,2] & [2,3] = [2] = */ 1; + int cardinalityA = 2; + int cardinalityB = 2; + expected = 1 - (dotProduct / Math.sqrt(cardinalityA * cardinalityB)); + assertEquals(expected, SetOperations.cosineDistance(filter1, filter2)); + assertEquals(expected, SetOperations.cosineDistance(filter2, filter1)); + + filter1 = new SimpleBloomFilter(shape, from1); + filter2 = new SimpleBloomFilter(shape, from11); + dotProduct = /* [1..17] & [11..27] = [] = */ 7; + cardinalityA = 17; + cardinalityB = 17; + expected = 1 - (dotProduct / Math.sqrt(cardinalityA * cardinalityB)); + assertEquals(expected, SetOperations.cosineDistance(filter1, filter2)); + assertEquals(expected, SetOperations.cosineDistance(filter2, filter1)); + + // test with no values + filter1 = new SimpleBloomFilter(shape, from1); + filter2 = new SimpleBloomFilter(shape); + BloomFilter filter3 = new SimpleBloomFilter(shape); + + dotProduct = /* [1,2] & [] = [] = */ 0; + cardinalityA = 2; + cardinalityB = 0; + expected = /* 1 - (dotProduct/Math.sqrt( cardinalityA * cardinalityB )) = */ 1.0; + assertEquals(expected, SetOperations.cosineDistance(filter1, filter2)); + assertEquals(expected, SetOperations.cosineDistance(filter2, filter1)); + + dotProduct = /* [] & [] = [] = */ 0; + cardinalityA = 0; + cardinalityB = 0; + expected = /* 1 - (dotProduct/Math.sqrt( cardinalityA * cardinalityB )) = */ 1.0; + assertEquals(1.0, SetOperations.cosineDistance(filter2, filter3)); + assertEquals(1.0, SetOperations.cosineDistance(filter3, filter2)); } /** * Tests that the Cosine similarity is correctly calculated. */ @Test - public final void cosineSimilarityTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(1.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); - assertEquals(1.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); - - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter1, filter2), 0.000000000000001); - assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter2, filter1), 0.000000000000001); - } - - /** - * Tests that the Cosine similarity is correctly calculated when one or - * both filters are empty - */ - @Test - public final void cosineSimilarityTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); + public final void testCosineSimilarity() { + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + + int dotProduct = /* [1..17] & [1..17] = [1..17] = */ 17; + int cardinalityA = 17; + int cardinalityB = 17; + double expected = /* dotProduct/Sqrt( cardinalityA * cardinalityB ) = */ 1.0; + assertEquals(expected, SetOperations.cosineSimilarity(filter1, filter2)); + assertEquals(expected, SetOperations.cosineSimilarity(filter2, filter1)); + + dotProduct = /* [1..17] & [11..27] = [11..17] = */ 7; + cardinalityA = 17; + cardinalityB = 17; + expected = dotProduct / Math.sqrt(cardinalityA * cardinalityB); + filter2 = new SimpleBloomFilter(shape, from11); + assertEquals(expected, SetOperations.cosineSimilarity(filter1, filter2)); + assertEquals(expected, SetOperations.cosineSimilarity(filter2, filter1)); + + // test no values + filter1 = new SimpleBloomFilter(shape); + filter2 = new SimpleBloomFilter(shape); // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); - - assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); - assertEquals(0.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); - assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter3), 0.0001); - assertEquals(0.0, SetOperations.cosineSimilarity(filter3, filter1), 0.0001); - } + BloomFilter filter3 = new SimpleBloomFilter(shape, from1); - /** - * Tests that the intersection size estimate is correctly calculated. - */ - @Test - public final void estimateIntersectionSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - lst = Arrays.asList(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, 39, 40); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - final long estimate = SetOperations.estimateIntersectionSize(filter1, filter2); - assertEquals(1, estimate); + assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter2)); + assertEquals(0.0, SetOperations.cosineSimilarity(filter2, filter1)); + assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter3)); + assertEquals(0.0, SetOperations.cosineSimilarity(filter3, filter1)); } /** - * Tests that the size estimate is correctly calculated. + * Tests that the Hamming distance is correctly calculated. */ @Test - public final void estimateSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher = new StaticHasher(lst.iterator(), shape); - BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - assertEquals(1, SetOperations.estimateSize(filter1)); - - // the data provided above do not generate an estimate that is equivalent to the - // actual. - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); - hasher = new StaticHasher(lst.iterator(), shape); - filter1 = new HasherBloomFilter(hasher, shape); - assertEquals(1, SetOperations.estimateSize(filter1)); - - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 33); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(3, SetOperations.estimateSize(filter2)); + public final void testHammingDistance() { + final BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + + int hammingDistance = /* [1..17] ^ [1..17] = [] = */ 0; + assertEquals(hammingDistance, SetOperations.hammingDistance(filter1, filter2)); + assertEquals(hammingDistance, SetOperations.hammingDistance(filter2, filter1)); + + filter2 = new SimpleBloomFilter(shape, from11); + hammingDistance = /* [1..17] ^ [11..27] = [1..10][17-27] = */ 20; + assertEquals(hammingDistance, SetOperations.hammingDistance(filter1, filter2)); + assertEquals(hammingDistance, SetOperations.hammingDistance(filter2, filter1)); } /** - * Tests that the union size estimate is correctly calculated. + * Tests that the Jaccard distance is correctly calculated. */ @Test - public final void estimateUnionSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - lst = Arrays.asList(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - final long estimate = SetOperations.estimateUnionSize(filter1, filter2); - assertEquals(3, estimate); + public final void testJaccardDistance() { + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + + // 1 - jaccardSimilarity -- see jaccardSimilarityTest + + assertEquals(0.0, SetOperations.jaccardDistance(filter1, filter2)); + assertEquals(0.0, SetOperations.jaccardDistance(filter2, filter1)); + + filter2 = new SimpleBloomFilter(shape, from11); + double intersection = /* [1..17] & [11..27] = [11..17] = */ 7.0; + int union = /* [1..17] | [11..27] = [1..27] = */ 27; + assertEquals(1 - (intersection / union), SetOperations.jaccardDistance(filter1, filter2)); + assertEquals(1 - (intersection / union), SetOperations.jaccardDistance(filter2, filter1)); + + // test no values + filter1 = new SimpleBloomFilter(shape); + filter2 = new SimpleBloomFilter(shape); + BloomFilter filter3 = new SimpleBloomFilter(shape, from1); + + // 1 - jaccardSimilarity -- see jaccardSimilarityTest + assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2)); + assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1)); + assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter3)); + assertEquals(1.0, SetOperations.jaccardDistance(filter3, filter1)); } /** - * Tests that the Hamming distance is correctly calculated. + * Tests that the Jaccard similarity is correctly calculated. */ @Test - public final void hammingDistanceTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0, SetOperations.hammingDistance(filter1, filter2)); - assertEquals(0, SetOperations.hammingDistance(filter2, filter1)); - - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(17, SetOperations.hammingDistance(filter1, filter2)); - assertEquals(17, SetOperations.hammingDistance(filter2, filter1)); + public final void testJaccardSimilarity() { + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + + double intersection = /* [1..17] & [1..17] = [1..17] = */ 17.0; + int union = /* [1..17] | [1..17] = [1..17] = */ 17; + + assertEquals(intersection / union, SetOperations.jaccardSimilarity(filter1, filter2)); + assertEquals(intersection / union, SetOperations.jaccardSimilarity(filter2, filter1)); + + filter2 = new SimpleBloomFilter(shape, from11); + intersection = /* [1..17] & [11..27] = [11..17] = */ 7.0; + union = /* [1..17] | [11..27] = [1..27] = */ 27; + assertEquals(intersection / union, SetOperations.jaccardSimilarity(filter1, filter2)); + assertEquals(intersection / union, SetOperations.jaccardSimilarity(filter2, filter1)); + + // test no values + filter1 = new SimpleBloomFilter(shape); + filter2 = new SimpleBloomFilter(shape); + BloomFilter filter3 = new SimpleBloomFilter(shape, from1); + + assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2)); + assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1)); + + intersection = /* [] & [1..17] = [] = */ 0.0; + union = /* [] | [1..17] = [] = */ 17; + assertEquals(intersection / union, SetOperations.jaccardSimilarity(filter1, filter3)); + assertEquals(intersection / union, SetOperations.jaccardSimilarity(filter3, filter1)); } - /** - * Tests that the Jaccard distance is correctly calculated. - */ @Test - public final void jaccardDistanceTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); - assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); - - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.32, SetOperations.jaccardDistance(filter1, filter2), 0.001); - assertEquals(0.32, SetOperations.jaccardDistance(filter2, filter1), 0.001); + public final void testOrCardinality() { + Shape shape = Shape.fromKM(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(5, SetOperations.orCardinality(filter1, filter2)); + assertEquals(5, SetOperations.orCardinality(filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(5, SetOperations.orCardinality(filter1, filter2)); + assertEquals(5, SetOperations.orCardinality(filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(4, SetOperations.orCardinality(filter1, filter2)); + assertEquals(4, SetOperations.orCardinality(filter2, filter1)); + + Shape bigShape = Shape.fromKM(3, 192); + filter1 = new SparseBloomFilter(bigShape, IndexProducer.fromIndexArray(new int[] { 1, 63, 185})); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63, 69 })); + assertEquals(5, SetOperations.orCardinality(filter1, filter2)); + assertEquals(5, SetOperations.orCardinality(filter2, filter1)); } - /** - * Tests that the Jaccard distance is correctly calculated when one or - * both filters are empty - */ @Test - public final void jaccardDistanceTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); - - assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); - assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); - assertEquals(0.0, SetOperations.jaccardDistance(filter1, filter3), 0.0001); - assertEquals(0.0, SetOperations.jaccardDistance(filter3, filter1), 0.0001); + public final void testAndCardinality() { + Shape shape = Shape.fromKM(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(1, SetOperations.andCardinality(filter1, filter2)); + assertEquals(1, SetOperations.andCardinality(filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(0, SetOperations.andCardinality(filter1, filter2)); + assertEquals(0, SetOperations.andCardinality(filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(1, SetOperations.andCardinality(filter1, filter2)); + assertEquals(1, SetOperations.andCardinality(filter2, filter1)); + + Shape bigShape = Shape.fromKM(3, 192); + filter1 = new SparseBloomFilter(bigShape, IndexProducer.fromIndexArray(new int[] { 1, 63, 185})); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63, 69 })); + assertEquals(1, SetOperations.andCardinality(filter1, filter2)); + assertEquals(1, SetOperations.andCardinality(filter2, filter1)); } - /** - * Tests that the Jaccard similarity is correctly calculated. - */ @Test - public final void jaccardSimilarityTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); - assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); - - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.68, SetOperations.jaccardSimilarity(filter1, filter2), 0.001); - assertEquals(0.68, SetOperations.jaccardSimilarity(filter2, filter1), 0.001); + public final void testXorCardinality() { + Shape shape = Shape.fromKM(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(4, SetOperations.xorCardinality(filter1, filter2)); + assertEquals(4, SetOperations.xorCardinality(filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(5, SetOperations.xorCardinality(filter1, filter2)); + assertEquals(5, SetOperations.xorCardinality(filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + assertEquals(3, SetOperations.xorCardinality(filter1, filter2)); + assertEquals(3, SetOperations.xorCardinality(filter2, filter1)); + + Shape bigShape = Shape.fromKM(3, 192); + filter1 = new SparseBloomFilter(bigShape, IndexProducer.fromIndexArray(new int[] { 1, 63, 185})); + filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63, 69 })); + assertEquals(4, SetOperations.xorCardinality(filter1, filter2)); + assertEquals(4, SetOperations.xorCardinality(filter2, filter1)); } - /** - * Tests that the Jaccard similarity is correctly calculated when one or - * both filters are empty - */ + @Test - public final void jaccardSimilarityTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); - - assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); - assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); - assertEquals(1.0, SetOperations.jaccardSimilarity(filter1, filter3), 0.0001); - assertEquals(1.0, SetOperations.jaccardSimilarity(filter3, filter1), 0.0001); + public final void testCommutativityOnMismatchedSizes() { + BitMapProducer p1 = BitMapProducer.fromBitMapArray(new long[] { 0x3L, 0x5L }); + BitMapProducer p2 = BitMapProducer.fromBitMapArray(new long[] { 0x1L }); + + assertEquals(SetOperations.orCardinality(p1, p2), SetOperations.orCardinality(p2, p1)); + assertEquals(SetOperations.xorCardinality(p1, p2), SetOperations.xorCardinality(p2, p1)); + assertEquals(SetOperations.andCardinality(p1, p2), SetOperations.andCardinality(p2, p1)); + assertEquals(SetOperations.hammingDistance(p1, p2), SetOperations.hammingDistance(p2, p1)); + assertEquals(SetOperations.cosineDistance(p1, p2), SetOperations.cosineDistance(p2, p1)); + assertEquals(SetOperations.cosineSimilarity(p1, p2), SetOperations.cosineSimilarity(p2, p1)); + assertEquals(SetOperations.jaccardDistance(p1, p2), SetOperations.jaccardDistance(p2, p1)); + assertEquals(SetOperations.jaccardSimilarity(p1, p2), SetOperations.jaccardSimilarity(p2, p1)); } - } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java new file mode 100644 index 0000000000..ecafa2ef42 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link Shape} class. + */ +public class ShapeTest { + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + private final Shape shape = Shape.fromKM(3, 24); + + /** + * Test equality of shape. + */ + @Test + public void testEquals() { + + assertEquals(shape, shape); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(24, shape.getNumberOfBits()); + assertEquals(shape.hashCode(), Shape.fromKM(3, 24).hashCode()); + assertNotEquals(shape, null); + assertNotEquals(shape, Shape.fromKM(3, 25)); + assertNotEquals(shape, Shape.fromKM(4, 24)); + assertNotEquals(shape, "text"); + assertNotEquals(shape, Integer.valueOf(3)); + } + + @Test + public void testEstimateN() { + for (int i = 0; i < 24; i++) { + double c = i; + double expected = -(24.0 / 3.0) * Math.log1p(-c / 24.0); + assertEquals(expected, shape.estimateN(i), "Error on " + i); + } + + assertEquals(Double.POSITIVE_INFINITY, shape.estimateN(24)); + + assertEquals(Double.NaN, shape.estimateN(25)); + } + + @Test + public void testGetProbability() { + for (int i = 0; i <= 24; i++) { + double expected = Math.pow(-Math.expm1(-3.0 * i / 24), 3); + assertEquals(expected, shape.getProbability(i), "error at " + i); + } + + assertEquals(0.0, shape.getProbability(0), 0.0); + + assertThrows(IllegalArgumentException.class, () -> shape.getProbability(-1)); + } + + @Test + public void testIsSparse() { + int functions = 1; // Ignored + for (int i = 1; i <= 3; i++) { + int bits = i * Long.SIZE; + Shape shape = Shape.fromKM(functions, bits); + for (int n = 0; n <= bits; n++) { + final int c = n; + // is sparse when number of bits stored as integers is less than 2 times the + // number of bitmaps + Assertions.assertEquals(n * Integer.SIZE <= Math.ceil((double) bits / Long.SIZE) * Long.SIZE, + shape.isSparse(n), () -> String.format("n=%d : bits=%d", c, bits)); + } + } + } + + @Test + public void testToString() { + assertEquals("Shape[k=3 m=5]", Shape.fromKM(3, 5).toString()); + } + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + /** + * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void testBadNumberOfItems() { + assertThrows(IllegalArgumentException.class, () -> Shape.fromNM(0, 24)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNMK(0, 24, 5)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(0, 0.02)); + } + + /** + * Tests that if the number of bits is less than 1 an exception is thrown + */ + @Test + public void testBadNumberOfBits() { + assertThrows(IllegalArgumentException.class, () -> Shape.fromKM(5, 0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNM(5, 0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNMK(5, 0, 7)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromPMK(0.035, 0, 7)); + } + + /** + * Tests that if the number of hash functions is less than 1 an exception is thrown. + */ + @Test + public void testBadNumberOfHashFunctions() { + assertThrows(IllegalArgumentException.class, () -> Shape.fromKM(0, 7)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNMK(5, 26, 0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromPMK(0.35, 26, 0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNM(2, 1)); + } + + /** + * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown + */ + @Test + public void testBadProbability() { + assertThrows(IllegalArgumentException.class, () -> Shape.fromNMK(4000, 8, 1)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(10, 0.0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(10, 1.0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(10, Double.NaN)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(10, Double.POSITIVE_INFINITY)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(10, Double.NEGATIVE_INFINITY)); + } + + /** + * Tests that when the number of items, number of bits and number of hash functions is passed the values are + * calculated correctly. + */ + @Test + public void testFromNMK() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 + */ + Shape shape = Shape.fromNMK(5, 24, 4); + + assertEquals(24, shape.getNumberOfBits()); + assertEquals(4, shape.getNumberOfHashFunctions()); + assertEquals(0.102194782, shape.getProbability(5), 0.000001); + + assertThrows(IllegalArgumentException.class, + () -> Shape.fromNMK(Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNMK(5, 5, 0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNMK(5, 0, 5)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNMK(0, 5, 5)); + } + + /** + * Tests that if the number of bits less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void testFromKM() { + assertThrows(IllegalArgumentException.class, () -> Shape.fromKM(5, 0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromKM(0, 5)); + } + + /** + * Tests that the number of items and number of bits is passed the other values are calculated correctly. + */ + @Test + public void testFromNM() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24 + */ + Shape shape = Shape.fromNM(5, 24); + + assertEquals(24, shape.getNumberOfBits()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(0.100375138, shape.getProbability(5), 0.000001); + + assertThrows(IllegalArgumentException.class, () -> Shape.fromNM(5, 0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNM(0, 5)); + } + + /** + * Tests that the probability is calculated correctly. + */ + @Test + public void testProbability() { + Shape shape = Shape.fromNMK(5, 24, 3); + assertEquals(24, shape.getNumberOfBits()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(0.100375138, shape.getProbability(5), 0.000001); + } + + /** + * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash + * functions. + */ + @Test + public void testFromPMK() { + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 + */ + Shape shape = Shape.fromPMK(0.1, 24, 3); + + assertEquals(24, shape.getNumberOfBits()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(0.100375138, shape.getProbability(5), 0.000001); + + assertThrows(IllegalArgumentException.class, + () -> Shape.fromPMK(Math.nextDown(1.0), Integer.MAX_VALUE, Integer.MAX_VALUE)); + shape = Shape.fromPMK(Math.nextUp(0.0), 5, 5); + assertEquals(1.0, shape.getProbability(Integer.MAX_VALUE)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromPMK(Math.nextDown(1.0), 5, 5)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromPMK(0.0, 5, 5)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromPMK(0.5, 0, 5)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromPMK(0.5, 5, 0)); + } + + /** + * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash + * functions. + */ + @Test + public void testFromNP() { + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 + */ + final double probability = 1.0 / 2000000; + Shape shape = Shape.fromNP(10, probability); + + assertEquals(302, shape.getNumberOfBits()); + assertEquals(21, shape.getNumberOfHashFunctions()); + + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(Integer.MAX_VALUE, Math.nextDown(1.0))); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(0, probability)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(5, 0.0)); + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(Integer.MAX_VALUE, Math.nextUp(0.0))); + // Test that if calculated number of bits is greater than Integer.MAX_VALUE an + // IllegalArgumentException is thrown. + assertThrows(IllegalArgumentException.class, () -> Shape.fromNP(Integer.MAX_VALUE, 0.1)); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java new file mode 100644 index 0000000000..c5c91f4748 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link SimpleBloomFilter}. + */ +public class SimpleBloomFilterTest extends AbstractBloomFilterTest { + @Override + protected SimpleBloomFilter createEmptyFilter(final Shape shape) { + return new SimpleBloomFilter(shape); + } + + @Override + protected SimpleBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new SimpleBloomFilter(shape, hasher); + } + + @Override + protected SimpleBloomFilter createFilter(final Shape shape, final BitMapProducer producer) { + return new SimpleBloomFilter(shape, producer); + } + + @Override + protected SimpleBloomFilter createFilter(final Shape shape, final IndexProducer producer) { + return new SimpleBloomFilter(shape, producer); + } + + private void executeNestedTest(SimpleBloomFilterTest nestedTest) { + nestedTest.testAsBitMapArray(); + nestedTest.testContains(); + nestedTest.testEstimateIntersection(); + nestedTest.testEstimateN(); + nestedTest.testEstimateUnion(); + nestedTest.testIsFull(); + nestedTest.testMerge(); + nestedTest.testMergeInPlace(); + } + + @Test + public void testConstructors() { + + // // copy of Sparse + SimpleBloomFilterTest nestedTest = new SimpleBloomFilterTest() { + + @Override + protected SimpleBloomFilter createEmptyFilter(Shape shape) { + return new SimpleBloomFilter(new SparseBloomFilter(shape)); + } + + @Override + protected SimpleBloomFilter createFilter(Shape shape, Hasher hasher) { + return new SimpleBloomFilter(new SparseBloomFilter(shape, hasher)); + } + }; + executeNestedTest(nestedTest); + + // copy of Simple + nestedTest = new SimpleBloomFilterTest() { + + @Override + protected SimpleBloomFilter createEmptyFilter(Shape shape) { + return new SimpleBloomFilter(new SimpleBloomFilter(shape)); + } + + @Override + protected SimpleBloomFilter createFilter(Shape shape, Hasher hasher) { + return new SimpleBloomFilter(new SimpleBloomFilter(shape, hasher)); + } + }; + executeNestedTest(nestedTest); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java new file mode 100644 index 0000000000..cb52bf80a8 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import java.util.ArrayList; +import java.util.List; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link SimpleHasher}. + */ +public class SimpleHasherTest extends AbstractHasherTest { + + @Override + protected Hasher createHasher() { + return new SimpleHasher(1, 1); + } + + @Override + protected Hasher createEmptyHasher() { + return NullHasher.INSTANCE; + } + + @Override + protected int getHasherSize(Hasher hasher) { + return 1; + } + + private void assertConstructorBuffer(Shape shape, byte[] buffer, Integer[] expected) { + SimpleHasher hasher = new SimpleHasher(buffer); + List lst = new ArrayList<>(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + } + + private void assertIncrement(SimpleHasher hasher, long defaultIncrement) { + assertEquals(defaultIncrement, hasher.getDefaultIncrement()); + int[] values = hasher.indices(Shape.fromKM(2, Integer.MAX_VALUE)).asIndexArray(); + assertEquals(0, values[0]); + assertEquals(Long.remainderUnsigned(defaultIncrement, Integer.MAX_VALUE), values[1]); + } + + @Test + public void testConstructor() { + Shape shape = Shape.fromKM(5, 10); + assertConstructorBuffer(shape, new byte[] { 1, 1 }, new Integer[] { 1, 2, 3, 4, 5 }); + assertConstructorBuffer(shape, new byte[] { 1 }, new Integer[] { 0, 1, 2, 3, 4 }); + assertConstructorBuffer(shape, new byte[] { 1, 0, 1 }, new Integer[] { 1, 2, 3, 4, 5 }); + assertConstructorBuffer(shape, new byte[] { 0, 1, 0, 1 }, new Integer[] { 1, 2, 3, 4, 5 }); + assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }, + new Integer[] { 1, 2, 3, 4, 5 }); + assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 }, + new Integer[] { 1, 2, 3, 4, 5 }); + assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 }, + new Integer[] { 1, 2, 3, 4, 5 }); + + // test empty buffer + assertThrows(IllegalArgumentException.class, () -> new SimpleHasher(new byte[0])); + + // test zero incrementer gets default + // default increment from SimpleHasher. + long defaultIncrement = 0x9e3779b97f4a7c15L; + SimpleHasher hasher = new SimpleHasher(0, 0); + assertIncrement(new SimpleHasher(0, 0), defaultIncrement); + assertIncrement(new SimpleHasher(new byte[2]), defaultIncrement); + + // test that changing default increment works + defaultIncrement = 4; + defaultIncrement = 4L; + hasher = new SimpleHasher(0, 0) { + @Override + public long getDefaultIncrement() { + return 4L; + } + }; + assertIncrement(hasher, defaultIncrement); + hasher = new SimpleHasher(new byte[2]) { + @Override + public long getDefaultIncrement() { + return 4L; + } + }; + + assertEquals(defaultIncrement, hasher.getDefaultIncrement()); + } + + @Test + void testModEdgeCases() { + for (long dividend : new long[] { -1, -2, -3, -6378683, -23567468136887892L, Long.MIN_VALUE, 345, 678686, + 67868768686878924L, Long.MAX_VALUE }) { + for (int divisor : new int[] { 1, 2, 3, 5, 13, Integer.MAX_VALUE }) { + assertEquals((int) Long.remainderUnsigned(dividend, divisor), SimpleHasher.mod(dividend, divisor), + () -> String.format("failure with dividend=%s and divisor=%s.", dividend, divisor)); + } + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java new file mode 100644 index 0000000000..2b2ba0fb5a --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link SparseBloomFilter}. + */ +public class SparseBloomFilterTest extends AbstractBloomFilterTest { + @Override + protected SparseBloomFilter createEmptyFilter(final Shape shape) { + return new SparseBloomFilter(shape); + } + + @Override + protected SparseBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new SparseBloomFilter(shape, hasher); + } + + @Override + protected SparseBloomFilter createFilter(final Shape shape, final BitMapProducer producer) { + return new SparseBloomFilter(shape, producer); + } + + @Override + protected SparseBloomFilter createFilter(final Shape shape, final IndexProducer producer) { + return new SparseBloomFilter(shape, producer); + } + + private void executeNestedTest(SparseBloomFilterTest nestedTest) { + nestedTest.testContains(); + nestedTest.testEstimateIntersection(); + nestedTest.testEstimateN(); + nestedTest.testEstimateUnion(); + nestedTest.testIsFull(); + nestedTest.testMerge(); + nestedTest.testMergeInPlace(); + } + + @Test + public void testConstructors() { + + // copy of Sparse + SparseBloomFilterTest nestedTest = new SparseBloomFilterTest() { + + @Override + protected SparseBloomFilter createEmptyFilter(Shape shape) { + return new SparseBloomFilter(new SparseBloomFilter(shape)); + } + + @Override + protected SparseBloomFilter createFilter(Shape shape, Hasher hasher) { + return new SparseBloomFilter(new SparseBloomFilter(shape, hasher)); + } + }; + executeNestedTest(nestedTest); + + // copy of Simple + nestedTest = new SparseBloomFilterTest() { + + @Override + protected SparseBloomFilter createEmptyFilter(Shape shape) { + return new SparseBloomFilter(new SimpleBloomFilter(shape)); + } + + @Override + protected SparseBloomFilter createFilter(Shape shape, Hasher hasher) { + return new SparseBloomFilter(new SimpleBloomFilter(shape, hasher)); + } + }; + executeNestedTest(nestedTest); + } + + @Test + public void testBitMapProducerEdgeCases() { + int[] values = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 65, 66, 67, 68, 69, 70, 71 }; + BloomFilter bf = createFilter(getTestShape(), IndexProducer.fromIndexArray(values)); + + // verify exit early before bitmap boundary + int[] passes = new int[1]; + assertFalse(bf.forEachBitMap(l -> { + passes[0]++; + return false; + })); + assertEquals(1, passes[0]); + + // verify exit early at bitmap boundary + bf = createFilter(getTestShape(), IndexProducer.fromIndexArray(values)); + passes[0] = 0; + assertFalse(bf.forEachBitMap(l -> { + boolean result = passes[0] == 0; + if (result) { + passes[0]++; + } + return result; + })); + assertEquals(1, passes[0]); + + // verify add extra if all values in first bitmap + values = new int[] { 1, 2, 3, 4 }; + bf = createFilter(getTestShape(), IndexProducer.fromIndexArray(values)); + passes[0] = 0; + assertTrue(bf.forEachBitMap(l -> { + passes[0]++; + return true; + })); + assertEquals(2, passes[0]); + + // verify exit early if all values in first bitmap and predicate returns false + // on 2nd block + values = new int[] { 1, 2, 3, 4 }; + bf = createFilter(getTestShape(), IndexProducer.fromIndexArray(values)); + passes[0] = 0; + assertFalse(bf.forEachBitMap(l -> { + boolean result = passes[0] == 0; + if (result) { + passes[0]++; + } + return result; + })); + assertEquals(1, passes[0]); + } + + @Test + public void testBloomFilterBasedMergeInPlaceEdgeCases() { + BloomFilter bf1 = createEmptyFilter(getTestShape()); + BloomFilter bf2 = new SimpleBloomFilter(getTestShape(), from1); + bf1.mergeInPlace(bf2); + assertTrue(bf2.forEachBitMapPair(bf1, (x, y) -> x == y)); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherCollectionTest.java similarity index 63% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherCollectionTest.java index 9a2078d80c..4aaf9141a0 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherCollectionTest.java @@ -16,22 +16,15 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; +public class UniqueIndexProducerFromHasherCollectionTest extends AbstractIndexProducerTest { -/** - * Tests for the {@link BitSetBloomFilter}. - */ -public class BitSetBloomFilterTest extends AbstractBloomFilterTest { @Override - protected BitSetBloomFilter createEmptyFilter(final Shape shape) { - return new BitSetBloomFilter(shape); + protected IndexProducer createProducer() { + return new HasherCollection(new SimpleHasher(0, 1), new SimpleHasher(0, 2)).uniqueIndices(Shape.fromKM(17, 72)); } @Override - protected BitSetBloomFilter createFilter(final Hasher hasher, final Shape shape) { - final BitSetBloomFilter testFilter = new BitSetBloomFilter(shape); - testFilter.merge( hasher ); - return testFilter; + protected IndexProducer createEmptyProducer() { + return new HasherCollection().uniqueIndices(Shape.fromKM(17, 72)); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherTest.java new file mode 100644 index 0000000000..f711a57201 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherTest.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class UniqueIndexProducerFromHasherTest extends AbstractIndexProducerTest { + + @Override + protected IndexProducer createProducer() { + return new SimpleHasher(0, 1).uniqueIndices(Shape.fromKM(17, 72)); + } + + @Override + protected IndexProducer createEmptyProducer() { + return NullHasher.INSTANCE.indices(Shape.fromKM(17, 72)); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/checkstyle.xml b/src/test/java/org/apache/commons/collections4/bloomfilter/checkstyle.xml new file mode 100644 index 0000000000..0b79c22dca --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/checkstyle.xml @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java deleted file mode 100644 index 5e925b9755..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.nio.charset.StandardCharsets; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator.OfInt; - -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -/** - * {@link DynamicHasher.Builder} tests. - */ -public class DynamicHasherBuilderTest { - - private DynamicHasher.Builder builder; - private final HashFunction hf = new MD5Cyclic(); - private final Shape shape = new Shape(hf, 1, 345, 1); - private final String testString = HasherBuilderTest.getExtendedString(); - - /** - * Tests that hashing a byte array works as expected. - */ - @Test - public void buildTest_byteArray() { - final byte[] bytes = testString.getBytes(); - final DynamicHasher hasher = builder.with(bytes).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that an empty hasher works as expected. - */ - @Test - public void buildTest_Empty() { - final DynamicHasher hasher = builder.build(); - - final OfInt iter = hasher.iterator(shape); - - assertFalse(iter.hasNext()); - - assertThrows(NoSuchElementException.class, () -> iter.nextInt(), "Should have thrown NoSuchElementException"); - } - - /** - * Tests that hashing a string works as expected. - */ - @Test - public void buildTest_String() { - final byte[] bytes = testString.getBytes(StandardCharsets.UTF_8); - final DynamicHasher hasher = builder.with(testString, StandardCharsets.UTF_8).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that hashing a string works as expected. - */ - @Test - public void buildTest_UnencodedString() { - final byte[] bytes = testString.getBytes(StandardCharsets.UTF_16LE); - final DynamicHasher hasher = builder.withUnencoded(testString).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that build resets the builder. - */ - @Test - public void buildResetTest() { - builder.with(new byte[] {123}); - final OfInt iter = builder.build().iterator(shape); - - assertTrue(iter.hasNext()); - iter.next(); - assertFalse(iter.hasNext()); - - // Nothing added since last build so it should be an empty hasher - final OfInt iter2 = builder.build().iterator(shape); - assertFalse(iter2.hasNext()); - } - - /** - * Sets up the builder for testing. - */ - @BeforeEach - public void setup() { - builder = new DynamicHasher.Builder(hf); - } - -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java deleted file mode 100644 index 10de0b5b30..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.nio.charset.StandardCharsets; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator.OfInt; - -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link DynamicHasher}. - */ -public class DynamicHasherTest { - - private DynamicHasher.Builder builder; - private Shape shape; - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - - }; - - /** - * Sets up the DynamicHasher. - */ - @BeforeEach - public void setup() { - builder = new DynamicHasher.Builder(new MD5Cyclic()); - shape = new Shape(new MD5Cyclic(), 3, 72, 17); - } - - /** - * Tests that the expected bits are returned from hashing. - */ - @Test - public void testGetBits() { - - final int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62}; - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).build(); - - final OfInt iter = hasher.iterator(shape); - - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that bits from multiple hashes are returned correctly. - */ - @Test - public void testGetBits_MultipleHashes() { - final int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, - 59, 49, 39, 13, 3, 65, 55, 45, 35, 25}; - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).with("World", StandardCharsets.UTF_8).build(); - - final OfInt iter = hasher.iterator(shape); - - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - - assertThrows(NoSuchElementException.class, () -> iter.next(), "Should have thrown NoSuchElementException"); - } - - /** - * Tests that retrieving bits for the wrong shape throws an exception. - */ - @Test - public void testGetBits_WrongShape() { - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).build(); - - assertThrows(IllegalArgumentException.class, () -> hasher.iterator(new Shape(testFunction, 3, 72, 17)), "Should have thrown IllegalArgumentException"); - } - -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java deleted file mode 100644 index 479cfa5188..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.junit.jupiter.api.Test; - -/** - * Tests the HashFunctionIdentity implementation ({@link HashFunctionIdentityImpl}).. - */ -public class HashFunctionIdentityImplTest { - - /** - * Tests a copy constructor of the HashFunctionIdentity. - */ - @Test - public void copyConstructorTest() { - final HashFunctionIdentity identity = new HashFunctionIdentity() { - - @Override - public String getName() { - return "NAME"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Provider"; - } - - @Override - public long getSignature() { - return -1L; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - - }; - final HashFunctionIdentityImpl impl = new HashFunctionIdentityImpl(identity); - assertEquals("NAME", impl.getName()); - assertEquals("Provider", impl.getProvider()); - assertEquals(Signedness.SIGNED, impl.getSignedness()); - assertEquals(ProcessType.CYCLIC, impl.getProcessType()); - assertEquals(-1L, impl.getSignature()); - } - - /** - * Test the constructor from component values. - */ - @Test - public void valuesConstructorTest() { - final HashFunctionIdentityImpl impl = new HashFunctionIdentityImpl("Provider", "NAME", Signedness.UNSIGNED, - ProcessType.ITERATIVE, -2L); - assertEquals("NAME", impl.getName()); - assertEquals("Provider", impl.getProvider()); - assertEquals(Signedness.UNSIGNED, impl.getSignedness()); - assertEquals(ProcessType.ITERATIVE, impl.getProcessType()); - assertEquals(-2L, impl.getSignature()); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java deleted file mode 100644 index e68df55b26..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.junit.jupiter.api.Test; - -/** - * Tests of the {@link HashFunctionValidator}. - */ -public class HashFunctionValidatorTest { - - /** - * Tests that name is used in the equality check. - */ - @Test - public void testName() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that name is not affected by case. - */ - @Test - public void testNameIsCaseInsensitive() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "IMPL1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl2)); - } - - /** - * Tests that process type is used in the equality check. - */ - @Test - public void testProcessType() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.ITERATIVE, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that provider is not used in the equality check. - */ - @Test - public void testProviderIsNotUsedInEqualityCheck() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertTrue(HashFunctionValidator.areEqual(impl1, impl2)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that signedness is used in the equality check. - */ - @Test - public void testSignedness() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Test the check method throws when the two hash functions are not equal. - */ - @Test - public void testCheckThrows() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, - ProcessType.CYCLIC, 300L); - assertThrows(IllegalArgumentException.class, () -> HashFunctionValidator.checkAreEqual(impl1, impl2)); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java deleted file mode 100644 index 303034053a..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher.Builder; -import org.apache.commons.lang3.NotImplementedException; -import org.junit.jupiter.api.Test; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; - -/** - * Tests the - * {@link org.apache.commons.collections4.bloomfilter.hasher.Hasher.Builder Hasher.Builder}. - */ -public class HasherBuilderTest { - - /** - * Simple class to collect byte[] items added to the builder. - */ - private static class TestBuilder implements Hasher.Builder { - ArrayList items = new ArrayList<>(); - - @Override - public Hasher build() { - throw new NotImplementedException("Not required"); - } - - @Override - public Builder with(final byte[] item) { - items.add(item); - return this; - } - } - - /** - * Tests that adding CharSequence items works correctly. - */ - @Test - public void withCharSequenceTest() { - final String ascii = "plain"; - final String extended = getExtendedString(); - for (final String s : new String[] {ascii, extended}) { - for (final Charset cs : new Charset[] { - StandardCharsets.ISO_8859_1, StandardCharsets.UTF_8, StandardCharsets.UTF_16 - }) { - final TestBuilder builder = new TestBuilder(); - builder.with(s, cs); - assertArrayEquals(s.getBytes(cs), builder.items.get(0)); - } - } - } - - /** - * Tests that adding unencoded CharSequence items works correctly. - */ - @Test - public void withUnencodedCharSequenceTest() { - final String ascii = "plain"; - final String extended = getExtendedString(); - for (final String s : new String[] {ascii, extended}) { - final TestBuilder builder = new TestBuilder(); - builder.withUnencoded(s); - final byte[] encoded = builder.items.get(0); - final char[] original = s.toCharArray(); - // Should be twice the length - assertEquals(original.length * 2, encoded.length); - // Should be little endian (lower bits first) - final CharBuffer buffer = ByteBuffer.wrap(encoded) - .order(ByteOrder.LITTLE_ENDIAN).asCharBuffer(); - for (int i = 0; i < original.length; i++) { - assertEquals(original[i], buffer.get(i)); - } - } - } - - /** - * Gets a string with non-standard characters. - * - * @return the extended string - */ - static String getExtendedString() { - final char[] data = {'e', 'x', 't', 'e', 'n', 'd', 'e', 'd', ' ', - // Add some characters that are non standard - // non-ascii - 0xCA98, - // UTF-16 surrogate pair - 0xD803, 0xDE6D - // Add other cases here ... - }; - return String.valueOf(data); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java deleted file mode 100644 index a393451a50..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertAll; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; - -import java.util.ArrayList; - -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link Shape} class. - */ -public class ShapeTest { - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - - }; - - /* - * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= - * - * n = 5 - * - * p = 0.100375138 (1 in 10) - * - * m = 24 (3B) - * - * k = 3 - */ - - private final Shape shape = new Shape(testFunction, 5, 0.1); - - /** - * Tests that if the number of bits less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfBitsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 5, 0), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the number of hash functions is less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfHashFunctionsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 16, 8), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfItemsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 0, 24), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the number of bits is less than 1 an exception is thrown - */ - @Test - public void constructor_items_bits_hash_BadNumberOfBitsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 5, 0, 1), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the number of hash functions is less than 1 an exception is thrown. - */ - @Test - public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 5, 24, 0), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the number of items is less than 1 an exception is thrown. - */ - @Test - public void constructor_items_bits_hash_BadNumberOfItemsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 0, 24, 1), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown - */ - @Test - public void constructor_items_bits_hash_BadProbabilityTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 4000, 8, 1), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that when the number of items, number of bits and number of hash functions is passed the values are - * calculated correctly. - */ - @Test - public void constructor_items_bits_hashTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 - */ - final Shape filterConfig = new Shape(testFunction, 5, 24, 4); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(4, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); - } - - /** - * Tests that the number of items and number of bits is passed the other values are calculated correctly. - */ - @Test - public void constructor_items_bitsTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&m=24 - */ - final Shape filterConfig = new Shape(testFunction, 5, 24); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(3, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - } - - /** - * Tests that if the number of items is less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_BadNumberOfItemsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 0, 1.0 / 10), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the probability is less than or equal to 0 or more than or equal to 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_BadProbabilityTest() { - assertAll( - () -> assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 10, 0.0), - "Should have thrown IllegalArgumentException"), - () -> assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 10, 1.0), - "Should have thrown IllegalArgumentException"), - () -> assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 10, Double.NaN), - "Should have thrown IllegalArgumentException") - ); - } - - /** - * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_NumberOfBitsOverflowTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests the the probability is calculated correctly. - */ - @Test - public void constructor_items_probability_Test() { - - assertEquals(24, shape.getNumberOfBits()); - assertEquals(3, shape.getNumberOfHashFunctions()); - assertEquals(5, shape.getNumberOfItems()); - assertEquals(0.100375138, shape.getProbability(), 0.000001); - } - - /** - * Tests that the constructor with a null name, number of items and size of filter fails. - */ - @Test - public void constructor_nm_noName() { - assertThrows(NullPointerException.class, () -> new Shape(null, 5, 72), - "Should throw NullPointerException"); - } - - /** - * Tests that the constructor with a null name, number of items, size of filter, and number of functions fails. - */ - @Test - public void constructor_nmk_noName() { - assertThrows(NullPointerException.class, () -> new Shape(null, 5, 72, 17), - "Should throw NullPointerException"); - } - - /** - * Tests that the constructor with a null name, number of items, and probability fails. - */ - @Test - public void constructor_np_noName() { - assertThrows(NullPointerException.class, () -> new Shape(null, 5, 0.1), - "Should throw NullPointerException"); - } - - /** - * Tests that the constructor with a null name, probability, size of filter, and number of functions fails. - */ - @Test - public void constructor_pmk_noName() { - assertThrows(NullPointerException.class, () -> new Shape(null, 0.1, 72, 17), - "Should throw NullPointerException"); - } - - /** - * Tests that if the number of bits is less than 1 an exception is thrown - */ - @Test - public void constructor_probability_bits_hash_BadNumberOfBitsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 0.5, 0, 1), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the number of functions is less than 1 an exception is thrown - */ - @Test - public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() { - assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 0.5, 24, 0), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that invalid probability values cause and IllegalArgumentException to be thrown. - */ - @Test - public void constructor_probability_bits_hash_BadProbabilityTest() { - assertAll( - // probability should not be 0 - () -> assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 0.0, 24, 1), - "Should have thrown IllegalArgumentException"), - // probability should not be = -1 - () -> assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, -1.0, 24, 1), - "Should have thrown IllegalArgumentException"), - // probability should not be < -1 - () -> assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, -1.5, 24, 1), - "Should have thrown IllegalArgumentException"), - // probability should not be = 1 - () -> assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 1.0, 24, 1), - "Should have thrown IllegalArgumentException"), - // probability should not be > 1 - () -> assertThrows(IllegalArgumentException.class, () -> new Shape(testFunction, 2.0, 24, 1), - "Should have thrown IllegalArgumentException") - ); - } - - /** - * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash - * functions. - */ - @Test - public void constructor_probability_bits_hashTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 - */ - final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(3, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - } - - /** - * Test equality of shape. - */ - @Test - public void equalsTest() { - - assertEquals(shape, shape); - assertEquals(shape, new Shape(testFunction, 5, 1.0 / 10)); - assertNotEquals(shape, null); - assertNotEquals(shape, new Shape(testFunction, 5, 1.0 / 11)); - assertNotEquals(shape, new Shape(testFunction, 4, 1.0 / 10)); - // Number of bits does not change equality, - // only the number of bits and the number of hash functions - final int numberOfBits = 10000; - final int numberOfItems = 15; - final int numberOfHashFunctions = 4; - assertEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems + 1, numberOfBits, numberOfHashFunctions)); - assertNotEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems, numberOfBits + 1, numberOfHashFunctions)); - assertNotEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions + 1)); - - final HashFunctionIdentity testFunction2 = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function2"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - - }; - - assertNotEquals(shape, new Shape(testFunction2, 4, 1.0 / 10)); - } - - /** - * Test that hashCode satisfies the contract between {@link Object#hashCode()} and - * {@link Object#equals(Object)}. Equal shapes must have the same hash code. - */ - @Test - public void hashCodeTest() { - // Hash function equality is based on process type, signedness and name (case insensitive) - final ArrayList list = new ArrayList<>(); - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Provider changes - list.add(new HashFunctionIdentityImpl("PROVIDER", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider2", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Name changes - list.add(new HashFunctionIdentityImpl("Provider", "name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider", "NAME", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider", "Other", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Signedness changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.UNSIGNED, ProcessType.ITERATIVE, 0L)); - // ProcessType changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.CYCLIC, 0L)); - // Signature changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 1L)); - - // Create shapes that only differ in the hash function. - final int numberOfItems = 30; - final int numberOfBits = 3000; - final int numberOfHashFunctions = 10; - final Shape shape1 = new Shape(list.get(0), numberOfItems, numberOfBits, numberOfHashFunctions); - assertEquals(shape1, shape1); - - // Try variations - for (int i = 1; i < list.size(); i++) { - final Shape shape2 = new Shape(list.get(i), numberOfItems, numberOfBits, numberOfHashFunctions); - assertEquals(shape2, shape2); - - // Equal shapes must have the same hash code - if (shape1.equals(shape2)) { - assertEquals(shape1.hashCode(), shape2.hashCode()); - } - } - } - -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java deleted file mode 100644 index 7522d36ad4..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.PrimitiveIterator.OfInt; - -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link StaticHasher}. - */ -public class StaticHasherTest { - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - - }; - - private final HashFunctionIdentity testFunctionX = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test FunctionX"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - - }; - - private final Shape shape = new Shape(testFunction, 3, 72, 17); - - /** - * Compare 2 static hashers to verify they have the same bits enabled. - * - * @param hasher1 the first static hasher. - * @param hasher2 the second static hasher. - */ - private void assertSameBits(final StaticHasher hasher1, final StaticHasher hasher2) { - final OfInt iter1 = hasher1.iterator(shape); - final OfInt iter2 = hasher2.iterator(shape); - - while (iter1.hasNext()) { - assertTrue(iter2.hasNext(), "Not enough data in second hasher"); - assertEquals(iter1.nextInt(), iter2.nextInt()); - } - assertFalse(iter2.hasNext(), "Too much data in second hasher"); - } - - /** - * Tests that passing a hasher other than a StaticHasher to the constructor works as - * expected. - */ - @Test - public void testConstructor_Hasher() { - final int[] expected = {1, 3, 5, 7, 9}; - - final Hasher testHasher = new Hasher() { - - @Override - public OfInt iterator(final Shape shape) { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - return Arrays.stream(values).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return testFunction; - } - }; - - final StaticHasher hasher = new StaticHasher(testHasher, shape); - final OfInt iter = hasher.iterator(shape); - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that passing a hasher other than a StaticHasher and the wrong Shape to the - * constructor throws an IllegalArgumentException. - */ - @Test - public void testConstructor_Hasher_WrongShape() { - final Hasher testHasher = new Hasher() { - - @Override - public OfInt iterator(final Shape shape) { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - return Arrays.stream(values).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return testFunctionX; - } - }; - - assertThrows(IllegalArgumentException.class, () -> new StaticHasher(testHasher, shape), - "Should have thrown IllegalArgumentException"); - } - - /** - * Test that the iterator based constructor works correctly and removes duplicates. - */ - @Test - public void testConstructor_Iterator() { - - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, shape); - - assertEquals(5, hasher.size()); - assertEquals(shape, hasher.getShape()); - // All function properties are equal - assertEquals(testFunction.getName(), hasher.getHashFunctionIdentity().getName()); - assertEquals(testFunction.getProcessType(), hasher.getHashFunctionIdentity().getProcessType()); - assertEquals(testFunction.getProvider(), hasher.getHashFunctionIdentity().getProvider()); - assertEquals(testFunction.getSignedness(), hasher.getHashFunctionIdentity().getSignedness()); - - iter = hasher.iterator(shape); - int idx = 0; - while (iter.hasNext()) { - assertEquals(Integer.valueOf(values[idx]), iter.next(), "Error at idx " + idx); - idx++; - } - assertEquals(5, idx); - } - - /** - * Tests that if the iterator passed to the constructor contains a value greater than - * or equal to Shape.numberOfBits() an exception is thrown. - */ - @Test - public void testConstructor_Iterator_ValueTooBig() { - - final int[] values = {shape.getNumberOfBits(), 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - - assertThrows(IllegalArgumentException.class, () -> new StaticHasher(iter, shape), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that if the iterator passed to the constructor contains a value less than 0 - * (zero) an exception is thrown. - */ - @Test - public void testConstructor_Iterator_ValueTooSmall() { - - final int[] values = {-1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - - assertThrows(IllegalArgumentException.class, () -> new StaticHasher(iter, shape), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that the constructor that accepts a static hasher properly builds the hasher. - */ - @Test - public void testConstructor_StaticHasher() { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, shape); - - final StaticHasher hasher2 = new StaticHasher(hasher, shape); - assertEquals(shape, hasher2.getShape()); - assertSameBits(hasher, hasher2); - } - - /** - * Tests that calling the constructor with a hasher and the wrong shape throws an - * IllegalArgumentException. - */ - @Test - public void testConstructor_StaticHasher_WrongShape() { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, new Shape(testFunctionX, 3, 72, 17)); - - assertThrows(IllegalArgumentException.class, () -> new StaticHasher(hasher, shape), - "Should have thrown IllegalArgumentException"); - } - - /** - * Tests that iterator returns the proper values. - */ - @Test - public void testGetBits() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - assertEquals(17, hasher.size()); - final OfInt iter = hasher.iterator(shape); - for (int i = 0; i < 17; i++) { - assertTrue(iter.hasNext()); - assertEquals(i, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that iterator does not return duplicates and orders the indices. - */ - @Test - public void testGetBits_DuplicateValues() { - final int[] input = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, 59, - 49, 39, 13, 3, 65, 55, 45, 35, 25}; - final int[] expected = {1, 2, 3, 6, 7, 10, 11, 13, 15, 17, 19, 23, 24, 25, 35, 36, 39, 43, 44, 45, 48, 49, 53, 55, 57, - 59, 61, 62, 63, 65, 69, 70}; - - final StaticHasher hasher = new StaticHasher(Arrays.stream(input).iterator(), shape); - - final OfInt iter = hasher.iterator(shape); - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that gitBits is called with the wrong shape an exception is thrown. - */ - @Test - public void testGetBits_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - - assertThrows(IllegalArgumentException.class, () -> hasher.iterator(new Shape(testFunctionX, 3, 72, 17)), - "Should have thrown IllegalArgumentException"); - } - -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java deleted file mode 100644 index 5498d699cb..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.junit.jupiter.api.Test; - -/** - * Tests the signature of a hash function. - */ -public abstract class AbstractHashFunctionTest { - - /** - * Test that the signature is properly generated. - */ - @Test - public void signatureTest() { - final HashFunction hf = createHashFunction(); - final long expected = hf.apply(HashFunctionIdentity.prepareSignatureBuffer(hf), 0); - assertEquals(expected, hf.getSignature()); - // Should be repeatable - final long expected2 = hf.apply(HashFunctionIdentity.prepareSignatureBuffer(hf), 0); - assertEquals(expected, expected2); - assertEquals("Apache Commons Collections", hf.getProvider()); - } - - /** - * Creates the hash function. - * - * @return the hash function - */ - protected abstract HashFunction createHashFunction(); -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java deleted file mode 100644 index 9b0d9a83e1..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Tests the MD5 cyclic hash function. - */ -public class MD5CyclicTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final MD5Cyclic md5 = new MD5Cyclic(); - final long l1 = 0x8b1a9953c4611296L; - final long l2 = 0xa827abf8c47804d7L; - final byte[] buffer = "Hello".getBytes(); - - long l = md5.apply(buffer, 0); - assertEquals(l1, l); - l = md5.apply(buffer, 1); - assertEquals(l1 + l2, l); - l = md5.apply(buffer, 2); - assertEquals(l1 + l2 + l2, l); - } - - @Override - protected HashFunction createHashFunction() { - return new MD5Cyclic(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java deleted file mode 100644 index 9e17c2ec89..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Test that the Murmur3 128 x64 hash function works correctly. - */ -public class Murmur128x64CyclicTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final Murmur128x64Cyclic murmur = new Murmur128x64Cyclic(); - - final long l1 = 0xe7eb60dabb386407L; - final long l2 = 0xc3ca49f691f73056L; - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = murmur.apply(buffer, 0); - assertEquals(l1, l); - l = murmur.apply(buffer, 1); - assertEquals(l1 + l2, l); - l = murmur.apply(buffer, 2); - assertEquals(l1 + l2 + l2, l); - } - - @Override - protected HashFunction createHashFunction() { - return new Murmur128x64Cyclic(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java deleted file mode 100644 index bca60c1e4b..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Test that the Murmur3 32 x86 hash function works correctly. - */ -public class Murmur32x86IterativeTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final Murmur32x86Iterative murmur = new Murmur32x86Iterative(); - - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = murmur.apply(buffer, 0); - assertEquals(82674681, l); - l = murmur.apply(buffer, 1); - assertEquals(-1475490736, l); - l = murmur.apply(buffer, 2); - assertEquals(-1561435247, l); - } - - @Override - protected HashFunction createHashFunction() { - return new Murmur32x86Iterative(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java deleted file mode 100644 index 5595efdc77..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Tests that the Objects hash works correctly. - */ -public class ObjectsHashIterativeTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final ObjectsHashIterative obj = new ObjectsHashIterative(); - - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = obj.apply(buffer, 0); - long prev = 0; - assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); - for (int i = 1; i <= 5; i++) { - prev += l; - l = obj.apply(buffer, i); - assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); - } - } - - @Override - protected HashFunction createHashFunction() { - return new ObjectsHashIterative(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/map/AbstractMapTest.java b/src/test/java/org/apache/commons/collections4/map/AbstractMapTest.java index a9a45197c0..21fca7c321 100644 --- a/src/test/java/org/apache/commons/collections4/map/AbstractMapTest.java +++ b/src/test/java/org/apache/commons/collections4/map/AbstractMapTest.java @@ -1015,7 +1015,7 @@ public void testMapRemove() { } /** - * Tests that the {@link Map#values} collection is backed by + * Tests that the {@link Map#bitMaps} collection is backed by * the underlying map for clear(). */ @Test @@ -1184,7 +1184,7 @@ public void testEntrySetRemove3() { } /** - * Tests that the {@link Map#values} collection is backed by + * Tests that the {@link Map#bitMaps} collection is backed by * the underlying map by removing from the values collection * and testing if the value was removed from the map. *