Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,15 @@
* @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
*/
public class InternalBloomFilter extends InternalFilter {
private static final byte[] BIT_VALUES = new byte[] {
(byte) 0x01,
(byte) 0x02,
(byte) 0x04,
(byte) 0x08,
(byte) 0x10,
(byte) 0x20,
(byte) 0x40,
(byte) 0x80
};

/**
* The bit vector.
* The bit vector, as little-endian 64-bit words: bit {@code i} lives at
* {@code words[i >> 6]} under mask {@code 1L << (i & 63)}. The serialized layout
* (bit {@code i} at byte {@code i >> 3} under mask {@code 1 << (i & 7)}) is the
* little-endian byte view of this array, so {@link #write} and {@link #readFields}
* translate between the two by byte position alone. Bits at positions greater than
* or equal to {@code vectorSize} are always zero.
*/
BitSet bits;
long[] words;

/**
* Default constructor - use with readFields
Expand All @@ -116,7 +110,7 @@ public InternalBloomFilter() {
public InternalBloomFilter(int vectorSize, int nbHash, int hashType) {
super(vectorSize, nbHash, hashType);

bits = new BitSet(this.vectorSize);
words = new long[wordCount(this.vectorSize)];
}

/**
Expand All @@ -134,7 +128,7 @@ public void add(Key key) {
hash.clear();

for (int i = 0; i < nbHash; i++) {
bits.set(h[i]);
words[h[i] >>> 6] |= 1L << (h[i] & 63);
}
}

Expand All @@ -147,7 +141,10 @@ public void and(InternalFilter filter) {
throw new IllegalArgumentException("filters cannot be and-ed");
}

this.bits.and(((InternalBloomFilter) filter).bits);
long[] other = ((InternalBloomFilter) filter).words;
for (int i = 0; i < words.length; i++) {
words[i] &= other[i];
}
}

@Override
Expand All @@ -159,7 +156,7 @@ public boolean membershipTest(Key key) {
int[] h = hash.hash(key);
hash.clear();
for (int i = 0; i < nbHash; i++) {
if (!bits.get(h[i])) {
if ((words[h[i] >>> 6] & (1L << (h[i] & 63))) == 0) {
return false;
}
}
Expand All @@ -168,7 +165,10 @@ public boolean membershipTest(Key key) {

@Override
public void not() {
bits.flip(0, vectorSize);
for (int i = 0; i < words.length; i++) {
words[i] = ~words[i];
}
clearUnusedBits();
}

@Override
Expand All @@ -179,7 +179,10 @@ public void or(InternalFilter filter) {
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be or-ed");
}
bits.or(((InternalBloomFilter) filter).bits);
long[] other = ((InternalBloomFilter) filter).words;
for (int i = 0; i < words.length; i++) {
words[i] |= other[i];
}
}

@Override
Expand All @@ -190,12 +193,15 @@ public void xor(InternalFilter filter) {
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be xor-ed");
}
bits.xor(((InternalBloomFilter) filter).bits);
long[] other = ((InternalBloomFilter) filter).words;
for (int i = 0; i < words.length; i++) {
words[i] ^= other[i];
}
}

@Override
public String toString() {
return bits.toString();
return BitSet.valueOf(words).toString();
}

/**
Expand All @@ -209,40 +215,41 @@ public int getVectorSize() {
public void write(DataOutput out) throws IOException {
super.write(out);
byte[] bytes = new byte[getNBytes()];
for (int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
if (bitIndex == 8) {
bitIndex = 0;
byteIndex++;
}
if (bitIndex == 0) {
bytes[byteIndex] = 0;
}
if (bits.get(i)) {
bytes[byteIndex] |= BIT_VALUES[bitIndex];
}
for (int byteIndex = 0; byteIndex < bytes.length; byteIndex++) {
bytes[byteIndex] = (byte) (words[byteIndex >>> 3] >>> ((byteIndex & 7) << 3));
}
out.write(bytes);
}

@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
bits = new BitSet(this.vectorSize);
words = new long[wordCount(vectorSize)];
byte[] bytes = new byte[getNBytes()];
in.readFully(bytes);
for (int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
if (bitIndex == 8) {
bitIndex = 0;
byteIndex++;
}
if ((bytes[byteIndex] & BIT_VALUES[bitIndex]) != 0) {
bits.set(i);
}
for (int byteIndex = 0; byteIndex < bytes.length; byteIndex++) {
words[byteIndex >>> 3] |= (bytes[byteIndex] & 0xFFL) << ((byteIndex & 7) << 3);
}
clearUnusedBits();
}

/* @return number of bytes needed to hold bit vector */
private int getNBytes() {
return (int) (((long) vectorSize + 7) / 8);
}

private static int wordCount(int vectorSize) {
return (vectorSize + 63) >>> 6;
}

/**
* Clears bits at positions greater than or equal to {@code vectorSize}, such as the unused
* trailing bits of the last serialized byte, so bitwise ops and serialization stay exact.
*/
private void clearUnusedBits() {
int usedBitsInLastWord = vectorSize & 63;
if (usedBitsInLastWord != 0) {
words[words.length - 1] &= (1L << usedBitsInLastWord) - 1;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hudi.common.bloom;

import org.junit.jupiter.api.Test;

import java.util.Locale;
import java.util.Random;
import java.util.function.Supplier;

/**
* Manual microbenchmark for bloom filter hot paths: key adds (the write-path cost paid per
* record by the HFile writer), membership tests, and serialization round trips.
* <p>
* The class name intentionally does not match the surefire test patterns, so it never runs
* in CI. Run it explicitly with:
* <pre>
* mvn test -pl hudi-common -Dtest=InternalBloomFilterBenchmark -Dsurefire.failIfNoSpecifiedTests=false
* </pre>
*/
public class InternalBloomFilterBenchmark {

private static final int WARMUP_ROUNDS = 1;
private static final int MEASURED_ROUNDS = 3;
private static final int KEY_POOL_SIZE = 1_000_000;
private static final int MEMBERSHIP_PROBES = 100_000;

@Test
public void benchmarkBloomFilterHotPaths() {
runScenario("SIMPLE, 1M entries, fpp 1e-3, 1M adds",
() -> BloomFilterFactory.createBloomFilter(
1_000_000, 0.001, -1, BloomFilterTypeCode.SIMPLE.name()),
1_000_000);
runScenario("SIMPLE, 10M entries, fpp 1e-9, 10M adds",
() -> BloomFilterFactory.createBloomFilter(
10_000_000, 0.000000001, -1, BloomFilterTypeCode.SIMPLE.name()),
10_000_000);
runScenario("DYNAMIC_V0, 60K entries, fpp 1e-9, max 100K, 10M adds",
() -> BloomFilterFactory.createBloomFilter(
60_000, 0.000000001, 100_000, BloomFilterTypeCode.DYNAMIC_V0.name()),
10_000_000);
}

private void runScenario(String name, Supplier<BloomFilter> filterSupplier, int numAdds) {
String[] keys = generateKeys(KEY_POOL_SIZE, 42);
String[] absentKeys = generateKeys(MEMBERSHIP_PROBES, 4242);
System.out.println("== " + name + " ==");
BloomFilter filter = null;
for (int round = 0; round < WARMUP_ROUNDS + MEASURED_ROUNDS; round++) {
filter = filterSupplier.get();
long start = System.nanoTime();
for (int i = 0; i < numAdds; i++) {
filter.add(keys[i % KEY_POOL_SIZE]);
}
long addMs = (System.nanoTime() - start) / 1_000_000;

start = System.nanoTime();
int hits = 0;
for (int i = 0; i < MEMBERSHIP_PROBES; i++) {
if (filter.mightContain(keys[i])) {
hits++;
}
}
long hitMs = (System.nanoTime() - start) / 1_000_000;

start = System.nanoTime();
int falsePositives = 0;
for (String absentKey : absentKeys) {
if (filter.mightContain(absentKey)) {
falsePositives++;
}
}
long missMs = (System.nanoTime() - start) / 1_000_000;

String label = round < WARMUP_ROUNDS ? "warmup" : "round" + (round - WARMUP_ROUNDS + 1);
System.out.println(String.format(Locale.ROOT,
"%s: adds(%d)=%d ms, membership hits(%d)=%d ms, misses(%d)=%d ms (hits=%d, falsePositives=%d)",
label, numAdds, addMs, MEMBERSHIP_PROBES, hitMs, absentKeys.length, missMs, hits, falsePositives));
}

for (int round = 0; round < WARMUP_ROUNDS + MEASURED_ROUNDS; round++) {
long start = System.nanoTime();
String serialized = filter.serializeToString();
long serMs = (System.nanoTime() - start) / 1_000_000;
start = System.nanoTime();
BloomFilterFactory.fromString(serialized, filter.getBloomFilterTypeCode().name());
long deserMs = (System.nanoTime() - start) / 1_000_000;
String label = round < WARMUP_ROUNDS ? "warmup" : "round" + (round - WARMUP_ROUNDS + 1);
System.out.println(String.format(Locale.ROOT,
"%s: serialize=%d ms, deserialize=%d ms (serialized length=%d)",
label, serMs, deserMs, serialized.length()));
}
}

/** Generates fixed-seed 32-character hex keys, mirroring hash-based record keys. */
private static String[] generateKeys(int count, long seed) {
Random random = new Random(seed);
String[] keys = new String[count];
byte[] buffer = new byte[16];
StringBuilder sb = new StringBuilder(32);
for (int i = 0; i < count; i++) {
random.nextBytes(buffer);
sb.setLength(0);
for (byte b : buffer) {
sb.append(Character.forDigit((b >> 4) & 0xF, 16)).append(Character.forDigit(b & 0xF, 16));
}
keys[i] = sb.toString();
}
return keys;
}
}
Loading
Loading