From 7842c01bed1fbbe6826d9aea72f95eb88c270311 Mon Sep 17 00:00:00 2001 From: yangxf Date: Wed, 13 Mar 2019 09:52:46 +0800 Subject: [PATCH] add BloomFilter --- src/main/java/com/search/BloomFilter.java | 190 ++++++++++++++++++ src/test/java/com/search/BloomFilterTest.java | 70 +++++++ 2 files changed, 260 insertions(+) create mode 100644 src/main/java/com/search/BloomFilter.java create mode 100644 src/test/java/com/search/BloomFilterTest.java diff --git a/src/main/java/com/search/BloomFilter.java b/src/main/java/com/search/BloomFilter.java new file mode 100644 index 00000000..122a3029 --- /dev/null +++ b/src/main/java/com/search/BloomFilter.java @@ -0,0 +1,190 @@ +package src.main.java.com.search; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Function; + +/** + * A simple implementation of Bloom filter. + *

+ * Bloom filter have a chance of being wrong. + *

+ * The Bloom filter assert that elements that do not exist must not exist, + * if assert an element exists, but not necessarily. + *

+ * The accuracy rate depends on capacity and hash functions. + * + * @author yangxf + */ +public class BloomFilter implements Serializable { + private static final long serialVersionUID = -4466610350741278658L; + + private static final int LONG_SHIFT = 6; + private static final int LONG_MASK = 63; + + /** + * hash functions + */ + private final List> hashFunctions; + + private final long[] table; + private final int tableMask; + private int size; + + /** + * @param capacity the filter capacity + * @param hashFunctions hash functions + * @see Builder must be build by {@link Builder}. + */ + private BloomFilter(int capacity, List> hashFunctions) { + this.hashFunctions = hashFunctions; + int cap = nextPowerOf2(capacity); + tableMask = (cap << LONG_SHIFT) - 1; + table = new long[cap]; + } + + public static Builder builder(int capacity) { + if (capacity < 1) { + throw new IllegalStateException("capacity must be > 0"); + } + + return new Builder(capacity); + } + + /** + * Add an element to the Bloom filter. + */ + public void add(String element) { + checkNotNull(element, "element"); + + for (Function hashFunction : hashFunctions) { + int key = hashFunction.apply(element) & tableMask; + table[key >>> LONG_SHIFT] |= (1 << (key & LONG_MASK)); + } + size++; + } + + /** + * @return true if the element exists, otherwise false. + */ + public boolean contains(String element) { + if (element == null) { + return false; + } + + for (Function hashFunction : hashFunctions) { + int key = hashFunction.apply(element) & tableMask; + if ((table[key >>> LONG_SHIFT] & (1 << (key & LONG_MASK))) == 0) { + return false; + } + } + return true; + } + + public List> getHashFunctions() { + return hashFunctions; + } + + public int size() { + return size; + } + + private static void checkNotNull(String element, String msg) { + if (element == null) { + throw new NullPointerException(msg + " must be not null"); + } + } + + private static int nextPowerOf2(int i) { + int n = i - 1; + n |= n >>> 1; + n |= n >>> 2; + n |= n >>> 4; + n |= n >>> 8; + n |= n >>> 16; + return (n < 0) ? 1 : (n >= 0x40000000) ? 0x40000000 : n + 1; + } + + /** + * We need a list of unmodifiable hash functions. + */ + public static class Builder { + private int capacity; + private List> hashFunctions = new ArrayList<>(); + + private Builder(int capacity) { + this.capacity = capacity; + } + + public Builder addHashFunction(Function function) { + hashFunctions.add(function); + return this; + } + + public BloomFilter build() { + if (hashFunctions.isEmpty()) { + addDefaultHashFunction(); + } + return new BloomFilter(capacity, Collections.unmodifiableList(hashFunctions)); + } + + /** + * I provides several default hash functions + */ + private void addDefaultHashFunction() { + // Java String Hash Function + hashFunctions.add(String::hashCode); + + // SDBM Hash Function + hashFunctions.add(key -> { + if (key == null || key.isEmpty()) { + return 0; + } + + int hash = 0; + for (int i = 0; i < key.length(); i++) { + hash = key.charAt(i) + (hash << 6) + (hash << 16) - hash; + } + hash &= 0x7ffffff; + return hash; + }); + + // Robert Sedgwicks Hash Function + hashFunctions.add(key -> { + if (key == null || key.isEmpty()) { + return 0; + } + + int hash = 0; + int magic = 63689; + for (int i = 0; i < key.length(); i++) { + hash = hash * magic + key.charAt(i); + magic *= 378551; + } + return hash; + }); + + // Arash Partow Hash Function + hashFunctions.add(key -> { + if (key == null || key.isEmpty()) { + return 0; + } + + int hash = 0; + for (int i = 0; i < key.length(); i++) { + char ch = key.charAt(i); + if ((i & 1) == 0) { + hash ^= ((hash << 7) ^ ch ^ (hash >> 3)); + } else { + hash ^= (~((hash << 11) ^ ch ^ (hash >> 5))); + } + } + return hash; + }); + } + + } + +} diff --git a/src/test/java/com/search/BloomFilterTest.java b/src/test/java/com/search/BloomFilterTest.java new file mode 100644 index 00000000..0d12b911 --- /dev/null +++ b/src/test/java/com/search/BloomFilterTest.java @@ -0,0 +1,70 @@ +package src.test.java.com.search; + +import org.junit.Test; +import src.main.java.com.search.BloomFilter; + +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ThreadLocalRandom; + +import static org.junit.Assert.*; + +public class BloomFilterTest { + + @Test + public void test() { + int count = 100000; + int low = 50, up = 100; + BloomFilter filter = BloomFilter.builder(10000).build(); + String[] data = new String[count]; + Set dataSet = new HashSet<>(); + for (int i = 0; i < count; i++) { + String str = randomString(low, up); + data[i] = str; + if (i % 2 == 0) { + dataSet.add(str); + filter.add(str); + } + } + + int error = 0, total = 0; + for (int i = 0; i < count; i++) { + String str = data[i]; + if (filter.contains(str)) { + total++; + if (!dataSet.contains(str)) { + error++; + } + } else { + assertFalse(dataSet.contains(str)); + } + } + System.out.println("error: " + error); + System.out.println("total: " + total); + System.out.println("error rate : " + (double) error / total); + } + + public static String randomString(int minLength, int maxLength) { + ThreadLocalRandom r = ThreadLocalRandom.current(); + int chLen = r.nextInt(minLength, maxLength), + poolSize = CHAR_POOL.length; + char[] chars = new char[chLen]; + for (int i = 0; i < chLen; i++) { + chars[i] = CHAR_POOL[r.nextInt(poolSize)]; + } + + return new String(chars); + } + + private static final char[] CHAR_POOL; + + static { + CHAR_POOL = new char[52]; + int i = 0; + for (char c = 'a'; c <= 'z'; c++) { + CHAR_POOL[i++] = c; + CHAR_POOL[i++] = (char) (c - 32); + } + } + +} \ No newline at end of file