Merge pull request #714 from MrYangxf/Development

Add a simple implementation of Bloom filter.
This commit is contained in:
yanglbme 2019-03-13 10:11:08 +08:00 committed by GitHub
commit ee8147cdda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 260 additions and 0 deletions

View File

@ -0,0 +1,190 @@
package src.main.java.com.search;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
/**
* A simple implementation of Bloom filter.
* <p>
* Bloom filter have a chance of being wrong.
* <p>
* The Bloom filter assert that elements that do not exist must not exist,
* if assert an element exists, but not necessarily.
* <p>
* The accuracy rate depends on capacity and hash functions.
*
* @author yangxf
*/
public class BloomFilter implements Serializable {
private static final long serialVersionUID = -4466610350741278658L;
private static final int LONG_SHIFT = 6;
private static final int LONG_MASK = 63;
/**
* hash functions
*/
private final List<Function<String, Integer>> hashFunctions;
private final long[] table;
private final int tableMask;
private int size;
/**
* @param capacity the filter capacity
* @param hashFunctions hash functions
* @see Builder must be build by {@link Builder}.
*/
private BloomFilter(int capacity, List<Function<String, Integer>> hashFunctions) {
this.hashFunctions = hashFunctions;
int cap = nextPowerOf2(capacity);
tableMask = (cap << LONG_SHIFT) - 1;
table = new long[cap];
}
public static Builder builder(int capacity) {
if (capacity < 1) {
throw new IllegalStateException("capacity must be > 0");
}
return new Builder(capacity);
}
/**
* Add an element to the Bloom filter.
*/
public void add(String element) {
checkNotNull(element, "element");
for (Function<String, Integer> hashFunction : hashFunctions) {
int key = hashFunction.apply(element) & tableMask;
table[key >>> LONG_SHIFT] |= (1 << (key & LONG_MASK));
}
size++;
}
/**
* @return true if the element exists, otherwise false.
*/
public boolean contains(String element) {
if (element == null) {
return false;
}
for (Function<String, Integer> hashFunction : hashFunctions) {
int key = hashFunction.apply(element) & tableMask;
if ((table[key >>> LONG_SHIFT] & (1 << (key & LONG_MASK))) == 0) {
return false;
}
}
return true;
}
public List<Function<String, Integer>> getHashFunctions() {
return hashFunctions;
}
public int size() {
return size;
}
private static void checkNotNull(String element, String msg) {
if (element == null) {
throw new NullPointerException(msg + " must be not null");
}
}
private static int nextPowerOf2(int i) {
int n = i - 1;
n |= n >>> 1;
n |= n >>> 2;
n |= n >>> 4;
n |= n >>> 8;
n |= n >>> 16;
return (n < 0) ? 1 : (n >= 0x40000000) ? 0x40000000 : n + 1;
}
/**
* We need a list of unmodifiable hash functions.
*/
public static class Builder {
private int capacity;
private List<Function<String, Integer>> hashFunctions = new ArrayList<>();
private Builder(int capacity) {
this.capacity = capacity;
}
public Builder addHashFunction(Function<String, Integer> function) {
hashFunctions.add(function);
return this;
}
public BloomFilter build() {
if (hashFunctions.isEmpty()) {
addDefaultHashFunction();
}
return new BloomFilter(capacity, Collections.unmodifiableList(hashFunctions));
}
/**
* I provides several default hash functions
*/
private void addDefaultHashFunction() {
// Java String Hash Function
hashFunctions.add(String::hashCode);
// SDBM Hash Function
hashFunctions.add(key -> {
if (key == null || key.isEmpty()) {
return 0;
}
int hash = 0;
for (int i = 0; i < key.length(); i++) {
hash = key.charAt(i) + (hash << 6) + (hash << 16) - hash;
}
hash &= 0x7ffffff;
return hash;
});
// Robert Sedgwicks Hash Function
hashFunctions.add(key -> {
if (key == null || key.isEmpty()) {
return 0;
}
int hash = 0;
int magic = 63689;
for (int i = 0; i < key.length(); i++) {
hash = hash * magic + key.charAt(i);
magic *= 378551;
}
return hash;
});
// Arash Partow Hash Function
hashFunctions.add(key -> {
if (key == null || key.isEmpty()) {
return 0;
}
int hash = 0;
for (int i = 0; i < key.length(); i++) {
char ch = key.charAt(i);
if ((i & 1) == 0) {
hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
} else {
hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
}
}
return hash;
});
}
}
}

View File

@ -0,0 +1,70 @@
package src.test.java.com.search;
import org.junit.Test;
import src.main.java.com.search.BloomFilter;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import static org.junit.Assert.*;
public class BloomFilterTest {
@Test
public void test() {
int count = 100000;
int low = 50, up = 100;
BloomFilter filter = BloomFilter.builder(10000).build();
String[] data = new String[count];
Set<String> dataSet = new HashSet<>();
for (int i = 0; i < count; i++) {
String str = randomString(low, up);
data[i] = str;
if (i % 2 == 0) {
dataSet.add(str);
filter.add(str);
}
}
int error = 0, total = 0;
for (int i = 0; i < count; i++) {
String str = data[i];
if (filter.contains(str)) {
total++;
if (!dataSet.contains(str)) {
error++;
}
} else {
assertFalse(dataSet.contains(str));
}
}
System.out.println("error: " + error);
System.out.println("total: " + total);
System.out.println("error rate : " + (double) error / total);
}
public static String randomString(int minLength, int maxLength) {
ThreadLocalRandom r = ThreadLocalRandom.current();
int chLen = r.nextInt(minLength, maxLength),
poolSize = CHAR_POOL.length;
char[] chars = new char[chLen];
for (int i = 0; i < chLen; i++) {
chars[i] = CHAR_POOL[r.nextInt(poolSize)];
}
return new String(chars);
}
private static final char[] CHAR_POOL;
static {
CHAR_POOL = new char[52];
int i = 0;
for (char c = 'a'; c <= 'z'; c++) {
CHAR_POOL[i++] = c;
CHAR_POOL[i++] = (char) (c - 32);
}
}
}