Merge pull request #714 from MrYangxf/Development
Add a simple implementation of Bloom filter.
This commit is contained in:
commit
ee8147cdda
190
src/main/java/com/search/BloomFilter.java
Normal file
190
src/main/java/com/search/BloomFilter.java
Normal file
@ -0,0 +1,190 @@
|
||||
package src.main.java.com.search;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* A simple implementation of Bloom filter.
|
||||
* <p>
|
||||
* Bloom filter have a chance of being wrong.
|
||||
* <p>
|
||||
* The Bloom filter assert that elements that do not exist must not exist,
|
||||
* if assert an element exists, but not necessarily.
|
||||
* <p>
|
||||
* The accuracy rate depends on capacity and hash functions.
|
||||
*
|
||||
* @author yangxf
|
||||
*/
|
||||
public class BloomFilter implements Serializable {
|
||||
private static final long serialVersionUID = -4466610350741278658L;
|
||||
|
||||
private static final int LONG_SHIFT = 6;
|
||||
private static final int LONG_MASK = 63;
|
||||
|
||||
/**
|
||||
* hash functions
|
||||
*/
|
||||
private final List<Function<String, Integer>> hashFunctions;
|
||||
|
||||
private final long[] table;
|
||||
private final int tableMask;
|
||||
private int size;
|
||||
|
||||
/**
|
||||
* @param capacity the filter capacity
|
||||
* @param hashFunctions hash functions
|
||||
* @see Builder must be build by {@link Builder}.
|
||||
*/
|
||||
private BloomFilter(int capacity, List<Function<String, Integer>> hashFunctions) {
|
||||
this.hashFunctions = hashFunctions;
|
||||
int cap = nextPowerOf2(capacity);
|
||||
tableMask = (cap << LONG_SHIFT) - 1;
|
||||
table = new long[cap];
|
||||
}
|
||||
|
||||
public static Builder builder(int capacity) {
|
||||
if (capacity < 1) {
|
||||
throw new IllegalStateException("capacity must be > 0");
|
||||
}
|
||||
|
||||
return new Builder(capacity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an element to the Bloom filter.
|
||||
*/
|
||||
public void add(String element) {
|
||||
checkNotNull(element, "element");
|
||||
|
||||
for (Function<String, Integer> hashFunction : hashFunctions) {
|
||||
int key = hashFunction.apply(element) & tableMask;
|
||||
table[key >>> LONG_SHIFT] |= (1 << (key & LONG_MASK));
|
||||
}
|
||||
size++;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if the element exists, otherwise false.
|
||||
*/
|
||||
public boolean contains(String element) {
|
||||
if (element == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (Function<String, Integer> hashFunction : hashFunctions) {
|
||||
int key = hashFunction.apply(element) & tableMask;
|
||||
if ((table[key >>> LONG_SHIFT] & (1 << (key & LONG_MASK))) == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public List<Function<String, Integer>> getHashFunctions() {
|
||||
return hashFunctions;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
private static void checkNotNull(String element, String msg) {
|
||||
if (element == null) {
|
||||
throw new NullPointerException(msg + " must be not null");
|
||||
}
|
||||
}
|
||||
|
||||
private static int nextPowerOf2(int i) {
|
||||
int n = i - 1;
|
||||
n |= n >>> 1;
|
||||
n |= n >>> 2;
|
||||
n |= n >>> 4;
|
||||
n |= n >>> 8;
|
||||
n |= n >>> 16;
|
||||
return (n < 0) ? 1 : (n >= 0x40000000) ? 0x40000000 : n + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* We need a list of unmodifiable hash functions.
|
||||
*/
|
||||
public static class Builder {
|
||||
private int capacity;
|
||||
private List<Function<String, Integer>> hashFunctions = new ArrayList<>();
|
||||
|
||||
private Builder(int capacity) {
|
||||
this.capacity = capacity;
|
||||
}
|
||||
|
||||
public Builder addHashFunction(Function<String, Integer> function) {
|
||||
hashFunctions.add(function);
|
||||
return this;
|
||||
}
|
||||
|
||||
public BloomFilter build() {
|
||||
if (hashFunctions.isEmpty()) {
|
||||
addDefaultHashFunction();
|
||||
}
|
||||
return new BloomFilter(capacity, Collections.unmodifiableList(hashFunctions));
|
||||
}
|
||||
|
||||
/**
|
||||
* I provides several default hash functions
|
||||
*/
|
||||
private void addDefaultHashFunction() {
|
||||
// Java String Hash Function
|
||||
hashFunctions.add(String::hashCode);
|
||||
|
||||
// SDBM Hash Function
|
||||
hashFunctions.add(key -> {
|
||||
if (key == null || key.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hash = 0;
|
||||
for (int i = 0; i < key.length(); i++) {
|
||||
hash = key.charAt(i) + (hash << 6) + (hash << 16) - hash;
|
||||
}
|
||||
hash &= 0x7ffffff;
|
||||
return hash;
|
||||
});
|
||||
|
||||
// Robert Sedgwicks Hash Function
|
||||
hashFunctions.add(key -> {
|
||||
if (key == null || key.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hash = 0;
|
||||
int magic = 63689;
|
||||
for (int i = 0; i < key.length(); i++) {
|
||||
hash = hash * magic + key.charAt(i);
|
||||
magic *= 378551;
|
||||
}
|
||||
return hash;
|
||||
});
|
||||
|
||||
// Arash Partow Hash Function
|
||||
hashFunctions.add(key -> {
|
||||
if (key == null || key.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hash = 0;
|
||||
for (int i = 0; i < key.length(); i++) {
|
||||
char ch = key.charAt(i);
|
||||
if ((i & 1) == 0) {
|
||||
hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
|
||||
} else {
|
||||
hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
|
||||
}
|
||||
}
|
||||
return hash;
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
70
src/test/java/com/search/BloomFilterTest.java
Normal file
70
src/test/java/com/search/BloomFilterTest.java
Normal file
@ -0,0 +1,70 @@
|
||||
package src.test.java.com.search;
|
||||
|
||||
import org.junit.Test;
|
||||
import src.main.java.com.search.BloomFilter;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class BloomFilterTest {
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
int count = 100000;
|
||||
int low = 50, up = 100;
|
||||
BloomFilter filter = BloomFilter.builder(10000).build();
|
||||
String[] data = new String[count];
|
||||
Set<String> dataSet = new HashSet<>();
|
||||
for (int i = 0; i < count; i++) {
|
||||
String str = randomString(low, up);
|
||||
data[i] = str;
|
||||
if (i % 2 == 0) {
|
||||
dataSet.add(str);
|
||||
filter.add(str);
|
||||
}
|
||||
}
|
||||
|
||||
int error = 0, total = 0;
|
||||
for (int i = 0; i < count; i++) {
|
||||
String str = data[i];
|
||||
if (filter.contains(str)) {
|
||||
total++;
|
||||
if (!dataSet.contains(str)) {
|
||||
error++;
|
||||
}
|
||||
} else {
|
||||
assertFalse(dataSet.contains(str));
|
||||
}
|
||||
}
|
||||
System.out.println("error: " + error);
|
||||
System.out.println("total: " + total);
|
||||
System.out.println("error rate : " + (double) error / total);
|
||||
}
|
||||
|
||||
public static String randomString(int minLength, int maxLength) {
|
||||
ThreadLocalRandom r = ThreadLocalRandom.current();
|
||||
int chLen = r.nextInt(minLength, maxLength),
|
||||
poolSize = CHAR_POOL.length;
|
||||
char[] chars = new char[chLen];
|
||||
for (int i = 0; i < chLen; i++) {
|
||||
chars[i] = CHAR_POOL[r.nextInt(poolSize)];
|
||||
}
|
||||
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
private static final char[] CHAR_POOL;
|
||||
|
||||
static {
|
||||
CHAR_POOL = new char[52];
|
||||
int i = 0;
|
||||
for (char c = 'a'; c <= 'z'; c++) {
|
||||
CHAR_POOL[i++] = c;
|
||||
CHAR_POOL[i++] = (char) (c - 32);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user