Skip to content

Commit c50b5f8

Browse files
authored
Merge branch 'master' into add_fuse16
2 parents 48bbddf + c79e291 commit c50b5f8

File tree

7 files changed

+313
-8
lines changed

7 files changed

+313
-8
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ The following filter types are currently implemented:
66

77
* Xor filter: 8 and 16 bit variants; needs less space than cuckoo filters, with faster lookup
88
* Xor+ filter: 8 and 16 bit variants; compressed xor filter
9-
* Xor binary fuse filter: 8 bit variant; needs less space than xor filters, with faster lookup
9+
* Xor binary fuse filter: 8 and 32 bit variants; needs less space than xor filters, with faster lookup
1010
* Cuckoo filter: 8 and 16 bit variants; uses cuckoo hashing to store fingerprints
1111
* Cuckoo+ filter: 8 and 16 bit variants, need a bit less space than regular cuckoo filters
1212
* Bloom filter: the 'standard' algorithm
@@ -32,7 +32,7 @@ When using Maven:
3232
<dependency>
3333
<groupId>io.github.fastfilter</groupId>
3434
<artifactId>fastfilter</artifactId>
35-
<version>1.0.2</version>
35+
<version>1.0.3</version>
3636
</dependency>
3737

3838
# Other Xor Filter Implementations

fastfilter/src/main/java/org/fastfilter/FilterType.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ public Filter construct(long[] keys, int setting) {
7575
return XorBinaryFuse8.construct(keys);
7676
}
7777
},
78+
XOR_BINARY_FUSE_32 {
79+
@Override
80+
public Filter construct(long[] keys, int setting) {
81+
return XorBinaryFuse32.construct(keys);
82+
}
83+
},
7884
XOR_PLUS_8 {
7985
@Override
8086
public Filter construct(long[] keys, int setting) {
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
package org.fastfilter.xor;
2+
3+
import java.util.Arrays;
4+
5+
import org.fastfilter.Filter;
6+
import org.fastfilter.utils.Hash;
7+
8+
/**
9+
* The xor binary fuse filter, a new algorithm that can replace a Bloom filter.
10+
*/
11+
public class XorBinaryFuse32 implements Filter {
12+
13+
private static final int ARITY = 3;
14+
15+
private final int segmentCount;
16+
private final int segmentCountLength;
17+
private final int segmentLength;
18+
private final int segmentLengthMask;
19+
private final int arrayLength;
20+
private final int[] fingerprints;
21+
private long seed;
22+
23+
public XorBinaryFuse32(int segmentCount, int segmentLength) {
24+
if (segmentLength < 0 || Integer.bitCount(segmentLength) != 1) {
25+
throw new IllegalArgumentException("Segment length needs to be a power of 2, is " + segmentLength);
26+
}
27+
if (segmentCount <= 0) {
28+
throw new IllegalArgumentException("Illegal segment count: " + segmentCount);
29+
}
30+
this.segmentLength = segmentLength;
31+
this.segmentCount = segmentCount;
32+
this.segmentLengthMask = segmentLength - 1;
33+
this.segmentCountLength = segmentCount * segmentLength;
34+
this.arrayLength = (segmentCount + ARITY - 1) * segmentLength;
35+
this.fingerprints = new int[arrayLength];
36+
}
37+
38+
public long getBitCount() {
39+
return ((long) (arrayLength)) * Integer.SIZE;
40+
}
41+
42+
static int calculateSegmentLength(int arity, int size) {
43+
int segmentLength;
44+
if (arity == 3) {
45+
segmentLength = 1 << (int) Math.floor(Math.log(size) / Math.log(3.33) + 2.11);
46+
} else if (arity == 4) {
47+
segmentLength = 1 << (int) Math.floor(Math.log(size) / Math.log(2.91) - 0.5);
48+
} else {
49+
// not supported
50+
segmentLength = 65536;
51+
}
52+
return segmentLength;
53+
}
54+
55+
static double calculateSizeFactor(int arity, int size) {
56+
double sizeFactor;
57+
if (arity == 3) {
58+
sizeFactor = Math.max(1.125, 0.875 + 0.25 * Math.log(1000000) / Math.log(size));
59+
} else if (arity == 4) {
60+
sizeFactor = Math.max(1.075, 0.77 + 0.305 * Math.log(600000) / Math.log(size));
61+
} else {
62+
// not supported
63+
sizeFactor = 2.0;
64+
}
65+
return sizeFactor;
66+
}
67+
68+
private static int mod3(int x) {
69+
if (x > 2) {
70+
x -= 3;
71+
}
72+
return x;
73+
}
74+
75+
public static XorBinaryFuse32 construct(long[] keys) {
76+
int size = keys.length;
77+
int segmentLength = calculateSegmentLength(ARITY, size);
78+
// the current implementation hardcodes a 18-bit limit to
79+
// to the segment length.
80+
if (segmentLength > (1 << 18)) {
81+
segmentLength = (1 << 18);
82+
}
83+
double sizeFactor = calculateSizeFactor(ARITY, size);
84+
int capacity = (int) (size * sizeFactor);
85+
int segmentCount = (capacity + segmentLength - 1) / segmentLength - (ARITY - 1);
86+
int arrayLength = (segmentCount + ARITY - 1) * segmentLength;
87+
segmentCount = (arrayLength + segmentLength - 1) / segmentLength;
88+
segmentCount = segmentCount <= ARITY - 1 ? 1 : segmentCount - (ARITY - 1);
89+
XorBinaryFuse32 filter = new XorBinaryFuse32(segmentCount, segmentLength);
90+
filter.addAll(keys);
91+
return filter;
92+
}
93+
94+
private void addAll(long[] keys) {
95+
int size = keys.length;
96+
long[] reverseOrder = new long[size + 1];
97+
int[] reverseH = new int[size];
98+
int reverseOrderPos = 0;
99+
100+
// the lowest 2 bits are the h index (0, 1, or 2)
101+
// so we only have 6 bits for counting;
102+
// but that's sufficient
103+
int[] t2count = new int[arrayLength];
104+
long[] t2hash = new long[arrayLength];
105+
int[] alone = new int[arrayLength];
106+
int hashIndex = 0;
107+
// the array h0, h1, h2, h0, h1, h2
108+
int[] h012 = new int[5];
109+
int blockBits = 1;
110+
while ((1 << blockBits) < segmentCount) {
111+
blockBits++;
112+
}
113+
int block = 1 << blockBits;
114+
mainloop:
115+
while (true) {
116+
reverseOrder[size] = 1;
117+
int[] startPos = new int[block];
118+
for (int i = 0; i < 1 << blockBits; i++) {
119+
startPos[i] = (int) ((long) i * size / block);
120+
}
121+
// counting sort
122+
123+
for (long key : keys) {
124+
long hash = Hash.hash64(key, seed);
125+
int segmentIndex = (int) (hash >>> (64 - blockBits));
126+
// We only overwrite when the hash was zero. Zero hash values
127+
// may be misplaced (unlikely).
128+
while (reverseOrder[startPos[segmentIndex]] != 0) {
129+
segmentIndex++;
130+
segmentIndex &= (1 << blockBits) - 1;
131+
}
132+
reverseOrder[startPos[segmentIndex]] = hash;
133+
startPos[segmentIndex]++;
134+
}
135+
int countMask = 0;
136+
for (int i = 0; i < size; i++) {
137+
long hash = reverseOrder[i];
138+
for (int hi = 0; hi < 3; hi++) {
139+
int index = getHashFromHash(hash, hi);
140+
t2count[index] += 4;
141+
t2count[index] ^= hi;
142+
t2hash[index] ^= hash;
143+
countMask |= t2count[index];
144+
}
145+
}
146+
startPos = null;
147+
if (countMask < 0) {
148+
// we have a possible counter overflow
149+
continue mainloop;
150+
}
151+
152+
reverseOrderPos = 0;
153+
int alonePos = 0;
154+
for (int i = 0; i < arrayLength; i++) {
155+
alone[alonePos] = i;
156+
int inc = (t2count[i] >> 2) == 1 ? 1 : 0;
157+
alonePos += inc;
158+
}
159+
160+
while (alonePos > 0) {
161+
alonePos--;
162+
int index = alone[alonePos];
163+
if ((t2count[index] >> 2) == 1) {
164+
// It is still there!
165+
long hash = t2hash[index];
166+
int found = t2count[index] & 3;
167+
168+
reverseH[reverseOrderPos] = found;
169+
reverseOrder[reverseOrderPos] = hash;
170+
171+
h012[0] = getHashFromHash(hash, 0);
172+
h012[1] = getHashFromHash(hash, 1);
173+
h012[2] = getHashFromHash(hash, 2);
174+
175+
int index3 = h012[mod3(found + 1)];
176+
alone[alonePos] = index3;
177+
alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
178+
t2count[index3] -= 4;
179+
t2count[index3] ^= mod3(found + 1);
180+
t2hash[index3] ^= hash;
181+
182+
index3 = h012[mod3(found + 2)];
183+
alone[alonePos] = index3;
184+
alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
185+
t2count[index3] -= 4;
186+
t2count[index3] ^= mod3(found + 2);
187+
t2hash[index3] ^= hash;
188+
189+
reverseOrderPos++;
190+
}
191+
}
192+
193+
if (reverseOrderPos == size) {
194+
break;
195+
}
196+
hashIndex++;
197+
Arrays.fill(t2count, 0);
198+
Arrays.fill(t2hash, 0);
199+
Arrays.fill(reverseOrder, 0);
200+
201+
if (hashIndex > 100) {
202+
// if construction doesn't succeed eventually,
203+
// then there is likely a problem with the hash function
204+
// let us not crash the system:
205+
for (int i = 0; i < fingerprints.length; i++) {
206+
fingerprints[i] = (int) 0xFFFFFFFF;
207+
}
208+
return;
209+
}
210+
// use a new random numbers
211+
seed = Hash.randomSeed();
212+
}
213+
alone = null;
214+
t2count = null;
215+
t2hash = null;
216+
217+
for (int i = reverseOrderPos - 1; i >= 0; i--) {
218+
long hash = reverseOrder[i];
219+
int found = reverseH[i];
220+
int xor2 = fingerprint(hash);
221+
h012[0] = getHashFromHash(hash, 0);
222+
h012[1] = getHashFromHash(hash, 1);
223+
h012[2] = getHashFromHash(hash, 2);
224+
h012[3] = h012[0];
225+
h012[4] = h012[1];
226+
fingerprints[h012[found]] = (xor2 ^ fingerprints[h012[found + 1]] ^ fingerprints[h012[found + 2]]);
227+
}
228+
}
229+
230+
@Override
231+
public boolean mayContain(long key) {
232+
long hash = Hash.hash64(key, seed);
233+
int f = fingerprint(hash);
234+
int h0 = Hash.reduce((int) (hash >>> 32), segmentCountLength);
235+
int h1 = h0 + segmentLength;
236+
int h2 = h1 + segmentLength;
237+
long hh = hash;
238+
h1 ^= (int) ((hh >> 18) & segmentLengthMask);
239+
h2 ^= (int) ((hh) & segmentLengthMask);
240+
f ^= fingerprints[h0] ^ fingerprints[h1] ^ fingerprints[h2];
241+
return (f & 0xffffffffL) == 0;
242+
}
243+
244+
@Override
245+
public String toString() {
246+
return "segmentLength " + segmentLength + " segmentCount " + segmentCount;
247+
}
248+
249+
int getHashFromHash(long hash, int index) {
250+
long h = Hash.reduce((int) (hash >>> 32), segmentCountLength);
251+
// long h = Hash.multiplyHighUnsigned(hash, segmentCountLength);
252+
h += index * segmentLength;
253+
// keep the lower 36 bits
254+
long hh = hash & ((1L << 36) - 1);
255+
// index 0: right shift by 36; index 1: right shift by 18; index 2: no shift
256+
h ^= (int) ((hh >>> (36 - 18 * index)) & segmentLengthMask);
257+
return (int) h;
258+
}
259+
260+
private int fingerprint(long hash) {
261+
return (int) (hash ^ (hash >>> 32));
262+
}
263+
264+
}

fastfilter/src/test/java/org/fastfilter/TestAllFilters.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,14 @@ public static void main(String... args) {
107107
System.out.println("size " + size);
108108
test(TestFilterType.XOR_BINARY_FUSE_8, size, 0, true);
109109
}
110+
for (int size = 1; size <= 100; size++) {
111+
System.out.println("size " + size);
112+
test(TestFilterType.XOR_BINARY_FUSE_32, size, 0, true);
113+
}
114+
for (int size = 100; size <= 100000; size *= 1.1) {
115+
System.out.println("size " + size);
116+
test(TestFilterType.XOR_BINARY_FUSE_32, size, 0, true);
117+
}
110118
for (int size = 1_000_000; size <= 8_000_000; size *= 2) {
111119
System.out.println("size " + size);
112120
testAll(size, true);

fastfilter/src/test/java/org/fastfilter/TestFilterType.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,7 @@
1111
import org.fastfilter.gcs.GolombCompressedSet;
1212
import org.fastfilter.gcs.GolombCompressedSet2;
1313
import org.fastfilter.mphf.MPHFilter;
14-
import org.fastfilter.xor.Xor16;
15-
import org.fastfilter.xor.Xor8;
16-
import org.fastfilter.xor.XorBinaryFuse8;
17-
import org.fastfilter.xor.XorBinaryFuse16;
14+
import org.fastfilter.xor.*;
1815
import org.fastfilter.xorplus.XorPlus8;
1916

2017
/**
@@ -87,6 +84,12 @@ public Filter construct(long[] keys, int setting) {
8784
return XorBinaryFuse16.construct(keys);
8885
}
8986
},
87+
XOR_BINARY_FUSE_32 {
88+
@Override
89+
public Filter construct(long[] keys, int setting) {
90+
return XorBinaryFuse32.construct(keys);
91+
}
92+
},
9093
CUCKOO_8 {
9194
@Override
9295
public Filter construct(long[] keys, int setting) {
@@ -133,7 +136,7 @@ public Filter construct(long[] keys, int setting) {
133136
/**
134137
* Construct the filter with the given keys and the setting.
135138
*
136-
* @param keys the keys
139+
* @param keys the keys
137140
* @param setting the setting (roughly bits per fingerprint)
138141
* @return the constructed filter
139142
*/

fastfilter/src/test/java/org/fastfilter/xor/SmallSetTest.java

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ public void small() {
1414
Xor16.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
1515
XorBinaryFuse8.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
1616
XorBinaryFuse16.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
17+
XorBinaryFuse32.construct(new long[]{0xef9bddc5166c081cL, 0x33bf87adaa46dcfcL});
1718
}
1819

1920
@Test
@@ -26,7 +27,29 @@ public void verySmallSizes() {
2627
testWithSize(n);
2728
}
2829
}
29-
30+
31+
32+
@Test
33+
public void smallSizes32() {
34+
long lastTime = System.currentTimeMillis();
35+
for (int n = 1; n < 1_500_000; n = (int) ((n * 1.01) + 7)) {
36+
XorBinaryFuse32 f = testWithSize32(n);
37+
long now = System.currentTimeMillis();
38+
if (now - lastTime > 5000) {
39+
lastTime = now;
40+
System.out.println("n=" + n + " " + f.toString());
41+
}
42+
}
43+
}
44+
45+
private static XorBinaryFuse32 testWithSize32(int n) {
46+
long[] keys = new long[n];
47+
for (int i = 0; i < n; i++) {
48+
keys[i] = i;
49+
}
50+
return XorBinaryFuse32.construct(keys);
51+
}
52+
3053
@Test
3154
public void smallSizes() {
3255
long lastTime = System.currentTimeMillis();

jmh/src/main/java/org/fastfilter/ConstructionState.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ public class ConstructionState {
2222
"XOR_8",
2323
"XOR_16",
2424
"XOR_BINARY_FUSE_8",
25+
"XOR_BINARY_FUSE_32",
2526
"XOR_PLUS_8",
2627
"CUCKOO_8",
2728
"CUCKOO_16",

0 commit comments

Comments
 (0)