1+ package org .fastfilter .xor ;
2+
3+ import java .util .Arrays ;
4+
5+ import org .fastfilter .Filter ;
6+ import org .fastfilter .utils .Hash ;
7+
8+ /**
9+ * The xor binary fuse filter, a new algorithm that can replace a Bloom filter.
10+ */
11+ public class XorBinaryFuse16 implements Filter {
12+
13+ private static final int ARITY = 3 ;
14+
15+ private final int segmentCount ;
16+ private final int segmentCountLength ;
17+ private final int segmentLength ;
18+ private final int segmentLengthMask ;
19+ private final int arrayLength ;
20+ private final short [] fingerprints ;
21+ private long seed ;
22+
23+ public XorBinaryFuse16 (int segmentCount , int segmentLength ) {
24+ if (segmentLength < 0 || Integer .bitCount (segmentLength ) != 1 ) {
25+ throw new IllegalArgumentException ("Segment length needs to be a power of 2, is " + segmentLength );
26+ }
27+ if (segmentCount <= 0 ) {
28+ throw new IllegalArgumentException ("Illegal segment count: " + segmentCount );
29+ }
30+ this .segmentLength = segmentLength ;
31+ this .segmentCount = segmentCount ;
32+ this .segmentLengthMask = segmentLength - 1 ;
33+ this .segmentCountLength = segmentCount * segmentLength ;
34+ this .arrayLength = (segmentCount + ARITY - 1 ) * segmentLength ;
35+ this .fingerprints = new short [arrayLength ];
36+ }
37+
38+ public long getBitCount () {
39+ return arrayLength * 16L ;
40+ }
41+
42+ static int calculateSegmentLength (int arity , int size ) {
43+ int segmentLength ;
44+ if (arity == 3 ) {
45+ segmentLength = 1 << (int ) Math .floor (Math .log (size ) / Math .log (3.33 ) + 2.11 );
46+ } else if (arity == 4 ) {
47+ segmentLength = 1 << (int ) Math .floor (Math .log (size ) / Math .log (2.91 ) - 0.5 );
48+ } else {
49+ // not supported
50+ segmentLength = 65536 ;
51+ }
52+ return segmentLength ;
53+ }
54+
55+ static double calculateSizeFactor (int arity , int size ) {
56+ double sizeFactor ;
57+ if (arity == 3 ) {
58+ sizeFactor = Math .max (1.125 , 0.875 + 0.25 * Math .log (1000000 ) / Math .log (size ));
59+ } else if (arity == 4 ) {
60+ sizeFactor = Math .max (1.075 , 0.77 + 0.305 * Math .log (600000 ) / Math .log (size ));
61+ } else {
62+ // not supported
63+ sizeFactor = 2.0 ;
64+ }
65+ return sizeFactor ;
66+ }
67+
68+ private static int mod3 (int x ) {
69+ if (x > 2 ) {
70+ x -= 3 ;
71+ }
72+ return x ;
73+ }
74+
75+ public static XorBinaryFuse16 construct (long [] keys ) {
76+ int size = keys .length ;
77+ int segmentLength = calculateSegmentLength (ARITY , size );
78+ // the current implementation hardcodes a 18-bit limit to
79+ // to the segment length.
80+ if (segmentLength > (1 << 18 )) {
81+ segmentLength = (1 << 18 );
82+ }
83+ double sizeFactor = calculateSizeFactor (ARITY , size );
84+ int capacity = (int ) (size * sizeFactor );
85+ int segmentCount = (capacity + segmentLength - 1 ) / segmentLength - (ARITY - 1 );
86+ int arrayLength = (segmentCount + ARITY - 1 ) * segmentLength ;
87+ segmentCount = (arrayLength + segmentLength - 1 ) / segmentLength ;
88+ segmentCount = segmentCount <= ARITY - 1 ? 1 : segmentCount - (ARITY - 1 );
89+ XorBinaryFuse16 filter = new XorBinaryFuse16 (segmentCount , segmentLength );
90+ filter .addAll (keys );
91+ return filter ;
92+ }
93+
94+ private void addAll (long [] keys ) {
95+ int size = keys .length ;
96+ long [] reverseOrder = new long [size + 1 ];
97+ byte [] reverseH = new byte [size ];
98+ int reverseOrderPos = 0 ;
99+
100+ // the lowest 2 bits are the h index (0, 1, or 2)
101+ // so we only have 6 bits for counting;
102+ // but that's sufficient
103+ byte [] t2count = new byte [arrayLength ];
104+ long [] t2hash = new long [arrayLength ];
105+ int [] alone = new int [arrayLength ];
106+ int hashIndex = 0 ;
107+ // the array h0, h1, h2, h0, h1, h2
108+ int [] h012 = new int [5 ];
109+ int blockBits = 1 ;
110+ while ((1 << blockBits ) < segmentCount ) {
111+ blockBits ++;
112+ }
113+ int block = 1 << blockBits ;
114+ mainloop :
115+ while (true ) {
116+ reverseOrder [size ] = 1 ;
117+ int [] startPos = new int [block ];
118+ for (int i = 0 ; i < 1 << blockBits ; i ++) {
119+ startPos [i ] = (int ) ((long ) i * size / block );
120+ }
121+ // counting sort
122+
123+ for (long key : keys ) {
124+ long hash = Hash .hash64 (key , seed );
125+ int segmentIndex = (int ) (hash >>> (64 - blockBits ));
126+ // We only overwrite when the hash was zero. Zero hash values
127+ // may be misplaced (unlikely).
128+ while (reverseOrder [startPos [segmentIndex ]] != 0 ) {
129+ segmentIndex ++;
130+ segmentIndex &= (1 << blockBits ) - 1 ;
131+ }
132+ reverseOrder [startPos [segmentIndex ]] = hash ;
133+ startPos [segmentIndex ]++;
134+ }
135+ byte countMask = 0 ;
136+ for (int i = 0 ; i < size ; i ++) {
137+ long hash = reverseOrder [i ];
138+ for (int hi = 0 ; hi < 3 ; hi ++) {
139+ int index = getHashFromHash (hash , hi );
140+ t2count [index ] += 4 ;
141+ t2count [index ] ^= hi ;
142+ t2hash [index ] ^= hash ;
143+ countMask |= t2count [index ];
144+ }
145+ }
146+ startPos = null ;
147+ if (countMask < 0 ) {
148+ // we have a possible counter overflow
149+ continue mainloop ;
150+ }
151+
152+ reverseOrderPos = 0 ;
153+ int alonePos = 0 ;
154+ for (int i = 0 ; i < arrayLength ; i ++) {
155+ alone [alonePos ] = i ;
156+ int inc = (t2count [i ] >> 2 ) == 1 ? 1 : 0 ;
157+ alonePos += inc ;
158+ }
159+
160+ while (alonePos > 0 ) {
161+ alonePos --;
162+ int index = alone [alonePos ];
163+ if ((t2count [index ] >> 2 ) == 1 ) {
164+ // It is still there!
165+ long hash = t2hash [index ];
166+ byte found = (byte ) (t2count [index ] & 3 );
167+
168+ reverseH [reverseOrderPos ] = found ;
169+ reverseOrder [reverseOrderPos ] = hash ;
170+
171+ h012 [0 ] = getHashFromHash (hash , 0 );
172+ h012 [1 ] = getHashFromHash (hash , 1 );
173+ h012 [2 ] = getHashFromHash (hash , 2 );
174+
175+ int index3 = h012 [mod3 (found + 1 )];
176+ alone [alonePos ] = index3 ;
177+ alonePos += ((t2count [index3 ] >> 2 ) == 2 ? 1 : 0 );
178+ t2count [index3 ] -= 4 ;
179+ t2count [index3 ] ^= mod3 (found + 1 );
180+ t2hash [index3 ] ^= hash ;
181+
182+ index3 = h012 [mod3 (found + 2 )];
183+ alone [alonePos ] = index3 ;
184+ alonePos += ((t2count [index3 ] >> 2 ) == 2 ? 1 : 0 );
185+ t2count [index3 ] -= 4 ;
186+ t2count [index3 ] ^= mod3 (found + 2 );
187+ t2hash [index3 ] ^= hash ;
188+
189+ reverseOrderPos ++;
190+ }
191+ }
192+
193+ if (reverseOrderPos == size ) {
194+ break ;
195+ }
196+ hashIndex ++;
197+ Arrays .fill (t2count , (byte ) 0 );
198+ Arrays .fill (t2hash , 0 );
199+ Arrays .fill (reverseOrder , 0 );
200+
201+ if (hashIndex > 100 ) {
202+ // if construction doesn't succeed eventually,
203+ // then there is likely a problem with the hash function
204+ // let us not crash the system:
205+ for (int i = 0 ; i < fingerprints .length ; i ++) {
206+ fingerprints [i ] = (short )0xFFFF ;
207+ }
208+ return ;
209+ }
210+ // use a new random numbers
211+ seed = Hash .randomSeed ();
212+ }
213+ alone = null ;
214+ t2count = null ;
215+ t2hash = null ;
216+
217+ for (int i = reverseOrderPos - 1 ; i >= 0 ; i --) {
218+ long hash = reverseOrder [i ];
219+ int found = reverseH [i ];
220+ short xor2 = fingerprint (hash );
221+ h012 [0 ] = getHashFromHash (hash , 0 );
222+ h012 [1 ] = getHashFromHash (hash , 1 );
223+ h012 [2 ] = getHashFromHash (hash , 2 );
224+ h012 [3 ] = h012 [0 ];
225+ h012 [4 ] = h012 [1 ];
226+ fingerprints [h012 [found ]] = (short ) (xor2 ^ fingerprints [h012 [found + 1 ]] ^ fingerprints [h012 [found + 2 ]]);
227+ }
228+ }
229+
230+ @ Override
231+ public boolean mayContain (long key ) {
232+ long hash = Hash .hash64 (key , seed );
233+ short f = fingerprint (hash );
234+ int h0 = Hash .reduce ((int ) (hash >>> 32 ), segmentCountLength );
235+ int h1 = h0 + segmentLength ;
236+ int h2 = h1 + segmentLength ;
237+ long hh = hash ;
238+ h1 ^= (int ) ((hh >> 18 ) & segmentLengthMask );
239+ h2 ^= (int ) ((hh ) & segmentLengthMask );
240+ f ^= fingerprints [h0 ] ^ fingerprints [h1 ] ^ fingerprints [h2 ];
241+ return (f & 0xffff ) == 0 ;
242+ }
243+
244+ @ Override
245+ public String toString () {
246+ return "segmentLength " + segmentLength + " segmentCount " + segmentCount ;
247+ }
248+
249+ int getHashFromHash (long hash , int index ) {
250+ long h = Hash .reduce ((int ) (hash >>> 32 ), segmentCountLength );
251+ // long h = Hash.multiplyHighUnsigned(hash, segmentCountLength);
252+ h += index * segmentLength ;
253+ // keep the lower 36 bits
254+ long hh = hash & ((1L << 36 ) - 1 );
255+ // index 0: right shift by 36; index 1: right shift by 18; index 2: no shift
256+ h ^= (int ) ((hh >>> (36 - 18 * index )) & segmentLengthMask );
257+ return (int ) h ;
258+ }
259+
260+ private short fingerprint (long hash ) {
261+ return (short ) hash ;
262+ }
263+
264+ }
0 commit comments