001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.collections4.bloomfilter; 018 019import java.util.Objects; 020import java.util.function.IntPredicate; 021 022/** 023 * A Hasher that implements combinatorial hashing as described by 024 * <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a> using the enhanced double hashing technique 025 * described in the wikipedia article <a href="https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing">Double Hashing</a>. 026 * <p> 027 * Common use for this hasher is to generate bit indices from a byte array output of a hashing 028 * or MessageDigest algorithm.</p> 029 * 030 * <h2>Thoughts on the hasher input</h2> 031 * 032 *<p>Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than 033 * the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be 034 * the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits 035 * changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for 036 * all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial} 037 * and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the 038 * tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also 039 * ignores the negative wrapping but the behavior is the same, some bits cannot be reached. 040 * </p><p> 041 * So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger 042 * than the number of bits then the modulus will create a 'random' position and increment within the size. 043 * </p> 044 * 045 * @since 4.5 046 */ 047public class EnhancedDoubleHasher implements Hasher { 048 049 /** 050 * Convert bytes to big-endian long filling with zero bytes as necessary.. 051 * @param byteArray the byte array to extract the values from. 052 * @param offset the offset to start extraction from. 053 * @param len the length of the extraction, may be longer than 8. 054 * @return 055 */ 056 private static long toLong(final byte[] byteArray, final int offset, final int len) { 057 long val = 0; 058 int shift = Long.SIZE; 059 final int end = offset + Math.min(len, Long.BYTES); 060 for (int i = offset; i < end; i++) { 061 shift -= Byte.SIZE; 062 val |= (long) (byteArray[i] & 0xFF) << shift; 063 } 064 return val; 065 } 066 067 /** 068 * The initial hash value. 069 */ 070 private final long initial; 071 072 /** 073 * The value to increment the hash value by. 074 */ 075 private final long increment; 076 077 /** 078 * Constructs the EnhancedDoubleHasher from a byte array. 079 * <p> 080 * This method simplifies the conversion from a Digest or hasher algorithm output 081 * to the two values used by the EnhancedDoubleHasher.</p> 082 * <p>The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value. 083 * Excess bytes are ignored. 084 * If there are fewer than 16 bytes the following conversions are made. 085 *</p> 086 * <ol> 087 * <li>If there is an odd number of bytes the excess byte is assigned to the increment value</li> 088 * <li>The bytes alloted are read in big-endian order any byte not populated is set to zero.</li> 089 * </ol> 090 * <p> 091 * This ensures that small arrays generate the largest possible increment and initial values. 092 * </p> 093 * @param buffer the buffer to extract the longs from. 094 * @throws IllegalArgumentException is buffer length is zero. 095 */ 096 public EnhancedDoubleHasher(final byte[] buffer) { 097 if (buffer.length == 0) { 098 throw new IllegalArgumentException("buffer length must be greater than 0"); 099 } 100 // divide by 2 101 final int segment = buffer.length / 2; 102 this.initial = toLong(buffer, 0, segment); 103 this.increment = toLong(buffer, segment, buffer.length - segment); 104 } 105 106 /** 107 * Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values. 108 * @param initial The initial value for the hasher. 109 * @param increment The value to increment the hash by on each iteration. 110 */ 111 public EnhancedDoubleHasher(final long initial, final long increment) { 112 this.initial = initial; 113 this.increment = increment; 114 } 115 116 /** 117 * Gets the increment value for the hash calculation. 118 * @return the increment value for the hash calculation. 119 */ 120 long getIncrement() { 121 return increment; 122 } 123 124 /** 125 * Gets the initial value for the hash calculation. 126 * @return the initial value for the hash calculation. 127 */ 128 long getInitial() { 129 return initial; 130 } 131 132 @Override 133 public IndexExtractor indices(final Shape shape) { 134 Objects.requireNonNull(shape, "shape"); 135 136 return new IndexExtractor() { 137 138 @Override 139 public int[] asIndexArray() { 140 final int[] result = new int[shape.getNumberOfHashFunctions()]; 141 final int[] idx = new int[1]; 142 143 // This method needs to return duplicate indices 144 145 processIndices(i -> { 146 result[idx[0]++] = i; 147 return true; 148 }); 149 return result; 150 } 151 152 @Override 153 public boolean processIndices(final IntPredicate consumer) { 154 Objects.requireNonNull(consumer, "consumer"); 155 final int bits = shape.getNumberOfBits(); 156 // Enhanced double hashing: 157 // hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits 158 // See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing 159 // 160 // Essentially this is computing a wrapped modulus from a start point and an 161 // increment and an additional term as a tetrahedral number. 162 // You only need two modulus operations before the loop. Within the loop 163 // the modulus is handled using the sign bit to detect wrapping to ensure: 164 // 0 <= index < bits 165 // 0 <= inc < bits 166 // The final hash is: 167 // hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits) 168 169 int index = BitMaps.mod(initial, bits); 170 if (!consumer.test(index)) { 171 return false; 172 } 173 int inc = BitMaps.mod(increment, bits); 174 175 final int k = shape.getNumberOfHashFunctions(); 176 177 if (k >= bits) { 178 // the tetraheadral incrementer. We need to ensure that this 179 // number does not exceed bits-1 or we may end up with an index > bits. 180 int tet = 1; 181 for (int i = 1; i < k; i++) { 182 // Update index and handle wrapping 183 index -= inc; 184 index = index < 0 ? index + bits : index; 185 if (!consumer.test(index)) { 186 return false; 187 } 188 189 // Incorporate the counter into the increment to create a 190 // tetrahedral number additional term, and handle wrapping. 191 inc -= tet; 192 inc = inc < 0 ? inc + bits : inc; 193 if (++tet == bits) { 194 tet = 0; 195 } 196 } 197 } else { 198 for (int i = 1; i < k; i++) { 199 // Update index and handle wrapping 200 index -= inc; 201 index = index < 0 ? index + bits : index; 202 if (!consumer.test(index)) { 203 return false; 204 } 205 206 // Incorporate the counter into the increment to create a 207 // tetrahedral number additional term, and handle wrapping. 208 inc -= i; 209 inc = inc < 0 ? inc + bits : inc; 210 } 211 212 } 213 return true; 214 } 215 }; 216 } 217}