From 647a7f44dc6dd036ef5417dd15f1d6789c6bd368 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Sun, 31 May 2015 23:31:31 +0100 Subject: [PATCH] Tidy-up initial files and add testing to optimise bst bloom filters --- ...eled_internal.erl => leveled_iterator.erl} | 52 ++-- src/leveled_rice.erl | 283 ++++++++++++++++++ test/lookup_test.beam | Bin 4096 -> 0 bytes test/rice_test.erl | 59 ++++ 4 files changed, 375 insertions(+), 19 deletions(-) rename src/{leveled_internal.erl => leveled_iterator.erl} (73%) create mode 100644 src/leveled_rice.erl delete mode 100644 test/lookup_test.beam create mode 100644 test/rice_test.erl diff --git a/src/leveled_internal.erl b/src/leveled_iterator.erl similarity index 73% rename from src/leveled_internal.erl rename to src/leveled_iterator.erl index 874fe61..f9b97c7 100644 --- a/src/leveled_internal.erl +++ b/src/leveled_iterator.erl @@ -1,19 +1,25 @@ -module(leveled_internal). + -export([termiterator/6]). + -include_lib("eunit/include/eunit.hrl"). %% We will have a sorted list of terms -%% Some terms will be dummy terms which are pointers to more terms which can be found -%% If a pointer is hit need to replenish the term list before proceeding +%% Some terms will be dummy terms which are pointers to more terms which can be +%% found. If a pointer is hit need to replenish the term list before +%% proceeding. %% -%% Helper Functions should have free functions - FolderFun, CompareFun, PointerCheck} -%% FolderFun - function which takes the next item and the accumulator and returns an updated accunulator -%% CompareFun - function which should be able to compare two keys (which are not pointers) +%% Helper Functions should have free functions - +%% {FolderFun, CompareFun, PointerCheck} +%% FolderFun - function which takes the next item and the accumulator and +%% returns an updated accumulator +%% CompareFun - function which should be able to compare two keys (which are +%% not pointers), and return a winning item (or combination of items) %% PointerCheck - function for differentiating between keys and pointer -termiterator(HeadItem, [], Acc, HelperFuns, _StartKey, _EndKey) -> - io:format("Reached empty list with head item of ~w~n", [HeadItem]), +termiterator(HeadItem, [], Acc, HelperFuns, + _StartKey, _EndKey) -> case HeadItem of null -> Acc; @@ -21,7 +27,8 @@ termiterator(HeadItem, [], Acc, HelperFuns, _StartKey, _EndKey) -> {FolderFun, _, _} = HelperFuns, FolderFun(Acc, HeadItem) end; -termiterator(null, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) -> +termiterator(null, [NextItem|TailList], Acc, HelperFuns, + StartKey, EndKey) -> %% Check that the NextItem is not a pointer before promoting to HeadItem %% Cannot now promote a HeadItem which is a pointer {_, _, PointerCheck} = HelperFuns, @@ -29,30 +36,37 @@ termiterator(null, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) -> {true, Pointer} -> NewSlice = getnextslice(Pointer, EndKey), ExtendedList = lists:merge(NewSlice, TailList), - termiterator(null, ExtendedList, Acc, HelperFuns, StartKey, EndKey); + termiterator(null, ExtendedList, Acc, HelperFuns, + StartKey, EndKey); false -> - termiterator(NextItem, TailList, Acc, HelperFuns, StartKey, EndKey) + termiterator(NextItem, TailList, Acc, HelperFuns, + StartKey, EndKey) end; -termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) -> - io:format("Checking head item of ~w~n", [HeadItem]), +termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns, + StartKey, EndKey) -> {FolderFun, CompareFun, PointerCheck} = HelperFuns, - %% HeadItem cannot be pointer, but NextItem might be, so check before comparison + %% HeadItem cannot be pointer, but NextItem might be, so check before + %% comparison case PointerCheck(NextItem) of {true, Pointer} -> NewSlice = getnextslice(Pointer, EndKey), ExtendedList = lists:merge(NewSlice, [NextItem|TailList]), - termiterator(null, ExtendedList, Acc, HelperFuns, StartKey, EndKey); + termiterator(null, ExtendedList, Acc, HelperFuns, + StartKey, EndKey); false -> - %% Compare to see if Head and Next match, or if Head is a winner to be added - %% to accumulator + %% Compare to see if Head and Next match, or if Head is a winner + %% to be added to accumulator case CompareFun(HeadItem, NextItem) of {match, StrongItem, _WeakItem} -> - %% Discard WeakItem - termiterator(StrongItem, TailList, Acc, HelperFuns, StartKey, EndKey); + %% Discard WeakItem, Strong Item might be an aggregation of + %% the items + termiterator(StrongItem, TailList, Acc, HelperFuns, + StartKey, EndKey); {winner, HeadItem} -> %% Add next item to accumulator, and proceed with next item AccPlus = FolderFun(Acc, HeadItem), - termiterator(NextItem, TailList, AccPlus, HelperFuns, HeadItem, EndKey) + termiterator(NextItem, TailList, AccPlus, HelperFuns, + HeadItem, EndKey) end end. diff --git a/src/leveled_rice.erl b/src/leveled_rice.erl new file mode 100644 index 0000000..f432944 --- /dev/null +++ b/src/leveled_rice.erl @@ -0,0 +1,283 @@ +%% Used for creating fixed-size self-regulating encoded bloom filters +%% +%% Normally a bloom filter in order to achieve optimium size increases the +%% number of hashes as the desired false positive rate increases. There is +%% a processing overhead for checking this bloom, both because of the number +%% of hash calculations required, and also because of the need to CRC check +%% the bloom to ensure a false negative result is not returned due to +%% corruption. +%% +%% A more space efficient bloom can be achieved through the compression of +%% bloom filters with less hashes (and in an optimal case a single hash). +%% This can be achieved using rice encoding. +%% +%% Rice-encoding and single hash blooms are used here in order to provide an +%% optimally space efficient solution, but also as the processing required to +%% support uncompression can be concurrently performing a checksum role. +%% +%% For this to work, the bloom is divided into 64 parts and a 32-bit hash is +%% required. Each hash is placed into one of 64 blooms based on the six least +%% significant bits of the hash, and the fmost significant 26-bits are used +%% to indicate the bit to be added to the bloom. +%% +%% The bloom is then created by calculating the differences between the ordered +%% elements of the hash list and representing the difference using an exponent +%% and a 13-bit remainder i.e. +%% 8000 -> 0 11111 01000000 +%% 10000 -> 10 00000 00010000 +%% 20000 -> 110 01110 00100000 +%% +%% Each bloom should have approximately 64 differences. +%% +%% Fronting the bloom is a bloom index, formed first by 16 pairs of 3-byte +%% max hash, 2-byte length (bits) - with then each of the encoded bitstrings +%% appended. The max hash is the total of all the differences (which should +%% be the highest hash in the bloom). +%% +%% To check a key against the bloom, hash it, take the four least signifcant +%% bits and read the start pointer, max hash end pointer from the expected +%% positions in the bloom index. Then roll through from the start pointer to +%% the end pointer, accumulating each difference. There is a possible match if +%% either the accumulator hits the expected hash or the max hash doesn't match +%% the final accumulator (to cover if the bloom has been corrupted by a bit +%% flip somwhere). A miss is more than twice as expensive (on average) than a +%% potential match - but still only requires around 64 integer additions +%% and the processing of <100 bytes of data. +%% +%% For 2048 keys, this takes up <4KB. The false positive rate is 0.000122 +%% This compares favourably for the equivalent size optimal bloom which +%% would require 11 hashes and have a false positive rate of 0.000459. +%% Checking with a positive match should take on average about 6 microseconds, +%% and a negative match should take around 11 microseconds. +%% +%% See ../test/rice_test.erl for proving timings and fpr. + + + +-module(leveled_rice). + +-export([create_bloom/1, + check_key/2, + check_keys/2]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(SLOT_COUNT, 64). +-define(MAX_HASH, 16777216). +-define(DIVISOR_BITS, 13). +-define(DIVISOR, 8092). + +%% Create a bitstring representing the bloom filter from a key list + +create_bloom(KeyList) -> + create_bloom(KeyList, ?SLOT_COUNT, ?MAX_HASH). + +create_bloom(KeyList, SlotCount, MaxHash) -> + HashLists = array:new(SlotCount, [{default, []}]), + OrdHashLists = create_hashlist(KeyList, HashLists, SlotCount, MaxHash), + serialise_bloom(OrdHashLists). + + +%% Checking for a key + +check_keys([], _) -> + true; +check_keys([Key|Rest], BitStr) -> + case check_key(Key, BitStr) of + false -> + false; + true -> + check_keys(Rest, BitStr) + end. + +check_key(Key, BitStr) -> + check_key(Key, BitStr, ?SLOT_COUNT, ?MAX_HASH, ?DIVISOR_BITS, ?DIVISOR). + +check_key(Key, BitStr, SlotCount, MaxHash, Factor, Divisor) -> + {Slot, Hash} = get_slothash(Key, MaxHash, SlotCount), + {StartPos, Length, TopHash} = find_position(Slot, BitStr, 0, 40 * SlotCount), + case BitStr of + <<_:StartPos/bitstring, Bloom:Length/bitstring, _/bitstring>> -> + check_hash(Hash, Bloom, Factor, Divisor, 0, TopHash); + _ -> + io:format("Possible corruption of bloom index ~n"), + true + end. + +find_position(Slot, BloomIndex, Counter, StartPosition) -> + <> = BloomIndex, + case Slot of + Counter -> + {StartPosition, Length, TopHash}; + _ -> + find_position(Slot, Rest, Counter + 1, StartPosition + Length) + end. + + +% Checking for a hash within a bloom + +check_hash(_, <<>>, _, _, Acc, MaxHash) -> + case Acc of + MaxHash -> + false; + _ -> + io:format("Failure of CRC check on bloom filter~n"), + true + end; +check_hash(HashToCheck, BitStr, Factor, Divisor, Acc, TopHash) -> + case findexponent(BitStr) of + {ok, Exponent, BitStrTail} -> + case findremainder(BitStrTail, Factor) of + {ok, Remainder, BitStrTail2} -> + NextHash = Acc + Divisor * Exponent + Remainder, + case NextHash of + HashToCheck -> + true; + _ -> + check_hash(HashToCheck, BitStrTail2, Factor, + Divisor, NextHash, TopHash) + end; + error -> + io:format("Failure of CRC check on bloom filter~n"), + true + end; + error -> + io:format("Failure of CRC check on bloom filter~n"), + true + end. + +%% Convert the key list into an array of sorted hash lists + +create_hashlist([], HashLists, _, _) -> + HashLists; +create_hashlist([HeadKey|Rest], HashLists, SlotCount, MaxHash) -> + {Slot, Hash} = get_slothash(HeadKey, MaxHash, SlotCount), + HashList = array:get(Slot, HashLists), + create_hashlist(Rest, + array:set(Slot, lists:usort([Hash|HashList]), HashLists), + SlotCount, MaxHash). + +%% Convert an array of hash lists into an serialsed bloom + +serialise_bloom(HashLists) -> + SlotCount = array:size(HashLists), + serialise_bloom(HashLists, SlotCount, 0, []). + +serialise_bloom(HashLists, SlotCount, Counter, Blooms) -> + case Counter of + SlotCount -> + finalise_bloom(Blooms); + _ -> + Bloom = serialise_singlebloom(array:get(Counter, HashLists)), + serialise_bloom(HashLists, SlotCount, Counter + 1, [Bloom|Blooms]) + end. + +serialise_singlebloom(HashList) -> + serialise_singlebloom(HashList, <<>>, 0, ?DIVISOR, ?DIVISOR_BITS). + +serialise_singlebloom([], BloomStr, TopHash, _, _) -> + % io:format("Single bloom created with bloom of ~w and top hash of ~w~n", [BloomStr, TopHash]), + {BloomStr, TopHash}; +serialise_singlebloom([Hash|Rest], BloomStr, TopHash, Divisor, Factor) -> + HashGap = Hash - TopHash, + Exponent = buildexponent(HashGap div Divisor), + Remainder = HashGap rem Divisor, + NewBloomStr = <>, + serialise_singlebloom(Rest, NewBloomStr, Hash, Divisor, Factor). + + +finalise_bloom(Blooms) -> + finalise_bloom(Blooms, {<<>>, <<>>}). + +finalise_bloom([], BloomAcc) -> + {BloomIndex, BloomStr} = BloomAcc, + <>; +finalise_bloom([Bloom|Rest], BloomAcc) -> + {BloomStr, TopHash} = Bloom, + {BloomIndexAcc, BloomStrAcc} = BloomAcc, + Length = bit_size(BloomStr), + UpdIdx = <>, + % io:format("Adding bloom string of ~w to bloom~n", [BloomStr]), + UpdBloomStr = <>, + finalise_bloom(Rest, {UpdIdx, UpdBloomStr}). + + + + +buildexponent(Exponent) -> + buildexponent(Exponent, <<0:1>>). + +buildexponent(0, OutputBits) -> + OutputBits; +buildexponent(Exponent, OutputBits) -> + buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>). + + +findexponent(BitStr) -> + findexponent(BitStr, 0). + +findexponent(<<>>, _) -> + error; +findexponent(<>, Acc) -> + case H of + 1 -> findexponent(T, Acc + 1); + 0 -> {ok, Acc, T} + end. + + +findremainder(BitStr, Factor) -> + case BitStr of + <> -> + {ok, Remainder, BitStrTail}; + _ -> + error + end. + + +get_slothash(Key, MaxHash, SlotCount) -> + Hash = erlang:phash2(Key, MaxHash), + {Hash rem SlotCount, Hash div SlotCount}. + + +%%%%%%%%%%%%%%%% +% T E S T +%%%%%%%%%%%%%%% + +corrupt_bloom(Bloom) -> + Length = bit_size(Bloom), + Random = random:uniform(Length), + <> = Bloom, + case Bit of + 1 -> + <>; + 0 -> + <> + end. + +bloom_test() -> + KeyList = ["key1", "key2", "key3", "key4"], + Bloom = create_bloom(KeyList), + io:format("Bloom of ~w of length ~w ~n", [Bloom, bit_size(Bloom)]), + ?assertMatch(true, check_key("key1", Bloom)), + ?assertMatch(true, check_key("key2", Bloom)), + ?assertMatch(true, check_key("key3", Bloom)), + ?assertMatch(true, check_key("key4", Bloom)), + ?assertMatch(false, check_key("key5", Bloom)). + +bloom_corruption_test() -> + KeyList = ["key1", "key2", "key3", "key4"], + Bloom = create_bloom(KeyList), + Bloom1 = corrupt_bloom(Bloom), + ?assertMatch(true, check_keys(KeyList, Bloom1)), + Bloom2 = corrupt_bloom(Bloom), + ?assertMatch(true, check_keys(KeyList, Bloom2)), + Bloom3 = corrupt_bloom(Bloom), + ?assertMatch(true, check_keys(KeyList, Bloom3)), + Bloom4 = corrupt_bloom(Bloom), + ?assertMatch(true, check_keys(KeyList, Bloom4)), + Bloom5 = corrupt_bloom(Bloom), + ?assertMatch(true, check_keys(KeyList, Bloom5)), + Bloom6 = corrupt_bloom(Bloom), + ?assertMatch(true, check_keys(KeyList, Bloom6)). + + diff --git a/test/lookup_test.beam b/test/lookup_test.beam deleted file mode 100644 index 3c8d76474f8d8ff741da4eecc8fc903230a9a869..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4096 zcmZ`+du$wM5#ROR;~U49?A1zw(>CyZHQI zrMaD*Z)U!k-^}cHd*syFAxYZu{-Me7$HpByCrR>GB}pF~vbYei z3X947yf2l^XZ$P}l6!rrt9Bv853`Ovo6TpetHF7DHj_;|KK@bi*3*HUm9q=iBJ;^Y zDmib>rtQT=EA3?Me0|YLI@zL=O&0?Nc(rrEQa(Fp7jphZh>L-=olhqn-y9s**_ZuA z$1YfMzuzhp>_PzhSWZ!ZyciWQfpW}&Y`$m}9D>Gt$|+cum`}~}z38`cOU^a_mE>Z{ z3Y3;INyqZ#?Yy-as>wnjdCi{Vu!9+EE?HW1{GLpiw;ZKtIqHg1*=@^;fURbq&N>U} zWO~60B$t+~e8yMIK4bZFR$<<1@P22#jfuF! z1geH>)J4wfvbq1fHm*g7UGuu%R2Aw_+|%H6C90UiT0ozaPmk=?G&$iW>LSmo@{+sc zxuOpgO}^pYsEfR;$~PNS?B{cD1|`GvMNOaPH(pcaTkb6^8}jY=>zce-mE7Aw$%LNy zmI78its3ePU7PYvDW;}GR879qU>aecsf?J|U1>D6n0qIvCR9_MRO+@5;Sh&7YW{K3~R^)X&Ca|_w1`ia;amiY8VzWD$vnfH%V>npf}wzX4!EYcLE`i7{Rp zfNOL`%UZx#0!&&N6ZAw3+qVVvI}Dqz3Y*yG>1>Bir6|gj&r{tI)bF~Y{kmVel?L1PtXvmD|%{c9BM{3 zt#LT0@7$;@dSQct_O7#Ob452gLu_+J9H2yj&-N;EeVCmJ_VqDLM300*^#yeid5C8_ z^mn+TpG^@#dpxwWg6BXDo&y`=xpQ*@jWXj6)`+Q1xMFY}nveiZ!)uh7D~5b($(+$# zu?JCT<=zQHjz@3kCLVUruxB_@HQZY^^xE-}6NcxYFpOb2Vqba2Bm?OInRb6uGr zL7B#@XdkMU>F^5FTqZnv!?04O3D0n{YIvlkOj9eiUYU-1Q?eUn>aHzQcV$Y}mT8)& zWNn>}mGPZkg>M%S*&T=1^60Fs({b2@*E%{;c65B5qi$Cmj(tx zadM-1I_VXPkA+hkEuZ3Lue!tDsnqDb@P_$?*Qd)O?uygA4N~Zfhx#i;JyTQEGaDAQ ze>4;al5s_ESi<=hZVbn86lHJ})z0JDb%llmg+5oU*Gx^l&ac$V6?nK|W)Q0qM&ch? zS>iUp$SlJZ(0Ws{7|_K;-Mi-syR`eE+2346za1TMLQ50 zl*Is4)(xSjcCkEzvKYj1k3-899}B9YDIPMEX&iR5{O19qJ`)YkAQcv1&iS3?rRIu6 zIXK_E_^DZH{fYBV1$O1b53D~|J+UImT;Gkg53sE zKt51ig%8y5!3tlG`z8=ytl!{qWqvbw8MFn&{~wtTgCZ54Z8Z@4XMQV)eK0kFSl$L= z|5cvjs`A?_d^_$tL3}Y^)nUE^#QvCPedfDBRiDi70D|{4pCYE#DJ1RV5)KcLY zQ>yTEq?G5qSeI=K5XUhJ@-F4z&;NNM>#)uUD1P;obRv;RB@*`*em$3X*2xiH@h;#`>Lp5R=VXa9^B=fQpE2QkLgUSxUQxZ@OX?|yC(zw$y|MpCX6 z^WH;Vl}caQ_l?ia{Hrgzvy z@4uu`Bel~`>Yz^QqFuC`dMQqQWY7RTN(bm5nRJ*YXp)Z7G##VkG(+dD P-KW3PKj`1|ALQ{rJu=`t diff --git a/test/rice_test.erl b/test/rice_test.erl new file mode 100644 index 0000000..1bbb43f --- /dev/null +++ b/test/rice_test.erl @@ -0,0 +1,59 @@ +%% Test performance and accuracy of rice-encoded bloom filters +%% +%% Calling check_negative(2048, 1000000) should return about 122 false +%% positives in around 11 seconds, with a size below 4KB +%% +%% The equivalent positive check is check_positive(2048, 488) and this +%% should take around 6 seconds. +%% +%% So a blooom with 2048 members should support o(100K) checks per second +%% on a modern CPU, whilst requiring 2 bytes per member. + +-module(rice_test). + +-export([check_positive/2, check_negative/2, calc_hash/2]). + + + +check_positive(KeyCount, LoopCount) -> + KeyList = produce_keylist(KeyCount), + Bloom = leveled_rice:create_bloom(KeyList), + check_positive(KeyList, Bloom, LoopCount). + +check_positive(_, Bloom, 0) -> + {ok, byte_size(Bloom)}; +check_positive(KeyList, Bloom, LoopCount) -> + true = leveled_rice:check_keys(KeyList, Bloom), + check_positive(KeyList, Bloom, LoopCount - 1). + + +produce_keylist(KeyCount) -> + KeyPrefix = lists:concat(["PositiveKey-", random:uniform(KeyCount)]), + produce_keylist(KeyCount, [], KeyPrefix). + +produce_keylist(0, KeyList, _) -> + KeyList; +produce_keylist(KeyCount, KeyList, KeyPrefix) -> + Key = lists:concat([KeyPrefix, KeyCount]), + produce_keylist(KeyCount - 1, [Key|KeyList], KeyPrefix). + + +check_negative(KeyCount, CheckCount) -> + KeyList = produce_keylist(KeyCount), + Bloom = leveled_rice:create_bloom(KeyList), + check_negative(Bloom, CheckCount, 0). + +check_negative(Bloom, 0, FalsePos) -> + {byte_size(Bloom), FalsePos}; +check_negative(Bloom, CheckCount, FalsePos) -> + Key = lists:concat(["NegativeKey-", CheckCount, random:uniform(CheckCount)]), + case leveled_rice:check_key(Key, Bloom) of + true -> check_negative(Bloom, CheckCount - 1, FalsePos + 1); + false -> check_negative(Bloom, CheckCount - 1, FalsePos) + end. + +calc_hash(_, 0) -> + ok; +calc_hash(Key, Count) -> + erlang:phash2(lists:concat([Key, Count, "sometxt"])), + calc_hash(Key, Count -1).