Tidy-up initial files and add testing to optimise bst bloom filters

2015-05-31 23:31:31 +01:00 · 2015-05-31 23:31:31 +01:00 · 647a7f44dc
commit 647a7f44dc
parent b09246ef04
4 changed files with 375 additions and 19 deletions
--- a/src/leveled_iterator.erl
+++ b/src/leveled_iterator.erl
@ -1,19 +1,25 @@
 -module(leveled_internal).
 -export([termiterator/6]).
 -include_lib("eunit/include/eunit.hrl").
 %% We will have a sorted list of terms
-%% Some terms will be dummy terms which are pointers to more terms which can be found
+%% Some terms will be dummy terms which are pointers to more terms which can be 
-%% If a pointer is hit need to replenish the term list before proceeding
+%% found.  If a pointer is hit need to replenish the term list before 
 %% proceeding.
 %%
-%% Helper Functions should have free functions - FolderFun, CompareFun, PointerCheck}
+%% Helper Functions should have free functions - 
-%% FolderFun - function which takes the next item and the accumulator and returns an updated accunulator
+%% {FolderFun, CompareFun, PointerCheck}
-%% CompareFun - function which should be able to compare two keys (which are not pointers)
+%% FolderFun - function which takes the next item and the accumulator and 
 %% returns an updated accumulator
 %% CompareFun - function which should be able to compare two keys (which are 
 %% not pointers), and return a winning item (or combination of items)
 %% PointerCheck - function for differentiating between keys and pointer
-termiterator(HeadItem, [], Acc, HelperFuns, _StartKey, _EndKey) ->
+termiterator(HeadItem, [], Acc, HelperFuns, 
-	io:format("Reached empty list with head item of ~w~n", [HeadItem]),
+	_StartKey, _EndKey) ->
 	case HeadItem of 
 		null ->
 			Acc;
@ -21,7 +27,8 @@ termiterator(HeadItem, [], Acc, HelperFuns, _StartKey, _EndKey) ->
 			{FolderFun, _, _} = HelperFuns,
 			FolderFun(Acc, HeadItem)
 	end;
-termiterator(null, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) ->
+termiterator(null, [NextItem|TailList], Acc, HelperFuns, 
 	StartKey, EndKey) ->
 	%% Check that the NextItem is not a pointer before promoting to HeadItem
 	%% Cannot now promote a HeadItem which is a pointer
 	{_, _, PointerCheck} = HelperFuns,
@ -29,30 +36,37 @@ termiterator(null, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) ->
 		{true, Pointer} ->
 			NewSlice = getnextslice(Pointer, EndKey),
 			ExtendedList = lists:merge(NewSlice, TailList),
-			termiterator(null, ExtendedList, Acc, HelperFuns, StartKey, EndKey);
+			termiterator(null, ExtendedList, Acc, HelperFuns, 
 				StartKey, EndKey);
 		false ->
-			termiterator(NextItem, TailList, Acc, HelperFuns, StartKey, EndKey)
+			termiterator(NextItem, TailList, Acc, HelperFuns, 
 				StartKey, EndKey)
 	end;
-termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) ->
+termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns, 
-	io:format("Checking head item of ~w~n", [HeadItem]),
+	StartKey, EndKey) ->
 	{FolderFun, CompareFun, PointerCheck} = HelperFuns,
-	%% HeadItem cannot be pointer, but NextItem might be, so check before comparison
+	%% HeadItem cannot be pointer, but NextItem might be, so check before 
 	%% comparison
 	case PointerCheck(NextItem) of 
 		{true, Pointer} ->
 			NewSlice = getnextslice(Pointer, EndKey),
 			ExtendedList = lists:merge(NewSlice, [NextItem|TailList]),
-			termiterator(null, ExtendedList, Acc, HelperFuns, StartKey, EndKey);
+			termiterator(null, ExtendedList, Acc, HelperFuns, 
 				StartKey, EndKey);
 		false ->
-			%% Compare to see if Head and Next match, or if Head is a winner to be added
+			%% Compare to see if Head and Next match, or if Head is a winner 
-			%% to accumulator
+			%% to be added to accumulator
 			case CompareFun(HeadItem, NextItem) of 
 				{match, StrongItem, _WeakItem} ->
-					%% Discard WeakItem
+					%% Discard WeakItem, Strong Item might be an aggregation of
-					termiterator(StrongItem, TailList, Acc, HelperFuns, StartKey, EndKey);
+					%% the items  
 					termiterator(StrongItem, TailList, Acc, HelperFuns, 
 						StartKey, EndKey);
 				{winner, HeadItem} ->
 					%% Add next item to accumulator, and proceed with next item
 					AccPlus = FolderFun(Acc, HeadItem),
-					termiterator(NextItem, TailList, AccPlus, HelperFuns, HeadItem, EndKey)
+					termiterator(NextItem, TailList, AccPlus, HelperFuns, 
 						HeadItem, EndKey)
 			end
 	end.
--- a/src/leveled_rice.erl
+++ b/src/leveled_rice.erl
@ -0,0 +1,283 @@
 %% Used for creating fixed-size self-regulating encoded bloom filters
 %%
 %% Normally a bloom filter in order to achieve optimium size increases the
 %% number of hashes as the desired false positive rate increases.  There is 
 %% a processing overhead for checking this bloom, both because of the number
 %% of hash calculations required, and also because of the need to CRC check
 %% the bloom to ensure a false negative result is not returned due to 
 %% corruption.
 %%
 %% A more space efficient bloom can be achieved through the compression of 
 %% bloom filters with less hashes (and in an optimal case a single hash).  
 %% This can be achieved using rice encoding.
 %%
 %% Rice-encoding and single hash blooms are used here in order to provide an
 %% optimally space efficient solution, but also as the processing required to
 %% support uncompression can be concurrently performing a checksum role.
 %%
 %% For this to work, the bloom is divided into 64 parts and a 32-bit hash is 
 %% required.  Each hash is placed into one of 64 blooms based on the six least
 %% significant bits of the hash, and the fmost significant 26-bits are used 
 %% to indicate the bit to be added to the bloom.
 %%
 %% The bloom is then created by calculating the differences between the ordered
 %% elements of the hash list and representing the difference using an exponent 
 %% and a 13-bit remainder i.e.
 %% 8000  ->   0  11111 01000000
 %% 10000 ->  10  00000 00010000
 %% 20000 -> 110  01110 00100000
 %%
 %% Each bloom should have approximately 64 differences.  
 %%
 %% Fronting the bloom is a bloom index, formed first by 16 pairs of 3-byte 
 %% max hash, 2-byte length (bits) - with then each of the encoded bitstrings 
 %% appended.  The max hash is the  total of all the differences (which should 
 %% be the highest hash in the bloom).
 %%
 %% To check a key against the bloom, hash it, take the four least signifcant 
 %% bits and read the start pointer, max hash end pointer from the expected 
 %% positions in the bloom index.  Then roll through from the start pointer to 
 %% the end pointer, accumulating each difference. There is a possible match if 
 %% either the accumulator hits the expected hash or the max hash doesn't match 
 %% the final accumulator (to cover if the bloom has been corrupted by a bit 
 %% flip somwhere). A miss is more than twice as expensive (on average) than a
 %% potential match - but still only requires around 64 integer additions
 %% and the processing of <100 bytes of data.
 %%
 %% For 2048 keys, this takes up <4KB.  The false positive rate is 0.000122
 %% This compares favourably for the equivalent size optimal bloom which 
 %% would require 11 hashes and have a false positive rate of 0.000459.
 %% Checking with a positive match should take on average about 6 microseconds, 
 %% and a negative match should take around 11 microseconds.  
 %%
 %% See ../test/rice_test.erl for proving timings and fpr.
 -module(leveled_rice).
 -export([create_bloom/1, 
 	check_key/2,
 	check_keys/2]).
 -include_lib("eunit/include/eunit.hrl").
 -define(SLOT_COUNT, 64).
 -define(MAX_HASH, 16777216).
 -define(DIVISOR_BITS, 13).
 -define(DIVISOR, 8092).
 %% Create a bitstring representing the bloom filter from a key list
 create_bloom(KeyList) ->
 	create_bloom(KeyList, ?SLOT_COUNT, ?MAX_HASH).
 create_bloom(KeyList, SlotCount, MaxHash) ->
 	HashLists = array:new(SlotCount, [{default, []}]),
 	OrdHashLists = create_hashlist(KeyList, HashLists, SlotCount, MaxHash),
 	serialise_bloom(OrdHashLists).
 %% Checking for a key
 check_keys([], _) ->
 	true;
 check_keys([Key|Rest], BitStr) ->
 	case check_key(Key, BitStr) of 
 		false ->
 			false;
 		true ->
 			check_keys(Rest, BitStr)
 	end.
 check_key(Key, BitStr) ->
 	check_key(Key, BitStr, ?SLOT_COUNT, ?MAX_HASH, ?DIVISOR_BITS, ?DIVISOR).
 check_key(Key, BitStr, SlotCount, MaxHash, Factor, Divisor) ->
 	{Slot, Hash} = get_slothash(Key, MaxHash, SlotCount),
 	{StartPos, Length, TopHash} = find_position(Slot, BitStr, 0, 40 * SlotCount),
 	case BitStr of 
 		<<_:StartPos/bitstring, Bloom:Length/bitstring, _/bitstring>> ->
 			check_hash(Hash, Bloom, Factor, Divisor, 0, TopHash);
 		_ ->
 			io:format("Possible corruption of bloom index ~n"),
 			true
 	end.
 find_position(Slot, BloomIndex, Counter, StartPosition) ->
 	<<TopHash:24/integer, Length:16/integer, Rest/bitstring>> = BloomIndex,
 	case Slot of 
 		Counter -> 
 			{StartPosition, Length, TopHash};
 		_ ->
 			find_position(Slot, Rest, Counter + 1, StartPosition + Length)
 	end.
 % Checking for a hash within a bloom
 check_hash(_, <<>>, _, _, Acc, MaxHash) ->
 	case Acc of 
 		MaxHash -> 
 			false;
 		_ -> 
 			io:format("Failure of CRC check on bloom filter~n"),
 			true
 	end;
 check_hash(HashToCheck, BitStr, Factor, Divisor, Acc, TopHash) ->
 	case findexponent(BitStr) of 
 		{ok, Exponent, BitStrTail} ->
 			case findremainder(BitStrTail, Factor) of 
 				{ok, Remainder, BitStrTail2} ->
 					NextHash = Acc + Divisor * Exponent + Remainder,
 					case NextHash of 
 						HashToCheck ->
 							true;
 						_ -> 
 							check_hash(HashToCheck, BitStrTail2, Factor, 
 								Divisor, NextHash, TopHash)
 					end;
 				error ->
 					io:format("Failure of CRC check on bloom filter~n"),
 					true 
 			end;
 		error ->
 			io:format("Failure of CRC check on bloom filter~n"),
 			true 
 	end.
 %% Convert the key list into an array of sorted hash lists
 create_hashlist([], HashLists, _, _) ->
 	HashLists;
 create_hashlist([HeadKey|Rest], HashLists, SlotCount, MaxHash) ->
 	{Slot, Hash} = get_slothash(HeadKey, MaxHash, SlotCount),
 	HashList = array:get(Slot, HashLists),
 	create_hashlist(Rest, 
 		array:set(Slot, lists:usort([Hash|HashList]), HashLists), 
 		SlotCount, MaxHash).
 %% Convert an array of hash lists into an serialsed bloom
 serialise_bloom(HashLists) ->
 	SlotCount = array:size(HashLists),
 	serialise_bloom(HashLists, SlotCount, 0,  []).
 serialise_bloom(HashLists, SlotCount, Counter, Blooms) ->
 	case Counter of 
 		SlotCount -> 
 			finalise_bloom(Blooms);
 		_ ->
 			Bloom = serialise_singlebloom(array:get(Counter, HashLists)),
 			serialise_bloom(HashLists, SlotCount, Counter + 1, [Bloom|Blooms])
 	end.
 serialise_singlebloom(HashList) ->
 	serialise_singlebloom(HashList, <<>>, 0, ?DIVISOR, ?DIVISOR_BITS).
 serialise_singlebloom([], BloomStr, TopHash, _, _) ->
 	% io:format("Single bloom created with bloom of ~w and top hash of ~w~n", [BloomStr, TopHash]),
 	{BloomStr, TopHash};
 serialise_singlebloom([Hash|Rest], BloomStr, TopHash, Divisor, Factor) ->
 	HashGap = Hash - TopHash,
 	Exponent = buildexponent(HashGap div Divisor),
 	Remainder = HashGap rem Divisor,
 	NewBloomStr = <<BloomStr/bitstring, Exponent/bitstring, Remainder:Factor/integer>>,
 	serialise_singlebloom(Rest, NewBloomStr, Hash, Divisor, Factor).
 finalise_bloom(Blooms) ->
 	finalise_bloom(Blooms, {<<>>, <<>>}).
 finalise_bloom([], BloomAcc) ->
 	{BloomIndex, BloomStr} = BloomAcc,
 	<<BloomIndex/bitstring, BloomStr/bitstring>>;
 finalise_bloom([Bloom|Rest], BloomAcc) ->
 	{BloomStr, TopHash} = Bloom,
 	{BloomIndexAcc, BloomStrAcc} = BloomAcc,
 	Length = bit_size(BloomStr),
 	UpdIdx = <<TopHash:24/integer, Length:16/integer, BloomIndexAcc/bitstring>>,
 	% io:format("Adding bloom string of ~w to bloom~n", [BloomStr]),
 	UpdBloomStr = <<BloomStr/bitstring, BloomStrAcc/bitstring>>, 
 	finalise_bloom(Rest, {UpdIdx, UpdBloomStr}).
 buildexponent(Exponent) ->
 	buildexponent(Exponent, <<0:1>>).
 buildexponent(0, OutputBits) ->
 	OutputBits;
 buildexponent(Exponent, OutputBits) ->
 	buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>).
 findexponent(BitStr) ->
 	findexponent(BitStr, 0).
 findexponent(<<>>, _) -> 
 	error;
 findexponent(<<H:1/integer, T/bitstring>>, Acc) ->
 	case H of
 		1 -> findexponent(T, Acc + 1);
 		0 -> {ok, Acc, T}
 	end.
 findremainder(BitStr, Factor) ->
 	case BitStr of 
 		<<Remainder:Factor/integer, BitStrTail/bitstring>> ->
 			{ok, Remainder, BitStrTail};
 		_ ->
 			error 
 	end.
 get_slothash(Key, MaxHash, SlotCount) ->
 	Hash = erlang:phash2(Key, MaxHash),
 	{Hash rem SlotCount, Hash div SlotCount}.
 %%%%%%%%%%%%%%%%
 % T E S T 
 %%%%%%%%%%%%%%%  
 corrupt_bloom(Bloom) ->
 	Length = bit_size(Bloom),
 	Random = random:uniform(Length),
 	<<Part1:Random/bitstring, Bit:1/integer, Rest1/bitstring>> = Bloom,
 	case Bit of 
 		1 -> 
 			<<Part1/bitstring, 0:1/integer, Rest1/bitstring>>;
 		0 ->
 			<<Part1/bitstring, 1:1/integer, Rest1/bitstring>>
 	end.
 bloom_test() ->
 	KeyList = ["key1", "key2", "key3", "key4"],
 	Bloom = create_bloom(KeyList),
 	io:format("Bloom of ~w of length ~w ~n", [Bloom, bit_size(Bloom)]),
 	?assertMatch(true, check_key("key1", Bloom)),
 	?assertMatch(true, check_key("key2", Bloom)),
 	?assertMatch(true, check_key("key3", Bloom)),
 	?assertMatch(true, check_key("key4", Bloom)),
 	?assertMatch(false, check_key("key5", Bloom)).
 bloom_corruption_test() ->
 	KeyList = ["key1", "key2", "key3", "key4"],
 	Bloom = create_bloom(KeyList),
 	Bloom1 = corrupt_bloom(Bloom),
 	?assertMatch(true, check_keys(KeyList, Bloom1)),
 	Bloom2 = corrupt_bloom(Bloom),
 	?assertMatch(true, check_keys(KeyList, Bloom2)),
 	Bloom3 = corrupt_bloom(Bloom),
 	?assertMatch(true, check_keys(KeyList, Bloom3)),
 	Bloom4 = corrupt_bloom(Bloom),
 	?assertMatch(true, check_keys(KeyList, Bloom4)),
 	Bloom5 = corrupt_bloom(Bloom),
 	?assertMatch(true, check_keys(KeyList, Bloom5)),
 	Bloom6 = corrupt_bloom(Bloom),
 	?assertMatch(true, check_keys(KeyList, Bloom6)).
--- a/test/lookup_test.beam
+++ b/test/lookup_test.beam
--- a/test/rice_test.erl
+++ b/test/rice_test.erl
@ -0,0 +1,59 @@
 %% Test performance and accuracy of rice-encoded bloom filters
 %%
 %% Calling check_negative(2048, 1000000) should return about 122 false 
 %% positives in around 11 seconds, with a size below 4KB
 %%
 %% The equivalent positive check is check_positive(2048, 488) and this
 %% should take around 6 seconds.
 %%
 %% So a blooom with 2048 members should support o(100K) checks per second
 %% on a modern CPU, whilst requiring 2 bytes per member.
 -module(rice_test).
 -export([check_positive/2, check_negative/2, calc_hash/2]).
 check_positive(KeyCount, LoopCount) ->
 	KeyList = produce_keylist(KeyCount),
 	Bloom = leveled_rice:create_bloom(KeyList),
 	check_positive(KeyList, Bloom, LoopCount).
 check_positive(_, Bloom, 0) ->
 	{ok, byte_size(Bloom)};
 check_positive(KeyList, Bloom, LoopCount) ->
 	true = leveled_rice:check_keys(KeyList, Bloom),
 	check_positive(KeyList, Bloom, LoopCount - 1).
 produce_keylist(KeyCount) ->
 	KeyPrefix = lists:concat(["PositiveKey-", random:uniform(KeyCount)]),
 	produce_keylist(KeyCount, [], KeyPrefix).
 produce_keylist(0, KeyList, _) ->
 	KeyList;
 produce_keylist(KeyCount, KeyList, KeyPrefix) ->
 	Key = lists:concat([KeyPrefix, KeyCount]),
 	produce_keylist(KeyCount - 1, [Key|KeyList], KeyPrefix).
 check_negative(KeyCount, CheckCount) ->
 	KeyList = produce_keylist(KeyCount),
 	Bloom = leveled_rice:create_bloom(KeyList),
 	check_negative(Bloom, CheckCount, 0).
 check_negative(Bloom, 0, FalsePos) ->
 	{byte_size(Bloom), FalsePos};
 check_negative(Bloom, CheckCount, FalsePos) ->
 	Key = lists:concat(["NegativeKey-", CheckCount, random:uniform(CheckCount)]),
 	case leveled_rice:check_key(Key, Bloom) of 
 		true -> check_negative(Bloom, CheckCount - 1, FalsePos + 1);
 		false -> check_negative(Bloom, CheckCount - 1, FalsePos)
 	end.
 calc_hash(_, 0) ->
 	ok;
 calc_hash(Key, Count) ->
 	erlang:phash2(lists:concat([Key, Count, "sometxt"])),
 	calc_hash(Key, Count -1).