From 2d981cb2e7b3108b5f08783735d994ff87927c5a Mon Sep 17 00:00:00 2001 From: martinsumner Date: Wed, 12 Oct 2016 18:10:47 +0100 Subject: [PATCH] FAdvise Add fadvise magic to SFT files. Also delete unnecessary rice modeule --- src/leveled_rice.erl | 283 ------------------------------------------- src/leveled_sft.erl | 28 ++--- 2 files changed, 7 insertions(+), 304 deletions(-) delete mode 100644 src/leveled_rice.erl diff --git a/src/leveled_rice.erl b/src/leveled_rice.erl deleted file mode 100644 index f432944..0000000 --- a/src/leveled_rice.erl +++ /dev/null @@ -1,283 +0,0 @@ -%% Used for creating fixed-size self-regulating encoded bloom filters -%% -%% Normally a bloom filter in order to achieve optimium size increases the -%% number of hashes as the desired false positive rate increases. There is -%% a processing overhead for checking this bloom, both because of the number -%% of hash calculations required, and also because of the need to CRC check -%% the bloom to ensure a false negative result is not returned due to -%% corruption. -%% -%% A more space efficient bloom can be achieved through the compression of -%% bloom filters with less hashes (and in an optimal case a single hash). -%% This can be achieved using rice encoding. -%% -%% Rice-encoding and single hash blooms are used here in order to provide an -%% optimally space efficient solution, but also as the processing required to -%% support uncompression can be concurrently performing a checksum role. -%% -%% For this to work, the bloom is divided into 64 parts and a 32-bit hash is -%% required. Each hash is placed into one of 64 blooms based on the six least -%% significant bits of the hash, and the fmost significant 26-bits are used -%% to indicate the bit to be added to the bloom. -%% -%% The bloom is then created by calculating the differences between the ordered -%% elements of the hash list and representing the difference using an exponent -%% and a 13-bit remainder i.e. -%% 8000 -> 0 11111 01000000 -%% 10000 -> 10 00000 00010000 -%% 20000 -> 110 01110 00100000 -%% -%% Each bloom should have approximately 64 differences. -%% -%% Fronting the bloom is a bloom index, formed first by 16 pairs of 3-byte -%% max hash, 2-byte length (bits) - with then each of the encoded bitstrings -%% appended. The max hash is the total of all the differences (which should -%% be the highest hash in the bloom). -%% -%% To check a key against the bloom, hash it, take the four least signifcant -%% bits and read the start pointer, max hash end pointer from the expected -%% positions in the bloom index. Then roll through from the start pointer to -%% the end pointer, accumulating each difference. There is a possible match if -%% either the accumulator hits the expected hash or the max hash doesn't match -%% the final accumulator (to cover if the bloom has been corrupted by a bit -%% flip somwhere). A miss is more than twice as expensive (on average) than a -%% potential match - but still only requires around 64 integer additions -%% and the processing of <100 bytes of data. -%% -%% For 2048 keys, this takes up <4KB. The false positive rate is 0.000122 -%% This compares favourably for the equivalent size optimal bloom which -%% would require 11 hashes and have a false positive rate of 0.000459. -%% Checking with a positive match should take on average about 6 microseconds, -%% and a negative match should take around 11 microseconds. -%% -%% See ../test/rice_test.erl for proving timings and fpr. - - - --module(leveled_rice). - --export([create_bloom/1, - check_key/2, - check_keys/2]). - --include_lib("eunit/include/eunit.hrl"). - --define(SLOT_COUNT, 64). --define(MAX_HASH, 16777216). --define(DIVISOR_BITS, 13). --define(DIVISOR, 8092). - -%% Create a bitstring representing the bloom filter from a key list - -create_bloom(KeyList) -> - create_bloom(KeyList, ?SLOT_COUNT, ?MAX_HASH). - -create_bloom(KeyList, SlotCount, MaxHash) -> - HashLists = array:new(SlotCount, [{default, []}]), - OrdHashLists = create_hashlist(KeyList, HashLists, SlotCount, MaxHash), - serialise_bloom(OrdHashLists). - - -%% Checking for a key - -check_keys([], _) -> - true; -check_keys([Key|Rest], BitStr) -> - case check_key(Key, BitStr) of - false -> - false; - true -> - check_keys(Rest, BitStr) - end. - -check_key(Key, BitStr) -> - check_key(Key, BitStr, ?SLOT_COUNT, ?MAX_HASH, ?DIVISOR_BITS, ?DIVISOR). - -check_key(Key, BitStr, SlotCount, MaxHash, Factor, Divisor) -> - {Slot, Hash} = get_slothash(Key, MaxHash, SlotCount), - {StartPos, Length, TopHash} = find_position(Slot, BitStr, 0, 40 * SlotCount), - case BitStr of - <<_:StartPos/bitstring, Bloom:Length/bitstring, _/bitstring>> -> - check_hash(Hash, Bloom, Factor, Divisor, 0, TopHash); - _ -> - io:format("Possible corruption of bloom index ~n"), - true - end. - -find_position(Slot, BloomIndex, Counter, StartPosition) -> - <> = BloomIndex, - case Slot of - Counter -> - {StartPosition, Length, TopHash}; - _ -> - find_position(Slot, Rest, Counter + 1, StartPosition + Length) - end. - - -% Checking for a hash within a bloom - -check_hash(_, <<>>, _, _, Acc, MaxHash) -> - case Acc of - MaxHash -> - false; - _ -> - io:format("Failure of CRC check on bloom filter~n"), - true - end; -check_hash(HashToCheck, BitStr, Factor, Divisor, Acc, TopHash) -> - case findexponent(BitStr) of - {ok, Exponent, BitStrTail} -> - case findremainder(BitStrTail, Factor) of - {ok, Remainder, BitStrTail2} -> - NextHash = Acc + Divisor * Exponent + Remainder, - case NextHash of - HashToCheck -> - true; - _ -> - check_hash(HashToCheck, BitStrTail2, Factor, - Divisor, NextHash, TopHash) - end; - error -> - io:format("Failure of CRC check on bloom filter~n"), - true - end; - error -> - io:format("Failure of CRC check on bloom filter~n"), - true - end. - -%% Convert the key list into an array of sorted hash lists - -create_hashlist([], HashLists, _, _) -> - HashLists; -create_hashlist([HeadKey|Rest], HashLists, SlotCount, MaxHash) -> - {Slot, Hash} = get_slothash(HeadKey, MaxHash, SlotCount), - HashList = array:get(Slot, HashLists), - create_hashlist(Rest, - array:set(Slot, lists:usort([Hash|HashList]), HashLists), - SlotCount, MaxHash). - -%% Convert an array of hash lists into an serialsed bloom - -serialise_bloom(HashLists) -> - SlotCount = array:size(HashLists), - serialise_bloom(HashLists, SlotCount, 0, []). - -serialise_bloom(HashLists, SlotCount, Counter, Blooms) -> - case Counter of - SlotCount -> - finalise_bloom(Blooms); - _ -> - Bloom = serialise_singlebloom(array:get(Counter, HashLists)), - serialise_bloom(HashLists, SlotCount, Counter + 1, [Bloom|Blooms]) - end. - -serialise_singlebloom(HashList) -> - serialise_singlebloom(HashList, <<>>, 0, ?DIVISOR, ?DIVISOR_BITS). - -serialise_singlebloom([], BloomStr, TopHash, _, _) -> - % io:format("Single bloom created with bloom of ~w and top hash of ~w~n", [BloomStr, TopHash]), - {BloomStr, TopHash}; -serialise_singlebloom([Hash|Rest], BloomStr, TopHash, Divisor, Factor) -> - HashGap = Hash - TopHash, - Exponent = buildexponent(HashGap div Divisor), - Remainder = HashGap rem Divisor, - NewBloomStr = <>, - serialise_singlebloom(Rest, NewBloomStr, Hash, Divisor, Factor). - - -finalise_bloom(Blooms) -> - finalise_bloom(Blooms, {<<>>, <<>>}). - -finalise_bloom([], BloomAcc) -> - {BloomIndex, BloomStr} = BloomAcc, - <>; -finalise_bloom([Bloom|Rest], BloomAcc) -> - {BloomStr, TopHash} = Bloom, - {BloomIndexAcc, BloomStrAcc} = BloomAcc, - Length = bit_size(BloomStr), - UpdIdx = <>, - % io:format("Adding bloom string of ~w to bloom~n", [BloomStr]), - UpdBloomStr = <>, - finalise_bloom(Rest, {UpdIdx, UpdBloomStr}). - - - - -buildexponent(Exponent) -> - buildexponent(Exponent, <<0:1>>). - -buildexponent(0, OutputBits) -> - OutputBits; -buildexponent(Exponent, OutputBits) -> - buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>). - - -findexponent(BitStr) -> - findexponent(BitStr, 0). - -findexponent(<<>>, _) -> - error; -findexponent(<>, Acc) -> - case H of - 1 -> findexponent(T, Acc + 1); - 0 -> {ok, Acc, T} - end. - - -findremainder(BitStr, Factor) -> - case BitStr of - <> -> - {ok, Remainder, BitStrTail}; - _ -> - error - end. - - -get_slothash(Key, MaxHash, SlotCount) -> - Hash = erlang:phash2(Key, MaxHash), - {Hash rem SlotCount, Hash div SlotCount}. - - -%%%%%%%%%%%%%%%% -% T E S T -%%%%%%%%%%%%%%% - -corrupt_bloom(Bloom) -> - Length = bit_size(Bloom), - Random = random:uniform(Length), - <> = Bloom, - case Bit of - 1 -> - <>; - 0 -> - <> - end. - -bloom_test() -> - KeyList = ["key1", "key2", "key3", "key4"], - Bloom = create_bloom(KeyList), - io:format("Bloom of ~w of length ~w ~n", [Bloom, bit_size(Bloom)]), - ?assertMatch(true, check_key("key1", Bloom)), - ?assertMatch(true, check_key("key2", Bloom)), - ?assertMatch(true, check_key("key3", Bloom)), - ?assertMatch(true, check_key("key4", Bloom)), - ?assertMatch(false, check_key("key5", Bloom)). - -bloom_corruption_test() -> - KeyList = ["key1", "key2", "key3", "key4"], - Bloom = create_bloom(KeyList), - Bloom1 = corrupt_bloom(Bloom), - ?assertMatch(true, check_keys(KeyList, Bloom1)), - Bloom2 = corrupt_bloom(Bloom), - ?assertMatch(true, check_keys(KeyList, Bloom2)), - Bloom3 = corrupt_bloom(Bloom), - ?assertMatch(true, check_keys(KeyList, Bloom3)), - Bloom4 = corrupt_bloom(Bloom), - ?assertMatch(true, check_keys(KeyList, Bloom4)), - Bloom5 = corrupt_bloom(Bloom), - ?assertMatch(true, check_keys(KeyList, Bloom5)), - Bloom6 = corrupt_bloom(Bloom), - ?assertMatch(true, check_keys(KeyList, Bloom6)). - - diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 9cc3a68..4a54eb6 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -159,7 +159,6 @@ sft_close/1, sft_clear/1, sft_checkready/1, - sft_getfilename/1, sft_setfordelete/2, sft_getmaxsequencenumber/1, generate_randomkeys/1]). @@ -255,9 +254,6 @@ sft_close(Pid) -> sft_checkready(Pid) -> gen_server:call(Pid, background_complete, infinity). -sft_getfilename(Pid) -> - gen_server:call(Pid, get_filename, infinty). - sft_getmaxsequencenumber(Pid) -> gen_server:call(Pid, get_maxsqn, infinity). @@ -330,8 +326,6 @@ handle_call(background_complete, _From, State) -> false -> {reply, {error, State#state.background_failure}, State} end; -handle_call(get_filename, _From, State) -> - {reply, State#state.filename, State}; handle_call({set_for_delete, Penciller}, _From, State) -> {reply, ok, @@ -362,9 +356,7 @@ handle_info(timeout, State) -> end; false -> {noreply, State} - end; -handle_info(_Info, State) -> - {noreply, State}. + end. terminate(Reason, State) -> io:format("Exit called for reason ~w on filename ~s~n", @@ -878,18 +870,12 @@ sftwrite_function(finalise, IndexLength:32/integer, FilterLength:32/integer, SummaryLength:32/integer>>), - file:close(Handle); -sftwrite_function(finalise, - {Handle, - SlotIndex, - SNExtremes, - KeyExtremes}) -> - {SlotFilters, PointerIndex} = convert_slotindex(SlotIndex), - sftwrite_function(finalise, - {Handle, - {SlotFilters, PointerIndex}, - SNExtremes, - KeyExtremes}). + {ok, _Position} = file:position(Handle, bof), + ok = file:advise(Handle, + BlocksLength + IndexLength, + FilterLength, + will_need), + file:close(Handle). %% Level 0 files are of variable (infinite) size to avoid issues with having %% any remainders when flushing from memory