Add fadvise magic to SFT files.  Also delete unnecessary rice modeule
This commit is contained in:
martinsumner 2016-10-12 18:10:47 +01:00
parent 938cc0fc16
commit 2d981cb2e7
2 changed files with 7 additions and 304 deletions

View file

@ -1,283 +0,0 @@
%% Used for creating fixed-size self-regulating encoded bloom filters
%%
%% Normally a bloom filter in order to achieve optimium size increases the
%% number of hashes as the desired false positive rate increases. There is
%% a processing overhead for checking this bloom, both because of the number
%% of hash calculations required, and also because of the need to CRC check
%% the bloom to ensure a false negative result is not returned due to
%% corruption.
%%
%% A more space efficient bloom can be achieved through the compression of
%% bloom filters with less hashes (and in an optimal case a single hash).
%% This can be achieved using rice encoding.
%%
%% Rice-encoding and single hash blooms are used here in order to provide an
%% optimally space efficient solution, but also as the processing required to
%% support uncompression can be concurrently performing a checksum role.
%%
%% For this to work, the bloom is divided into 64 parts and a 32-bit hash is
%% required. Each hash is placed into one of 64 blooms based on the six least
%% significant bits of the hash, and the fmost significant 26-bits are used
%% to indicate the bit to be added to the bloom.
%%
%% The bloom is then created by calculating the differences between the ordered
%% elements of the hash list and representing the difference using an exponent
%% and a 13-bit remainder i.e.
%% 8000 -> 0 11111 01000000
%% 10000 -> 10 00000 00010000
%% 20000 -> 110 01110 00100000
%%
%% Each bloom should have approximately 64 differences.
%%
%% Fronting the bloom is a bloom index, formed first by 16 pairs of 3-byte
%% max hash, 2-byte length (bits) - with then each of the encoded bitstrings
%% appended. The max hash is the total of all the differences (which should
%% be the highest hash in the bloom).
%%
%% To check a key against the bloom, hash it, take the four least signifcant
%% bits and read the start pointer, max hash end pointer from the expected
%% positions in the bloom index. Then roll through from the start pointer to
%% the end pointer, accumulating each difference. There is a possible match if
%% either the accumulator hits the expected hash or the max hash doesn't match
%% the final accumulator (to cover if the bloom has been corrupted by a bit
%% flip somwhere). A miss is more than twice as expensive (on average) than a
%% potential match - but still only requires around 64 integer additions
%% and the processing of <100 bytes of data.
%%
%% For 2048 keys, this takes up <4KB. The false positive rate is 0.000122
%% This compares favourably for the equivalent size optimal bloom which
%% would require 11 hashes and have a false positive rate of 0.000459.
%% Checking with a positive match should take on average about 6 microseconds,
%% and a negative match should take around 11 microseconds.
%%
%% See ../test/rice_test.erl for proving timings and fpr.
-module(leveled_rice).
-export([create_bloom/1,
check_key/2,
check_keys/2]).
-include_lib("eunit/include/eunit.hrl").
-define(SLOT_COUNT, 64).
-define(MAX_HASH, 16777216).
-define(DIVISOR_BITS, 13).
-define(DIVISOR, 8092).
%% Create a bitstring representing the bloom filter from a key list
create_bloom(KeyList) ->
create_bloom(KeyList, ?SLOT_COUNT, ?MAX_HASH).
create_bloom(KeyList, SlotCount, MaxHash) ->
HashLists = array:new(SlotCount, [{default, []}]),
OrdHashLists = create_hashlist(KeyList, HashLists, SlotCount, MaxHash),
serialise_bloom(OrdHashLists).
%% Checking for a key
check_keys([], _) ->
true;
check_keys([Key|Rest], BitStr) ->
case check_key(Key, BitStr) of
false ->
false;
true ->
check_keys(Rest, BitStr)
end.
check_key(Key, BitStr) ->
check_key(Key, BitStr, ?SLOT_COUNT, ?MAX_HASH, ?DIVISOR_BITS, ?DIVISOR).
check_key(Key, BitStr, SlotCount, MaxHash, Factor, Divisor) ->
{Slot, Hash} = get_slothash(Key, MaxHash, SlotCount),
{StartPos, Length, TopHash} = find_position(Slot, BitStr, 0, 40 * SlotCount),
case BitStr of
<<_:StartPos/bitstring, Bloom:Length/bitstring, _/bitstring>> ->
check_hash(Hash, Bloom, Factor, Divisor, 0, TopHash);
_ ->
io:format("Possible corruption of bloom index ~n"),
true
end.
find_position(Slot, BloomIndex, Counter, StartPosition) ->
<<TopHash:24/integer, Length:16/integer, Rest/bitstring>> = BloomIndex,
case Slot of
Counter ->
{StartPosition, Length, TopHash};
_ ->
find_position(Slot, Rest, Counter + 1, StartPosition + Length)
end.
% Checking for a hash within a bloom
check_hash(_, <<>>, _, _, Acc, MaxHash) ->
case Acc of
MaxHash ->
false;
_ ->
io:format("Failure of CRC check on bloom filter~n"),
true
end;
check_hash(HashToCheck, BitStr, Factor, Divisor, Acc, TopHash) ->
case findexponent(BitStr) of
{ok, Exponent, BitStrTail} ->
case findremainder(BitStrTail, Factor) of
{ok, Remainder, BitStrTail2} ->
NextHash = Acc + Divisor * Exponent + Remainder,
case NextHash of
HashToCheck ->
true;
_ ->
check_hash(HashToCheck, BitStrTail2, Factor,
Divisor, NextHash, TopHash)
end;
error ->
io:format("Failure of CRC check on bloom filter~n"),
true
end;
error ->
io:format("Failure of CRC check on bloom filter~n"),
true
end.
%% Convert the key list into an array of sorted hash lists
create_hashlist([], HashLists, _, _) ->
HashLists;
create_hashlist([HeadKey|Rest], HashLists, SlotCount, MaxHash) ->
{Slot, Hash} = get_slothash(HeadKey, MaxHash, SlotCount),
HashList = array:get(Slot, HashLists),
create_hashlist(Rest,
array:set(Slot, lists:usort([Hash|HashList]), HashLists),
SlotCount, MaxHash).
%% Convert an array of hash lists into an serialsed bloom
serialise_bloom(HashLists) ->
SlotCount = array:size(HashLists),
serialise_bloom(HashLists, SlotCount, 0, []).
serialise_bloom(HashLists, SlotCount, Counter, Blooms) ->
case Counter of
SlotCount ->
finalise_bloom(Blooms);
_ ->
Bloom = serialise_singlebloom(array:get(Counter, HashLists)),
serialise_bloom(HashLists, SlotCount, Counter + 1, [Bloom|Blooms])
end.
serialise_singlebloom(HashList) ->
serialise_singlebloom(HashList, <<>>, 0, ?DIVISOR, ?DIVISOR_BITS).
serialise_singlebloom([], BloomStr, TopHash, _, _) ->
% io:format("Single bloom created with bloom of ~w and top hash of ~w~n", [BloomStr, TopHash]),
{BloomStr, TopHash};
serialise_singlebloom([Hash|Rest], BloomStr, TopHash, Divisor, Factor) ->
HashGap = Hash - TopHash,
Exponent = buildexponent(HashGap div Divisor),
Remainder = HashGap rem Divisor,
NewBloomStr = <<BloomStr/bitstring, Exponent/bitstring, Remainder:Factor/integer>>,
serialise_singlebloom(Rest, NewBloomStr, Hash, Divisor, Factor).
finalise_bloom(Blooms) ->
finalise_bloom(Blooms, {<<>>, <<>>}).
finalise_bloom([], BloomAcc) ->
{BloomIndex, BloomStr} = BloomAcc,
<<BloomIndex/bitstring, BloomStr/bitstring>>;
finalise_bloom([Bloom|Rest], BloomAcc) ->
{BloomStr, TopHash} = Bloom,
{BloomIndexAcc, BloomStrAcc} = BloomAcc,
Length = bit_size(BloomStr),
UpdIdx = <<TopHash:24/integer, Length:16/integer, BloomIndexAcc/bitstring>>,
% io:format("Adding bloom string of ~w to bloom~n", [BloomStr]),
UpdBloomStr = <<BloomStr/bitstring, BloomStrAcc/bitstring>>,
finalise_bloom(Rest, {UpdIdx, UpdBloomStr}).
buildexponent(Exponent) ->
buildexponent(Exponent, <<0:1>>).
buildexponent(0, OutputBits) ->
OutputBits;
buildexponent(Exponent, OutputBits) ->
buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>).
findexponent(BitStr) ->
findexponent(BitStr, 0).
findexponent(<<>>, _) ->
error;
findexponent(<<H:1/integer, T/bitstring>>, Acc) ->
case H of
1 -> findexponent(T, Acc + 1);
0 -> {ok, Acc, T}
end.
findremainder(BitStr, Factor) ->
case BitStr of
<<Remainder:Factor/integer, BitStrTail/bitstring>> ->
{ok, Remainder, BitStrTail};
_ ->
error
end.
get_slothash(Key, MaxHash, SlotCount) ->
Hash = erlang:phash2(Key, MaxHash),
{Hash rem SlotCount, Hash div SlotCount}.
%%%%%%%%%%%%%%%%
% T E S T
%%%%%%%%%%%%%%%
corrupt_bloom(Bloom) ->
Length = bit_size(Bloom),
Random = random:uniform(Length),
<<Part1:Random/bitstring, Bit:1/integer, Rest1/bitstring>> = Bloom,
case Bit of
1 ->
<<Part1/bitstring, 0:1/integer, Rest1/bitstring>>;
0 ->
<<Part1/bitstring, 1:1/integer, Rest1/bitstring>>
end.
bloom_test() ->
KeyList = ["key1", "key2", "key3", "key4"],
Bloom = create_bloom(KeyList),
io:format("Bloom of ~w of length ~w ~n", [Bloom, bit_size(Bloom)]),
?assertMatch(true, check_key("key1", Bloom)),
?assertMatch(true, check_key("key2", Bloom)),
?assertMatch(true, check_key("key3", Bloom)),
?assertMatch(true, check_key("key4", Bloom)),
?assertMatch(false, check_key("key5", Bloom)).
bloom_corruption_test() ->
KeyList = ["key1", "key2", "key3", "key4"],
Bloom = create_bloom(KeyList),
Bloom1 = corrupt_bloom(Bloom),
?assertMatch(true, check_keys(KeyList, Bloom1)),
Bloom2 = corrupt_bloom(Bloom),
?assertMatch(true, check_keys(KeyList, Bloom2)),
Bloom3 = corrupt_bloom(Bloom),
?assertMatch(true, check_keys(KeyList, Bloom3)),
Bloom4 = corrupt_bloom(Bloom),
?assertMatch(true, check_keys(KeyList, Bloom4)),
Bloom5 = corrupt_bloom(Bloom),
?assertMatch(true, check_keys(KeyList, Bloom5)),
Bloom6 = corrupt_bloom(Bloom),
?assertMatch(true, check_keys(KeyList, Bloom6)).

View file

@ -159,7 +159,6 @@
sft_close/1,
sft_clear/1,
sft_checkready/1,
sft_getfilename/1,
sft_setfordelete/2,
sft_getmaxsequencenumber/1,
generate_randomkeys/1]).
@ -255,9 +254,6 @@ sft_close(Pid) ->
sft_checkready(Pid) ->
gen_server:call(Pid, background_complete, infinity).
sft_getfilename(Pid) ->
gen_server:call(Pid, get_filename, infinty).
sft_getmaxsequencenumber(Pid) ->
gen_server:call(Pid, get_maxsqn, infinity).
@ -330,8 +326,6 @@ handle_call(background_complete, _From, State) ->
false ->
{reply, {error, State#state.background_failure}, State}
end;
handle_call(get_filename, _From, State) ->
{reply, State#state.filename, State};
handle_call({set_for_delete, Penciller}, _From, State) ->
{reply,
ok,
@ -362,9 +356,7 @@ handle_info(timeout, State) ->
end;
false ->
{noreply, State}
end;
handle_info(_Info, State) ->
{noreply, State}.
end.
terminate(Reason, State) ->
io:format("Exit called for reason ~w on filename ~s~n",
@ -878,18 +870,12 @@ sftwrite_function(finalise,
IndexLength:32/integer,
FilterLength:32/integer,
SummaryLength:32/integer>>),
file:close(Handle);
sftwrite_function(finalise,
{Handle,
SlotIndex,
SNExtremes,
KeyExtremes}) ->
{SlotFilters, PointerIndex} = convert_slotindex(SlotIndex),
sftwrite_function(finalise,
{Handle,
{SlotFilters, PointerIndex},
SNExtremes,
KeyExtremes}).
{ok, _Position} = file:position(Handle, bof),
ok = file:advise(Handle,
BlocksLength + IndexLength,
FilterLength,
will_need),
file:close(Handle).
%% Level 0 files are of variable (infinite) size to avoid issues with having
%% any remainders when flushing from memory