Entropy fiddle

Try and increase efefctiveness of bloom by combing Magic Hash with
phash2
This commit is contained in:
martinsumner 2016-12-29 11:59:07 +00:00
parent fb75a26497
commit 4784f8521a
2 changed files with 37 additions and 27 deletions

View file

@ -401,13 +401,14 @@ code_change(_OldVsn, StateName, State, _Extra) ->
fetch(LedgerKey, Hash, State) ->
Summary = State#state.summary,
case leveled_tinybloom:check({hash, Hash}, Summary#summary.bloom) of
case leveled_tinybloom:check({hash, Hash},
Summary#summary.bloom) of
false ->
{not_present, summary_bloom, null};
true ->
Slot = lookup_slot(LedgerKey, Summary#summary.index),
SlotBloom = Slot#slot_index_value.bloom,
case is_check_slot_required({hash, Hash}, SlotBloom) of
case is_check_slot_required({hash, Hash}, LedgerKey, SlotBloom) of
false ->
{not_present, slot_bloom, null};
true ->
@ -570,7 +571,9 @@ build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) ->
false ->
element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_SLOTS))
end,
Bloom = lists:foldr(fun leveled_tinybloom:enter/2,
BloomAddFun =
fun({H, _K}, Bloom) -> leveled_tinybloom:enter(H, Bloom) end,
Bloom = lists:foldr(BloomAddFun,
leveled_tinybloom:empty(BloomSlots),
AllHashes),
[{LastKey, _LastV}|_Rest] = SlotIndex,
@ -627,7 +630,7 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) ->
no_lookup ->
Acc;
H ->
[{hash, H}|Acc]
[{{hash, H}, K}|Acc]
end
end,
HashList = lists:foldr(ExtractHashFun, [], SlotList),
@ -649,16 +652,18 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) ->
build_slot(KVList, HashList) ->
Tree = gb_trees:from_orddict(KVList),
Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2,
BloomAddFun =
fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end,
Bloom = lists:foldr(BloomAddFun,
leveled_tinybloom:tiny_empty(),
HashList),
SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]),
{SlotBin, Bloom}.
is_check_slot_required(_Hash, none) ->
is_check_slot_required(_Hash, _Key, none) ->
true;
is_check_slot_required(Hash, Bloom) ->
leveled_tinybloom:tiny_check(Hash, Bloom).
is_check_slot_required(Hash, Key, Bloom) ->
leveled_tinybloom:tiny_check(Hash, Key, Bloom).
%% Returns a section from the summary index and two booleans to indicate if
%% the first slot needs trimming, or the last slot
@ -1030,7 +1035,7 @@ simple_slotbin_test() ->
ExtractHashFun =
fun({K, V}) ->
{_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}),
{hash, H} end,
{{hash, H}, K} end,
HashList = lists:map(ExtractHashFun, KVList1),
SW0 = os:timestamp(),
{SlotBin0, Bloom0} = build_slot(KVList1, HashList),
@ -1038,8 +1043,10 @@ simple_slotbin_test() ->
[timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]),
SW1 = os:timestamp(),
lists:foreach(fun(H) -> ?assertMatch(true,
is_check_slot_required(H, Bloom0))
lists:foreach(fun({H, K}) -> ?assertMatch(true,
is_check_slot_required(H,
K,
Bloom0))
end,
HashList),
lists:foreach(fun({K, V}) ->

View file

@ -20,8 +20,8 @@
enter/2,
check/2,
empty/1,
tiny_enter/2,
tiny_check/2,
tiny_enter/3,
tiny_check/3,
tiny_empty/0
]).
@ -75,16 +75,16 @@ check(Key, Bloom) ->
tiny_empty() ->
<<0:1024>>.
tiny_enter({hash, no_lookup}, Bloom) ->
tiny_enter({hash, no_lookup}, _Key, Bloom) ->
Bloom;
tiny_enter({hash, Hash}, Bloom) ->
{Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash),
tiny_enter({hash, Hash}, Key, Bloom) ->
{Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key),
AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end,
lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]).
tiny_check({hash, Hash}, Bloom) ->
{Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash),
tiny_check({hash, Hash}, Key, Bloom) ->
{Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key),
case getbit(Bit0, Bloom, 1024) of
<<0:1>> ->
false;
@ -113,8 +113,9 @@ split_hash(Hash) ->
H2 = Hash bsr 20,
{H0, H1, H2}.
split_hash_for_tinybloom(Hash) ->
split_hash_for_tinybloom(MagicHash, Key) ->
% Tiny bloom can make k=3 from one hash
Hash = MagicHash bxor erlang:phash2(Key),
H0 = Hash band 1023,
H1 = (Hash bsr 11) band 1023,
H2 = (Hash bsr 22) band 1023,
@ -194,8 +195,8 @@ simple_test() ->
?assertMatch(true, FP < (N div 4)).
tiny_test() ->
N = 256,
K = 32, % more checks out then in K * checks
N = 128,
K = 64, % more checks out than in K * checks
KLin = lists:map(fun(X) -> "Key_" ++
integer_to_list(X) ++
integer_to_list(random:uniform(100)) ++
@ -211,27 +212,29 @@ tiny_test() ->
lists:seq(1, N * K)),
HashIn = lists:map(fun(X) ->
{hash, leveled_codec:magic_hash(X)} end,
{{hash, leveled_codec:magic_hash(X)}, X} end,
KLin),
HashOut = lists:map(fun(X) ->
{hash, leveled_codec:magic_hash(X)} end,
{{hash, leveled_codec:magic_hash(X)}, X} end,
KLout),
SW1 = os:timestamp(),
Bloom = lists:foldr(fun tiny_enter/2, tiny_empty(), HashIn),
Bloom = lists:foldr(fun({H0, K0}, B) -> tiny_enter(H0, K0, B) end,
tiny_empty(),
HashIn),
io:format(user,
"~nAdding ~w hashes to tiny bloom took ~w microseconds~n",
[N, timer:now_diff(os:timestamp(), SW1)]),
SW2 = os:timestamp(),
lists:foreach(fun(X) ->
?assertMatch(true, tiny_check(X, Bloom)) end, HashIn),
lists:foreach(fun({H1, K1}) ->
?assertMatch(true, tiny_check(H1, K1, Bloom)) end, HashIn),
io:format(user,
"~nChecking ~w hashes in tiny bloom took ~w microseconds~n",
[N, timer:now_diff(os:timestamp(), SW2)]),
SW3 = os:timestamp(),
FP = lists:foldr(fun(X, Acc) -> case tiny_check(X, Bloom) of
FP = lists:foldr(fun({H3, K3}, Acc) -> case tiny_check(H3, K3, Bloom) of
true -> Acc + 1;
false -> Acc
end end,