Entropy fiddle

Try and increase efefctiveness of bloom by combing Magic Hash with
phash2
This commit is contained in:
martinsumner 2016-12-29 11:59:07 +00:00
parent fb75a26497
commit 4784f8521a
2 changed files with 37 additions and 27 deletions

View file

@ -401,13 +401,14 @@ code_change(_OldVsn, StateName, State, _Extra) ->
fetch(LedgerKey, Hash, State) -> fetch(LedgerKey, Hash, State) ->
Summary = State#state.summary, Summary = State#state.summary,
case leveled_tinybloom:check({hash, Hash}, Summary#summary.bloom) of case leveled_tinybloom:check({hash, Hash},
Summary#summary.bloom) of
false -> false ->
{not_present, summary_bloom, null}; {not_present, summary_bloom, null};
true -> true ->
Slot = lookup_slot(LedgerKey, Summary#summary.index), Slot = lookup_slot(LedgerKey, Summary#summary.index),
SlotBloom = Slot#slot_index_value.bloom, SlotBloom = Slot#slot_index_value.bloom,
case is_check_slot_required({hash, Hash}, SlotBloom) of case is_check_slot_required({hash, Hash}, LedgerKey, SlotBloom) of
false -> false ->
{not_present, slot_bloom, null}; {not_present, slot_bloom, null};
true -> true ->
@ -570,7 +571,9 @@ build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) ->
false -> false ->
element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_SLOTS)) element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_SLOTS))
end, end,
Bloom = lists:foldr(fun leveled_tinybloom:enter/2, BloomAddFun =
fun({H, _K}, Bloom) -> leveled_tinybloom:enter(H, Bloom) end,
Bloom = lists:foldr(BloomAddFun,
leveled_tinybloom:empty(BloomSlots), leveled_tinybloom:empty(BloomSlots),
AllHashes), AllHashes),
[{LastKey, _LastV}|_Rest] = SlotIndex, [{LastKey, _LastV}|_Rest] = SlotIndex,
@ -627,7 +630,7 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) ->
no_lookup -> no_lookup ->
Acc; Acc;
H -> H ->
[{hash, H}|Acc] [{{hash, H}, K}|Acc]
end end
end, end,
HashList = lists:foldr(ExtractHashFun, [], SlotList), HashList = lists:foldr(ExtractHashFun, [], SlotList),
@ -649,16 +652,18 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) ->
build_slot(KVList, HashList) -> build_slot(KVList, HashList) ->
Tree = gb_trees:from_orddict(KVList), Tree = gb_trees:from_orddict(KVList),
Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, BloomAddFun =
fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end,
Bloom = lists:foldr(BloomAddFun,
leveled_tinybloom:tiny_empty(), leveled_tinybloom:tiny_empty(),
HashList), HashList),
SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]),
{SlotBin, Bloom}. {SlotBin, Bloom}.
is_check_slot_required(_Hash, none) -> is_check_slot_required(_Hash, _Key, none) ->
true; true;
is_check_slot_required(Hash, Bloom) -> is_check_slot_required(Hash, Key, Bloom) ->
leveled_tinybloom:tiny_check(Hash, Bloom). leveled_tinybloom:tiny_check(Hash, Key, Bloom).
%% Returns a section from the summary index and two booleans to indicate if %% Returns a section from the summary index and two booleans to indicate if
%% the first slot needs trimming, or the last slot %% the first slot needs trimming, or the last slot
@ -1030,7 +1035,7 @@ simple_slotbin_test() ->
ExtractHashFun = ExtractHashFun =
fun({K, V}) -> fun({K, V}) ->
{_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}),
{hash, H} end, {{hash, H}, K} end,
HashList = lists:map(ExtractHashFun, KVList1), HashList = lists:map(ExtractHashFun, KVList1),
SW0 = os:timestamp(), SW0 = os:timestamp(),
{SlotBin0, Bloom0} = build_slot(KVList1, HashList), {SlotBin0, Bloom0} = build_slot(KVList1, HashList),
@ -1038,8 +1043,10 @@ simple_slotbin_test() ->
[timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]), [timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]),
SW1 = os:timestamp(), SW1 = os:timestamp(),
lists:foreach(fun(H) -> ?assertMatch(true, lists:foreach(fun({H, K}) -> ?assertMatch(true,
is_check_slot_required(H, Bloom0)) is_check_slot_required(H,
K,
Bloom0))
end, end,
HashList), HashList),
lists:foreach(fun({K, V}) -> lists:foreach(fun({K, V}) ->

View file

@ -20,8 +20,8 @@
enter/2, enter/2,
check/2, check/2,
empty/1, empty/1,
tiny_enter/2, tiny_enter/3,
tiny_check/2, tiny_check/3,
tiny_empty/0 tiny_empty/0
]). ]).
@ -75,16 +75,16 @@ check(Key, Bloom) ->
tiny_empty() -> tiny_empty() ->
<<0:1024>>. <<0:1024>>.
tiny_enter({hash, no_lookup}, Bloom) -> tiny_enter({hash, no_lookup}, _Key, Bloom) ->
Bloom; Bloom;
tiny_enter({hash, Hash}, Bloom) -> tiny_enter({hash, Hash}, Key, Bloom) ->
{Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key),
AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end,
lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]). lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]).
tiny_check({hash, Hash}, Bloom) -> tiny_check({hash, Hash}, Key, Bloom) ->
{Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key),
case getbit(Bit0, Bloom, 1024) of case getbit(Bit0, Bloom, 1024) of
<<0:1>> -> <<0:1>> ->
false; false;
@ -113,8 +113,9 @@ split_hash(Hash) ->
H2 = Hash bsr 20, H2 = Hash bsr 20,
{H0, H1, H2}. {H0, H1, H2}.
split_hash_for_tinybloom(Hash) -> split_hash_for_tinybloom(MagicHash, Key) ->
% Tiny bloom can make k=3 from one hash % Tiny bloom can make k=3 from one hash
Hash = MagicHash bxor erlang:phash2(Key),
H0 = Hash band 1023, H0 = Hash band 1023,
H1 = (Hash bsr 11) band 1023, H1 = (Hash bsr 11) band 1023,
H2 = (Hash bsr 22) band 1023, H2 = (Hash bsr 22) band 1023,
@ -194,8 +195,8 @@ simple_test() ->
?assertMatch(true, FP < (N div 4)). ?assertMatch(true, FP < (N div 4)).
tiny_test() -> tiny_test() ->
N = 256, N = 128,
K = 32, % more checks out then in K * checks K = 64, % more checks out than in K * checks
KLin = lists:map(fun(X) -> "Key_" ++ KLin = lists:map(fun(X) -> "Key_" ++
integer_to_list(X) ++ integer_to_list(X) ++
integer_to_list(random:uniform(100)) ++ integer_to_list(random:uniform(100)) ++
@ -211,27 +212,29 @@ tiny_test() ->
lists:seq(1, N * K)), lists:seq(1, N * K)),
HashIn = lists:map(fun(X) -> HashIn = lists:map(fun(X) ->
{hash, leveled_codec:magic_hash(X)} end, {{hash, leveled_codec:magic_hash(X)}, X} end,
KLin), KLin),
HashOut = lists:map(fun(X) -> HashOut = lists:map(fun(X) ->
{hash, leveled_codec:magic_hash(X)} end, {{hash, leveled_codec:magic_hash(X)}, X} end,
KLout), KLout),
SW1 = os:timestamp(), SW1 = os:timestamp(),
Bloom = lists:foldr(fun tiny_enter/2, tiny_empty(), HashIn), Bloom = lists:foldr(fun({H0, K0}, B) -> tiny_enter(H0, K0, B) end,
tiny_empty(),
HashIn),
io:format(user, io:format(user,
"~nAdding ~w hashes to tiny bloom took ~w microseconds~n", "~nAdding ~w hashes to tiny bloom took ~w microseconds~n",
[N, timer:now_diff(os:timestamp(), SW1)]), [N, timer:now_diff(os:timestamp(), SW1)]),
SW2 = os:timestamp(), SW2 = os:timestamp(),
lists:foreach(fun(X) -> lists:foreach(fun({H1, K1}) ->
?assertMatch(true, tiny_check(X, Bloom)) end, HashIn), ?assertMatch(true, tiny_check(H1, K1, Bloom)) end, HashIn),
io:format(user, io:format(user,
"~nChecking ~w hashes in tiny bloom took ~w microseconds~n", "~nChecking ~w hashes in tiny bloom took ~w microseconds~n",
[N, timer:now_diff(os:timestamp(), SW2)]), [N, timer:now_diff(os:timestamp(), SW2)]),
SW3 = os:timestamp(), SW3 = os:timestamp(),
FP = lists:foldr(fun(X, Acc) -> case tiny_check(X, Bloom) of FP = lists:foldr(fun({H3, K3}, Acc) -> case tiny_check(H3, K3, Bloom) of
true -> Acc + 1; true -> Acc + 1;
false -> Acc false -> Acc
end end, end end,