From 4784f8521aa0875f82511e4dbb06a1e12fb6f22e Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 11:59:07 +0000 Subject: [PATCH] Entropy fiddle Try and increase efefctiveness of bloom by combing Magic Hash with phash2 --- src/leveled_sst.erl | 29 ++++++++++++++++++----------- src/leveled_tinybloom.erl | 35 +++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 7078064..f84d43a 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -401,13 +401,14 @@ code_change(_OldVsn, StateName, State, _Extra) -> fetch(LedgerKey, Hash, State) -> Summary = State#state.summary, - case leveled_tinybloom:check({hash, Hash}, Summary#summary.bloom) of + case leveled_tinybloom:check({hash, Hash}, + Summary#summary.bloom) of false -> {not_present, summary_bloom, null}; true -> Slot = lookup_slot(LedgerKey, Summary#summary.index), SlotBloom = Slot#slot_index_value.bloom, - case is_check_slot_required({hash, Hash}, SlotBloom) of + case is_check_slot_required({hash, Hash}, LedgerKey, SlotBloom) of false -> {not_present, slot_bloom, null}; true -> @@ -570,7 +571,9 @@ build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) -> false -> element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_SLOTS)) end, - Bloom = lists:foldr(fun leveled_tinybloom:enter/2, + BloomAddFun = + fun({H, _K}, Bloom) -> leveled_tinybloom:enter(H, Bloom) end, + Bloom = lists:foldr(BloomAddFun, leveled_tinybloom:empty(BloomSlots), AllHashes), [{LastKey, _LastV}|_Rest] = SlotIndex, @@ -627,7 +630,7 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) -> no_lookup -> Acc; H -> - [{hash, H}|Acc] + [{{hash, H}, K}|Acc] end end, HashList = lists:foldr(ExtractHashFun, [], SlotList), @@ -649,16 +652,18 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) -> build_slot(KVList, HashList) -> Tree = gb_trees:from_orddict(KVList), - Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, + BloomAddFun = + fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end, + Bloom = lists:foldr(BloomAddFun, leveled_tinybloom:tiny_empty(), HashList), SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), {SlotBin, Bloom}. -is_check_slot_required(_Hash, none) -> +is_check_slot_required(_Hash, _Key, none) -> true; -is_check_slot_required(Hash, Bloom) -> - leveled_tinybloom:tiny_check(Hash, Bloom). +is_check_slot_required(Hash, Key, Bloom) -> + leveled_tinybloom:tiny_check(Hash, Key, Bloom). %% Returns a section from the summary index and two booleans to indicate if %% the first slot needs trimming, or the last slot @@ -1030,7 +1035,7 @@ simple_slotbin_test() -> ExtractHashFun = fun({K, V}) -> {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), - {hash, H} end, + {{hash, H}, K} end, HashList = lists:map(ExtractHashFun, KVList1), SW0 = os:timestamp(), {SlotBin0, Bloom0} = build_slot(KVList1, HashList), @@ -1038,8 +1043,10 @@ simple_slotbin_test() -> [timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]), SW1 = os:timestamp(), - lists:foreach(fun(H) -> ?assertMatch(true, - is_check_slot_required(H, Bloom0)) + lists:foreach(fun({H, K}) -> ?assertMatch(true, + is_check_slot_required(H, + K, + Bloom0)) end, HashList), lists:foreach(fun({K, V}) -> diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 5428917..c03a5b5 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -20,8 +20,8 @@ enter/2, check/2, empty/1, - tiny_enter/2, - tiny_check/2, + tiny_enter/3, + tiny_check/3, tiny_empty/0 ]). @@ -75,16 +75,16 @@ check(Key, Bloom) -> tiny_empty() -> <<0:1024>>. -tiny_enter({hash, no_lookup}, Bloom) -> +tiny_enter({hash, no_lookup}, _Key, Bloom) -> Bloom; -tiny_enter({hash, Hash}, Bloom) -> - {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), +tiny_enter({hash, Hash}, Key, Bloom) -> + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key), AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]). -tiny_check({hash, Hash}, Bloom) -> - {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), +tiny_check({hash, Hash}, Key, Bloom) -> + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key), case getbit(Bit0, Bloom, 1024) of <<0:1>> -> false; @@ -113,8 +113,9 @@ split_hash(Hash) -> H2 = Hash bsr 20, {H0, H1, H2}. -split_hash_for_tinybloom(Hash) -> +split_hash_for_tinybloom(MagicHash, Key) -> % Tiny bloom can make k=3 from one hash + Hash = MagicHash bxor erlang:phash2(Key), H0 = Hash band 1023, H1 = (Hash bsr 11) band 1023, H2 = (Hash bsr 22) band 1023, @@ -194,8 +195,8 @@ simple_test() -> ?assertMatch(true, FP < (N div 4)). tiny_test() -> - N = 256, - K = 32, % more checks out then in K * checks + N = 128, + K = 64, % more checks out than in K * checks KLin = lists:map(fun(X) -> "Key_" ++ integer_to_list(X) ++ integer_to_list(random:uniform(100)) ++ @@ -211,27 +212,29 @@ tiny_test() -> lists:seq(1, N * K)), HashIn = lists:map(fun(X) -> - {hash, leveled_codec:magic_hash(X)} end, + {{hash, leveled_codec:magic_hash(X)}, X} end, KLin), HashOut = lists:map(fun(X) -> - {hash, leveled_codec:magic_hash(X)} end, + {{hash, leveled_codec:magic_hash(X)}, X} end, KLout), SW1 = os:timestamp(), - Bloom = lists:foldr(fun tiny_enter/2, tiny_empty(), HashIn), + Bloom = lists:foldr(fun({H0, K0}, B) -> tiny_enter(H0, K0, B) end, + tiny_empty(), + HashIn), io:format(user, "~nAdding ~w hashes to tiny bloom took ~w microseconds~n", [N, timer:now_diff(os:timestamp(), SW1)]), SW2 = os:timestamp(), - lists:foreach(fun(X) -> - ?assertMatch(true, tiny_check(X, Bloom)) end, HashIn), + lists:foreach(fun({H1, K1}) -> + ?assertMatch(true, tiny_check(H1, K1, Bloom)) end, HashIn), io:format(user, "~nChecking ~w hashes in tiny bloom took ~w microseconds~n", [N, timer:now_diff(os:timestamp(), SW2)]), SW3 = os:timestamp(), - FP = lists:foldr(fun(X, Acc) -> case tiny_check(X, Bloom) of + FP = lists:foldr(fun({H3, K3}, Acc) -> case tiny_check(H3, K3, Bloom) of true -> Acc + 1; false -> Acc end end,