Entropy fiddle

Try and increase efefctiveness of bloom by combing Magic Hash with phash2
2016-12-29 11:59:07 +00:00 · 2016-12-29 11:59:07 +00:00 · 4784f8521a
commit 4784f8521a
parent fb75a26497
2 changed files with 37 additions and 27 deletions
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@ -401,13 +401,14 @@ code_change(_OldVsn, StateName, State, _Extra) ->

 fetch(LedgerKey, Hash, State) ->
    Summary = State#state.summary,
-    case leveled_tinybloom:check({hash, Hash}, Summary#summary.bloom) of
+    case leveled_tinybloom:check({hash, Hash},
+                                    Summary#summary.bloom) of
        false ->
            {not_present, summary_bloom, null};
        true ->
            Slot = lookup_slot(LedgerKey, Summary#summary.index),
            SlotBloom = Slot#slot_index_value.bloom,
-            case is_check_slot_required({hash, Hash}, SlotBloom) of
+            case is_check_slot_required({hash, Hash}, LedgerKey, SlotBloom) of
                false ->
                    {not_present, slot_bloom, null};
                true ->
@ -570,7 +571,9 @@ build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) ->
            false ->
                element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_SLOTS))
        end,
-    Bloom = lists:foldr(fun leveled_tinybloom:enter/2,
+    BloomAddFun =
+        fun({H, _K}, Bloom) -> leveled_tinybloom:enter(H, Bloom) end,
+    Bloom = lists:foldr(BloomAddFun,
                            leveled_tinybloom:empty(BloomSlots),
                            AllHashes),
    [{LastKey, _LastV}|_Rest] = SlotIndex,
@ -627,7 +630,7 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) ->
                no_lookup ->
                    Acc;
                H ->
-                    [{hash, H}|Acc]
+                    [{{hash, H}, K}|Acc]
            end
            end,
    HashList = lists:foldr(ExtractHashFun, [], SlotList),
@ -649,16 +652,18 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) ->

 build_slot(KVList, HashList) ->
    Tree = gb_trees:from_orddict(KVList),
-    Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2,
+    BloomAddFun =
+        fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end,
+    Bloom = lists:foldr(BloomAddFun,
                        leveled_tinybloom:tiny_empty(),
                        HashList),
    SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]),
    {SlotBin, Bloom}.

-is_check_slot_required(_Hash, none) ->
+is_check_slot_required(_Hash, _Key, none) ->
    true;
-is_check_slot_required(Hash, Bloom) ->
-    leveled_tinybloom:tiny_check(Hash, Bloom).
+is_check_slot_required(Hash, Key, Bloom) ->
+    leveled_tinybloom:tiny_check(Hash, Key, Bloom).

 %% Returns a section from the summary index and two booleans to indicate if
 %% the first slot needs trimming, or the last slot
@ -1030,7 +1035,7 @@ simple_slotbin_test() ->
    ExtractHashFun =
        fun({K, V}) ->
            {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}),
-            {hash, H} end,
+            {{hash, H}, K} end,
    HashList = lists:map(ExtractHashFun, KVList1),
    SW0 = os:timestamp(),
    {SlotBin0, Bloom0} = build_slot(KVList1, HashList),
@ -1038,8 +1043,10 @@ simple_slotbin_test() ->
                [timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]),
    
    SW1 = os:timestamp(),
-    lists:foreach(fun(H) -> ?assertMatch(true,
-                                            is_check_slot_required(H, Bloom0))
+    lists:foreach(fun({H, K}) -> ?assertMatch(true,
+                                            is_check_slot_required(H,
+                                                                    K,
+                                                                    Bloom0))
                                            end,
                    HashList),
    lists:foreach(fun({K, V}) ->
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@ -20,8 +20,8 @@
        enter/2,
        check/2,
        empty/1,
-        tiny_enter/2,
-        tiny_check/2,
+        tiny_enter/3,
+        tiny_check/3,
        tiny_empty/0
        ]).      

@ -75,16 +75,16 @@ check(Key, Bloom) ->
 tiny_empty() ->
    <<0:1024>>.

-tiny_enter({hash, no_lookup}, Bloom) ->
+tiny_enter({hash, no_lookup}, _Key, Bloom) ->
    Bloom;
-tiny_enter({hash, Hash}, Bloom) ->
-    {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash),
+tiny_enter({hash, Hash}, Key, Bloom) ->
+    {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key),
    AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end,
    lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]).


-tiny_check({hash, Hash}, Bloom) ->
-    {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash),
+tiny_check({hash, Hash}, Key, Bloom) ->
+    {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key),
    case getbit(Bit0, Bloom, 1024) of
        <<0:1>> ->
            false;
@ -113,8 +113,9 @@ split_hash(Hash) ->
    H2 = Hash bsr 20,
    {H0, H1, H2}.

-split_hash_for_tinybloom(Hash) ->
+split_hash_for_tinybloom(MagicHash, Key) ->
    % Tiny bloom can make k=3 from one hash
+    Hash = MagicHash bxor erlang:phash2(Key),
    H0 = Hash band 1023,
    H1 = (Hash bsr 11) band 1023,
    H2 = (Hash bsr 22) band 1023,
@ -194,8 +195,8 @@ simple_test() ->
    ?assertMatch(true, FP < (N div 4)).

 tiny_test() ->
-    N = 256,
-    K = 32, % more checks out then in K * checks
+    N = 128,
+    K = 64, % more checks out than in K * checks
    KLin = lists:map(fun(X) -> "Key_" ++
                                integer_to_list(X) ++
                                integer_to_list(random:uniform(100)) ++
@ -211,27 +212,29 @@ tiny_test() ->
                        lists:seq(1, N * K)),
    
    HashIn = lists:map(fun(X) ->
-                            {hash, leveled_codec:magic_hash(X)} end,
+                            {{hash, leveled_codec:magic_hash(X)}, X} end,
                            KLin),
    HashOut = lists:map(fun(X) ->
-                            {hash, leveled_codec:magic_hash(X)} end,
+                            {{hash, leveled_codec:magic_hash(X)}, X} end,
                            KLout),
       
    SW1 = os:timestamp(),
-    Bloom = lists:foldr(fun tiny_enter/2, tiny_empty(), HashIn),
+    Bloom = lists:foldr(fun({H0, K0}, B) -> tiny_enter(H0, K0, B) end,
+                        tiny_empty(),
+                        HashIn),
    io:format(user,
                "~nAdding ~w hashes to tiny bloom took ~w microseconds~n",
                [N, timer:now_diff(os:timestamp(), SW1)]),
    
    SW2 = os:timestamp(),
-    lists:foreach(fun(X) ->
-                    ?assertMatch(true, tiny_check(X, Bloom)) end, HashIn),
+    lists:foreach(fun({H1, K1}) ->
+                    ?assertMatch(true, tiny_check(H1, K1, Bloom)) end, HashIn),
    io:format(user,
                "~nChecking ~w hashes in tiny bloom took ~w microseconds~n",
                [N, timer:now_diff(os:timestamp(), SW2)]),
    
    SW3 = os:timestamp(),
-    FP = lists:foldr(fun(X, Acc) -> case tiny_check(X, Bloom) of
+    FP = lists:foldr(fun({H3, K3}, Acc) -> case tiny_check(H3, K3, Bloom) of
                                        true -> Acc + 1;
                                        false -> Acc
                                    end end,