Initial functions and unit tests

Try to replace SFT files with one that more natively supports features already in use (e.g. skiplist, tinybloom and magic_hash)
2016-12-23 12:30:58 +00:00 · 2016-12-23 12:30:58 +00:00 · 90e587dcee
commit 90e587dcee
parent 05ddcadbf9
3 changed files with 240 additions and 9 deletions
--- a/src/leveled_skiplist.erl
+++ b/src/leveled_skiplist.erl
@ -18,6 +18,7 @@
 -export([
        from_list/1,
        from_list/2,
+        to_sstlist/1,
        from_sortedlist/1,
        from_sortedlist/2,
        to_list/1,
@ -37,6 +38,7 @@

 -define(SKIP_WIDTH, 16).
 -define(LIST_HEIGHT, 2).
+-define(SST_WIDTH, 8).
 -define(INFINITY_KEY, {null, null, null, null, null}).
 -define(BITARRAY_SIZE, 2048).

@ -94,6 +96,9 @@ from_sortedlist(SortedKVL, BloomProtect) ->
    end,
    {Bloom0, from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT)}.

+to_sstlist(SortedKVL) ->
+    {list_only, from_list(SortedKVL, ?SST_WIDTH, ?LIST_HEIGHT)}.
+
 lookup(Key, SkipList) ->
    case element(1, SkipList) of
        list_only ->
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@ -0,0 +1,133 @@
+%% -------- SST (Variant) ---------
+%%
+%% A FSM module intended to wrap a persisted, ordered view of Keys and Values
+%%
+%% The persisted view is built from a list (which may be created by merging
+%% multiple lists)
+
+-module(leveled_sst).
+
+-include("include/leveled.hrl").
+
+-define(SLOT_SIZE, 128).
+-define(COMPRESSION_LEVEL, 1).
+
+-include_lib("eunit/include/eunit.hrl").
+
+%%%============================================================================
+%%% API
+%%%============================================================================
+
+
+
+
+%%%============================================================================
+%%% Internal Functions
+%%%============================================================================
+
+
+build_slot(KVList, HashList) when length(KVList) =< ?SLOT_SIZE ->
+    SkipList = leveled_skiplist:to_sstlist(KVList),
+    Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2,
+                        leveled_tinybloom:tiny_empty(),
+                        HashList),
+    SlotBin = term_to_binary(SkipList, [{compressed, ?COMPRESSION_LEVEL}]),
+    {SlotBin, Bloom}.
+
+is_check_slot_required(_Hash, none) ->
+    true;
+is_check_slot_required(Hash, Bloom) ->
+    leveled_tinybloom:tiny_check(Hash, Bloom).
+
+lookup_in_slot(Key, {pointer, Handle, Pos, Length}) ->
+    lookup_in_slot(Key, read_slot(Handle, Pos, Length));
+lookup_in_slot(Key, SlotBin) ->
+    SkipList = binary_to_term(SlotBin),
+    leveled_skiplist:lookup(Key, SkipList).
+
+range_from_slot(StartKey, EndKey, {pointer, Handle, Pos, Length}) ->
+    range_from_slot(StartKey, EndKey, read_slot(Handle, Pos, Length));
+range_from_slot(StartKey, EndKey, SlotBin) ->
+    SkipList = binary_to_term(SlotBin),
+    leveled_skiplist:to_range(SkipList, StartKey, EndKey).
+
+all_from_slot({pointer, Handle, Pos, Length}) ->
+    all_from_slot(read_slot(Handle, Pos, Length));
+all_from_slot(SlotBin) ->
+    SkipList = binary_to_term(SlotBin),
+    leveled_skiplist:to_list(SkipList).
+
+
+read_slot(_Handle, _Pos, _Length) ->
+    not_yet_implemented.
+
+
+%%%============================================================================
+%%% Test
+%%%============================================================================
+
+-ifdef(TEST).
+
+generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
+    generate_randomkeys(Seqn,
+                        Count,
+                        [],
+                        BucketRangeLow,
+                        BucketRangeHigh).
+
+generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) ->
+    Acc;
+generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) ->
+    BNumber =
+        case BRange of
+            0 ->
+                string:right(integer_to_list(BucketLow), 4, $0);
+            _ ->
+                BRand = random:uniform(BRange),
+                string:right(integer_to_list(BucketLow + BRand), 4, $0)
+        end,
+    KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0),
+    LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber,
+                                            "Key" ++ KNumber,
+                                            o),
+    {_B, _K, KV} = leveled_codec:generate_ledgerkv(LedgerKey,
+                                                    Seqn,
+                                                    crypto:rand_bytes(64),
+                                                    64,
+                                                    infinity),
+    generate_randomkeys(Seqn + 1,
+                        Count - 1,
+                        [KV|Acc],
+                        BucketLow,
+                        BRange).
+
+
+simple_slotbin_test() ->
+    KVList0 = generate_randomkeys(1, 256, 1, 4),
+    KVList1 = lists:sublist(lists:ukeysort(1, KVList0), 1, 128),
+    ExtractHashFun =
+        fun({K, V}) ->
+            {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}),
+            {hash, H} end,
+    HashList = lists:map(ExtractHashFun, KVList1),
+    
+    SW0 = os:timestamp(),
+    {SlotBin0, Bloom0} = build_slot(KVList1, HashList),
+    io:format(user, "~nSlot built in ~w microseconds~n",
+                [timer:now_diff(os:timestamp(), SW0)]),
+    
+    SW1 = os:timestamp(),
+    lists:foreach(fun(H) -> ?assertMatch(true,
+                                            is_check_slot_required(H, Bloom0))
+                                            end,
+                    HashList),
+    lists:foreach(fun({K, V}) ->
+                            ?assertMatch({value, V},
+                                            lookup_in_slot(K, SlotBin0))
+                                            end,
+                    KVList1),
+    io:format(user, "~nSlot checked for all keys in ~w microseconds~n",
+                [timer:now_diff(os:timestamp(), SW1)]).
+
+
+-endif.
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@ -2,7 +2,7 @@
 %%
 %% For sheltering relatively expensive lookups with a probabilistic check
 %%
-%% Uses multiple 256 byte blooms.  Can sensibly hold up to 1000 keys per array.
+%% Uses multiple 512 byte blooms.  Can sensibly hold up to 1000 keys per array.
 %% Even at 1000 keys should still offer only a 20% false positive
 %%
 %% Restricted to no more than 256 arrays - so can't handle more than 250K keys
@ -19,9 +19,13 @@
 -export([
        enter/2,
        check/2,
-        empty/1
+        empty/1,
+        tiny_enter/2,
+        tiny_check/2,
+        tiny_empty/0
        ]).      

+
 -include_lib("eunit/include/eunit.hrl").

 %%%============================================================================
@ -39,7 +43,9 @@ enter({hash, Hash}, Bloom) ->
    {H0, Bit1, Bit2} = split_hash(Hash),
    Slot = H0 rem dict:size(Bloom),
    BitArray0 = dict:fetch(Slot, Bloom),
-    BitArray1 = lists:foldl(fun add_to_array/2,
+    FoldFun =
+        fun(K, Arr) -> add_to_array(K, Arr, 4096) end,
+    BitArray1 = lists:foldl(FoldFun,
                                BitArray0,
                                lists:usort([Bit1, Bit2])),
    dict:store(Slot, BitArray1, Bloom);
@ -51,11 +57,11 @@ check({hash, Hash}, Bloom) ->
    {H0, Bit1, Bit2} = split_hash(Hash),
    Slot = H0 rem dict:size(Bloom),
    BitArray = dict:fetch(Slot, Bloom),
-    case getbit(Bit1, BitArray) of
+    case getbit(Bit1, BitArray, 4096) of
        <<0:1>> ->
            false;
        <<1:1>> ->
-            case getbit(Bit2, BitArray) of
+            case getbit(Bit2, BitArray, 4096) of
                <<0:1>> ->
                    false;
                <<1:1>> ->
@ -66,6 +72,37 @@ check(Key, Bloom) ->
    Hash = leveled_codec:magic_hash(Key),
    check({hash, Hash}, Bloom).

+tiny_empty() ->
+    <<0:1024>>.
+
+tiny_enter({hash, no_lookup}, Bloom) ->
+    Bloom;
+tiny_enter({hash, Hash}, Bloom) ->
+    {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash),
+    FoldFun =
+        fun(K, Arr) -> add_to_array(K, Arr, 1024) end,
+    lists:foldl(FoldFun, Bloom, lists:usort([Bit0, Bit1, Bit2])).
+
+tiny_check({hash, Hash}, Bloom) ->
+    {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash),
+    case getbit(Bit0, Bloom, 1024) of
+        <<0:1>> ->
+            false;
+        <<1:1>> ->
+            case getbit(Bit1, Bloom, 1024) of
+                <<0:1>> ->
+                    false;
+                <<1:1>> ->
+                    case getbit(Bit2, Bloom, 1024) of
+                        <<0:1>> ->
+                            false;
+                        <<1:1>> ->
+                            true
+                    end
+            end
+    end.
+
+
 %%%============================================================================
 %%% Internal Functions
 %%%============================================================================
@ -76,15 +113,21 @@ split_hash(Hash) ->
    H2 = Hash bsr 20,
    {H0, H1, H2}.

-add_to_array(Bit, BitArray) ->
-    RestLen = 4096 - Bit - 1,
+split_hash_for_tinybloom(Hash) ->
+    H0 = Hash band 1023,
+    H1 = (Hash bsr 10) band 1023,
+    H2 = (Hash bsr 20) band 1023,
+    {H0, H1, H2}.
+
+add_to_array(Bit, BitArray, ArrayLength) ->
+    RestLen = ArrayLength - Bit - 1,
    <<Head:Bit/bitstring,
        _B:1/bitstring,
        Rest:RestLen/bitstring>> = BitArray,
    <<Head/bitstring, 1:1, Rest/bitstring>>.

-getbit(Bit, BitArray) ->
-    RestLen = 4096 - Bit - 1,
+getbit(Bit, BitArray, ArrayLength) ->
+    RestLen = ArrayLength - Bit - 1,
    <<_Head:Bit/bitstring,
        B:1/bitstring,
        _Rest:RestLen/bitstring>> = BitArray,
@ -148,6 +191,56 @@ simple_test() ->
                    "with ~w false positive rate~n",
                [N, timer:now_diff(os:timestamp(), SW3), FP / N]),
    ?assertMatch(true, FP < (N div 4)).
+
+tiny_test() ->
+    N = 128,
+    K = 32, % more checks out then in K * checks
+    KLin = lists:map(fun(X) -> "Key_" ++
+                                integer_to_list(X) ++
+                                integer_to_list(random:uniform(100)) ++
+                                binary_to_list(crypto:rand_bytes(2))
+                                end,
+                        lists:seq(1, N)),
+    KLout = lists:map(fun(X) ->
+                            "NotKey_" ++
+                            integer_to_list(X) ++
+                            integer_to_list(random:uniform(100)) ++
+                            binary_to_list(crypto:rand_bytes(2))
+                            end,
+                        lists:seq(1, N * K)),
    
+    HashIn = lists:map(fun(X) ->
+                            {hash, leveled_codec:magic_hash(X)} end,
+                            KLin),
+    HashOut = lists:map(fun(X) ->
+                            {hash, leveled_codec:magic_hash(X)} end,
+                            KLout),
+       
+    SW1 = os:timestamp(),
+    Bloom = lists:foldr(fun tiny_enter/2, tiny_empty(), HashIn),
+    io:format(user,
+                "~nAdding ~w hashes to tiny bloom took ~w microseconds~n",
+                [N, timer:now_diff(os:timestamp(), SW1)]),
+    
+    SW2 = os:timestamp(),
+    lists:foreach(fun(X) ->
+                    ?assertMatch(true, tiny_check(X, Bloom)) end, HashIn),
+    io:format(user,
+                "~nChecking ~w hashes in tiny bloom took ~w microseconds~n",
+                [N, timer:now_diff(os:timestamp(), SW2)]),
+    
+    SW3 = os:timestamp(),
+    FP = lists:foldr(fun(X, Acc) -> case tiny_check(X, Bloom) of
+                                        true -> Acc + 1;
+                                        false -> Acc
+                                    end end,
+                        0,
+                        HashOut),
+    io:format(user,
+                "~nChecking ~w hashes out of tiny bloom took ~w microseconds "
+                    ++ "with ~w false positive rate~n",
+                [N * K, timer:now_diff(os:timestamp(), SW3), FP / (N * K)]),
+    ?assertMatch(true, FP < ((N * K) div 8)).
+

 -endif.