diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 7fcc81a..dd40590 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -18,6 +18,7 @@ -export([ from_list/1, from_list/2, + to_sstlist/1, from_sortedlist/1, from_sortedlist/2, to_list/1, @@ -37,6 +38,7 @@ -define(SKIP_WIDTH, 16). -define(LIST_HEIGHT, 2). +-define(SST_WIDTH, 8). -define(INFINITY_KEY, {null, null, null, null, null}). -define(BITARRAY_SIZE, 2048). @@ -94,6 +96,9 @@ from_sortedlist(SortedKVL, BloomProtect) -> end, {Bloom0, from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT)}. +to_sstlist(SortedKVL) -> + {list_only, from_list(SortedKVL, ?SST_WIDTH, ?LIST_HEIGHT)}. + lookup(Key, SkipList) -> case element(1, SkipList) of list_only -> diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl new file mode 100644 index 0000000..713e6c8 --- /dev/null +++ b/src/leveled_sst.erl @@ -0,0 +1,133 @@ +%% -------- SST (Variant) --------- +%% +%% A FSM module intended to wrap a persisted, ordered view of Keys and Values +%% +%% The persisted view is built from a list (which may be created by merging +%% multiple lists) + +-module(leveled_sst). + +-include("include/leveled.hrl"). + +-define(SLOT_SIZE, 128). +-define(COMPRESSION_LEVEL, 1). + +-include_lib("eunit/include/eunit.hrl"). + +%%%============================================================================ +%%% API +%%%============================================================================ + + + + +%%%============================================================================ +%%% Internal Functions +%%%============================================================================ + + +build_slot(KVList, HashList) when length(KVList) =< ?SLOT_SIZE -> + SkipList = leveled_skiplist:to_sstlist(KVList), + Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, + leveled_tinybloom:tiny_empty(), + HashList), + SlotBin = term_to_binary(SkipList, [{compressed, ?COMPRESSION_LEVEL}]), + {SlotBin, Bloom}. + +is_check_slot_required(_Hash, none) -> + true; +is_check_slot_required(Hash, Bloom) -> + leveled_tinybloom:tiny_check(Hash, Bloom). + +lookup_in_slot(Key, {pointer, Handle, Pos, Length}) -> + lookup_in_slot(Key, read_slot(Handle, Pos, Length)); +lookup_in_slot(Key, SlotBin) -> + SkipList = binary_to_term(SlotBin), + leveled_skiplist:lookup(Key, SkipList). + +range_from_slot(StartKey, EndKey, {pointer, Handle, Pos, Length}) -> + range_from_slot(StartKey, EndKey, read_slot(Handle, Pos, Length)); +range_from_slot(StartKey, EndKey, SlotBin) -> + SkipList = binary_to_term(SlotBin), + leveled_skiplist:to_range(SkipList, StartKey, EndKey). + +all_from_slot({pointer, Handle, Pos, Length}) -> + all_from_slot(read_slot(Handle, Pos, Length)); +all_from_slot(SlotBin) -> + SkipList = binary_to_term(SlotBin), + leveled_skiplist:to_list(SkipList). + + +read_slot(_Handle, _Pos, _Length) -> + not_yet_implemented. + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> + generate_randomkeys(Seqn, + Count, + [], + BucketRangeLow, + BucketRangeHigh). + +generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) -> + Acc; +generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> + BNumber = + case BRange of + 0 -> + string:right(integer_to_list(BucketLow), 4, $0); + _ -> + BRand = random:uniform(BRange), + string:right(integer_to_list(BucketLow + BRand), 4, $0) + end, + KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0), + LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, + "Key" ++ KNumber, + o), + {_B, _K, KV} = leveled_codec:generate_ledgerkv(LedgerKey, + Seqn, + crypto:rand_bytes(64), + 64, + infinity), + generate_randomkeys(Seqn + 1, + Count - 1, + [KV|Acc], + BucketLow, + BRange). + + +simple_slotbin_test() -> + KVList0 = generate_randomkeys(1, 256, 1, 4), + KVList1 = lists:sublist(lists:ukeysort(1, KVList0), 1, 128), + ExtractHashFun = + fun({K, V}) -> + {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), + {hash, H} end, + HashList = lists:map(ExtractHashFun, KVList1), + + SW0 = os:timestamp(), + {SlotBin0, Bloom0} = build_slot(KVList1, HashList), + io:format(user, "~nSlot built in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW0)]), + + SW1 = os:timestamp(), + lists:foreach(fun(H) -> ?assertMatch(true, + is_check_slot_required(H, Bloom0)) + end, + HashList), + lists:foreach(fun({K, V}) -> + ?assertMatch({value, V}, + lookup_in_slot(K, SlotBin0)) + end, + KVList1), + io:format(user, "~nSlot checked for all keys in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW1)]). + + +-endif. \ No newline at end of file diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index f9212ad..9e76c44 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -2,7 +2,7 @@ %% %% For sheltering relatively expensive lookups with a probabilistic check %% -%% Uses multiple 256 byte blooms. Can sensibly hold up to 1000 keys per array. +%% Uses multiple 512 byte blooms. Can sensibly hold up to 1000 keys per array. %% Even at 1000 keys should still offer only a 20% false positive %% %% Restricted to no more than 256 arrays - so can't handle more than 250K keys @@ -19,9 +19,13 @@ -export([ enter/2, check/2, - empty/1 + empty/1, + tiny_enter/2, + tiny_check/2, + tiny_empty/0 ]). + -include_lib("eunit/include/eunit.hrl"). %%%============================================================================ @@ -39,7 +43,9 @@ enter({hash, Hash}, Bloom) -> {H0, Bit1, Bit2} = split_hash(Hash), Slot = H0 rem dict:size(Bloom), BitArray0 = dict:fetch(Slot, Bloom), - BitArray1 = lists:foldl(fun add_to_array/2, + FoldFun = + fun(K, Arr) -> add_to_array(K, Arr, 4096) end, + BitArray1 = lists:foldl(FoldFun, BitArray0, lists:usort([Bit1, Bit2])), dict:store(Slot, BitArray1, Bloom); @@ -51,11 +57,11 @@ check({hash, Hash}, Bloom) -> {H0, Bit1, Bit2} = split_hash(Hash), Slot = H0 rem dict:size(Bloom), BitArray = dict:fetch(Slot, Bloom), - case getbit(Bit1, BitArray) of + case getbit(Bit1, BitArray, 4096) of <<0:1>> -> false; <<1:1>> -> - case getbit(Bit2, BitArray) of + case getbit(Bit2, BitArray, 4096) of <<0:1>> -> false; <<1:1>> -> @@ -66,6 +72,37 @@ check(Key, Bloom) -> Hash = leveled_codec:magic_hash(Key), check({hash, Hash}, Bloom). +tiny_empty() -> + <<0:1024>>. + +tiny_enter({hash, no_lookup}, Bloom) -> + Bloom; +tiny_enter({hash, Hash}, Bloom) -> + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + FoldFun = + fun(K, Arr) -> add_to_array(K, Arr, 1024) end, + lists:foldl(FoldFun, Bloom, lists:usort([Bit0, Bit1, Bit2])). + +tiny_check({hash, Hash}, Bloom) -> + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + case getbit(Bit0, Bloom, 1024) of + <<0:1>> -> + false; + <<1:1>> -> + case getbit(Bit1, Bloom, 1024) of + <<0:1>> -> + false; + <<1:1>> -> + case getbit(Bit2, Bloom, 1024) of + <<0:1>> -> + false; + <<1:1>> -> + true + end + end + end. + + %%%============================================================================ %%% Internal Functions %%%============================================================================ @@ -76,15 +113,21 @@ split_hash(Hash) -> H2 = Hash bsr 20, {H0, H1, H2}. -add_to_array(Bit, BitArray) -> - RestLen = 4096 - Bit - 1, +split_hash_for_tinybloom(Hash) -> + H0 = Hash band 1023, + H1 = (Hash bsr 10) band 1023, + H2 = (Hash bsr 20) band 1023, + {H0, H1, H2}. + +add_to_array(Bit, BitArray, ArrayLength) -> + RestLen = ArrayLength - Bit - 1, <> = BitArray, <>. -getbit(Bit, BitArray) -> - RestLen = 4096 - Bit - 1, +getbit(Bit, BitArray, ArrayLength) -> + RestLen = ArrayLength - Bit - 1, <<_Head:Bit/bitstring, B:1/bitstring, _Rest:RestLen/bitstring>> = BitArray, @@ -148,6 +191,56 @@ simple_test() -> "with ~w false positive rate~n", [N, timer:now_diff(os:timestamp(), SW3), FP / N]), ?assertMatch(true, FP < (N div 4)). + +tiny_test() -> + N = 128, + K = 32, % more checks out then in K * checks + KLin = lists:map(fun(X) -> "Key_" ++ + integer_to_list(X) ++ + integer_to_list(random:uniform(100)) ++ + binary_to_list(crypto:rand_bytes(2)) + end, + lists:seq(1, N)), + KLout = lists:map(fun(X) -> + "NotKey_" ++ + integer_to_list(X) ++ + integer_to_list(random:uniform(100)) ++ + binary_to_list(crypto:rand_bytes(2)) + end, + lists:seq(1, N * K)), + HashIn = lists:map(fun(X) -> + {hash, leveled_codec:magic_hash(X)} end, + KLin), + HashOut = lists:map(fun(X) -> + {hash, leveled_codec:magic_hash(X)} end, + KLout), + + SW1 = os:timestamp(), + Bloom = lists:foldr(fun tiny_enter/2, tiny_empty(), HashIn), + io:format(user, + "~nAdding ~w hashes to tiny bloom took ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW1)]), + + SW2 = os:timestamp(), + lists:foreach(fun(X) -> + ?assertMatch(true, tiny_check(X, Bloom)) end, HashIn), + io:format(user, + "~nChecking ~w hashes in tiny bloom took ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW2)]), + + SW3 = os:timestamp(), + FP = lists:foldr(fun(X, Acc) -> case tiny_check(X, Bloom) of + true -> Acc + 1; + false -> Acc + end end, + 0, + HashOut), + io:format(user, + "~nChecking ~w hashes out of tiny bloom took ~w microseconds " + ++ "with ~w false positive rate~n", + [N * K, timer:now_diff(os:timestamp(), SW3), FP / (N * K)]), + ?assertMatch(true, FP < ((N * K) div 8)). + -endif. \ No newline at end of file