From 90e587dcee3b83a78835730ae78da5bdc9b862cf Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Dec 2016 12:30:58 +0000 Subject: [PATCH 01/58] Initial functions and unit tests Try to replace SFT files with one that more natively supports features already in use (e.g. skiplist, tinybloom and magic_hash) --- src/leveled_skiplist.erl | 5 ++ src/leveled_sst.erl | 133 ++++++++++++++++++++++++++++++++++++++ src/leveled_tinybloom.erl | 111 ++++++++++++++++++++++++++++--- 3 files changed, 240 insertions(+), 9 deletions(-) create mode 100644 src/leveled_sst.erl diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 7fcc81a..dd40590 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -18,6 +18,7 @@ -export([ from_list/1, from_list/2, + to_sstlist/1, from_sortedlist/1, from_sortedlist/2, to_list/1, @@ -37,6 +38,7 @@ -define(SKIP_WIDTH, 16). -define(LIST_HEIGHT, 2). +-define(SST_WIDTH, 8). -define(INFINITY_KEY, {null, null, null, null, null}). -define(BITARRAY_SIZE, 2048). @@ -94,6 +96,9 @@ from_sortedlist(SortedKVL, BloomProtect) -> end, {Bloom0, from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT)}. +to_sstlist(SortedKVL) -> + {list_only, from_list(SortedKVL, ?SST_WIDTH, ?LIST_HEIGHT)}. + lookup(Key, SkipList) -> case element(1, SkipList) of list_only -> diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl new file mode 100644 index 0000000..713e6c8 --- /dev/null +++ b/src/leveled_sst.erl @@ -0,0 +1,133 @@ +%% -------- SST (Variant) --------- +%% +%% A FSM module intended to wrap a persisted, ordered view of Keys and Values +%% +%% The persisted view is built from a list (which may be created by merging +%% multiple lists) + +-module(leveled_sst). + +-include("include/leveled.hrl"). + +-define(SLOT_SIZE, 128). +-define(COMPRESSION_LEVEL, 1). + +-include_lib("eunit/include/eunit.hrl"). + +%%%============================================================================ +%%% API +%%%============================================================================ + + + + +%%%============================================================================ +%%% Internal Functions +%%%============================================================================ + + +build_slot(KVList, HashList) when length(KVList) =< ?SLOT_SIZE -> + SkipList = leveled_skiplist:to_sstlist(KVList), + Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, + leveled_tinybloom:tiny_empty(), + HashList), + SlotBin = term_to_binary(SkipList, [{compressed, ?COMPRESSION_LEVEL}]), + {SlotBin, Bloom}. + +is_check_slot_required(_Hash, none) -> + true; +is_check_slot_required(Hash, Bloom) -> + leveled_tinybloom:tiny_check(Hash, Bloom). + +lookup_in_slot(Key, {pointer, Handle, Pos, Length}) -> + lookup_in_slot(Key, read_slot(Handle, Pos, Length)); +lookup_in_slot(Key, SlotBin) -> + SkipList = binary_to_term(SlotBin), + leveled_skiplist:lookup(Key, SkipList). + +range_from_slot(StartKey, EndKey, {pointer, Handle, Pos, Length}) -> + range_from_slot(StartKey, EndKey, read_slot(Handle, Pos, Length)); +range_from_slot(StartKey, EndKey, SlotBin) -> + SkipList = binary_to_term(SlotBin), + leveled_skiplist:to_range(SkipList, StartKey, EndKey). + +all_from_slot({pointer, Handle, Pos, Length}) -> + all_from_slot(read_slot(Handle, Pos, Length)); +all_from_slot(SlotBin) -> + SkipList = binary_to_term(SlotBin), + leveled_skiplist:to_list(SkipList). + + +read_slot(_Handle, _Pos, _Length) -> + not_yet_implemented. + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> + generate_randomkeys(Seqn, + Count, + [], + BucketRangeLow, + BucketRangeHigh). + +generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) -> + Acc; +generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> + BNumber = + case BRange of + 0 -> + string:right(integer_to_list(BucketLow), 4, $0); + _ -> + BRand = random:uniform(BRange), + string:right(integer_to_list(BucketLow + BRand), 4, $0) + end, + KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0), + LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, + "Key" ++ KNumber, + o), + {_B, _K, KV} = leveled_codec:generate_ledgerkv(LedgerKey, + Seqn, + crypto:rand_bytes(64), + 64, + infinity), + generate_randomkeys(Seqn + 1, + Count - 1, + [KV|Acc], + BucketLow, + BRange). + + +simple_slotbin_test() -> + KVList0 = generate_randomkeys(1, 256, 1, 4), + KVList1 = lists:sublist(lists:ukeysort(1, KVList0), 1, 128), + ExtractHashFun = + fun({K, V}) -> + {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), + {hash, H} end, + HashList = lists:map(ExtractHashFun, KVList1), + + SW0 = os:timestamp(), + {SlotBin0, Bloom0} = build_slot(KVList1, HashList), + io:format(user, "~nSlot built in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW0)]), + + SW1 = os:timestamp(), + lists:foreach(fun(H) -> ?assertMatch(true, + is_check_slot_required(H, Bloom0)) + end, + HashList), + lists:foreach(fun({K, V}) -> + ?assertMatch({value, V}, + lookup_in_slot(K, SlotBin0)) + end, + KVList1), + io:format(user, "~nSlot checked for all keys in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW1)]). + + +-endif. \ No newline at end of file diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index f9212ad..9e76c44 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -2,7 +2,7 @@ %% %% For sheltering relatively expensive lookups with a probabilistic check %% -%% Uses multiple 256 byte blooms. Can sensibly hold up to 1000 keys per array. +%% Uses multiple 512 byte blooms. Can sensibly hold up to 1000 keys per array. %% Even at 1000 keys should still offer only a 20% false positive %% %% Restricted to no more than 256 arrays - so can't handle more than 250K keys @@ -19,9 +19,13 @@ -export([ enter/2, check/2, - empty/1 + empty/1, + tiny_enter/2, + tiny_check/2, + tiny_empty/0 ]). + -include_lib("eunit/include/eunit.hrl"). %%%============================================================================ @@ -39,7 +43,9 @@ enter({hash, Hash}, Bloom) -> {H0, Bit1, Bit2} = split_hash(Hash), Slot = H0 rem dict:size(Bloom), BitArray0 = dict:fetch(Slot, Bloom), - BitArray1 = lists:foldl(fun add_to_array/2, + FoldFun = + fun(K, Arr) -> add_to_array(K, Arr, 4096) end, + BitArray1 = lists:foldl(FoldFun, BitArray0, lists:usort([Bit1, Bit2])), dict:store(Slot, BitArray1, Bloom); @@ -51,11 +57,11 @@ check({hash, Hash}, Bloom) -> {H0, Bit1, Bit2} = split_hash(Hash), Slot = H0 rem dict:size(Bloom), BitArray = dict:fetch(Slot, Bloom), - case getbit(Bit1, BitArray) of + case getbit(Bit1, BitArray, 4096) of <<0:1>> -> false; <<1:1>> -> - case getbit(Bit2, BitArray) of + case getbit(Bit2, BitArray, 4096) of <<0:1>> -> false; <<1:1>> -> @@ -66,6 +72,37 @@ check(Key, Bloom) -> Hash = leveled_codec:magic_hash(Key), check({hash, Hash}, Bloom). +tiny_empty() -> + <<0:1024>>. + +tiny_enter({hash, no_lookup}, Bloom) -> + Bloom; +tiny_enter({hash, Hash}, Bloom) -> + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + FoldFun = + fun(K, Arr) -> add_to_array(K, Arr, 1024) end, + lists:foldl(FoldFun, Bloom, lists:usort([Bit0, Bit1, Bit2])). + +tiny_check({hash, Hash}, Bloom) -> + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + case getbit(Bit0, Bloom, 1024) of + <<0:1>> -> + false; + <<1:1>> -> + case getbit(Bit1, Bloom, 1024) of + <<0:1>> -> + false; + <<1:1>> -> + case getbit(Bit2, Bloom, 1024) of + <<0:1>> -> + false; + <<1:1>> -> + true + end + end + end. + + %%%============================================================================ %%% Internal Functions %%%============================================================================ @@ -76,15 +113,21 @@ split_hash(Hash) -> H2 = Hash bsr 20, {H0, H1, H2}. -add_to_array(Bit, BitArray) -> - RestLen = 4096 - Bit - 1, +split_hash_for_tinybloom(Hash) -> + H0 = Hash band 1023, + H1 = (Hash bsr 10) band 1023, + H2 = (Hash bsr 20) band 1023, + {H0, H1, H2}. + +add_to_array(Bit, BitArray, ArrayLength) -> + RestLen = ArrayLength - Bit - 1, <> = BitArray, <>. -getbit(Bit, BitArray) -> - RestLen = 4096 - Bit - 1, +getbit(Bit, BitArray, ArrayLength) -> + RestLen = ArrayLength - Bit - 1, <<_Head:Bit/bitstring, B:1/bitstring, _Rest:RestLen/bitstring>> = BitArray, @@ -148,6 +191,56 @@ simple_test() -> "with ~w false positive rate~n", [N, timer:now_diff(os:timestamp(), SW3), FP / N]), ?assertMatch(true, FP < (N div 4)). + +tiny_test() -> + N = 128, + K = 32, % more checks out then in K * checks + KLin = lists:map(fun(X) -> "Key_" ++ + integer_to_list(X) ++ + integer_to_list(random:uniform(100)) ++ + binary_to_list(crypto:rand_bytes(2)) + end, + lists:seq(1, N)), + KLout = lists:map(fun(X) -> + "NotKey_" ++ + integer_to_list(X) ++ + integer_to_list(random:uniform(100)) ++ + binary_to_list(crypto:rand_bytes(2)) + end, + lists:seq(1, N * K)), + HashIn = lists:map(fun(X) -> + {hash, leveled_codec:magic_hash(X)} end, + KLin), + HashOut = lists:map(fun(X) -> + {hash, leveled_codec:magic_hash(X)} end, + KLout), + + SW1 = os:timestamp(), + Bloom = lists:foldr(fun tiny_enter/2, tiny_empty(), HashIn), + io:format(user, + "~nAdding ~w hashes to tiny bloom took ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW1)]), + + SW2 = os:timestamp(), + lists:foreach(fun(X) -> + ?assertMatch(true, tiny_check(X, Bloom)) end, HashIn), + io:format(user, + "~nChecking ~w hashes in tiny bloom took ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW2)]), + + SW3 = os:timestamp(), + FP = lists:foldr(fun(X, Acc) -> case tiny_check(X, Bloom) of + true -> Acc + 1; + false -> Acc + end end, + 0, + HashOut), + io:format(user, + "~nChecking ~w hashes out of tiny bloom took ~w microseconds " + ++ "with ~w false positive rate~n", + [N * K, timer:now_diff(os:timestamp(), SW3), FP / (N * K)]), + ?assertMatch(true, FP < ((N * K) div 8)). + -endif. \ No newline at end of file From b37f3acb1ed8978b45c32e64445bcbbb04bfd43c Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Dec 2016 12:44:44 +0000 Subject: [PATCH 02/58] Extra timings --- src/leveled_sst.erl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 713e6c8..7a85d90 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -27,11 +27,18 @@ build_slot(KVList, HashList) when length(KVList) =< ?SLOT_SIZE -> + SW = os:timestamp(), SkipList = leveled_skiplist:to_sstlist(KVList), + io:format(user, "Changed to skiplist in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW)]), Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, leveled_tinybloom:tiny_empty(), HashList), + io:format(user, "Bloom added in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW)]), SlotBin = term_to_binary(SkipList, [{compressed, ?COMPRESSION_LEVEL}]), + io:format(user, "Converted to binary in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW)]), {SlotBin, Bloom}. is_check_slot_required(_Hash, none) -> @@ -113,8 +120,8 @@ simple_slotbin_test() -> SW0 = os:timestamp(), {SlotBin0, Bloom0} = build_slot(KVList1, HashList), - io:format(user, "~nSlot built in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW0)]), + io:format(user, "~nSlot built in ~w microseconds with size ~w~n", + [timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]), SW1 = os:timestamp(), lists:foreach(fun(H) -> ?assertMatch(true, From 60bddbc87436cfebad1dbde12c04cf7318246fe1 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Dec 2016 13:17:59 +0000 Subject: [PATCH 03/58] More timing - and changes slot width --- src/leveled_sst.erl | 6 +++--- src/leveled_tinybloom.erl | 43 ++++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 7a85d90..14c6390 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -9,7 +9,7 @@ -include("include/leveled.hrl"). --define(SLOT_SIZE, 128). +-define(SLOT_SIZE, 256). -define(COMPRESSION_LEVEL, 1). -include_lib("eunit/include/eunit.hrl"). @@ -110,8 +110,8 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> simple_slotbin_test() -> - KVList0 = generate_randomkeys(1, 256, 1, 4), - KVList1 = lists:sublist(lists:ukeysort(1, KVList0), 1, 128), + KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 2, 1, 4), + KVList1 = lists:sublist(lists:ukeysort(1, KVList0), 1, ?SLOT_SIZE), ExtractHashFun = fun({K, V}) -> {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 9e76c44..14aca98 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -73,18 +73,35 @@ check(Key, Bloom) -> check({hash, Hash}, Bloom). tiny_empty() -> - <<0:1024>>. + <<0:2048>>. tiny_enter({hash, no_lookup}, Bloom) -> Bloom; tiny_enter({hash, Hash}, Bloom) -> - {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), - FoldFun = - fun(K, Arr) -> add_to_array(K, Arr, 1024) end, - lists:foldl(FoldFun, Bloom, lists:usort([Bit0, Bit1, Bit2])). + {Half, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, + case Half of + 0 -> + <> = Bloom, + NewBin = lists:foldl(AddFun, Bin1, [Bit0, Bit1, Bit2]), + <>; + 1 -> + <> = Bloom, + NewBin = lists:foldl(AddFun, Bin2, [Bit0, Bit1, Bit2]), + <> + end. -tiny_check({hash, Hash}, Bloom) -> - {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), +tiny_check({hash, Hash}, FullBloom) -> + {Half, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + Bloom = + case Half of + 0 -> + <> = FullBloom, + Bin1; + 1 -> + <<_Bin1:1024/bitstring, Bin2:1024/bitstring>> = FullBloom, + Bin2 + end, case getbit(Bit0, Bloom, 1024) of <<0:1>> -> false; @@ -114,10 +131,12 @@ split_hash(Hash) -> {H0, H1, H2}. split_hash_for_tinybloom(Hash) -> - H0 = Hash band 1023, - H1 = (Hash bsr 10) band 1023, - H2 = (Hash bsr 20) band 1023, - {H0, H1, H2}. + % Tiny bloom can make k=3 from one hash + Half = Hash band 1, + H0 = (Hash bsr 1) band 1023, + H1 = (Hash bsr 11) band 1023, + H2 = (Hash bsr 21) band 1023, + {Half, H0, H1, H2}. add_to_array(Bit, BitArray, ArrayLength) -> RestLen = ArrayLength - Bit - 1, @@ -193,7 +212,7 @@ simple_test() -> ?assertMatch(true, FP < (N div 4)). tiny_test() -> - N = 128, + N = 256, K = 32, % more checks out then in K * checks KLin = lists:map(fun(X) -> "Key_" ++ integer_to_list(X) ++ From b1429a7330040ddd565075b54ba31a3e9926762d Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Dec 2016 16:49:16 +0000 Subject: [PATCH 04/58] Experiment with slot width of 512 --- src/leveled_sst.erl | 2 +- src/leveled_tinybloom.erl | 52 ++++++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 14c6390..6491f48 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -9,7 +9,7 @@ -include("include/leveled.hrl"). --define(SLOT_SIZE, 256). +-define(SLOT_SIZE, 512). -define(COMPRESSION_LEVEL, 1). -include_lib("eunit/include/eunit.hrl"). diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 14aca98..a437326 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -73,33 +73,55 @@ check(Key, Bloom) -> check({hash, Hash}, Bloom). tiny_empty() -> - <<0:2048>>. + <<0:4096>>. tiny_enter({hash, no_lookup}, Bloom) -> Bloom; tiny_enter({hash, Hash}, Bloom) -> - {Half, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + {Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, - case Half of + case Q of 0 -> - <> = Bloom, + <> = Bloom, NewBin = lists:foldl(AddFun, Bin1, [Bit0, Bit1, Bit2]), <>; 1 -> - <> = Bloom, + <> = Bloom, + NewBin = lists:foldl(AddFun, Bin2, [Bit0, Bit1, Bit2]), + <>; + 2 -> + <> = Bloom, + NewBin = lists:foldl(AddFun, Bin2, [Bit0, Bit1, Bit2]), + <>; + 3 -> + <> = Bloom, NewBin = lists:foldl(AddFun, Bin2, [Bit0, Bit1, Bit2]), <> end. tiny_check({hash, Hash}, FullBloom) -> - {Half, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + {Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), Bloom = - case Half of + case Q of 0 -> - <> = FullBloom, + <> = FullBloom, Bin1; 1 -> - <<_Bin1:1024/bitstring, Bin2:1024/bitstring>> = FullBloom, + <<_Bin1:1024/bitstring, + Bin2:1024/bitstring, + _Bin3:2048/bitstring>> = FullBloom, + Bin2; + 2 -> + <<_Bin1:2048/bitstring, + Bin2:1024/bitstring, + _Bin3:1024/bitstring>> = FullBloom, + Bin2; + 3 -> + <<_Bin1:3072/bitstring, Bin2:1024/bitstring>> = FullBloom, Bin2 end, case getbit(Bit0, Bloom, 1024) of @@ -132,11 +154,11 @@ split_hash(Hash) -> split_hash_for_tinybloom(Hash) -> % Tiny bloom can make k=3 from one hash - Half = Hash band 1, - H0 = (Hash bsr 1) band 1023, - H1 = (Hash bsr 11) band 1023, - H2 = (Hash bsr 21) band 1023, - {Half, H0, H1, H2}. + Q = Hash band 3, + H0 = (Hash bsr 2) band 1023, + H1 = (Hash bsr 12) band 1023, + H2 = (Hash bsr 22) band 1023, + {Q, H0, H1, H2}. add_to_array(Bit, BitArray, ArrayLength) -> RestLen = ArrayLength - Bit - 1, @@ -212,7 +234,7 @@ simple_test() -> ?assertMatch(true, FP < (N div 4)). tiny_test() -> - N = 256, + N = 512, K = 32, % more checks out then in K * checks KLin = lists:map(fun(X) -> "Key_" ++ integer_to_list(X) ++ From 4466210ac8515f1d71dc4dedd520701dc3607d9d Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Dec 2016 17:07:05 +0000 Subject: [PATCH 05/58] Revert back to slot size of 256 Changing the slot size higher has a significant impact on the fetch time, although it allows for more constant time on write. i.e. doubling the size means 5 x cost of read, if only a 10% increase at write time. --- src/leveled_skiplist.erl | 2 +- src/leveled_sst.erl | 2 +- src/leveled_tinybloom.erl | 42 ++++++++++----------------------------- 3 files changed, 12 insertions(+), 34 deletions(-) diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index dd40590..7505a61 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -38,7 +38,7 @@ -define(SKIP_WIDTH, 16). -define(LIST_HEIGHT, 2). --define(SST_WIDTH, 8). +-define(SST_WIDTH, 16). -define(INFINITY_KEY, {null, null, null, null, null}). -define(BITARRAY_SIZE, 2048). diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 6491f48..14c6390 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -9,7 +9,7 @@ -include("include/leveled.hrl"). --define(SLOT_SIZE, 512). +-define(SLOT_SIZE, 256). -define(COMPRESSION_LEVEL, 1). -include_lib("eunit/include/eunit.hrl"). diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index a437326..8868479 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -73,7 +73,7 @@ check(Key, Bloom) -> check({hash, Hash}, Bloom). tiny_empty() -> - <<0:4096>>. + <<0:2048>>. tiny_enter({hash, no_lookup}, Bloom) -> Bloom; @@ -81,24 +81,12 @@ tiny_enter({hash, Hash}, Bloom) -> {Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, case Q of - 0 -> - <> = Bloom, + N when N < 2 -> + <> = Bloom, NewBin = lists:foldl(AddFun, Bin1, [Bit0, Bit1, Bit2]), <>; - 1 -> - <> = Bloom, - NewBin = lists:foldl(AddFun, Bin2, [Bit0, Bit1, Bit2]), - <>; - 2 -> - <> = Bloom, - NewBin = lists:foldl(AddFun, Bin2, [Bit0, Bit1, Bit2]), - <>; - 3 -> - <> = Bloom, + _N -> + <> = Bloom, NewBin = lists:foldl(AddFun, Bin2, [Bit0, Bit1, Bit2]), <> end. @@ -107,21 +95,11 @@ tiny_check({hash, Hash}, FullBloom) -> {Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), Bloom = case Q of - 0 -> - <> = FullBloom, + N when N < 2 -> + <> = FullBloom, Bin1; - 1 -> - <<_Bin1:1024/bitstring, - Bin2:1024/bitstring, - _Bin3:2048/bitstring>> = FullBloom, - Bin2; - 2 -> - <<_Bin1:2048/bitstring, - Bin2:1024/bitstring, - _Bin3:1024/bitstring>> = FullBloom, - Bin2; - 3 -> - <<_Bin1:3072/bitstring, Bin2:1024/bitstring>> = FullBloom, + _N -> + <<_Bin1:1024/bitstring, Bin2:1024/bitstring>> = FullBloom, Bin2 end, case getbit(Bit0, Bloom, 1024) of @@ -234,7 +212,7 @@ simple_test() -> ?assertMatch(true, FP < (N div 4)). tiny_test() -> - N = 512, + N = 256, K = 32, % more checks out then in K * checks KLin = lists:map(fun(X) -> "Key_" ++ integer_to_list(X) ++ From 2d0881644567116df064599e7b37e9d266cba8ca Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Dec 2016 18:08:22 +0000 Subject: [PATCH 06/58] Confirm timings --- src/leveled_sst.erl | 73 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 14c6390..bd0a7f8 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -9,11 +9,17 @@ -include("include/leveled.hrl"). --define(SLOT_SIZE, 256). +-define(SLOT_SIZE, 128). -define(COMPRESSION_LEVEL, 1). -include_lib("eunit/include/eunit.hrl"). +-record(slot_index_value, {slot_id :: integer(), + bloom :: dict:dict(), + cache :: tuple(), + start_position :: integer(), + length :: integer()}). + %%%============================================================================ %%% API %%%============================================================================ @@ -25,20 +31,45 @@ %%% Internal Functions %%%============================================================================ +build_all_slots(KVList, BasePosition) -> + build_all_slots(KVList, BasePosition, [], 1, []). + +build_all_slots([], _Start, AllHashes, _SlotID, SlotIndex) -> + {SlotIndex, AllHashes}; +build_all_slots(KVList, StartPosition, AllHashes, SlotID, SlotIndex) -> + {SlotList, KVRem} = lists:split(?SLOT_SIZE, KVList), + {LastKey, _V} = lists:tail(SlotList), + ExtractHashFun = + fun({K, V}, Acc) -> + {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), + case H of + no_lookup -> + Acc; + H -> + [{hash, H}|Acc] + end + end, + HashList = lists:foldr(ExtractHashFun, [], KVList), + {SlotBin, Bloom} = build_slot(KVList, HashList), + Length = byte_size(SlotBin), + SlotIndexV = #slot_index_value{slot_id = SlotID, + bloom = Bloom, + start_position = StartPosition, + length = Length}, + build_all_slots(KVRem, + StartPosition + Length, + HashList ++ AllHashes, + SlotID + 1, + [{LastKey, SlotIndexV}|SlotIndex]). + + build_slot(KVList, HashList) when length(KVList) =< ?SLOT_SIZE -> - SW = os:timestamp(), SkipList = leveled_skiplist:to_sstlist(KVList), - io:format(user, "Changed to skiplist in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW)]), Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, leveled_tinybloom:tiny_empty(), HashList), - io:format(user, "Bloom added in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW)]), SlotBin = term_to_binary(SkipList, [{compressed, ?COMPRESSION_LEVEL}]), - io:format(user, "Converted to binary in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW)]), {SlotBin, Bloom}. is_check_slot_required(_Hash, none) -> @@ -117,10 +148,10 @@ simple_slotbin_test() -> {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), {hash, H} end, HashList = lists:map(ExtractHashFun, KVList1), - + io:format(user, "~nSkiplist with bloom timing test~n", []), SW0 = os:timestamp(), {SlotBin0, Bloom0} = build_slot(KVList1, HashList), - io:format(user, "~nSlot built in ~w microseconds with size ~w~n", + io:format(user, "Slot built in ~w microseconds with size ~w~n", [timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]), SW1 = os:timestamp(), @@ -133,8 +164,26 @@ simple_slotbin_test() -> lookup_in_slot(K, SlotBin0)) end, KVList1), - io:format(user, "~nSlot checked for all keys in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW1)]). + io:format(user, "Slot checked for all keys in ~w microsconds~n", + [timer:now_diff(os:timestamp(), SW1)]), + + io:format(user, "~ngb_tree comparison~n", []), + SW2 = os:timestamp(), + Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, + leveled_tinybloom:tiny_empty(), + HashList), + Tree0 = gb_trees:from_orddict(KVList1), + TreeBin = term_to_binary(Tree0, [{compressed, ?COMPRESSION_LEVEL}]), + io:format(user, "Bloom and Tree created for all keys in ~w microsconds~n", + [timer:now_diff(os:timestamp(), SW2)]), + SW3 = os:timestamp(), + lists:foreach(fun({K, V}) -> + ?assertMatch({value, V}, + gb_trees:lookup(K, binary_to_term(TreeBin))) + end, + KVList1), + io:format(user, "Tree checked for all keys in ~w microsconds~n", + [timer:now_diff(os:timestamp(), SW3)]). -endif. \ No newline at end of file From 0cea470b70100c4fd9657672a82430c0eea3803f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Dec 2016 23:30:15 +0000 Subject: [PATCH 07/58] Share final timing test --- src/leveled_sst.erl | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index bd0a7f8..f728643 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -164,8 +164,13 @@ simple_slotbin_test() -> lookup_in_slot(K, SlotBin0)) end, KVList1), - io:format(user, "Slot checked for all keys in ~w microsconds~n", + io:format(user, "Slot checked for all keys in ~w microseconds~n", [timer:now_diff(os:timestamp(), SW1)]), + SW5 = os:timestamp(), + leveled_skiplist:to_list(binary_to_term(SlotBin0)), + io:format(user, "Skiplist flattened in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW5)]), + io:format(user, "~ngb_tree comparison~n", []), SW2 = os:timestamp(), @@ -174,16 +179,21 @@ simple_slotbin_test() -> HashList), Tree0 = gb_trees:from_orddict(KVList1), TreeBin = term_to_binary(Tree0, [{compressed, ?COMPRESSION_LEVEL}]), - io:format(user, "Bloom and Tree created for all keys in ~w microsconds~n", - [timer:now_diff(os:timestamp(), SW2)]), + io:format(user, "Bloom and Tree created for all keys in ~w microseconds " ++ + "with size ~w~n", + [timer:now_diff(os:timestamp(), SW2), byte_size(TreeBin)]), SW3 = os:timestamp(), lists:foreach(fun({K, V}) -> ?assertMatch({value, V}, gb_trees:lookup(K, binary_to_term(TreeBin))) end, KVList1), - io:format(user, "Tree checked for all keys in ~w microsconds~n", - [timer:now_diff(os:timestamp(), SW3)]). + io:format(user, "Tree checked for all keys in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW3)]), + SW4 = os:timestamp(), + gb_trees:to_list(binary_to_term(TreeBin)), + io:format(user, "Tree flattened in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW4)]). -endif. \ No newline at end of file From b1a3b4ad137b94646acebe690ea9b1ec932f5911 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 00:02:06 +0000 Subject: [PATCH 08/58] Switch slot to gb_trees and size of 128 --- src/leveled_sst.erl | 82 ++++++++++++++++++--------------------- src/leveled_tinybloom.erl | 28 +++---------- 2 files changed, 43 insertions(+), 67 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index f728643..948972f 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -3,7 +3,34 @@ %% A FSM module intended to wrap a persisted, ordered view of Keys and Values %% %% The persisted view is built from a list (which may be created by merging -%% multiple lists) +%% multiple lists). +%% +%% +%% -------- Slots --------- +%% +%% The view is built from sublists referred to as slot. Each slot is up to 128 +%% keys and values in size. The slots are each themselves a gb_tree. The +%% gb_tree is slightly slower than the skiplist at fetch time, and doesn't +%% support directly the useful to_range function. However the from_orddict +%% capability is much faster than from_sortedlist in skiplist, saving on CPU +%% at sst build time: +%% +%% Skiplist: +%% build and serialise slot 3233 microseconds +%% de-serialise and check * 128 - 14669 microseconds +%% flatten back to list - 164 microseconds +%% +%% GBTree: +%% build and serialise tree 402 microseconds +%% de-serialise and check * 128 - 15263 microseconds +%% flatten back to list - 175 microseconds +%% +%% The performance advantage at lookup time is no negligible as the time to +%% de-deserialise for each check is dominant. This time grows linearly with +%% the size of the slot, wherease the serialisation time is relatively constant +%% with growth. So bigger slots would be quicker to build, but the penalty for +%% that speed is too high at lookup time. + -module(leveled_sst). @@ -16,7 +43,6 @@ -record(slot_index_value, {slot_id :: integer(), bloom :: dict:dict(), - cache :: tuple(), start_position :: integer(), length :: integer()}). @@ -63,13 +89,12 @@ build_all_slots(KVList, StartPosition, AllHashes, SlotID, SlotIndex) -> [{LastKey, SlotIndexV}|SlotIndex]). - build_slot(KVList, HashList) when length(KVList) =< ?SLOT_SIZE -> - SkipList = leveled_skiplist:to_sstlist(KVList), + Tree = gb_trees:from_orddict(KVList), Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, leveled_tinybloom:tiny_empty(), HashList), - SlotBin = term_to_binary(SkipList, [{compressed, ?COMPRESSION_LEVEL}]), + SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), {SlotBin, Bloom}. is_check_slot_required(_Hash, none) -> @@ -80,20 +105,14 @@ is_check_slot_required(Hash, Bloom) -> lookup_in_slot(Key, {pointer, Handle, Pos, Length}) -> lookup_in_slot(Key, read_slot(Handle, Pos, Length)); lookup_in_slot(Key, SlotBin) -> - SkipList = binary_to_term(SlotBin), - leveled_skiplist:lookup(Key, SkipList). - -range_from_slot(StartKey, EndKey, {pointer, Handle, Pos, Length}) -> - range_from_slot(StartKey, EndKey, read_slot(Handle, Pos, Length)); -range_from_slot(StartKey, EndKey, SlotBin) -> - SkipList = binary_to_term(SlotBin), - leveled_skiplist:to_range(SkipList, StartKey, EndKey). + Tree = binary_to_term(SlotBin), + gb_trees:lookup(Key, Tree). all_from_slot({pointer, Handle, Pos, Length}) -> all_from_slot(read_slot(Handle, Pos, Length)); all_from_slot(SlotBin) -> SkipList = binary_to_term(SlotBin), - leveled_skiplist:to_list(SkipList). + gb_trees:to_list(SkipList). read_slot(_Handle, _Pos, _Length) -> @@ -148,7 +167,6 @@ simple_slotbin_test() -> {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), {hash, H} end, HashList = lists:map(ExtractHashFun, KVList1), - io:format(user, "~nSkiplist with bloom timing test~n", []), SW0 = os:timestamp(), {SlotBin0, Bloom0} = build_slot(KVList1, HashList), io:format(user, "Slot built in ~w microseconds with size ~w~n", @@ -166,34 +184,10 @@ simple_slotbin_test() -> KVList1), io:format(user, "Slot checked for all keys in ~w microseconds~n", [timer:now_diff(os:timestamp(), SW1)]), - SW5 = os:timestamp(), - leveled_skiplist:to_list(binary_to_term(SlotBin0)), - io:format(user, "Skiplist flattened in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW5)]), - - - io:format(user, "~ngb_tree comparison~n", []), SW2 = os:timestamp(), - Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, - leveled_tinybloom:tiny_empty(), - HashList), - Tree0 = gb_trees:from_orddict(KVList1), - TreeBin = term_to_binary(Tree0, [{compressed, ?COMPRESSION_LEVEL}]), - io:format(user, "Bloom and Tree created for all keys in ~w microseconds " ++ - "with size ~w~n", - [timer:now_diff(os:timestamp(), SW2), byte_size(TreeBin)]), - SW3 = os:timestamp(), - lists:foreach(fun({K, V}) -> - ?assertMatch({value, V}, - gb_trees:lookup(K, binary_to_term(TreeBin))) - end, - KVList1), - io:format(user, "Tree checked for all keys in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW3)]), - SW4 = os:timestamp(), - gb_trees:to_list(binary_to_term(TreeBin)), - io:format(user, "Tree flattened in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW4)]). - - + ?assertMatch(KVList1, all_from_slot(SlotBin0)), + io:format(user, "Slot flattened in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW2)]). + + -endif. \ No newline at end of file diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 8868479..a0af8a5 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -73,35 +73,17 @@ check(Key, Bloom) -> check({hash, Hash}, Bloom). tiny_empty() -> - <<0:2048>>. + <<0:1024>>. tiny_enter({hash, no_lookup}, Bloom) -> Bloom; tiny_enter({hash, Hash}, Bloom) -> - {Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + {_Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, - case Q of - N when N < 2 -> - <> = Bloom, - NewBin = lists:foldl(AddFun, Bin1, [Bit0, Bit1, Bit2]), - <>; - _N -> - <> = Bloom, - NewBin = lists:foldl(AddFun, Bin2, [Bit0, Bit1, Bit2]), - <> - end. + lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]). -tiny_check({hash, Hash}, FullBloom) -> - {Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), - Bloom = - case Q of - N when N < 2 -> - <> = FullBloom, - Bin1; - _N -> - <<_Bin1:1024/bitstring, Bin2:1024/bitstring>> = FullBloom, - Bin2 - end, +tiny_check({hash, Hash}, Bloom) -> + {_Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), case getbit(Bit0, Bloom, 1024) of <<0:1>> -> false; From 4f838f6f88fad1c98c6e8991b79f41b6429e8d29 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 00:41:50 +0000 Subject: [PATCH 09/58] Settled on sizes Also removed length check due to warning in Erlang guidance about non-constant time nature of this command. Intend to remove lengths from elsewhere (especially when used simply for logging). --- src/leveled_skiplist.erl | 5 ----- src/leveled_sst.erl | 48 +++++++++++++++++++++++++++++++++++----- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 7505a61..7fcc81a 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -18,7 +18,6 @@ -export([ from_list/1, from_list/2, - to_sstlist/1, from_sortedlist/1, from_sortedlist/2, to_list/1, @@ -38,7 +37,6 @@ -define(SKIP_WIDTH, 16). -define(LIST_HEIGHT, 2). --define(SST_WIDTH, 16). -define(INFINITY_KEY, {null, null, null, null, null}). -define(BITARRAY_SIZE, 2048). @@ -96,9 +94,6 @@ from_sortedlist(SortedKVL, BloomProtect) -> end, {Bloom0, from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT)}. -to_sstlist(SortedKVL) -> - {list_only, from_list(SortedKVL, ?SST_WIDTH, ?LIST_HEIGHT)}. - lookup(Key, SkipList) -> case element(1, SkipList) of list_only -> diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 948972f..5d747ec 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -3,8 +3,7 @@ %% A FSM module intended to wrap a persisted, ordered view of Keys and Values %% %% The persisted view is built from a list (which may be created by merging -%% multiple lists). -%% +%% multiple lists). The list is built first, then the view is created in bulk. %% %% -------- Slots --------- %% @@ -21,7 +20,7 @@ %% flatten back to list - 164 microseconds %% %% GBTree: -%% build and serialise tree 402 microseconds +%% build and serialise tree 1433 microseconds %% de-serialise and check * 128 - 15263 microseconds %% flatten back to list - 175 microseconds %% @@ -30,6 +29,28 @@ %% the size of the slot, wherease the serialisation time is relatively constant %% with growth. So bigger slots would be quicker to build, but the penalty for %% that speed is too high at lookup time. +%% +%% -------- Blooms --------- +%% +%% There are two different tiny blooms for each table. One is split by the +%% first byte of the hash, and consists of two hashes (derived from the +%% remainder of the hash). This is the top bloom, and the size vaires by +%% level. +%% Level 0 has 8 bits per key - 0.05 fpr +%% Level 1 has 6 bits per key - 0.08 fpr +%% Other Levels have 4 bits per key - 0.15 fpr +%% +%% If this level is passed, then each slot has its own bloom based on the +%% same hash, but now split into three hashes and having a fixed 8 bit per +%% key size at all levels. +%% Slot Bloom has 8 bits per key - 0.03 fpr +%% +%% All blooms are base don the DJ Bernstein magic hash which proved to give +%% the predicted fpr in tests (unlike phash2 which has significantly higher +%% fpr). Due to the cost of producing the magic hash, it is read from the +%% value not reproduced each time. If the value is set to no_lookup no bloom +%% entry is added, and if all hashes are no_lookup in the slot then no bloom +%% is produced. -module(leveled_sst). @@ -38,6 +59,7 @@ -define(SLOT_SIZE, 128). -define(COMPRESSION_LEVEL, 1). +-define(LEVEL_BLOOM_SLOTS, [{0, 64}, {1, 48}, {default, 32}]). -include_lib("eunit/include/eunit.hrl"). @@ -57,6 +79,20 @@ %%% Internal Functions %%%============================================================================ +build_table_summary(SlotIndex, AllHashes, Level) -> + BloomSlots = + case lists:keyfind(Level, ?LEVEL_BLOOM_SLOTS) of + {Level, N} -> + N; + false -> + element(2, lists:keyfind(default, ?LEVEL_BLOOM_SLOTS)) + end, + Bloom = lists:foldr(fun leveled_tinybloom:enter/2, + leveled_bloom:empty(BloomSlots), + AllHashes), + SkipSlot = leveled_skiplist:from_sortedlist(lists:reverse(SlotIndex)), + term_to_binary({SkipSlot, Bloom}, [{comprressed, ?COMPRESSION_LEVEL}]). + build_all_slots(KVList, BasePosition) -> build_all_slots(KVList, BasePosition, [], 1, []). @@ -75,8 +111,8 @@ build_all_slots(KVList, StartPosition, AllHashes, SlotID, SlotIndex) -> [{hash, H}|Acc] end end, - HashList = lists:foldr(ExtractHashFun, [], KVList), - {SlotBin, Bloom} = build_slot(KVList, HashList), + HashList = lists:foldr(ExtractHashFun, [], SlotList), + {SlotBin, Bloom} = build_slot(SlotList, HashList), Length = byte_size(SlotBin), SlotIndexV = #slot_index_value{slot_id = SlotID, bloom = Bloom, @@ -89,7 +125,7 @@ build_all_slots(KVList, StartPosition, AllHashes, SlotID, SlotIndex) -> [{LastKey, SlotIndexV}|SlotIndex]). -build_slot(KVList, HashList) when length(KVList) =< ?SLOT_SIZE -> +build_slot(KVList, HashList) -> Tree = gb_trees:from_orddict(KVList), Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, leveled_tinybloom:tiny_empty(), From cb654b132512057b914552d7fa06131b5c82bd7b Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 01:23:40 +0000 Subject: [PATCH 10/58] Build the table summary The table summary will be a skiplist, and this and the slot binary will be CRC checked --- src/leveled_sst.erl | 65 +++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 5d747ec..9482030 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -81,26 +81,49 @@ build_table_summary(SlotIndex, AllHashes, Level) -> BloomSlots = - case lists:keyfind(Level, ?LEVEL_BLOOM_SLOTS) of + case lists:keyfind(Level, 1, ?LEVEL_BLOOM_SLOTS) of {Level, N} -> N; false -> - element(2, lists:keyfind(default, ?LEVEL_BLOOM_SLOTS)) + element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_SLOTS)) end, Bloom = lists:foldr(fun leveled_tinybloom:enter/2, - leveled_bloom:empty(BloomSlots), + leveled_tinybloom:empty(BloomSlots), AllHashes), SkipSlot = leveled_skiplist:from_sortedlist(lists:reverse(SlotIndex)), - term_to_binary({SkipSlot, Bloom}, [{comprressed, ?COMPRESSION_LEVEL}]). + SummBin = term_to_binary({SkipSlot, Bloom}, + [{compressed, ?COMPRESSION_LEVEL}]), + SummCRC = erlang:crc32(SummBin), + <>. + +read_table_summary(BinWithCheck) -> + <> = BinWithCheck, + CRCCheck = erlang:crc32(SummBin), + if + CRCCheck == SummCRC -> + % If not might it should be possible to rebuild from all the slots + binary_to_term(SummBin) + end. build_all_slots(KVList, BasePosition) -> - build_all_slots(KVList, BasePosition, [], 1, []). + L = length(KVList), + % The length is not a constant time command and the list may be large, + % but otherwise lenght must be called each iteration to avoid exception + % on split or sublist + SlotCount = L div ?SLOT_SIZE, + build_all_slots(KVList, SlotCount, BasePosition, [], 1, [], <<>>). -build_all_slots([], _Start, AllHashes, _SlotID, SlotIndex) -> - {SlotIndex, AllHashes}; -build_all_slots(KVList, StartPosition, AllHashes, SlotID, SlotIndex) -> - {SlotList, KVRem} = lists:split(?SLOT_SIZE, KVList), - {LastKey, _V} = lists:tail(SlotList), +build_all_slots([], _Count, _Start, AllHashes, _SlotID, SlotIndex, SlotsBin) -> + {SlotIndex, AllHashes, SlotsBin}; +build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) -> + {SlotList, KVRem} = + case Count of + 0 -> + {lists:sublist(KVL, ?SLOT_SIZE), []}; + _N -> + lists:split(?SLOT_SIZE, KVL) + end, + {LastKey, _V} = lists:last(SlotList), ExtractHashFun = fun({K, V}, Acc) -> {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), @@ -113,16 +136,19 @@ build_all_slots(KVList, StartPosition, AllHashes, SlotID, SlotIndex) -> end, HashList = lists:foldr(ExtractHashFun, [], SlotList), {SlotBin, Bloom} = build_slot(SlotList, HashList), - Length = byte_size(SlotBin), + SlotCRC = erlang:crc32(SlotBin), + Length = byte_size(SlotBin) + 4, SlotIndexV = #slot_index_value{slot_id = SlotID, bloom = Bloom, - start_position = StartPosition, + start_position = Start, length = Length}, build_all_slots(KVRem, - StartPosition + Length, + Count - 1, + Start + Length, HashList ++ AllHashes, SlotID + 1, - [{LastKey, SlotIndexV}|SlotIndex]). + [{LastKey, SlotIndexV}|SlotIndex], + <>). build_slot(KVList, HashList) -> @@ -150,9 +176,8 @@ all_from_slot(SlotBin) -> SkipList = binary_to_term(SlotBin), gb_trees:to_list(SkipList). - read_slot(_Handle, _Pos, _Length) -> - not_yet_implemented. + not_implemented. %%%============================================================================ @@ -225,5 +250,11 @@ simple_slotbin_test() -> io:format(user, "Slot flattened in ~w microseconds~n", [timer:now_diff(os:timestamp(), SW2)]). - + +simple_slotbinsummary_test() -> + KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 8 + 100, 1, 4), + KVList1 = lists:ukeysort(1, KVList0), + {SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList1, 0), + _SummaryBin = build_table_summary(SlotIndex, AllHashes, 2). + -endif. \ No newline at end of file From 58d8e60994661ea00b9e3179993912e0755c34d2 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 15:12:24 +0000 Subject: [PATCH 11/58] Some basic code layout work --- src/leveled_log.erl | 112 +++++++++++---- src/leveled_skiplist.erl | 50 +++++++ src/leveled_sst.erl | 289 +++++++++++++++++++++++++++++++++++--- src/leveled_tinybloom.erl | 1 + 4 files changed, 405 insertions(+), 47 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index e06118b..635ecc8 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -11,11 +11,13 @@ log_timer/3, put_timing/4, head_timing/4, - get_timing/3]). + get_timing/3, + sst_timing/3]). --define(PUT_TIMING_LOGPOINT, 20000). --define(HEAD_TIMING_LOGPOINT, 160000). --define(GET_TIMING_LOGPOINT, 160000). +-define(PUT_LOGPOINT, 20000). +-define(HEAD_LOGPOINT, 160000). +-define(GET_LOGPOINT, 160000). +-define(SST_LOGPOINT, 200000). -define(LOG_LEVEL, [info, warn, error, critical]). -define(SAMPLE_RATE, 16#F). @@ -230,6 +232,15 @@ {"PM002", {info, "Completed dump of L0 cache to list of size ~w"}}, + {"SST01", + {info, "SST timing for result ~w is sample ~w total ~w and max ~w"}}, + {"SST02", + {error, "False result returned from SST with filename ~s as " + ++ "slot ~w has failed crc check"}}, + {"SST03", + {info, "Opening SST file with filename ~s keys ~w and slots ~w"}}, + {"SST04", + {info, "Exit called for reason ~w on filename ~s"}}, {"SFT01", {info, "Opened filename with name ~s"}}, @@ -333,14 +344,13 @@ log_timer(LogReference, Subs, StartTime) -> end. %% Make a log of put timings split out by actor - one log for every -%% PUT_TIMING_LOGPOINT puts +%% PUT_LOGPOINT puts put_timing(_Actor, undefined, T0, T1) -> {1, {T0, T1}, {T0, T1}}; -put_timing(Actor, {?PUT_TIMING_LOGPOINT, {Total0, Total1}, {Max0, Max1}}, - T0, T1) -> - RN = random:uniform(?HEAD_TIMING_LOGPOINT), - case RN > ?HEAD_TIMING_LOGPOINT div 2 of +put_timing(Actor, {?PUT_LOGPOINT, {Total0, Total1}, {Max0, Max1}}, T0, T1) -> + RN = random:uniform(?HEAD_LOGPOINT), + case RN > ?HEAD_LOGPOINT div 2 of true -> % log at the timing point less than half the time LogRef = @@ -349,7 +359,7 @@ put_timing(Actor, {?PUT_TIMING_LOGPOINT, {Total0, Total1}, {Max0, Max1}}, inker -> "I0019"; journal -> "CDB17" end, - log(LogRef, [?PUT_TIMING_LOGPOINT, Total0, Total1, Max0, Max1]), + log(LogRef, [?PUT_LOGPOINT, Total0, Total1, Max0, Max1]), put_timing(Actor, undefined, T0, T1); false -> % Log some other random time @@ -359,7 +369,7 @@ put_timing(_Actor, {N, {Total0, Total1}, {Max0, Max1}}, T0, T1) -> {N + 1, {Total0 + T0, Total1 + T1}, {max(Max0, T0), max(Max1, T1)}}. %% Make a log of penciller head timings split out by level and result - one -%% log for every HEAD_TIMING_LOGPOINT puts +%% log for every HEAD_LOGPOINT puts %% Returns a tuple of {Count, TimingDict} to be stored on the process state head_timing(undefined, SW, Level, R) -> T0 = timer:now_diff(os:timestamp(), SW), @@ -384,9 +394,9 @@ head_timing_int(undefined, T0, Level, R) -> dict:store(K, [0, 0, 0], Acc) end end, {1, lists:foldl(NewDFun, dict:new(), head_keylist())}; -head_timing_int({?HEAD_TIMING_LOGPOINT, HeadTimingD}, T0, Level, R) -> - RN = random:uniform(?HEAD_TIMING_LOGPOINT), - case RN > ?HEAD_TIMING_LOGPOINT div 2 of +head_timing_int({?HEAD_LOGPOINT, HeadTimingD}, T0, Level, R) -> + RN = random:uniform(?HEAD_LOGPOINT), + case RN > ?HEAD_LOGPOINT div 2 of true -> % log at the timing point less than half the time LogFun = fun(K) -> log("P0032", [K|dict:fetch(K, HeadTimingD)]) end, @@ -419,21 +429,62 @@ head_keylist() -> [not_present, found_lower, found_0, found_1, found_2]. +sst_timing(undefined, SW, TimerType) -> + T0 = timer:now_diff(os:timestamp(), SW), + gen_timing_int(undefined, + T0, + TimerType, + fun sst_keylist/0, + ?SST_LOGPOINT, + "SST01"); +sst_timing({N, SSTTimerD}, SW, TimerType) -> + case N band ?SAMPLE_RATE of + 0 -> + T0 = timer:now_diff(os:timestamp(), SW), + gen_timing_int({N, SSTTimerD}, + T0, + TimerType, + fun sst_keylist/0, + ?SST_LOGPOINT, + "SST01"); + _ -> + % Not to be sampled this time + {N + 1, SSTTimerD} + end. + +sst_keylist() -> + [summary_bloom, cache_entry, + slot_bloom, slot_crc_wonky, slot_lookup_miss, slot_lookup_hit]. + get_timing(undefined, SW, TimerType) -> T0 = timer:now_diff(os:timestamp(), SW), - get_timing_int(undefined, T0, TimerType); + gen_timing_int(undefined, + T0, + TimerType, + fun get_keylist/0, + ?GET_LOGPOINT, + "B0014"); get_timing({N, GetTimerD}, SW, TimerType) -> case N band ?SAMPLE_RATE of 0 -> T0 = timer:now_diff(os:timestamp(), SW), - get_timing_int({N, GetTimerD}, T0, TimerType); + gen_timing_int({N, GetTimerD}, + T0, + TimerType, + fun get_keylist/0, + ?GET_LOGPOINT, + "B0014"); _ -> % Not to be sampled this time {N + 1, GetTimerD} end. -get_timing_int(undefined, T0, TimerType) -> +get_keylist() -> + [head_not_present, head_found, fetch]. + + +gen_timing_int(undefined, T0, TimerType, KeyListFun, _LogPoint, _LogRef) -> NewDFun = fun(K, Acc) -> case K of TimerType -> @@ -441,31 +492,32 @@ get_timing_int(undefined, T0, TimerType) -> _ -> dict:store(K, [0, 0, 0], Acc) end end, - {1, lists:foldl(NewDFun, dict:new(), get_keylist())}; -get_timing_int({?GET_TIMING_LOGPOINT, GetTimerD}, T0, TimerType) -> - RN = random:uniform(?GET_TIMING_LOGPOINT), - case RN > ?GET_TIMING_LOGPOINT div 2 of + {1, lists:foldl(NewDFun, dict:new(), KeyListFun())}; +gen_timing_int({LogPoint, TimerD}, T0, TimerType, KeyListFun, LogPoint, + LogRef) -> + RN = random:uniform(LogPoint), + case RN > LogPoint div 2 of true -> % log at the timing point less than half the time - LogFun = fun(K) -> log("B0014", [K|dict:fetch(K, GetTimerD)]) end, - lists:foreach(LogFun, get_keylist()), - get_timing_int(undefined, T0, TimerType); + LogFun = fun(K) -> log(LogRef, [K|dict:fetch(K, TimerD)]) end, + lists:foreach(LogFun, KeyListFun()), + gen_timing_int(undefined, T0, TimerType, + KeyListFun, LogPoint, LogRef); false -> % Log some other time - reset to RN not 0 to stagger logs out over % time between the vnodes - get_timing_int({RN, GetTimerD}, T0, TimerType) + gen_timing_int({RN, TimerD}, T0, TimerType, + KeyListFun, LogPoint, LogRef) end; -get_timing_int({N, GetTimerD}, T0, TimerType) -> - [Count0, Total0, Max0] = dict:fetch(TimerType, GetTimerD), +gen_timing_int({N, TimerD}, T0, TimerType, _KeyListFun, _LogPoint, _LogRef) -> + [Count0, Total0, Max0] = dict:fetch(TimerType, TimerD), {N + 1, dict:store(TimerType, [Count0 + 1, Total0 + T0, max(Max0, T0)], - GetTimerD)}. + TimerD)}. -get_keylist() -> - [head_not_present, head_found, fetch]. %%%============================================================================ %%% Test diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 7fcc81a..83ac382 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -28,6 +28,7 @@ to_range/3, lookup/2, lookup/3, + key_above/2, empty/0, empty/1, size/1 @@ -122,6 +123,9 @@ to_range(SkipList, Start, End) -> to_list(SkipList) -> to_list(element(2, SkipList), ?LIST_HEIGHT). +key_above(SkipList, Key) -> + key_above(element(2, SkipList), Key, ?LIST_HEIGHT). + empty() -> empty(false). @@ -335,6 +339,36 @@ to_range(SkipList, Start, End, Level) -> {_Bool1, _Bool2, SubList, _PrevList} = R, SubList. +key_above(SkipList, Key, 0) -> + FindFun = fun({Mark, V}, Found) -> + case Found of + false -> + case Key =< Mark of + true -> + {Mark, V}; + false -> + false + end; + _ -> + Found + end + end, + lists:foldl(FindFun, false, SkipList); +key_above(SkipList, Key, Level) -> + FindFun = fun({Mark, SL}, Found) -> + case Found of + false -> + case Key =< Mark of + true -> + key_above(SL, Key, Level - 1); + false -> + false + end; + _ -> + Found + end + end, + lists:foldl(FindFun, false, SkipList). empty(SkipList, 1) -> [{?INFINITY_KEY, SkipList}]; @@ -645,6 +679,22 @@ skiplist_nolookup_test() -> KL), ?assertMatch(KLSorted, to_list(SkipList)). +skiplist_keybefore_test() -> + N = 128, + KL = generate_randomkeys(1, N, 1, N div 5), + SkipList = lists:foldl(fun({K, V}, Acc) -> + enter_nolookup(K, V, Acc) end, + empty(true), + KL), + KLSorted = lists:ukeysort(1, lists:reverse(KL)), + SW = os:timestamp(), + lists:foreach(fun({K, V}) -> + ?assertMatch({K, V}, key_above(SkipList, K)) end, + KLSorted), + io:format(user, "~nFinding self in keys above ~w microseconds for ~w finds~n", + [timer:now_diff(os:timestamp(), SW), N]). + + empty_skiplist_size_test() -> ?assertMatch(0, leveled_skiplist:size(empty(false))), ?assertMatch(0, leveled_skiplist:size(empty(true))). diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 9482030..b3243ee 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -45,7 +45,7 @@ %% key size at all levels. %% Slot Bloom has 8 bits per key - 0.03 fpr %% -%% All blooms are base don the DJ Bernstein magic hash which proved to give +%% All blooms are based on the DJ Bernstein magic hash which proved to give %% the predicted fpr in tests (unlike phash2 which has significantly higher %% fpr). Due to the cost of producing the magic hash, it is read from the %% value not reproduced each time. If the value is set to no_lookup no bloom @@ -55,6 +55,8 @@ -module(leveled_sst). +-behaviour(gen_fsm). + -include("include/leveled.hrl"). -define(SLOT_SIZE, 128). @@ -63,23 +65,234 @@ -include_lib("eunit/include/eunit.hrl"). +-export([init/1, + handle_sync_event/4, + handle_event/3, + handle_info/3, + terminate/3, + code_change/4, + starting/3, + reader/3]). + +-export([sst_new/3, + sst_open/1, + sst_get/2, + sst_get/3, + sst_close/1]). + +-export([generate_randomkeys/1]). + + + -record(slot_index_value, {slot_id :: integer(), bloom :: dict:dict(), start_position :: integer(), length :: integer()}). +-record(summary, {first_key :: tuple(), + last_key :: tuple(), + index :: list(), % leveled_skiplist + bloom :: tuple(), % leveled_tinybloom + size :: integer()}). + +-record(state, {summary, + handle :: file:fd(), + sst_timings :: tuple(), + slot_lengths :: list(), + filename, + cache}). + + %%%============================================================================ %%% API %%%============================================================================ +sst_open(Filename) -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + case gen_fsm:sync_send_event(Pid, {sst_open, Filename}, infinity) of + {ok, {SK, EK}} -> + {ok, Pid, {SK, EK}} + end. +sst_new(Filename, Level, KVList) -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + case gen_fsm:sync_send_event(Pid, + {sst_new, Filename, Level, KVList}, + infinity) of + {ok, {SK, EK}} -> + {ok, Pid, {SK, EK}} + end. + +%sft_newlevelzero(Filename, Slots, FetchFun, Wait, Penciller) -> +% {ok, Pid} = gen_fsm:start(?MODULE, [], []), +% case Wait of +% true -> +% KL1 = leveled_pmem:to_list(Slots, FetchFun), +% Reply = gen_fsm:sync_send_event(Pid, +% {sft_new, +% Filename, +% 0, +% KL1}, +% infinity), +% {ok, Pid, Reply}; +% false -> +% gen_fsm:send_event(Pid, +% {sft_newlevelzero, +% Filename, +% Slots, +% FetchFun, +% Penciller}), +% {ok, Pid, noreply} +% end. + +sst_get(Pid, LedgerKey) -> + sst_get(Pid, LedgerKey, leveled_codec:magic_hash(LedgerKey)). + +sst_get(Pid, LedgerKey, Hash) -> + gen_fsm:sync_send_event(Pid, {get_kv, LedgerKey, Hash}, infinity). + +sst_close(Pid) -> + gen_fsm:sync_send_event(Pid, close, 2000). + + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([]) -> + {ok, starting, #state{}}. + +starting({sft_open, Filename}, _From, State) -> + UpdState = read_file(Filename, State), + Summary = UpdState#state.summary, + {reply, + {ok, {Summary#summary.first_key, Summary#summary.last_key}}, + reader, + UpdState}; +starting({sft_new, Filename, Level, KVList}, _From, State) -> + {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), + SummaryBin = build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L), + ok = write_file(Filename, SummaryBin, SlotsBin), + UpdState = read_file(Filename, State), + Summary = UpdState#state.summary, + {reply, + {ok, {Summary#summary.first_key, Summary#summary.last_key}}, + reader, + UpdState}. + +reader({get_kv, LedgerKey, Hash}, _From, State) -> + SW = os:timestamp(), + {Result, Stage, SlotID} = fetch(LedgerKey, Hash, State), + UpdTimings = leveled_log:sst_timing(State#state.sst_timings, SW, Stage), + case {Result, Stage} of + {not_present, slot_crc_wonky} -> + leveled_log:log("SST02", [State#state.filename, SlotID]), + {reply, Result, reader, State#state{sst_timings = UpdTimings}}; + {not_present, _} -> + {reply, Result, reader, State#state{sst_timings = UpdTimings}}; + {KV, slot_lookup_hit} -> + UpdCache = array:set(SlotID, KV, State#state.cache), + {reply, Result, reader, State#state{cache = UpdCache, + sst_timings = UpdTimings}} + end. + +handle_sync_event(_Msg, _From, StateName, State) -> + {reply, undefined, StateName, State}. + +handle_event(_Msg, StateName, State) -> + {next_state, StateName, State}. + +handle_info(_Msg, StateName, State) -> + {next_state, StateName, State}. + +terminate(Reason, _StateName, State) -> + leveled_log:log("SST04", [Reason, State#state.filename]). + +code_change(_OldVsn, StateName, State, _Extra) -> + {ok, StateName, State}. %%%============================================================================ %%% Internal Functions %%%============================================================================ -build_table_summary(SlotIndex, AllHashes, Level) -> +fetch(LedgerKey, Hash, State) -> + Summary = State#state.summary, + case leveled_tinybloom:check({hash, Hash}, Summary#summary.bloom) of + false -> + {not_present, summary_bloom, null}; + true -> + Slot = lookup_slot(LedgerKey, Summary#summary.index), + CacheEntry = array:get(Slot#slot_index_value.slot_id, + State#state.cache), + case CacheEntry of + {LedgerKey, CachedValue} -> + {{LedgerKey, CachedValue}, cache_entry}; + _ -> + SlotBloom = Slot#slot_index_value.bloom, + case is_check_slot_required({hash, Hash}, SlotBloom) of + false -> + {not_present, slot_bloom, null}; + true -> + SlotLook = lookup_in_slot(LedgerKey, + {pointer, + State#state.handle, + Slot}), + case SlotLook of + crc_wonky -> + {not_present, + slot_crc_wonky, + Slot#slot_index_value.slot_id}; + none -> + {not_present, + slot_lookup_miss, + null}; + KV -> + {KV, + slot_lookup_hit, + Slot#slot_index_value.slot_id} + end + end + end + end. + + +write_file(Filename, SummaryBin, SlotsBin) -> + SummaryLength = byte_size(SummaryBin), + SlotsLength = byte_size(SlotsBin), + file:write_file(Filename, + <>, + [raw]). + +read_file(Filename, State) -> + {Handle, SummaryBin} = open_reader(Filename), + Summary = read_table_summary(SummaryBin), + SlotLengthFetchFun = + fun({_K, V}, Acc) -> + [{V#slot_index_value.slot_id, + V#slot_index_value.length}|Acc] + end, + SlotLengths = lists:foldr(SlotLengthFetchFun, [], Summary#summary.index), + SlotCount = length(SlotLengths), + SkipL = leveled_skiplist:from_list(Summary#summary.index), + UpdSummary = Summary#summary{index = SkipL}, + leveled_log:log("SST03", [Filename, Summary#summary.size, SlotCount]), + State#state{summary = UpdSummary, + slot_lengths = SlotLengths, + handle = Handle, + cache = array:new({size, SlotCount})}. + +open_reader(Filename) -> + {ok, Handle} = file:open(Filename, [binary, raw, read]), + {ok, Lengths} = file:pread(Handle, {bof, 0}, 8), + <> = Lengths, + {ok, SummaryBin} = file:pread(Handle, {cur, SlotsLength}, SummaryLength), + {Handle, SummaryBin}. + +build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L) -> BloomSlots = case lists:keyfind(Level, 1, ?LEVEL_BLOOM_SLOTS) of {Level, N} -> @@ -90,9 +303,13 @@ build_table_summary(SlotIndex, AllHashes, Level) -> Bloom = lists:foldr(fun leveled_tinybloom:enter/2, leveled_tinybloom:empty(BloomSlots), AllHashes), - SkipSlot = leveled_skiplist:from_sortedlist(lists:reverse(SlotIndex)), - SummBin = term_to_binary({SkipSlot, Bloom}, - [{compressed, ?COMPRESSION_LEVEL}]), + [{LastKey, _LastV}|_Rest] = SlotIndex, + Summary = #summary{first_key = FirstKey, + last_key = LastKey, + size = L, + index = lists:reverse(SlotIndex), + bloom = Bloom}, + SummBin = term_to_binary(Summary, [{compressed, ?COMPRESSION_LEVEL}]), SummCRC = erlang:crc32(SummBin), <>. @@ -101,17 +318,25 @@ read_table_summary(BinWithCheck) -> CRCCheck = erlang:crc32(SummBin), if CRCCheck == SummCRC -> - % If not might it should be possible to rebuild from all the slots + % If not might it might be possible to rebuild from all the slots binary_to_term(SummBin) end. -build_all_slots(KVList, BasePosition) -> +build_all_slots(KVList) -> L = length(KVList), % The length is not a constant time command and the list may be large, - % but otherwise lenght must be called each iteration to avoid exception + % but otherwise length must be called each iteration to avoid exception % on split or sublist + [{FirstKey, _FirstV}|_Rest] = KVList, SlotCount = L div ?SLOT_SIZE, - build_all_slots(KVList, SlotCount, BasePosition, [], 1, [], <<>>). + {SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList, + SlotCount, + 8, + [], + 1, + [], + <<>>), + {FirstKey, L, SlotIndex, AllHashes, SlotsBin}. build_all_slots([], _Count, _Start, AllHashes, _SlotID, SlotIndex, SlotsBin) -> {SlotIndex, AllHashes, SlotsBin}; @@ -164,20 +389,38 @@ is_check_slot_required(_Hash, none) -> is_check_slot_required(Hash, Bloom) -> leveled_tinybloom:tiny_check(Hash, Bloom). -lookup_in_slot(Key, {pointer, Handle, Pos, Length}) -> - lookup_in_slot(Key, read_slot(Handle, Pos, Length)); +lookup_slot(Key, SkipList) -> + leveled_skiplist:key_above(SkipList, Key). + +lookup_in_slot(Key, {pointer, Handle, Slot}) -> + SlotBin = read_slot(Handle, Slot), + case SlotBin of + crc_wonky -> + crc_wonky; + _ -> + lookup_in_slot(Key, SlotBin) + end; lookup_in_slot(Key, SlotBin) -> Tree = binary_to_term(SlotBin), gb_trees:lookup(Key, Tree). -all_from_slot({pointer, Handle, Pos, Length}) -> - all_from_slot(read_slot(Handle, Pos, Length)); +all_from_slot({pointer, Handle, Slot}) -> + all_from_slot(read_slot(Handle, Slot)); all_from_slot(SlotBin) -> SkipList = binary_to_term(SlotBin), gb_trees:to_list(SkipList). -read_slot(_Handle, _Pos, _Length) -> - not_implemented. +read_slot(Handle, Slot) -> + {ok, SlotBin} = file:pread(Handle, + Slot#slot_index_value.start_position, + Slot#slot_index_value.length), + <> = SlotBin, + case erlang:crc32(Slot) of + SlotCRC -> + Slot; + _ -> + crc_wonky + end. %%%============================================================================ @@ -186,6 +429,13 @@ read_slot(_Handle, _Pos, _Length) -> -ifdef(TEST). +generate_randomkeys({Count, StartSQN}) -> + BucketNumber = random:uniform(1024), + generate_randomkeys(Count, StartSQN, [], BucketNumber, BucketNumber); +generate_randomkeys(Count) -> + BucketNumber = random:uniform(1024), + generate_randomkeys(Count, 0, [], BucketNumber, BucketNumber). + generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> generate_randomkeys(Seqn, Count, @@ -254,7 +504,12 @@ simple_slotbin_test() -> simple_slotbinsummary_test() -> KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 8 + 100, 1, 4), KVList1 = lists:ukeysort(1, KVList0), - {SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList1, 0), - _SummaryBin = build_table_summary(SlotIndex, AllHashes, 2). + [{FirstKey, _V}|_Rest] = KVList1, + {SlotIndex, AllHashes, _SlotsBin} = build_all_slots(KVList1), + _SummaryBin = build_table_summary(SlotIndex, + AllHashes, + 2, + FirstKey, + length(KVList1)). -endif. \ No newline at end of file diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index a0af8a5..fb9fc5d 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -82,6 +82,7 @@ tiny_enter({hash, Hash}, Bloom) -> AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]). + tiny_check({hash, Hash}, Bloom) -> {_Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), case getbit(Bit0, Bloom, 1024) of From 7a11e8b4905a6a0d171ecf3dbc57283d5cad11a1 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 16:34:36 +0000 Subject: [PATCH 12/58] Some basic testing --- src/leveled_sst.erl | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index b3243ee..a66e360 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -277,7 +277,7 @@ read_file(Filename, State) -> end, SlotLengths = lists:foldr(SlotLengthFetchFun, [], Summary#summary.index), SlotCount = length(SlotLengths), - SkipL = leveled_skiplist:from_list(Summary#summary.index), + SkipL = leveled_skiplist:from_sortedlist(Summary#summary.index), UpdSummary = Summary#summary{index = SkipL}, leveled_log:log("SST03", [Filename, Summary#summary.size, SlotCount]), State#state{summary = UpdSummary, @@ -367,6 +367,7 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) -> bloom = Bloom, start_position = Start, length = Length}, + io:format("slot_id ~w at ~w and length ~w~n", [SlotID, Start, Length]), build_all_slots(KVRem, Count - 1, Start + Length, @@ -390,7 +391,8 @@ is_check_slot_required(Hash, Bloom) -> leveled_tinybloom:tiny_check(Hash, Bloom). lookup_slot(Key, SkipList) -> - leveled_skiplist:key_above(SkipList, Key). + {_Mark, Slot} = leveled_skiplist:key_above(SkipList, Key), + Slot. lookup_in_slot(Key, {pointer, Handle, Slot}) -> SlotBin = read_slot(Handle, Slot), @@ -505,11 +507,33 @@ simple_slotbinsummary_test() -> KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 8 + 100, 1, 4), KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _V}|_Rest] = KVList1, - {SlotIndex, AllHashes, _SlotsBin} = build_all_slots(KVList1), - _SummaryBin = build_table_summary(SlotIndex, + {FirstKey, _L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList1), + SummaryBin = build_table_summary(SlotIndex, AllHashes, 2, FirstKey, - length(KVList1)). + length(KVList1)), + Summary = read_table_summary(SummaryBin), + SummaryIndex = leveled_skiplist:from_sortedlist(Summary#summary.index), + FetchFun = + fun({Key, Value}) -> + Slot = lookup_slot(Key, SummaryIndex), + StartPos = Slot#slot_index_value.start_position, + Length = Slot#slot_index_value.length, + io:format("lookup slot id ~w from ~w length ~w~n", + [Slot#slot_index_value.slot_id, StartPos, Length]), + <<_Pre:StartPos/binary, + SlotBin:Length/binary, + _Post/binary>> = <<0:64/integer, SlotsBin/binary>>, + <> = SlotBin, + ?assertMatch(SlotCRC, erlang:crc32(SlotBinNoCRC)), + {value, V} = lookup_in_slot(Key, SlotBinNoCRC), + ?assertMatch(Value, V) + end, + SW = os:timestamp(), + lists:foreach(FetchFun, KVList1), + io:format(user, + "Checking for ~w keys in slots took ~w microseconds~n", + [length(KVList1), timer:now_diff(os:timestamp(), SW)]). -endif. \ No newline at end of file From 0d0ab326532c152671958f6c9315e98ed0075954 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 17:48:31 +0000 Subject: [PATCH 13/58] Some end-to-end testing --- src/leveled_log.erl | 2 + src/leveled_sst.erl | 90 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 74 insertions(+), 18 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index 635ecc8..c9a16cd 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -241,6 +241,8 @@ {info, "Opening SST file with filename ~s keys ~w and slots ~w"}}, {"SST04", {info, "Exit called for reason ~w on filename ~s"}}, + {"SST05", + {warn, "Rename rogue filename ~s to ~s"}}, {"SFT01", {info, "Opened filename with name ~s"}}, diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index a66e360..572dab8 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -62,6 +62,7 @@ -define(SLOT_SIZE, 128). -define(COMPRESSION_LEVEL, 1). -define(LEVEL_BLOOM_SLOTS, [{0, 64}, {1, 48}, {default, 32}]). +-define(DISCARD_EXT, ".discarded"). -include_lib("eunit/include/eunit.hrl"). @@ -162,18 +163,19 @@ sst_close(Pid) -> init([]) -> {ok, starting, #state{}}. -starting({sft_open, Filename}, _From, State) -> +starting({sst_open, Filename}, _From, State) -> UpdState = read_file(Filename, State), Summary = UpdState#state.summary, {reply, {ok, {Summary#summary.first_key, Summary#summary.last_key}}, reader, UpdState}; -starting({sft_new, Filename, Level, KVList}, _From, State) -> +starting({sst_new, Filename, Level, KVList}, _From, State) -> {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), SummaryBin = build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L), - ok = write_file(Filename, SummaryBin, SlotsBin), - UpdState = read_file(Filename, State), + ActualFilename = write_file(Filename, SummaryBin, SlotsBin), + UpdState = read_file(ActualFilename, + State#state{filename=ActualFilename}), Summary = UpdState#state.summary, {reply, {ok, {Summary#summary.first_key, Summary#summary.last_key}}, @@ -193,8 +195,13 @@ reader({get_kv, LedgerKey, Hash}, _From, State) -> {KV, slot_lookup_hit} -> UpdCache = array:set(SlotID, KV, State#state.cache), {reply, Result, reader, State#state{cache = UpdCache, - sst_timings = UpdTimings}} - end. + sst_timings = UpdTimings}}; + _ -> + {reply, Result, reader, State#state{sst_timings = UpdTimings}} + end; +reader(close, _From, State) -> + ok = file:close(State#state.handle), + {stop, normal, ok, State}. handle_sync_event(_Msg, _From, StateName, State) -> {reply, undefined, StateName, State}. @@ -227,7 +234,7 @@ fetch(LedgerKey, Hash, State) -> State#state.cache), case CacheEntry of {LedgerKey, CachedValue} -> - {{LedgerKey, CachedValue}, cache_entry}; + {{LedgerKey, CachedValue}, cache_entry, null}; _ -> SlotBloom = Slot#slot_index_value.bloom, case is_check_slot_required({hash, Hash}, SlotBloom) of @@ -247,8 +254,8 @@ fetch(LedgerKey, Hash, State) -> {not_present, slot_lookup_miss, null}; - KV -> - {KV, + {value, V} -> + {{LedgerKey, V}, slot_lookup_hit, Slot#slot_index_value.slot_id} end @@ -260,12 +267,25 @@ fetch(LedgerKey, Hash, State) -> write_file(Filename, SummaryBin, SlotsBin) -> SummaryLength = byte_size(SummaryBin), SlotsLength = byte_size(SlotsBin), - file:write_file(Filename, + {PendingName, FinalName} = generate_filenames(Filename), + file:write_file(PendingName, <>, - [raw]). + [raw]), + case filelib:is_file(FinalName) of + true -> + AltName = filename:join(filename:dirname(FinalName), + filename:basename(FinalName)) + ++ ?DISCARD_EXT, + leveled_log:log("SST05", [FinalName, AltName]), + ok = file:rename(FinalName, AltName); + false -> + ok + end, + file:rename(PendingName, FinalName), + FinalName. read_file(Filename, State) -> {Handle, SummaryBin} = open_reader(Filename), @@ -283,13 +303,13 @@ read_file(Filename, State) -> State#state{summary = UpdSummary, slot_lengths = SlotLengths, handle = Handle, - cache = array:new({size, SlotCount})}. + cache = array:new({size, SlotCount + 1})}. open_reader(Filename) -> {ok, Handle} = file:open(Filename, [binary, raw, read]), - {ok, Lengths} = file:pread(Handle, {bof, 0}, 8), + {ok, Lengths} = file:pread(Handle, 0, 8), <> = Lengths, - {ok, SummaryBin} = file:pread(Handle, {cur, SlotsLength}, SummaryLength), + {ok, SummaryBin} = file:pread(Handle, SlotsLength + 8, SummaryLength), {Handle, SummaryBin}. build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L) -> @@ -367,7 +387,6 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) -> bloom = Bloom, start_position = Start, length = Length}, - io:format("slot_id ~w at ~w and length ~w~n", [SlotID, Start, Length]), build_all_slots(KVRem, Count - 1, Start + Length, @@ -416,15 +435,30 @@ read_slot(Handle, Slot) -> {ok, SlotBin} = file:pread(Handle, Slot#slot_index_value.start_position, Slot#slot_index_value.length), - <> = SlotBin, - case erlang:crc32(Slot) of + <> = SlotBin, + case erlang:crc32(SlotNoCRC) of SlotCRC -> - Slot; + SlotNoCRC; _ -> crc_wonky end. +generate_filenames(RootFilename) -> + Ext = filename:extension(RootFilename), + Components = filename:split(RootFilename), + case Ext of + [] -> + {filename:join(Components) ++ ".pnd", + filename:join(Components) ++ ".sst"}; + Ext -> + DN = filename:dirname(RootFilename), + FP_NOEXT = filename:basename(RootFilename, Ext), + {filename:join(DN, FP_NOEXT) ++ ".pnd", + filename:join(DN, FP_NOEXT) ++ ".sst"} + end. + + %%%============================================================================ %%% Test %%%============================================================================ @@ -536,4 +570,24 @@ simple_slotbinsummary_test() -> "Checking for ~w keys in slots took ~w microseconds~n", [length(KVList1), timer:now_diff(os:timestamp(), SW)]). +simple_persisted_test() -> + Filename = "../test/simple_test", + KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 8 + 100, 1, 4), + KVList1 = lists:ukeysort(1, KVList0), + [{FirstKey, _FV}|_Rest] = KVList1, + {LastKey, _LV} = lists:last(KVList1), + {ok, Pid, {FirstKey, LastKey}} = sst_new(Filename, 1, KVList1), + SW = os:timestamp(), + lists:foreach(fun({K, V}) -> + ?assertMatch({K, V}, sst_get(Pid, K)), + ?assertMatch({K, V}, sst_get(Pid, K)) + end, + KVList1), + io:format(user, + "Checking for ~w keys (twice) in file with cache hit took ~w " + ++ "microseconds~n", + [length(KVList1), timer:now_diff(os:timestamp(), SW)]), + ok = sst_close(Pid), + ok = file:delete(Filename ++ ".sst"). + -endif. \ No newline at end of file From 85261063129df3ec344dcd485dc7d91511ba3c60 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 17:59:07 +0000 Subject: [PATCH 14/58] Test for missing keys --- src/leveled_sst.erl | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 572dab8..708bf6a 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -577,7 +577,7 @@ simple_persisted_test() -> [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), {ok, Pid, {FirstKey, LastKey}} = sst_new(Filename, 1, KVList1), - SW = os:timestamp(), + SW1 = os:timestamp(), lists:foreach(fun({K, V}) -> ?assertMatch({K, V}, sst_get(Pid, K)), ?assertMatch({K, V}, sst_get(Pid, K)) @@ -586,7 +586,27 @@ simple_persisted_test() -> io:format(user, "Checking for ~w keys (twice) in file with cache hit took ~w " ++ "microseconds~n", - [length(KVList1), timer:now_diff(os:timestamp(), SW)]), + [length(KVList1), timer:now_diff(os:timestamp(), SW1)]), + KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 8 + 100, 1, 4), + MapFun = + fun({K, V}, Acc) -> + In = lists:keymember(K, 1, KVList1), + case {K > FirstKey, LastKey > K, In} of + {true, true, false} -> + [{K, V}|Acc]; + _ -> + Acc + end + end, + KVList3 = lists:foldl(MapFun, [], KVList2), + SW2 = os:timestamp(), + lists:foreach(fun({K, V}) -> + ?assertMatch(not_present, sst_get(Pid, K)) + end, + KVList3), + io:format(user, + "Checking for ~w missing keys took ~w microseconds~n", + [length(KVList3), timer:now_diff(os:timestamp(), SW2)]), ok = sst_close(Pid), ok = file:delete(Filename ++ ".sst"). From 480820e466ae0eacb6f1834ca5b05af122ee3847 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 18:03:34 +0000 Subject: [PATCH 15/58] Add hash to missing key test --- src/leveled_sst.erl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 708bf6a..1388280 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -587,21 +587,21 @@ simple_persisted_test() -> "Checking for ~w keys (twice) in file with cache hit took ~w " ++ "microseconds~n", [length(KVList1), timer:now_diff(os:timestamp(), SW1)]), - KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 8 + 100, 1, 4), + KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 20 + 100, 1, 4), MapFun = fun({K, V}, Acc) -> In = lists:keymember(K, 1, KVList1), case {K > FirstKey, LastKey > K, In} of {true, true, false} -> - [{K, V}|Acc]; + [{K, leveled_codec:magic_hash(K), V}|Acc]; _ -> Acc end end, KVList3 = lists:foldl(MapFun, [], KVList2), SW2 = os:timestamp(), - lists:foreach(fun({K, V}) -> - ?assertMatch(not_present, sst_get(Pid, K)) + lists:foreach(fun({K, H, _V}) -> + ?assertMatch(not_present, sst_get(Pid, K, H)) end, KVList3), io:format(user, From 6e5f5d2d442ad9e3d664f3d6a30f09e55c627fab Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 24 Dec 2016 18:13:55 +0000 Subject: [PATCH 16/58] Alter ordering don't try the cache hit before checking for presence, only look in the cache if protecting a lookup from the persisted part --- src/leveled_sst.erl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 1388280..f132d00 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -230,17 +230,17 @@ fetch(LedgerKey, Hash, State) -> {not_present, summary_bloom, null}; true -> Slot = lookup_slot(LedgerKey, Summary#summary.index), - CacheEntry = array:get(Slot#slot_index_value.slot_id, + SlotBloom = Slot#slot_index_value.bloom, + case is_check_slot_required({hash, Hash}, SlotBloom) of + false -> + {not_present, slot_bloom, null}; + true -> + CacheEntry = array:get(Slot#slot_index_value.slot_id, State#state.cache), - case CacheEntry of - {LedgerKey, CachedValue} -> - {{LedgerKey, CachedValue}, cache_entry, null}; - _ -> - SlotBloom = Slot#slot_index_value.bloom, - case is_check_slot_required({hash, Hash}, SlotBloom) of - false -> - {not_present, slot_bloom, null}; - true -> + case CacheEntry of + {LedgerKey, CachedValue} -> + {{LedgerKey, CachedValue}, cache_entry, null}; + _ -> SlotLook = lookup_in_slot(LedgerKey, {pointer, State#state.handle, From cbad3753739b5ff20c342da0013044b7dd08d89d Mon Sep 17 00:00:00 2001 From: martinsumner Date: Wed, 28 Dec 2016 15:48:04 +0000 Subject: [PATCH 17/58] Refactoring of skiplist ranges and support for sst ranges the Skiplist range code was needlessly complicated. It may be faster than the new code, but the complexity delta cannot be support for such a small change. This was incovered whilst troubleshooting the initial kv range test. --- src/leveled_log.erl | 8 +- src/leveled_skiplist.erl | 137 ++++++++++-------------- src/leveled_sst.erl | 217 ++++++++++++++++++++++++++++++++++++-- src/leveled_tinybloom.erl | 11 +- 4 files changed, 270 insertions(+), 103 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index c9a16cd..205962d 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -19,7 +19,7 @@ -define(GET_LOGPOINT, 160000). -define(SST_LOGPOINT, 200000). -define(LOG_LEVEL, [info, warn, error, critical]). --define(SAMPLE_RATE, 16#F). +-define(SAMPLE_RATE, 1). -define(LOGBASE, dict:from_list([ @@ -377,7 +377,7 @@ head_timing(undefined, SW, Level, R) -> T0 = timer:now_diff(os:timestamp(), SW), head_timing_int(undefined, T0, Level, R); head_timing({N, HeadTimingD}, SW, Level, R) -> - case N band ?SAMPLE_RATE of + case N band (?SAMPLE_RATE - 1) of 0 -> T0 = timer:now_diff(os:timestamp(), SW), head_timing_int({N, HeadTimingD}, T0, Level, R); @@ -440,7 +440,7 @@ sst_timing(undefined, SW, TimerType) -> ?SST_LOGPOINT, "SST01"); sst_timing({N, SSTTimerD}, SW, TimerType) -> - case N band ?SAMPLE_RATE of + case N band (?SAMPLE_RATE - 1) of 0 -> T0 = timer:now_diff(os:timestamp(), SW), gen_timing_int({N, SSTTimerD}, @@ -468,7 +468,7 @@ get_timing(undefined, SW, TimerType) -> ?GET_LOGPOINT, "B0014"); get_timing({N, GetTimerD}, SW, TimerType) -> - case N band ?SAMPLE_RATE of + case N band (?SAMPLE_RATE - 1) of 0 -> T0 = timer:now_diff(os:timestamp(), SW), gen_timing_int({N, GetTimerD}, diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 83ac382..d03f0c1 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -266,78 +266,60 @@ to_list(SkipList, Level) -> [], SkipList). -to_range(SkipList, Start, End, 1) -> - R = lists:foldl(fun({Mark, SL}, {PassedStart, PassedEnd, Acc, PrevList}) -> - - case {PassedStart, PassedEnd} of - {true, true} -> - {true, true, Acc, null}; - {false, false} -> - case Start > Mark of - true -> - {false, false, Acc, SL}; - false -> - RHS = splitlist_start(Start, PrevList ++ SL), - case leveled_codec:endkey_passed(End, Mark) of - true -> - EL = splitlist_end(End, RHS), - {true, true, EL, null}; - false -> - {true, false, RHS, null} - end - end; - {true, false} -> - case leveled_codec:endkey_passed(End, Mark) of - true -> - EL = splitlist_end(End, SL), - {true, true, Acc ++ EL, null}; - false -> - {true, false, Acc ++ SL, null} - end - end end, - - {false, false, [], []}, - SkipList), - {_Bool1, _Bool2, SubList, _PrevList} = R, - SubList; -to_range(SkipList, Start, End, Level) -> - R = lists:foldl(fun({Mark, SL}, {PassedStart, PassedEnd, Acc, PrevList}) -> - - case {PassedStart, PassedEnd} of - {true, true} -> - {true, true, Acc, null}; - {false, false} -> - case Start > Mark of - true -> - {false, false, Acc, SL}; - false -> - SkipLRange = to_range(PrevList, - Start, End, - Level - 1) ++ - to_range(SL, - Start, End, - Level - 1), - case leveled_codec:endkey_passed(End, Mark) of - true -> - {true, true, SkipLRange, null}; - false -> - {true, false, SkipLRange, null} - end - end; - {true, false} -> - SkipLRange = to_range(SL, Start, End, Level - 1), - case leveled_codec:endkey_passed(End, Mark) of - true -> - {true, true, Acc ++ SkipLRange, null}; - false -> - {true, false, Acc ++ SkipLRange, null} - end - end end, - - {false, false, [], []}, - SkipList), - {_Bool1, _Bool2, SubList, _PrevList} = R, - SubList. + +to_range(SkipList, StartKey, EndKey, ListHeight) -> + to_range(SkipList, StartKey, EndKey, ListHeight, [], true). + +to_range(SkipList, StartKey, EndKey, ListHeight, Acc, StartIncl) -> + SL = sublist_above(SkipList, StartKey, ListHeight, StartIncl), + case SL of + [] -> + Acc; + _ -> + {LK, _LV} = lists:last(SL), + case leveled_codec:endkey_passed(EndKey, LK) of + false -> + to_range(SkipList, + LK, + EndKey, + ListHeight, + Acc ++ SL, + false); + true -> + SplitFun = + fun({K, _V}) -> + not leveled_codec:endkey_passed(EndKey, K) end, + LHS = lists:takewhile(SplitFun, SL), + Acc ++ LHS + end + end. + +sublist_above(SkipList, StartKey, 0, StartIncl) -> + TestFun = + fun({K, _V}) -> + case StartIncl of + true -> + K < StartKey; + false -> + K =< StartKey + end end, + lists:dropwhile(TestFun, SkipList); +sublist_above(SkipList, StartKey, Level, StartIncl) -> + TestFun = + fun({K, _SL}) -> + case StartIncl of + true -> + K < StartKey; + false -> + K =< StartKey + end end, + RHS = lists:dropwhile(TestFun, SkipList), + case RHS of + [] -> + []; + [{_K, SL}|_Rest] -> + sublist_above(SL, StartKey, Level - 1, StartIncl) + end. key_above(SkipList, Key, 0) -> FindFun = fun({Mark, V}, Found) -> @@ -419,17 +401,6 @@ get_sublist(Key, SkipList) -> null, SkipList). -splitlist_start(StartKey, SL) -> - {_LHS, RHS} = lists:splitwith(fun({K, _V}) -> K < StartKey end, SL), - RHS. - -splitlist_end(EndKey, SL) -> - {LHS, _RHS} = lists:splitwith(fun({K, _V}) -> - not leveled_codec:endkey_passed(EndKey, K) - end, - SL), - LHS. - %%%============================================================================ %%% Test %%%============================================================================ diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index f132d00..fd2ffe5 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -79,6 +79,8 @@ sst_open/1, sst_get/2, sst_get/3, + sst_getkvrange/4, + sst_getslots/2, sst_close/1]). -export([generate_randomkeys/1]). @@ -152,9 +154,21 @@ sst_get(Pid, LedgerKey) -> sst_get(Pid, LedgerKey, Hash) -> gen_fsm:sync_send_event(Pid, {get_kv, LedgerKey, Hash}, infinity). +sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> + gen_fsm:sync_send_event(Pid, + {get_kvrange, StartKey, EndKey, ScanWidth}, + infinity). + +sst_getslots(Pid, SlotList) -> + gen_fsm:sync_send_event(Pid, {get_slots, SlotList}, infinity). + sst_close(Pid) -> gen_fsm:sync_send_event(Pid, close, 2000). +%% Used in unit tests to force the printing of timings +sst_printtimings(Pid) -> + gen_fsm:sync_send_event(Pid, print_timings, 1000). + %%%============================================================================ %%% gen_server callbacks @@ -199,6 +213,23 @@ reader({get_kv, LedgerKey, Hash}, _From, State) -> _ -> {reply, Result, reader, State#state{sst_timings = UpdTimings}} end; +reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> + {reply, + fetch_range(StartKey, EndKey, ScanWidth, State), + reader, + State}; +reader({get_slots, SlotList}, _From, State) -> + Handle = State#state.handle, + FetchFun = + fun({pointer, S, SK, EK}, Acc) -> + Acc ++ trim_slot({pointer, Handle, S}, SK, EK) end, + {reply, + lists:foldl(FetchFun, [], SlotList), + reader, + State}; +reader(print_timings, _From, State) -> + io:format(user, "Timings of ~w~n", [State#state.sst_timings]), + {reply, ok, reader, State#state{sst_timings = undefined}}; reader(close, _From, State) -> ok = file:close(State#state.handle), {stop, normal, ok, State}. @@ -263,6 +294,76 @@ fetch(LedgerKey, Hash, State) -> end end. +fetch_range(StartKey, EndKey, ScanWidth, State) -> + Summary = State#state.summary, + Handle = State#state.handle, + {Slots, LTrim, RTrim} = lookup_slots(StartKey, + EndKey, + Summary#summary.index), + Self = self(), + SL = length(Slots), + ExpandedSlots = + case SL of + 0 -> + []; + 1 -> + [Slot] = Slots, + case {LTrim, RTrim} of + {true, true} -> + [{pointer, Self, Slot, StartKey, EndKey}]; + {true, false} -> + [{pointer, Self, Slot, StartKey, all}]; + {false, true} -> + [{pointer, Self, Slot, all, EndKey}]; + {false, false} -> + [{pointer, Self, Slot, all, all}] + end; + N -> + {LSlot, MidSlots, RSlot} = + case N of + 2 -> + [Slot1, Slot2] = Slots, + {Slot1, [], Slot2}; + N -> + [Slot1|_Rest] = Slots, + SlotN = lists:last(Slots), + {Slot1, lists:sublist(Slots, 2, N - 2), SlotN} + end, + MidSlotPointers = lists:map(fun(S) -> + {pointer, Self, S, all, all} + end, + MidSlots), + case {LTrim, RTrim} of + {true, true} -> + [{pointer, Self, LSlot, StartKey, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, EndKey}]; + {true, false} -> + [{pointer, Self, LSlot, StartKey, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, all}]; + {false, true} -> + [{pointer, Self, LSlot, all, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, EndKey}]; + {false, false} -> + [{pointer, Self, LSlot, all, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, all}] + end + end, + {SlotsToFetch, SlotsToPoint} = + case ScanWidth of + SW when SW >= SL -> + {ExpandedSlots, []}; + _ -> + lists:split(ScanWidth, ExpandedSlots) + end, + FetchFun = + fun({pointer, _Self, S, SK, EK}, Acc) -> + Acc ++ trim_slot({pointer, Handle, S}, SK, EK) end, + lists:foldl(FetchFun, [], SlotsToFetch) ++ SlotsToPoint. + write_file(Filename, SummaryBin, SlotsBin) -> SummaryLength = byte_size(SummaryBin), @@ -409,6 +510,34 @@ is_check_slot_required(_Hash, none) -> is_check_slot_required(Hash, Bloom) -> leveled_tinybloom:tiny_check(Hash, Bloom). +%% Returns a section from the summary index and two booleans to indicate if +%% the first slot needs trimming, or the last slot +lookup_slots(StartKey, EndKey, SkipList) -> + SlotsOnlyFun = fun({_K, V}) -> V end, + {KSL, LTrim, RTrim} = lookup_slots_int(StartKey, EndKey, SkipList), + {lists:map(SlotsOnlyFun, KSL), LTrim, RTrim}. + +lookup_slots_int(all, all, SkipList) -> + {leveled_skiplist:to_list(SkipList), false, false}; +lookup_slots_int(StartKey, all, SkipList) -> + L = leveled_skiplist:to_list(SkipList), + LTrimFun = fun({K, _V}) -> K < StartKey end, + {_LDrop, RKeep0} = lists:splitwith(LTrimFun, L), + [{FirstKey, _V}|_Rest] = RKeep0, + LTrim = FirstKey < StartKey, + {RKeep0, LTrim, false}; +lookup_slots_int(StartKey, EndKey, SkipList) -> + L0 = leveled_skiplist:to_range(SkipList, StartKey, EndKey), + {LastKey, _LastVal} = lists:last(L0), + case LastKey of + EndKey -> + {L0, true, false}; + _ -> + LTail = leveled_skiplist:key_above(SkipList, EndKey), + {L0 ++ [LTail], true, true} + end. + + lookup_slot(Key, SkipList) -> {_Mark, Slot} = leveled_skiplist:key_above(SkipList, Key), Slot. @@ -425,12 +554,6 @@ lookup_in_slot(Key, SlotBin) -> Tree = binary_to_term(SlotBin), gb_trees:lookup(Key, Tree). -all_from_slot({pointer, Handle, Slot}) -> - all_from_slot(read_slot(Handle, Slot)); -all_from_slot(SlotBin) -> - SkipList = binary_to_term(SlotBin), - gb_trees:to_list(SkipList). - read_slot(Handle, Slot) -> {ok, SlotBin} = file:pread(Handle, Slot#slot_index_value.start_position, @@ -443,6 +566,48 @@ read_slot(Handle, Slot) -> crc_wonky end. +trim_slot({pointer, Handle, Slot}, all, all) -> + case read_slot(Handle, Slot) of + crc_wonky -> + []; + SlotBin -> + trim_slot(SlotBin, all, all) + end; +trim_slot(SlotBinary, all, all) -> + Tree = binary_to_term(SlotBinary), + gb_trees:to_list(Tree); +trim_slot({pointer, Handle, Slot}, StartKey, EndKey) -> + case read_slot(Handle, Slot) of + crc_wonky -> + []; + SlotBin -> + trim_slot(SlotBin, StartKey, EndKey) + end; +trim_slot(SlotBinary, StartKey, EndKey) -> + Tree = binary_to_term(SlotBinary), + L = gb_trees:to_list(Tree), + LTrimFun = fun({K, _V}) -> + K < StartKey end, + RTrimFun = fun({K, _V}) -> + not leveled_codec:endkey_passed(EndKey, K) end, + LTrimL = + case StartKey of + all -> + L; + _ -> + {_LDrop, RKeep} = lists:splitwith(LTrimFun, L), + RKeep + end, + RTrimL = + case EndKey of + all -> + LTrimL; + _ -> + {LKeep, _RDrop} = lists:splitwith(RTrimFun, L), + LKeep + end, + RTrimL. + generate_filenames(RootFilename) -> Ext = filename:extension(RootFilename), @@ -490,7 +655,7 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> BRand = random:uniform(BRange), string:right(integer_to_list(BucketLow + BRand), 4, $0) end, - KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0), + KNumber = string:right(integer_to_list(random:uniform(1000)), 6, $0), LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, "Key" ++ KNumber, o), @@ -532,7 +697,7 @@ simple_slotbin_test() -> io:format(user, "Slot checked for all keys in ~w microseconds~n", [timer:now_diff(os:timestamp(), SW1)]), SW2 = os:timestamp(), - ?assertMatch(KVList1, all_from_slot(SlotBin0)), + ?assertMatch(KVList1, trim_slot(SlotBin0, all, all)), io:format(user, "Slot flattened in ~w microseconds~n", [timer:now_diff(os:timestamp(), SW2)]). @@ -572,7 +737,7 @@ simple_slotbinsummary_test() -> simple_persisted_test() -> Filename = "../test/simple_test", - KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 8 + 100, 1, 4), + KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20), KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), @@ -587,7 +752,8 @@ simple_persisted_test() -> "Checking for ~w keys (twice) in file with cache hit took ~w " ++ "microseconds~n", [length(KVList1), timer:now_diff(os:timestamp(), SW1)]), - KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 20 + 100, 1, 4), + ok = sst_printtimings(Pid), + KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20), MapFun = fun({K, V}, Acc) -> In = lists:keymember(K, 1, KVList1), @@ -607,6 +773,37 @@ simple_persisted_test() -> io:format(user, "Checking for ~w missing keys took ~w microseconds~n", [length(KVList3), timer:now_diff(os:timestamp(), SW2)]), + ok = sst_printtimings(Pid), + FetchList1 = sst_getkvrange(Pid, all, all, 2), + FoldFun = fun(X, Acc) -> + case X of + {pointer, P, S, SK, EK} -> + Acc ++ sst_getslots(P, [{pointer, S, SK, EK}]); + _ -> + Acc ++ [X] + end end, + FetchedList1 = lists:foldl(FoldFun, [], FetchList1), + ?assertMatch(KVList1, FetchedList1), + + {TenthKey, _v10} = lists:nth(10, KVList1), + {Three000Key, _v300} = lists:nth(300, KVList1), + io:format("Looking for 291 elements between ~s ~s and ~s ~s~n", + [element(2, TenthKey), + element(3, TenthKey), + element(2, Three000Key), + element(3, Three000Key)]), + SubKVList1 = lists:sublist(KVList1, 10, 291), + SubKVList1L = length(SubKVList1), + FetchList2 = sst_getkvrange(Pid, TenthKey, Three000Key, 2), + FetchedList2 = lists:foldl(FoldFun, [], FetchList2), + io:format("Found elements between ~s ~s and ~s ~s~n", + [element(2, element(1, lists:nth(1, FetchedList2))), + element(3, element(1, lists:nth(1, FetchedList2))), + element(2, element(1, lists:last(FetchedList2))), + element(3, element(1, lists:last(FetchedList2)))]), + ?assertMatch(SubKVList1L, length(FetchedList2)), + ?assertMatch(SubKVList1, FetchedList2), + ok = sst_close(Pid), ok = file:delete(Filename ++ ".sst"). diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index fb9fc5d..5428917 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -78,13 +78,13 @@ tiny_empty() -> tiny_enter({hash, no_lookup}, Bloom) -> Bloom; tiny_enter({hash, Hash}, Bloom) -> - {_Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]). tiny_check({hash, Hash}, Bloom) -> - {_Q, Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), case getbit(Bit0, Bloom, 1024) of <<0:1>> -> false; @@ -115,11 +115,10 @@ split_hash(Hash) -> split_hash_for_tinybloom(Hash) -> % Tiny bloom can make k=3 from one hash - Q = Hash band 3, - H0 = (Hash bsr 2) band 1023, - H1 = (Hash bsr 12) band 1023, + H0 = Hash band 1023, + H1 = (Hash bsr 11) band 1023, H2 = (Hash bsr 22) band 1023, - {Q, H0, H1, H2}. + {H0, H1, H2}. add_to_array(Bit, BitArray, ArrayLength) -> RestLen = ArrayLength - Bit - 1, From 3716de1c82b7d9ea2263500649b25fbc72c96dff Mon Sep 17 00:00:00 2001 From: martinsumner Date: Wed, 28 Dec 2016 15:49:58 +0000 Subject: [PATCH 18/58] Revert back to sampling timing logs should be based on a sample --- src/leveled_log.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index 205962d..8161409 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -19,7 +19,7 @@ -define(GET_LOGPOINT, 160000). -define(SST_LOGPOINT, 200000). -define(LOG_LEVEL, [info, warn, error, critical]). --define(SAMPLE_RATE, 1). +-define(SAMPLE_RATE, 15). -define(LOGBASE, dict:from_list([ From c664483f03b0fda4c38649f380b72ff00706e09e Mon Sep 17 00:00:00 2001 From: martinsumner Date: Wed, 28 Dec 2016 21:47:05 +0000 Subject: [PATCH 19/58] Add basic merge support No generates KV list first, and then creates a new SST --- src/leveled_pclerk.erl | 13 +- src/leveled_sst.erl | 292 +++++++++++++++++++++++++++++++++++------ 2 files changed, 254 insertions(+), 51 deletions(-) diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index b5f8e3f..a0f64d9 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -308,18 +308,11 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> [SrcLevel + 1, FileCounter])), leveled_log:log("PC012", [MSN, FileName]), TS1 = os:timestamp(), - LevelR = case IsB of - true -> - #level{level = SrcLevel + 1, - is_basement = true, - timestamp = leveled_codec:integer_now()}; - false -> - SrcLevel + 1 - end, - {ok, Pid, Reply} = leveled_sft:sft_new(FileName, + {ok, Pid, Reply} = leveled_sst:sst_new(FileName, KL1, KL2, - LevelR), + IsB, + SrcLevel + 1), case Reply of {{[], []}, null, _} -> leveled_log:log("PC013", [FileName]), diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index fd2ffe5..877e42c 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -59,9 +59,11 @@ -include("include/leveled.hrl"). +-define(MAX_SLOTS, 256). -define(SLOT_SIZE, 128). -define(COMPRESSION_LEVEL, 1). -define(LEVEL_BLOOM_SLOTS, [{0, 64}, {1, 48}, {default, 32}]). +-define(MERGE_SCANWIDTH, 16). -define(DISCARD_EXT, ".discarded"). -include_lib("eunit/include/eunit.hrl"). @@ -76,6 +78,8 @@ reader/3]). -export([sst_new/3, + sst_new/5, + sst_newlevelzero/4, sst_open/1, sst_get/2, sst_get/3, @@ -126,27 +130,25 @@ sst_new(Filename, Level, KVList) -> {ok, Pid, {SK, EK}} end. -%sft_newlevelzero(Filename, Slots, FetchFun, Wait, Penciller) -> -% {ok, Pid} = gen_fsm:start(?MODULE, [], []), -% case Wait of -% true -> -% KL1 = leveled_pmem:to_list(Slots, FetchFun), -% Reply = gen_fsm:sync_send_event(Pid, -% {sft_new, -% Filename, -% 0, -% KL1}, -% infinity), -% {ok, Pid, Reply}; -% false -> -% gen_fsm:send_event(Pid, -% {sft_newlevelzero, -% Filename, -% Slots, -% FetchFun, -% Penciller}), -% {ok, Pid, noreply} -% end. +sst_new(Filename, KL1, KL2, IsBasement, Level) -> + {{Rem1, Rem2}, MergedList} = merge_lists(KL1, KL2, {IsBasement, Level}), + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + case gen_fsm:sync_send_event(Pid, + {sst_new, Filename, Level, MergedList}, + infinity) of + {ok, {SK, EK}} -> + {ok, Pid, {{Rem1, Rem2}, SK, EK}} + end. + +sst_newlevelzero(Filename, Slots, FetchFun, Penciller) -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + gen_fsm:send_event(Pid, + {sst_newlevelzero, + Filename, + Slots, + FetchFun, + Penciller}), + {ok, Pid, noreply}. sst_get(Pid, LedgerKey) -> sst_get(Pid, LedgerKey, leveled_codec:magic_hash(LedgerKey)). @@ -161,7 +163,7 @@ sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> sst_getslots(Pid, SlotList) -> gen_fsm:sync_send_event(Pid, {get_slots, SlotList}, infinity). - + sst_close(Pid) -> gen_fsm:sync_send_event(Pid, close, 2000). @@ -196,6 +198,20 @@ starting({sst_new, Filename, Level, KVList}, _From, State) -> reader, UpdState}. +starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller}, State) -> + KVList = leveled_pmem:to_list(Slots, FetchFun), + {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), + SummaryBin = build_table_summary(SlotIndex, AllHashes, 0, FirstKey, L), + ActualFilename = write_file(Filename, SummaryBin, SlotsBin), + UpdState = read_file(ActualFilename, + State#state{filename=ActualFilename}), + Summary = UpdState#state.summary, + leveled_penciller:pcl_confirml0complete(Penciller, + UpdState#state.filename, + Summary#summary.first_key, + Summary#summary.last_key), + {next_state, reader, UpdState}. + reader({get_kv, LedgerKey, Hash}, _From, State) -> SW = os:timestamp(), {Result, Stage, SlotID} = fetch(LedgerKey, Hash, State), @@ -219,14 +235,11 @@ reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> reader, State}; reader({get_slots, SlotList}, _From, State) -> - Handle = State#state.handle, - FetchFun = - fun({pointer, S, SK, EK}, Acc) -> - Acc ++ trim_slot({pointer, Handle, S}, SK, EK) end, - {reply, - lists:foldl(FetchFun, [], SlotList), - reader, - State}; + SlotBins = read_slots(State#state.handle, SlotList), + FoldFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ trim_slot(SlotBin, SK, EK) end, + {reply, lists:foldl(FoldFun, [], SlotBins), reader, State}; reader(print_timings, _From, State) -> io:format(user, "Timings of ~w~n", [State#state.sst_timings]), {reply, ok, reader, State#state{sst_timings = undefined}}; @@ -566,6 +579,36 @@ read_slot(Handle, Slot) -> crc_wonky end. +read_slots(Handle, SlotList) -> + [{pointer, FirstSlot, _SK, _EK}|_Rest] = SlotList, + {pointer, LastSlot, _SK, _EK} = lists:last(SlotList), + StartPos = FirstSlot#slot_index_value.start_position, + Length = LastSlot#slot_index_value.start_position + + LastSlot#slot_index_value.length + - StartPos, + {ok, MultiSlotBin} = file:pread(Handle, StartPos, Length), + read_off_binary(MultiSlotBin, SlotList, []). + +read_off_binary(<<>>, [], SplitBins) -> + SplitBins; +read_off_binary(MultiSlotBin, [TopSlot|Rest], SplitBins) -> + {pointer, Slot, SK, EK} = TopSlot, + Length = Slot#slot_index_value.length - 4, + <> = MultiSlotBin, + case erlang:crc32(SlotBin) of + SlotCRC -> + read_off_binary(RestBin, + Rest, + SplitBins ++ [{SlotBin, SK, EK}]); + _ -> + read_off_binary(RestBin, + Rest, + SplitBins ++ []) + end. + + trim_slot({pointer, Handle, Slot}, all, all) -> case read_slot(Handle, Slot) of crc_wonky -> @@ -623,6 +666,120 @@ generate_filenames(RootFilename) -> filename:join(DN, FP_NOEXT) ++ ".sst"} end. +%%%============================================================================ +%%% Merge Functions +%%%============================================================================ + +%% functions for merging two KV lists with pointers + +%% Compare the keys at the head of the list, and either skip that "best" key or +%% identify as the next key. +%% +%% The logic needs to change if the file is in the basement level, as keys with +%% expired timestamps need not be written at this level +%% +%% The best key is considered to be the lowest key in erlang term order. If +%% there are matching keys then the highest sequence number must be chosen and +%% any lower sequence numbers should be compacted out of existence + +merge_lists(KeyList1, KeyList2, LevelInfo) -> + merge_lists(KeyList1, KeyList2, LevelInfo, [], ?MAX_SLOTS * ?SLOT_SIZE). + +merge_lists([], [], _LevelR, MergedList, _MaxSize) -> + {{[], []}, lists:reverse(MergedList)}; +merge_lists(Rem1, Rem2, _LevelR, MergedList, 0) -> + {{Rem1, Rem2}, lists:reverse(MergedList)}; +merge_lists(KeyList1, KeyList2, {IsBasement, TS}, MergedList, MaxSize) -> + case key_dominates(KeyList1, KeyList2, {IsBasement, TS}) of + {{next_key, TopKey}, Rem1, Rem2} -> + merge_lists(Rem1, + Rem2, + {IsBasement, TS}, + [TopKey|MergedList], + MaxSize - 1); + {skipped_key, Rem1, Rem2} -> + merge_lists(Rem1, Rem2, {IsBasement, TS}, MergedList, MaxSize) + end. + +key_dominates(KL1, KL2, Level) -> + key_dominates_expanded(maybe_expand_pointer(KL1), + maybe_expand_pointer(KL2), + Level). + +key_dominates_expanded([H1|T1], [], Level) -> + case leveled_codec:maybe_reap_expiredkey(H1, Level) of + true -> + {skipped_key, T1, []}; + false -> + {{next_key, H1}, T1, []} + end; +key_dominates_expanded([], [H2|T2], Level) -> + case leveled_codec:maybe_reap_expiredkey(H2, Level) of + true -> + {skipped_key, [], T2}; + false -> + {{next_key, H2}, [], T2} + end; +key_dominates_expanded([H1|T1], [H2|T2], Level) -> + case leveled_codec:key_dominates(H1, H2) of + left_hand_first -> + case leveled_codec:maybe_reap_expiredkey(H1, Level) of + true -> + {skipped_key, T1, [H2|T2]}; + false -> + {{next_key, H1}, T1, [H2|T2]} + end; + right_hand_first -> + case leveled_codec:maybe_reap_expiredkey(H2, Level) of + true -> + {skipped_key, [H1|T1], T2}; + false -> + {{next_key, H2}, [H1|T1], T2} + end; + left_hand_dominant -> + {skipped_key, [H1|T1], T2}; + right_hand_dominant -> + {skipped_key, T1, [H2|T2]} + end. + + +%% When a list is provided it may include a pointer to gain another batch of +%% entries from the same file, or a new batch of entries from another file +%% +%% This resultant list should include the Tail of any pointers added at the +%% end of the list + +maybe_expand_pointer([]) -> + []; +maybe_expand_pointer([{pointer, SFTPid, Slot, StartKey, all}|Tail]) -> + FoldFun = + fun(X, {Pointers, Remainder}) -> + case length(Pointers) of + L when L < ?MERGE_SCANWIDTH -> + case X of + {pointer, SFTPid, S, SK, EK} -> + {Pointers ++ [{pointer, S, SK, EK}], Remainder}; + _ -> + {Pointers, Remainder ++ [X]} + end; + _ -> + {Pointers, Remainder ++ [X]} + end + end, + InitAcc = {[{pointer, Slot, StartKey, all}], []}, + {AccPointers, AccTail} = lists:foldl(FoldFun, InitAcc, Tail), + SW = os:timestamp(), + ExpPointers = sst_getslots(SFTPid, AccPointers), + leveled_log:log_timer("SFT14", [SFTPid], SW), + lists:append(ExpPointers, AccTail); +maybe_expand_pointer([{next, SFTPid, StartKey}|Tail]) -> + ExpPointer = sst_getkvrange(SFTPid, StartKey, all, ?MERGE_SCANWIDTH), + maybe_expand_pointer(ExpPointer ++ Tail); +maybe_expand_pointer(List) -> + List. + + + %%%============================================================================ %%% Test @@ -671,6 +828,53 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> BRange). +merge_test() -> + N = 3000, + KVL1 = lists:ukeysort(1, generate_randomkeys(N + 1, N, 1, 20)), + KVL2 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 20)), + KVL3 = lists:ukeymerge(1, KVL1, KVL2), + SW0 = os:timestamp(), + {ok, P1, {FK1, LK1}} = sst_new("../test/level1_src", 1, KVL1), + {ok, P2, {FK2, LK2}} = sst_new("../test/level2_src", 2, KVL2), + ExpFK1 = element(1, lists:nth(1, KVL1)), + ExpLK1 = element(1, lists:last(KVL1)), + ExpFK2 = element(1, lists:nth(1, KVL2)), + ExpLK2 = element(1, lists:last(KVL2)), + ?assertMatch(ExpFK1, FK1), + ?assertMatch(ExpFK2, FK2), + ?assertMatch(ExpLK1, LK1), + ?assertMatch(ExpLK2, LK2), + ML1 = [{next, P1, FK1}], + ML2 = [{next, P2, FK2}], + {ok, P3, {{Rem1, Rem2}, FK3, LK3}} = sst_new("../test/level2_merge", + ML1, + ML2, + false, + 2), + ?assertMatch([], Rem1), + ?assertMatch([], Rem2), + ?assertMatch(true, FK3 == min(FK1, FK2)), + ?assertMatch(true, LK3 == max(LK1, LK2)), + io:format(user, + "Created and merged two files of size ~w in ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW0)]), + + SW1 = os:timestamp(), + lists:foreach(fun({K, V}) -> + ?assertMatch({K, V}, sst_get(P3, K)) + end, + KVL3), + io:format(user, + "Checked presence of all ~w objects in ~w microseconds~n", + [length(KVL3), timer:now_diff(os:timestamp(), SW1)]), + + ok = sst_close(P1), + ok = sst_close(P2), + ok = sst_close(P3), + ok = file:delete("../test/level1_src.sst"), + ok = file:delete("../test/level2_src.sst"), + ok = file:delete("../test/level2_merge.sst"). + simple_slotbin_test() -> KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 2, 1, 4), KVList1 = lists:sublist(lists:ukeysort(1, KVList0), 1, ?SLOT_SIZE), @@ -703,7 +907,7 @@ simple_slotbin_test() -> simple_slotbinsummary_test() -> - KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 8 + 100, 1, 4), + KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20), KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _V}|_Rest] = KVList1, {FirstKey, _L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList1), @@ -787,23 +991,29 @@ simple_persisted_test() -> {TenthKey, _v10} = lists:nth(10, KVList1), {Three000Key, _v300} = lists:nth(300, KVList1), - io:format("Looking for 291 elements between ~s ~s and ~s ~s~n", - [element(2, TenthKey), - element(3, TenthKey), - element(2, Three000Key), - element(3, Three000Key)]), SubKVList1 = lists:sublist(KVList1, 10, 291), SubKVList1L = length(SubKVList1), FetchList2 = sst_getkvrange(Pid, TenthKey, Three000Key, 2), + ?assertMatch(pointer, element(1, lists:last(FetchList2))), FetchedList2 = lists:foldl(FoldFun, [], FetchList2), - io:format("Found elements between ~s ~s and ~s ~s~n", - [element(2, element(1, lists:nth(1, FetchedList2))), - element(3, element(1, lists:nth(1, FetchedList2))), - element(2, element(1, lists:last(FetchedList2))), - element(3, element(1, lists:last(FetchedList2)))]), ?assertMatch(SubKVList1L, length(FetchedList2)), ?assertMatch(SubKVList1, FetchedList2), + {Eight000Key, _v800} = lists:nth(800, KVList1), + SubKVListA1 = lists:sublist(KVList1, 10, 791), + SubKVListA1L = length(SubKVListA1), + FetchListA2 = sst_getkvrange(Pid, TenthKey, Eight000Key, 2), + ?assertMatch(pointer, element(1, lists:last(FetchListA2))), + FetchedListA2 = lists:foldl(FoldFun, [], FetchListA2), + ?assertMatch(SubKVListA1L, length(FetchedListA2)), + ?assertMatch(SubKVListA1, FetchedListA2), + + FetchListB2 = sst_getkvrange(Pid, TenthKey, Eight000Key, 4), + ?assertMatch(pointer, element(1, lists:last(FetchListB2))), + FetchedListB2 = lists:foldl(FoldFun, [], FetchListB2), + ?assertMatch(SubKVListA1L, length(FetchedListB2)), + ?assertMatch(SubKVListA1, FetchedListB2), + ok = sst_close(Pid), ok = file:delete(Filename ++ ".sst"). From dc28388c76e5e0dcdbaf2c451b2299eb5864c9a8 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 02:07:14 +0000 Subject: [PATCH 20/58] Removed SFT Now moved over to SST on this branch --- src/leveled_bookie.erl | 5 +- src/leveled_log.erl | 13 +- src/leveled_pclerk.erl | 75 +- src/leveled_penciller.erl | 286 +++--- src/leveled_sft.erl | 2024 ------------------------------------- src/leveled_skiplist.erl | 20 +- src/leveled_sst.erl | 304 +++++- 7 files changed, 481 insertions(+), 2246 deletions(-) delete mode 100644 src/leveled_sft.erl diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 90a0b2a..d3c3f1f 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1186,7 +1186,10 @@ hashtree_query_test() -> {hashtree_query, ?STD_TAG, false}), - ?assertMatch(KeyHashList, HTFolder2()), + L0 = length(KeyHashList), + HTR2 = HTFolder2(), + ?assertMatch(L0, length(HTR2)), + ?assertMatch(KeyHashList, HTR2), ok = book_close(Bookie2), reset_filestructure(). diff --git a/src/leveled_log.erl b/src/leveled_log.erl index 8161409..8ab798f 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -19,7 +19,7 @@ -define(GET_LOGPOINT, 160000). -define(SST_LOGPOINT, 200000). -define(LOG_LEVEL, [info, warn, error, critical]). --define(SAMPLE_RATE, 15). +-define(SAMPLE_RATE, 16). -define(LOGBASE, dict:from_list([ @@ -96,7 +96,7 @@ {info, "Response to push_mem of ~w with " ++ "L0 pending ~w and merge backlog ~w"}}, {"P0019", - {info, "Rolling level zero to filename ~s"}}, + {info, "Rolling level zero to filename ~s at ledger sqn ~w"}}, {"P0020", {info, "Work at Level ~w to be scheduled for ~w with ~w " ++ "queue items outstanding at all levels"}}, @@ -238,11 +238,18 @@ {error, "False result returned from SST with filename ~s as " ++ "slot ~w has failed crc check"}}, {"SST03", - {info, "Opening SST file with filename ~s keys ~w and slots ~w"}}, + {info, "Opening SST file with filename ~s keys ~w slots ~w and" + ++ " max sqn ~w"}}, {"SST04", {info, "Exit called for reason ~w on filename ~s"}}, {"SST05", {warn, "Rename rogue filename ~s to ~s"}}, + {"SST06", + {info, "File ~s has been set for delete"}}, + {"SST07", + {info, "Exit called and now clearing ~s"}}, + {"SST08", + {info, "Completed creation of ~s at level ~w with max sqn ~w"}}, {"SFT01", {info, "Opened filename with name ~s"}}, diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index a0f64d9..2f29920 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -9,7 +9,7 @@ %% %% -------- COMMITTING MANIFEST CHANGES --------- %% -%% Once the Penciller has taken a manifest change, the SFT file owners which no +%% Once the Penciller has taken a manifest change, the SST file owners which no %% longer form part of the manifest will be marked for delete. By marking for %% deletion, the owners will poll to confirm when it is safe for them to be %% deleted. @@ -225,7 +225,7 @@ merge(WI) -> mark_for_delete([], _Penciller) -> ok; mark_for_delete([Head|Tail], Penciller) -> - ok = leveled_sft:sft_setfordelete(Head#manifest_entry.owner, Penciller), + ok = leveled_sst:sst_setfordelete(Head#manifest_entry.owner, Penciller), mark_for_delete(Tail, Penciller). @@ -268,13 +268,13 @@ select_filetomerge(SrcLevel, Manifest) -> -%% Assumption is that there is a single SFT from a higher level that needs -%% to be merged into multiple SFTs at a lower level. This should create an -%% entirely new set of SFTs, and the calling process can then update the +%% Assumption is that there is a single SST from a higher level that needs +%% to be merged into multiple SSTs at a lower level. This should create an +%% entirely new set of SSTs, and the calling process can then update the %% manifest. %% %% Once the FileToMerge has been emptied, the remainder of the candidate list -%% needs to be placed in a remainder SFT that may be of a sub-optimal (small) +%% needs to be placed in a remainder SST that may be of a sub-optimal (small) %% size. This stops the need to perpetually roll over the whole level if the %% level consists of already full files. Some smartness may be required when %% selecting the candidate list so that small files just outside the candidate @@ -293,18 +293,22 @@ perform_merge({SrcPid, SrcFN}, CandidateList, LevelInfo, {Filepath, MSN}) -> PointerList = lists:map(fun(P) -> {next, P#manifest_entry.owner, all} end, CandidateList), + MaxSQN = leveled_sst:sst_getmaxsequencenumber(SrcPid), do_merge([{next, SrcPid, all}], PointerList, LevelInfo, {Filepath, MSN}, + MaxSQN, 0, []). -do_merge([], [], {SrcLevel, _IsB}, {_Filepath, MSN}, FileCounter, OutList) -> +do_merge([], [], {SrcLevel, _IsB}, {_Filepath, MSN}, _MaxSQN, + FileCounter, OutList) -> leveled_log:log("PC011", [MSN, SrcLevel, FileCounter]), OutList; -do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> - FileName = lists:flatten(io_lib:format(Filepath ++ "_~w_~w.sft", +do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN, + FileCounter, OutList) -> + FileName = lists:flatten(io_lib:format(Filepath ++ "_~w_~w.sst", [SrcLevel + 1, FileCounter])), leveled_log:log("PC012", [MSN, FileName]), TS1 = os:timestamp(), @@ -312,12 +316,13 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> KL1, KL2, IsB, - SrcLevel + 1), + SrcLevel + 1, + MaxSQN), case Reply of {{[], []}, null, _} -> leveled_log:log("PC013", [FileName]), leveled_log:log("PC014", [FileName]), - ok = leveled_sft:sft_clear(Pid), + ok = leveled_sst:sst_clear(Pid), OutList; {{KL1Rem, KL2Rem}, SmallestKey, HighestKey} -> ExtMan = lists:append(OutList, @@ -327,7 +332,7 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> filename=FileName}]), leveled_log:log_timer("PC015", [], TS1), do_merge(KL1Rem, KL2Rem, - {SrcLevel, IsB}, {Filepath, MSN}, + {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN, FileCounter + 1, ExtMan) end. @@ -377,7 +382,7 @@ find_randomkeys(FList, Count, Source) -> KV1 = lists:nth(random:uniform(length(Source)), Source), K1 = leveled_codec:strip_to_keyonly(KV1), P1 = choose_pid_toquery(FList, K1), - FoundKV = leveled_sft:sft_get(P1, K1), + FoundKV = leveled_sst:sst_get(P1, K1), Found = leveled_codec:strip_to_keyonly(FoundKV), io:format("success finding ~w in ~w~n", [K1, P1]), ?assertMatch(K1, Found), @@ -386,21 +391,31 @@ find_randomkeys(FList, Count, Source) -> merge_file_test() -> KL1_L1 = lists:sort(generate_randomkeys(8000, 0, 1000)), - {ok, PidL1_1, _} = leveled_sft:sft_new("../test/KL1_L1.sft", - KL1_L1, [], 1), + {ok, PidL1_1, _} = leveled_sst:sst_new("../test/KL1_L1.sst", + 1, + KL1_L1, + undefined), KL1_L2 = lists:sort(generate_randomkeys(8000, 0, 250)), - {ok, PidL2_1, _} = leveled_sft:sft_new("../test/KL1_L2.sft", - KL1_L2, [], 2), + {ok, PidL2_1, _} = leveled_sst:sst_new("../test/KL1_L2.sst", + 2, + KL1_L2, + undefined), KL2_L2 = lists:sort(generate_randomkeys(8000, 250, 250)), - {ok, PidL2_2, _} = leveled_sft:sft_new("../test/KL2_L2.sft", - KL2_L2, [], 2), + {ok, PidL2_2, _} = leveled_sst:sst_new("../test/KL2_L2.sst", + 2, + KL2_L2, + undefined), KL3_L2 = lists:sort(generate_randomkeys(8000, 500, 250)), - {ok, PidL2_3, _} = leveled_sft:sft_new("../test/KL3_L2.sft", - KL3_L2, [], 2), + {ok, PidL2_3, _} = leveled_sst:sst_new("../test/KL3_L2.sst", + 2, + KL3_L2, + undefined), KL4_L2 = lists:sort(generate_randomkeys(8000, 750, 250)), - {ok, PidL2_4, _} = leveled_sft:sft_new("../test/KL4_L2.sft", - KL4_L2, [], 2), - Result = perform_merge({PidL1_1, "../test/KL1_L1.sft"}, + {ok, PidL2_4, _} = leveled_sst:sst_new("../test/KL4_L2.sst", + 2, + KL4_L2, + undefined), + Result = perform_merge({PidL1_1, "../test/KL1_L1.sst"}, [#manifest_entry{owner=PidL2_1}, #manifest_entry{owner=PidL2_2}, #manifest_entry{owner=PidL2_3}, @@ -422,13 +437,13 @@ merge_file_test() -> ok = find_randomkeys(Result, 50, KL3_L2), io:format("Finding keys in KL4_L2~n"), ok = find_randomkeys(Result, 50, KL4_L2), - leveled_sft:sft_clear(PidL1_1), - leveled_sft:sft_clear(PidL2_1), - leveled_sft:sft_clear(PidL2_2), - leveled_sft:sft_clear(PidL2_3), - leveled_sft:sft_clear(PidL2_4), + leveled_sst:sst_clear(PidL1_1), + leveled_sst:sst_clear(PidL2_1), + leveled_sst:sst_clear(PidL2_2), + leveled_sst:sst_clear(PidL2_3), + leveled_sst:sst_clear(PidL2_4), lists:foreach(fun(ManEntry) -> - leveled_sft:sft_clear(ManEntry#manifest_entry.owner) end, + leveled_sst:sst_clear(ManEntry#manifest_entry.owner) end, Result). select_merge_candidates_test() -> diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 0de9b2b..7f36325 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -22,17 +22,17 @@ %% %% The Ledger is divided into many levels %% - L0: New keys are received from the Bookie and merged into a single -%% gb_tree, until that tree is the size of a SFT file, and it is then persisted -%% as a SFT file at this level. L0 SFT files can be larger than the normal +%% gb_tree, until that tree is the size of a SST file, and it is then persisted +%% as a SST file at this level. L0 SST files can be larger than the normal %% maximum size - so we don't have to consider problems of either having more %% than one L0 file (and handling what happens on a crash between writing the %% files when the second may have overlapping sequence numbers), or having a %% remainder with overlapping in sequence numbers in memory after the file is %% written. Once the persistence is completed, the L0 tree can be erased. -%% There can be only one SFT file at Level 0, so the work to merge that file +%% There can be only one SST file at Level 0, so the work to merge that file %% to the lower level must be the highest priority, as otherwise writes to the %% ledger will stall, when there is next a need to persist. -%% - L1 TO L7: May contain multiple processes managing non-overlapping sft +%% - L1 TO L7: May contain multiple processes managing non-overlapping SST %% files. Compaction work should be sheduled if the number of files exceeds %% the target size of the level, where the target size is 8 ^ n. %% @@ -67,14 +67,14 @@ %% completed to merge the tree into the L0 tree. %% %% The Penciller MUST NOT accept a new PUSH if the Clerk has commenced the -%% conversion of the current L0 tree into a SFT file, but not completed this +%% conversion of the current L0 tree into a SST file, but not completed this %% change. The Penciller in this case returns the push, and the Bookie should %% continue to grow the cache before trying again. %% %% ---------- FETCH ---------- %% %% On request to fetch a key the Penciller should look first in the in-memory -%% L0 tree, then look in the SFT files Level by Level (including level 0), +%% L0 tree, then look in the SST files Level by Level (including level 0), %% consulting the Manifest to determine which file should be checked at each %% level. %% @@ -82,16 +82,16 @@ %% %% Iterators may request a snapshot of the database. A snapshot is a cloned %% Penciller seeded not from disk, but by the in-memory L0 gb_tree and the -%% in-memory manifest, allowing for direct reference for the SFT file processes. +%% in-memory manifest, allowing for direct reference for the SST file processes. %% %% Clones formed to support snapshots are registered by the Penciller, so that -%% SFT files valid at the point of the snapshot until either the iterator is +%% SST files valid at the point of the snapshot until either the iterator is %% completed or has timed out. %% %% ---------- ON STARTUP ---------- %% %% On Startup the Bookie with ask the Penciller to initiate the Ledger first. -%% To initiate the Ledger the must consult the manifest, and then start a SFT +%% To initiate the Ledger the must consult the manifest, and then start a SST %% management process for each file in the manifest. %% %% The penciller should then try and read any Level 0 file which has the @@ -103,14 +103,14 @@ %% ---------- ON SHUTDOWN ---------- %% %% On a controlled shutdown the Penciller should attempt to write any in-memory -%% ETS table to a L0 SFT file, assuming one is nto already pending. If one is +%% ETS table to a L0 SST file, assuming one is nto already pending. If one is %% already pending then the Penciller will not persist this part of the Ledger. %% %% ---------- FOLDER STRUCTURE ---------- %% %% The following folders are used by the Penciller %% $ROOT/ledger/ledger_manifest/ - used for keeping manifest files -%% $ROOT/ledger/ledger_files/ - containing individual SFT files +%% $ROOT/ledger/ledger_files/ - containing individual SST files %% %% In larger stores there could be a large number of files in the ledger_file %% folder - perhaps o(1000). It is assumed that modern file systems should @@ -120,7 +120,7 @@ %% %% The Penciller can have one and only one Clerk for performing compaction %% work. When the Clerk has requested and taken work, it should perform the -%5 compaction work starting the new SFT process to manage the new Ledger state +%5 compaction work starting the new SST process to manage the new Ledger state %% and then write a new manifest file that represents that state with using %% the next Manifest sequence number as the filename: %% - nonzero_.pnd @@ -130,14 +130,14 @@ %% %% On startup, the Penciller should look for the nonzero_*.crr file with the %% highest such manifest sequence number. This will be started as the -%% manifest, together with any _0_0.sft file found at that Manifest SQN. +%% manifest, together with any _0_0.sst file found at that Manifest SQN. %% Level zero files are not kept in the persisted manifest, and adding a L0 %% file does not advanced the Manifest SQN. %% %% The pace at which the store can accept updates will be dependent on the %% speed at which the Penciller's Clerk can merge files at lower levels plus %% the time it takes to merge from Level 0. As if a clerk has commenced -%% compaction work at a lower level and then immediately a L0 SFT file is +%% compaction work at a lower level and then immediately a L0 SST file is %% written the Penciller will need to wait for this compaction work to %% complete and the L0 file to be compacted before the ETS table can be %% allowed to again reach capacity @@ -145,7 +145,7 @@ %% The writing of L0 files do not require the involvement of the clerk. %% The L0 files are prompted directly by the penciller when the in-memory tree %% has reached capacity. This places the penciller in a levelzero_pending -%% state, and in this state it must return new pushes. Once the SFT file has +%% state, and in this state it must return new pushes. Once the SST file has %% been completed it will confirm completion to the penciller which can then %% revert the levelzero_pending state, add the file to the manifest and clear %% the current level zero in-memory view. @@ -399,10 +399,11 @@ handle_call({fetch_keys, StartKey, EndKey, AccFun, InitAcc, MaxKeys}, List -> List end, - SFTiter = initiate_rangequery_frommanifest(StartKey, + SSTiter = initiate_rangequery_frommanifest(StartKey, EndKey, State#state.manifest), - Acc = keyfolder({L0AsList, SFTiter}, + io:format("SSTiter on query ~w~n", [SSTiter]), + Acc = keyfolder({L0AsList, SSTiter}, {StartKey, EndKey}, {AccFun, InitAcc}, MaxKeys), @@ -456,7 +457,7 @@ handle_cast({confirm_delete, FileName}, State=#state{is_snapshot=Snap}) {true, Pid} -> UF1 = lists:keydelete(FileName, 1, State#state.unreferenced_files), leveled_log:log("P0005", [FileName]), - ok = leveled_sft:sft_deleteconfirmed(Pid), + ok = leveled_sst:sst_deleteconfirmed(Pid), {noreply, State#state{unreferenced_files=UF1}}; _ -> {noreply, State} @@ -525,7 +526,7 @@ terminate(Reason, State) -> leveled_log:log("P0009", []); {false, [], _N} -> L0Pid = roll_memory(UpdState, true), - ok = leveled_sft:sft_close(L0Pid); + ok = leveled_sst:sst_close(L0Pid); StatusTuple -> leveled_log:log("P0010", [StatusTuple]) end, @@ -533,7 +534,7 @@ terminate(Reason, State) -> % Tidy shutdown of individual files ok = close_files(0, UpdState#state.manifest), lists:foreach(fun({_FN, Pid, _SN}) -> - ok = leveled_sft:sft_close(Pid) end, + ok = leveled_sst:sst_close(Pid) end, UpdState#state.unreferenced_files), leveled_log:log("P0011", []), ok. @@ -608,14 +609,14 @@ start_from_file(PCLopts) -> leveled_log:log("P0014", [MaxSQN]), %% Find any L0 files - L0FN = filepath(RootPath, TopManSQN, new_merge_files) ++ "_0_0.sft", + L0FN = filepath(RootPath, TopManSQN, new_merge_files) ++ "_0_0.sst", case filelib:is_file(L0FN) of true -> leveled_log:log("P0015", [L0FN]), {ok, L0Pid, - {L0StartKey, L0EndKey}} = leveled_sft:sft_open(L0FN), - L0SQN = leveled_sft:sft_getmaxsequencenumber(L0Pid), + {L0StartKey, L0EndKey}} = leveled_sst:sst_open(L0FN), + L0SQN = leveled_sst:sst_getmaxsequencenumber(L0Pid), ManifestEntry = #manifest_entry{start_key=L0StartKey, end_key=L0EndKey, owner=L0Pid, @@ -696,7 +697,7 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, %% to an immediate return as expected. With 32K keys in the TreeList it could %% take around 35-40ms. %% -%% To avoid blocking this gen_server, the SFT file can request each item of the +%% To avoid blocking this gen_server, the SST file can request each item of the %% cache one at a time. %% %% The Wait is set to false to use a cast when calling this in normal operation @@ -704,25 +705,22 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, roll_memory(State, false) -> FileName = levelzero_filename(State), - leveled_log:log("P0019", [FileName]), - Opts = #sft_options{wait=false, penciller=self()}, + leveled_log:log("P0019", [FileName, State#state.ledger_sqn]), PCL = self(), FetchFun = fun(Slot) -> pcl_fetchlevelzero(PCL, Slot) end, - % FetchFun = fun(Slot) -> lists:nth(Slot, State#state.levelzero_cache) end, - R = leveled_sft:sft_newfroml0cache(FileName, + R = leveled_sst:sst_newlevelzero(FileName, length(State#state.levelzero_cache), FetchFun, - Opts), + PCL, + State#state.ledger_sqn), {ok, Constructor, _} = R, Constructor; roll_memory(State, true) -> FileName = levelzero_filename(State), - Opts = #sft_options{wait=true}, FetchFun = fun(Slot) -> lists:nth(Slot, State#state.levelzero_cache) end, - R = leveled_sft:sft_newfroml0cache(FileName, - length(State#state.levelzero_cache), - FetchFun, - Opts), + KVList = leveled_pmem:to_list(length(State#state.levelzero_cache), + FetchFun), + R = leveled_sst:sst_new(FileName, 0, KVList, State#state.ledger_sqn), {ok, Constructor, _} = R, Constructor. @@ -753,7 +751,7 @@ fetch_mem(Key, Hash, Manifest, L0Cache, none) -> L0Check = leveled_pmem:check_levelzero(Key, Hash, L0Cache), case L0Check of {false, not_found} -> - fetch(Key, Hash, Manifest, 0, fun timed_sft_get/3); + fetch(Key, Hash, Manifest, 0, fun timed_sst_get/3); {true, KV} -> {KV, 0} end; @@ -762,7 +760,7 @@ fetch_mem(Key, Hash, Manifest, L0Cache, L0Index) -> true -> fetch_mem(Key, Hash, Manifest, L0Cache, none); false -> - fetch(Key, Hash, Manifest, 0, fun timed_sft_get/3) + fetch(Key, Hash, Manifest, 0, fun timed_sst_get/3) end. fetch(_Key, _Hash, _Manifest, ?MAX_LEVELS + 1, _FetchFun) -> @@ -791,9 +789,9 @@ fetch(Key, Hash, Manifest, Level, FetchFun) -> end end. -timed_sft_get(PID, Key, Hash) -> +timed_sst_get(PID, Key, Hash) -> SW = os:timestamp(), - R = leveled_sft:sft_get(PID, Key, Hash), + R = leveled_sst:sst_get(PID, Key, Hash), T0 = timer:now_diff(os:timestamp(), SW), case {T0, R} of {T, R} when T < ?SLOW_FETCH -> @@ -880,7 +878,7 @@ close_files(?MAX_LEVELS - 1, _Manifest) -> close_files(Level, Manifest) -> LevelList = get_item(Level, Manifest, []), lists:foreach(fun(F) -> - ok = leveled_sft:sft_close(F#manifest_entry.owner) end, + ok = leveled_sst:sst_close(F#manifest_entry.owner) end, LevelList), close_files(Level + 1, Manifest). @@ -897,8 +895,8 @@ open_all_filesinmanifest({Manifest, TopSQN}, Level) -> %5 replace them LvlR = lists:foldl(fun(F, {FL, FL_SQN}) -> FN = F#manifest_entry.filename, - {ok, P, _Keys} = leveled_sft:sft_open(FN), - F_SQN = leveled_sft:sft_getmaxsequencenumber(P), + {ok, P, _Keys} = leveled_sst:sst_open(FN), + F_SQN = leveled_sst:sst_getmaxsequencenumber(P), {lists:append(FL, [F#manifest_entry{owner = P}]), max(FL_SQN, F_SQN)} @@ -932,24 +930,24 @@ initiate_rangequery_frommanifest(StartKey, EndKey, Manifest) -> C2 = leveled_codec:endkey_passed(EndKey, M#manifest_entry.start_key), not (C1 or C2) end, - lists:foldl(fun(L, AccL) -> - Level = get_item(L, Manifest, []), - FL = lists:foldl(fun(M, Acc) -> - case CompareFun(M) of - true -> - Acc ++ [{next_file, M}]; - false -> - Acc - end end, - [], - Level), - case FL of - [] -> AccL; - FL -> AccL ++ [{L, FL}] - end - end, - [], - lists:seq(0, ?MAX_LEVELS - 1)). + FoldFun = + fun(L, AccL) -> + Level = get_item(L, Manifest, []), + FL = lists:foldl(fun(M, Acc) -> + case CompareFun(M) of + true -> + Acc ++ [{next, M, StartKey}]; + false -> + Acc + end end, + [], + Level), + case FL of + [] -> AccL; + FL -> AccL ++ [{L, FL}] + end + end, + lists:foldl(FoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)). %% Looks to find the best choice for the next key across the levels (other %% than in-memory table) @@ -960,22 +958,25 @@ find_nextkey(QueryArray, StartKey, EndKey) -> find_nextkey(QueryArray, 0, {null, null}, - {fun leveled_sft:sft_getkvrange/4, StartKey, EndKey, 1}). + StartKey, + EndKey, + 1). -find_nextkey(_QueryArray, LCnt, {null, null}, _QueryFunT) +find_nextkey(_QueryArray, LCnt, {null, null}, _StartKey, _EndKey, _Width) when LCnt > ?MAX_LEVELS -> % The array has been scanned wihtout finding a best key - must be % exhausted - respond to indicate no more keys to be found by the % iterator no_more_keys; -find_nextkey(QueryArray, LCnt, {BKL, BestKV}, _QueryFunT) +find_nextkey(QueryArray, LCnt, {BKL, BestKV}, _StartKey, _EndKey, _Width) when LCnt > ?MAX_LEVELS -> % All levels have been scanned, so need to remove the best result from % the array, and return that array along with the best key/sqn/status % combination {BKL, [BestKV|Tail]} = lists:keyfind(BKL, 1, QueryArray), {lists:keyreplace(BKL, 1, QueryArray, {BKL, Tail}), BestKV}; -find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> +find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, + StartKey, EndKey, Width) -> % Get the next key at this level {NextKey, RestOfKeys} = case lists:keyfind(LCnt, 1, QueryArray) of false -> @@ -989,39 +990,46 @@ find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> case {NextKey, BestKeyLevel, BestKV} of {null, BKL, BKV} -> % There is no key at this level - go to the next level - find_nextkey(QueryArray, LCnt + 1, {BKL, BKV}, QueryFunT); - {{next_file, ManifestEntry}, BKL, BKV} -> + find_nextkey(QueryArray, + LCnt + 1, + {BKL, BKV}, + StartKey, EndKey, Width); + {{next, ManifestEntry, _SK}, BKL, BKV} -> % The first key at this level is pointer to a file - need to query % the file to expand this level out before proceeding Owner = ManifestEntry#manifest_entry.owner, - {QueryFun, StartKey, EndKey, ScanSize} = QueryFunT, - QueryResult = QueryFun(Owner, StartKey, EndKey, ScanSize), - NewEntry = {LCnt, QueryResult ++ RestOfKeys}, + Pointer = {next, Owner, StartKey, EndKey}, + UpdList = leveled_sst:expand_list_by_pointer(Pointer, + RestOfKeys, + Width), + NewEntry = {LCnt, UpdList}, % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), LCnt, {BKL, BKV}, - QueryFunT); - {{next, SFTpid, NewStartKey}, BKL, BKV} -> + StartKey, EndKey, Width); + {{pointer, SSTPid, Slot, PSK, PEK}, BKL, BKV} -> % The first key at this level is pointer within a file - need to % query the file to expand this level out before proceeding - {QueryFun, _StartKey, EndKey, ScanSize} = QueryFunT, - QueryResult = QueryFun(SFTpid, NewStartKey, EndKey, ScanSize), - NewEntry = {LCnt, QueryResult ++ RestOfKeys}, + Pointer = {pointer, SSTPid, Slot, PSK, PEK}, + UpdList = leveled_sst:expand_list_by_pointer(Pointer, + RestOfKeys, + Width), + NewEntry = {LCnt, UpdList}, % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), LCnt, {BKL, BKV}, - QueryFunT); + StartKey, EndKey, Width); {{Key, Val}, null, null} -> % No best key set - so can assume that this key is the best key, % and check the lower levels find_nextkey(QueryArray, LCnt + 1, {LCnt, {Key, Val}}, - QueryFunT); + StartKey, EndKey, Width); {{Key, Val}, _BKL, {BestKey, _BestVal}} when Key < BestKey -> % There is a real key and a best key to compare, and the real key % at this level is before the best key, and so is now the new best @@ -1030,7 +1038,7 @@ find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> find_nextkey(QueryArray, LCnt + 1, {LCnt, {Key, Val}}, - QueryFunT); + StartKey, EndKey, Width); {{Key, Val}, BKL, {BestKey, BestVal}} when Key == BestKey -> SQN = leveled_codec:strip_to_seqonly({Key, Val}), BestSQN = leveled_codec:strip_to_seqonly({BestKey, BestVal}), @@ -1041,7 +1049,7 @@ find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), LCnt + 1, {BKL, {BestKey, BestVal}}, - QueryFunT); + StartKey, EndKey, Width); SQN > BestSQN -> % There is a real key at the front of this level and it has % a higher SQN than the best key, so we should use this as @@ -1056,29 +1064,32 @@ find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> {BKL, BestTail}), LCnt + 1, {LCnt, {Key, Val}}, - QueryFunT) + StartKey, EndKey, Width) end; {_, BKL, BKV} -> % This is not the best key - find_nextkey(QueryArray, LCnt + 1, {BKL, BKV}, QueryFunT) + find_nextkey(QueryArray, + LCnt + 1, + {BKL, BKV}, + StartKey, EndKey, Width) end. -keyfolder(IMMiter, SFTiter, StartKey, EndKey, {AccFun, Acc}) -> - keyfolder({IMMiter, SFTiter}, {StartKey, EndKey}, {AccFun, Acc}, -1). +keyfolder(IMMiter, SSTiter, StartKey, EndKey, {AccFun, Acc}) -> + keyfolder({IMMiter, SSTiter}, {StartKey, EndKey}, {AccFun, Acc}, -1). keyfolder(_Iterators, _KeyRange, {_AccFun, Acc}, MaxKeys) when MaxKeys == 0 -> Acc; -keyfolder({[], SFTiter}, KeyRange, {AccFun, Acc}, MaxKeys) -> +keyfolder({[], SSTiter}, KeyRange, {AccFun, Acc}, MaxKeys) -> {StartKey, EndKey} = KeyRange, - case find_nextkey(SFTiter, StartKey, EndKey) of + case find_nextkey(SSTiter, StartKey, EndKey) of no_more_keys -> Acc; - {NxSFTiter, {SFTKey, SFTVal}} -> - Acc1 = AccFun(SFTKey, SFTVal, Acc), - keyfolder({[], NxSFTiter}, KeyRange, {AccFun, Acc1}, MaxKeys - 1) + {NxSSTiter, {SSTKey, SSTVal}} -> + Acc1 = AccFun(SSTKey, SSTVal, Acc), + keyfolder({[], NxSSTiter}, KeyRange, {AccFun, Acc1}, MaxKeys - 1) end; -keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SFTiterator}, KeyRange, +keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, KeyRange, {AccFun, Acc}, MaxKeys) -> {StartKey, EndKey} = KeyRange, case {IMMKey < StartKey, leveled_codec:endkey_passed(EndKey, IMMKey)} of @@ -1087,7 +1098,7 @@ keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SFTiterator}, KeyRange, % Normally everything is pre-filterd, but the IMM iterator can % be re-used and so may be behind the StartKey if the StartKey has % advanced from the previous use - keyfolder({NxIMMiterator, SFTiterator}, + keyfolder({NxIMMiterator, SSTiterator}, KeyRange, {AccFun, Acc}, MaxKeys); @@ -1095,44 +1106,44 @@ keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SFTiterator}, KeyRange, % There are no more keys in-range in the in-memory % iterator, so take action as if this iterator is empty % (see above) - keyfolder({[], SFTiterator}, + keyfolder({[], SSTiterator}, KeyRange, {AccFun, Acc}, MaxKeys); {false, false} -> - case find_nextkey(SFTiterator, StartKey, EndKey) of + case find_nextkey(SSTiterator, StartKey, EndKey) of no_more_keys -> % No more keys in range in the persisted store, so use the % in-memory KV as the next Acc1 = AccFun(IMMKey, IMMVal, Acc), - keyfolder({NxIMMiterator, SFTiterator}, + keyfolder({NxIMMiterator, SSTiterator}, KeyRange, {AccFun, Acc1}, MaxKeys - 1); - {NxSFTiterator, {SFTKey, SFTVal}} -> + {NxSSTiterator, {SSTKey, SSTVal}} -> % There is a next key, so need to know which is the % next key between the two (and handle two keys % with different sequence numbers). case leveled_codec:key_dominates({IMMKey, IMMVal}, - {SFTKey, - SFTVal}) of + {SSTKey, + SSTVal}) of left_hand_first -> Acc1 = AccFun(IMMKey, IMMVal, Acc), - keyfolder({NxIMMiterator, SFTiterator}, + keyfolder({NxIMMiterator, SSTiterator}, KeyRange, {AccFun, Acc1}, MaxKeys - 1); right_hand_first -> - Acc1 = AccFun(SFTKey, SFTVal, Acc), + Acc1 = AccFun(SSTKey, SSTVal, Acc), keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], - NxSFTiterator}, + NxSSTiterator}, KeyRange, {AccFun, Acc1}, MaxKeys - 1); left_hand_dominant -> Acc1 = AccFun(IMMKey, IMMVal, Acc), - keyfolder({NxIMMiterator, NxSFTiterator}, + keyfolder({NxIMMiterator, NxSSTiterator}, KeyRange, {AccFun, Acc1}, MaxKeys - 1) @@ -1286,6 +1297,27 @@ confirm_delete(Filename, UnreferencedFiles, RegisteredSnapshots) -> -ifdef(TEST). + +generate_randomkeys({Count, StartSQN}) -> + generate_randomkeys(Count, StartSQN, []); +generate_randomkeys(Count) -> + generate_randomkeys(Count, 0, []). + +generate_randomkeys(0, _SQN, Acc) -> + lists:reverse(Acc); +generate_randomkeys(Count, SQN, Acc) -> + K = {o, + lists:concat(["Bucket", random:uniform(1024)]), + lists:concat(["Key", random:uniform(1024)]), + null}, + RandKey = {K, + {SQN, + {active, infinity}, + leveled_codec:magic_hash(K), + null}}, + generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). + + clean_testdir(RootPath) -> clean_subdir(filepath(RootPath, manifest)), clean_subdir(filepath(RootPath, files)). @@ -1332,8 +1364,8 @@ compaction_work_assessment_test() -> ?assertMatch([{1, Manifest3, 1}], WorkQ3). confirm_delete_test() -> - Filename = 'test.sft', - UnreferencedFiles = [{'other.sft', dummy_owner, 15}, + Filename = 'test.sst', + UnreferencedFiles = [{'other.sst', dummy_owner, 15}, {Filename, dummy_owner, 10}], RegisteredIterators1 = [{dummy_pid, 16}, {dummy_pid, 12}], R1 = confirm_delete(Filename, UnreferencedFiles, RegisteredIterators1), @@ -1376,20 +1408,20 @@ simple_server_test() -> Key1_Pre = {{o,"Bucket0001", "Key0001", null}, {1, {active, infinity}, null}}, Key1 = add_missing_hash(Key1_Pre), - KL1 = leveled_sft:generate_randomkeys({1000, 2}), + KL1 = generate_randomkeys({1000, 2}), Key2_Pre = {{o,"Bucket0002", "Key0002", null}, {1002, {active, infinity}, null}}, Key2 = add_missing_hash(Key2_Pre), - KL2 = leveled_sft:generate_randomkeys({900, 1003}), + KL2 = generate_randomkeys({900, 1003}), % Keep below the max table size by having 900 not 1000 Key3_Pre = {{o,"Bucket0003", "Key0003", null}, {2003, {active, infinity}, null}}, Key3 = add_missing_hash(Key3_Pre), - KL3 = leveled_sft:generate_randomkeys({1000, 2004}), + KL3 = generate_randomkeys({1000, 2004}), Key4_Pre = {{o,"Bucket0004", "Key0004", null}, {3004, {active, infinity}, null}}, Key4 = add_missing_hash(Key4_Pre), - KL4 = leveled_sft:generate_randomkeys({1000, 3005}), + KL4 = generate_randomkeys({1000, 3005}), ok = maybe_pause_push(PCL, [Key1]), ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), ok = maybe_pause_push(PCL, KL1), @@ -1464,7 +1496,7 @@ simple_server_test() -> Key1A_Pre = {{o,"Bucket0001", "Key0001", null}, {4005, {active, infinity}, null}}, Key1A = add_missing_hash(Key1A_Pre), - KL1A = leveled_sft:generate_randomkeys({2000, 4006}), + KL1A = generate_randomkeys({2000, 4006}), ok = maybe_pause_push(PCLr, [Key1A]), ok = maybe_pause_push(PCLr, KL1A), ?assertMatch(true, pcl_checksequencenumber(PclSnap, @@ -1528,17 +1560,16 @@ rangequery_manifest_test() -> end_key={o, "Bucket1", "K996", null}, filename="Z6"}}, Man = [{1, [E1, E2, E3]}, {2, [E4, E5, E6]}], - R1 = initiate_rangequery_frommanifest({o, "Bucket1", "K711", null}, - {o, "Bucket1", "K999", null}, - Man), - ?assertMatch([{1, [{next_file, E3}]}, - {2, [{next_file, E5}, {next_file, E6}]}], + SK1 = {o, "Bucket1", "K711", null}, + EK1 = {o, "Bucket1", "K999", null}, + R1 = initiate_rangequery_frommanifest(SK1, EK1, Man), + ?assertMatch([{1, [{next, E3, SK1}]}, + {2, [{next, E5, SK1}, {next, E6, SK1}]}], R1), - R2 = initiate_rangequery_frommanifest({i, "Bucket1", {"Idx1", "Fld8"}, null}, - {i, "Bucket1", {"Idx1", "Fld8"}, null}, - Man), - ?assertMatch([{1, [{next_file, E1}]}, {2, [{next_file, E5}]}], - R2), + SK2 = {i, "Bucket1", {"Idx1", "Fld8"}, null}, + EK2 = {i, "Bucket1", {"Idx1", "Fld8"}, null}, + R2 = initiate_rangequery_frommanifest(SK2, EK2, Man), + ?assertMatch([{1, [{next, E1, SK2}]}, {2, [{next, E5, SK2}]}], R2), R3 = initiate_rangequery_frommanifest({i, "Bucket1", {"Idx0", "Fld8"}, null}, {i, "Bucket1", {"Idx0", "Fld9"}, null}, Man), @@ -1693,17 +1724,18 @@ foldwithimm_simple_test() -> {{o, "Bucket1", "Key6"}, 7}], AccB). create_file_test() -> - Filename = "../test/new_file.sft", + Filename = "../test/new_file.sst", ok = file:write_file(Filename, term_to_binary("hello")), - KVL = lists:usort(leveled_sft:generate_randomkeys(10000)), + KVL = lists:usort(generate_randomkeys(10000)), Tree = leveled_skiplist:from_list(KVL), FetchFun = fun(Slot) -> lists:nth(Slot, [Tree]) end, {ok, SP, - noreply} = leveled_sft:sft_newfroml0cache(Filename, + noreply} = leveled_sst:sst_newlevelzero(Filename, 1, FetchFun, - #sft_options{wait=false}), + undefined, + 10000), lists:foreach(fun(X) -> case checkready(SP) of timeout -> @@ -1716,9 +1748,9 @@ create_file_test() -> io:format("StartKey ~w EndKey ~w~n", [StartKey, EndKey]), ?assertMatch({o, _, _, _}, StartKey), ?assertMatch({o, _, _, _}, EndKey), - ?assertMatch("../test/new_file.sft", SrcFN), - ok = leveled_sft:sft_clear(SP), - {ok, Bin} = file:read_file("../test/new_file.sft.discarded"), + ?assertMatch("../test/new_file.sst", SrcFN), + ok = leveled_sst:sst_clear(SP), + {ok, Bin} = file:read_file("../test/new_file.sst.discarded"), ?assertMatch("hello", binary_to_term(Bin)). commit_manifest_test() -> @@ -1735,14 +1767,14 @@ commit_manifest_test() -> ok = file:write_file(ManifestFP ++ "nonzero_1.pnd", term_to_binary("dummy data")), - L1_0 = [{1, [#manifest_entry{filename="1.sft"}]}], + L1_0 = [{1, [#manifest_entry{filename="1.sst"}]}], Resp_WI0 = Resp_WI#penciller_work{new_manifest=L1_0, unreferenced_files=[]}, {ok, State0} = commit_manifest_change(Resp_WI0, State), ?assertMatch(1, State0#state.manifest_sqn), ?assertMatch([], get_item(0, State0#state.manifest, [])), - L0Entry = [#manifest_entry{filename="0.sft"}], + L0Entry = [#manifest_entry{filename="0.sst"}], ManifestPlus = [{0, L0Entry}|State0#state.manifest], NxtSent_WI = #penciller_work{next_sqn=2, @@ -1756,7 +1788,7 @@ commit_manifest_test() -> ok = file:write_file(ManifestFP ++ "nonzero_2.pnd", term_to_binary("dummy data")), - L2_0 = [#manifest_entry{filename="2.sft"}], + L2_0 = [#manifest_entry{filename="2.sst"}], NxtResp_WI0 = NxtResp_WI#penciller_work{new_manifest=[{2, L2_0}], unreferenced_files=[]}, {ok, State2} = commit_manifest_change(NxtResp_WI0, State1), @@ -1777,7 +1809,7 @@ badmanifest_test() -> Key1_pre = {{o,"Bucket0001", "Key0001", null}, {1001, {active, infinity}, null}}, Key1 = add_missing_hash(Key1_pre), - KL1 = leveled_sft:generate_randomkeys({1000, 1}), + KL1 = generate_randomkeys({1000, 1}), ok = maybe_pause_push(PCL, KL1 ++ [Key1]), %% Added together, as split apart there will be a race between the close @@ -1798,7 +1830,7 @@ badmanifest_test() -> checkready(Pid) -> try - leveled_sft:sft_checkready(Pid) + leveled_sst:sst_checkready(Pid) catch exit:{timeout, _} -> timeout diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl deleted file mode 100644 index e736a47..0000000 --- a/src/leveled_sft.erl +++ /dev/null @@ -1,2024 +0,0 @@ -%% This module provides functions for managing sft files - a modified version -%% of sst files, to be used in leveleddb. -%% -%% sft files are segment filtered tables in that they are guarded by a quick -%% access filter that checks for the presence of key by segment id, with the -%% segment id being a hash in the range 0 - 1024 * 1024 -%% -%% This filter has a dual purpose -%% - a memory efficient way of discovering non-presence with low false positive -%% rate -%% - to make searching for all keys by hashtree segment more efficient (a -%% specific change to optimise behaviour for use with the incremental refresh) -%% of riak hashtrees -%% -%% All keys are not equal in sft files, keys are only expected in a specific -%% series of formats -%% - {Tag, Bucket, Key, SubKey|null} - Object Keys -%% - {i, Bucket, {IndexName, IndexTerm}, Key} - Postings -%% The {Bucket, Key} part of all types of keys are hashed for segment filters. -%% For Postings the {Bucket, IndexName, IndexTerm} is also hashed. This -%% causes a false positive on lookup of a segment, but allows for the presence -%% of specific index terms to be checked -%% -%% The objects stored are a tuple of {Key, SequenceNumber, State, Value}, where -%% Key - as above -%% SequenceNumber - monotonically increasing counter of addition to the nursery -%% log -%% State - {active|tomb, ExpiryTimestamp | infinity} -%% Value - null (all postings) | [Object Metadata] (all object keys) -%% Keys should be unique in files. If more than two keys are candidate for -%% the same file the highest sequence number should be chosen. If the file -%% is at the basemenet level of a leveleddb database the objects with an -%% ExpiryTimestamp in the past should not be written, but at all other levels -%% keys should not be ignored because of a timestamp in the past. -%% tomb objects are written for deletions, and these tombstones may have an -%% Expirytimestamp which in effect is the time when the tombstone should be -%% reaped. -%% -%% sft files are broken into the following sections: -%% - Header (fixed width 80 bytes - containing pointers and metadata) -%% - Blocks (variable length) -%% - Slot Filter (variable length) -%% - Slot Index (variable length) -%% - Table Summary (variable length) -%% Each section should contain at the footer of the section a 4-byte CRC which -%% is to be checked only on the opening of the file -%% -%% The keys in the sft file are placed into the file in erlang term order. -%% There will normally be 256 slots of keys. The Slot Index is a gb_tree -%% acting as a helper to find the right slot to check when searching for a key -%% or range of keys. -%% The Key in the Slot Index is the Key at the start of the Slot. -%% The Value in the Slot Index is a record indicating: -%% - The starting position of the Slot within the Blocks (relative to the -%% starting position of the Blocks) -%% - The (relative) starting position of the Slot Filter for this Slot -%% - The number of blocks within the Slot -%% - The length of each of the Blocks within the Slot -%% -%% When checking for a Key in the sft file, the key should be hashed to the -%% segment, then the key should be looked-up in the Slot Index. The segment -%% ID can then be checked against the Slot Filter which will either return -%% not_present or [BlockIDs] -%% If a list of BlockIDs (normally of length 1) is returned the block should -%% be fetched using the starting position and length of the Block to find the -%% actual key (or not if the Slot Filter had returned a false positive) -%% -%% There will exist a Slot Filter for each entry in the Slot Index -%% The Slot Filter starts with some fixed length metadata -%% - 1 byte stating the expected number of keys in the block -%% - 1 byte stating the number of complete (i.e. containing the expected -%% number of keys) Blocks in the Slot -%% - 1 byte stating the number of keys in any incomplete Block (there can -%% only be 1 incomplete Block per Slot and it must be the last block) -%% - 3 bytes stating the largest segment ID in the Slot -%% - 1 byte stating the exponent used in the rice-encoding of the filter -%% The Filter itself is a rice-encoded list of Integers representing the -%% differences between the Segment IDs in the Slot with each entry being -%% appended by the minimal number of bits to represent the Block ID in which -%% an entry for that segment can be found. Where a segment exists more than -%% once then a 0 length will be used. -%% To use the filter code should roll over the filter incrementing the Segment -%% ID by each difference, and counting the keys by Block ID. This should -%% return one of: -%% mismatch - the final Segment Count didn't meet the largest Segment ID or -%% the per-block key counts don't add-up. There could have been a bit-flip, -%% so don't rely on the filter -%% no_match - everything added up but the counter never equalled the queried -%% Segment ID -%% {match, [BlockIDs]} - everything added up and the Segment may be -%% represented in the given blocks -%% -%% The makeup of a block -%% - A block is a list of 32 {Key, Value} pairs in Erlang term order -%% - The block is stored using standard compression in term_to_binary -%% May be improved by use of lz4 or schema-based binary_to_term -%% -%% The Table Summary may contain multiple summaries -%% The standard table summary contains: -%% - a count of keys by bucket and type of key (posting or object key) -%% - the total size of objects referred to by object keys -%% - the number of postings by index name -%% - the number of tombstones within the file -%% - the highest and lowest sequence number in the file -%% Summaries could be used for other summaries of table content in the future, -%% perhaps application-specific bloom filters - -%% The 56-byte header is made up of -%% - 1 byte version (major 5 bits, minor 3 bits) - default 0.1 -%% - 1 byte options (currently undefined) -%% - 1 byte Block Size - the expected number of keys in each block -%% - 1 byte Block Count - the expected number of blocks in each slot -%% - 2 byte Slot Count - the maximum number of slots in the file -%% - 6 bytes - spare -%% - 4 bytes - Blocks length -%% - 4 bytes - Slot Index length -%% - 4 bytes - Slot Filter length -%% - 4 bytes - Table summary length -%% - 24 bytes - spare -%% - 4 bytes - CRC32 -%% -%% The file body is written in the same order of events as the header (i.e. -%% Blocks first) -%% -%% Once open the file can be in the following states -%% - writing, the file is still being created -%% - available, the file may be read, but never again must be modified -%% - pending_deletion, the file can be closed and deleted once all outstanding -%% Snapshots have been started beyond a certain sequence number -%% -%% Level managers should only be aware of files in the available state. -%% Iterators may be aware of files in either available or pending_delete. -%% Level maintainers should control the file exclusively when in the writing -%% state, and send the event to trigger pending_delete with the a sequence -%% number equal to or higher than the number at the point it was no longer -%% active at any level. -%% -%% The format of the file is intended to support quick lookups, whilst -%% allowing for a new file to be written incrementally (so that all keys and -%% values need not be retained in memory) - perhaps n blocks at a time - - --module(leveled_sft). - --behaviour(gen_fsm). --include("include/leveled.hrl"). - --export([init/1, - handle_sync_event/4, - handle_event/3, - handle_info/3, - terminate/3, - code_change/4, - starting/2, - starting/3, - reader/3, - delete_pending/3, - delete_pending/2]). - --export([sft_new/4, - sft_newfroml0cache/4, - sft_open/1, - sft_get/2, - sft_get/3, - sft_getkvrange/4, - sft_close/1, - sft_clear/1, - sft_checkready/1, - sft_setfordelete/2, - sft_deleteconfirmed/1, - sft_getmaxsequencenumber/1]). - --export([generate_randomkeys/1]). - --include_lib("eunit/include/eunit.hrl"). - - --define(WORD_SIZE, 4). --define(DWORD_SIZE, 8). --define(CURRENT_VERSION, {0,1}). --define(SLOT_COUNT, 256). --define(SLOT_GROUPWRITE_COUNT, 16). --define(BLOCK_SIZE, 32). --define(BLOCK_COUNT, 4). --define(FOOTERPOS_HEADERPOS, 2). --define(MAX_SEG_HASH, 1048576). --define(DIVISOR_BITS, 13). --define(DIVISOR, 8092). --define(COMPRESSION_LEVEL, 1). --define(HEADER_LEN, 56). --define(ITERATOR_SCANWIDTH, 1). --define(MERGE_SCANWIDTH, 32). --define(BLOOM_WIDTH, 48). --define(DELETE_TIMEOUT, 10000). --define(MAX_KEYS, ?SLOT_COUNT * ?BLOCK_COUNT * ?BLOCK_SIZE). --define(DISCARD_EXT, ".discarded"). --define(WRITE_OPS, [binary, raw, read, write, delayed_write]). --define(READ_OPS, [binary, raw, read]). - --record(state, {version = ?CURRENT_VERSION :: tuple(), - slot_index :: list(), - next_position :: integer(), - smallest_sqn :: integer(), - highest_sqn :: integer(), - smallest_key :: string(), - highest_key :: string(), - slots_pointer :: integer(), - index_pointer :: integer(), - filter_pointer :: integer(), - summ_pointer :: integer(), - summ_length :: integer(), - filename = "not set" :: string(), - handle :: file:fd(), - background_complete = false :: boolean(), - oversized_file = false :: boolean(), - penciller :: pid(), - bloom}). - -%% Helper object when writing a file to keep track of various accumulators --record(writer, {slot_index = [] :: list(), - slot_binary = <<>> :: binary(), - bloom = leveled_tinybloom:empty(?BLOOM_WIDTH), - min_sqn = infinity :: integer()|infinity, - max_sqn = 0 :: integer(), - last_key = {last, null}}). - -%%%============================================================================ -%%% API -%%%============================================================================ - - -sft_new(Filename, KL1, KL2, LevelInfo) -> - LevelR = case is_integer(LevelInfo) of - true -> - #level{level=LevelInfo}; - _ -> - if - is_record(LevelInfo, level) -> - LevelInfo - end - end, - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - Reply = gen_fsm:sync_send_event(Pid, - {sft_new, Filename, KL1, KL2, LevelR}, - infinity), - {ok, Pid, Reply}. - -sft_newfroml0cache(Filename, Slots, FetchFun, Options) -> - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - case Options#sft_options.wait of - true -> - KL1 = leveled_pmem:to_list(Slots, FetchFun), - Reply = gen_fsm:sync_send_event(Pid, - {sft_new, - Filename, - KL1, - [], - #level{level=0}}, - infinity), - {ok, Pid, Reply}; - false -> - gen_fsm:send_event(Pid, - {sft_newfroml0cache, - Filename, - Slots, - FetchFun, - Options#sft_options.penciller}), - {ok, Pid, noreply} - end. - -sft_open(Filename) -> - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - case gen_fsm:sync_send_event(Pid, {sft_open, Filename}, infinity) of - {ok, {SK, EK}} -> - {ok, Pid, {SK, EK}} - end. - -sft_setfordelete(Pid, Penciller) -> - gen_fsm:sync_send_event(Pid, {set_for_delete, Penciller}, infinity). - -sft_get(Pid, Key, Hash) -> - gen_fsm:sync_send_event(Pid, {get_kv, Key, Hash}, infinity). - -sft_get(Pid, Key) -> - sft_get(Pid, Key, leveled_codec:magic_hash(Key)). - -sft_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> - gen_fsm:sync_send_event(Pid, - {get_kvrange, StartKey, EndKey, ScanWidth}, - infinity). - -sft_clear(Pid) -> - gen_fsm:sync_send_event(Pid, {set_for_delete, false}, infinity), - gen_fsm:sync_send_event(Pid, close, 1000). - -sft_close(Pid) -> - gen_fsm:sync_send_event(Pid, close, 1000). - -sft_deleteconfirmed(Pid) -> - gen_fsm:send_event(Pid, close). - -sft_checkready(Pid) -> - gen_fsm:sync_send_event(Pid, background_complete, 20). - -sft_getmaxsequencenumber(Pid) -> - gen_fsm:sync_send_event(Pid, get_maxsqn, infinity). - - - -%%%============================================================================ -%%% gen_server callbacks -%%%============================================================================ - -init([]) -> - {ok, starting, #state{}}. - -starting({sft_new, Filename, KL1, [], _LevelR=#level{level=L}}, _From, _State) - when L == 0 -> - {ok, State} = create_levelzero(KL1, Filename), - {reply, - {{[], []}, State#state.smallest_key, State#state.highest_key}, - reader, - State}; -starting({sft_new, Filename, KL1, KL2, LevelR}, _From, _State) -> - case create_file(Filename) of - {Handle, FileMD} -> - {ReadHandle, UpdFileMD, KeyRemainders} = complete_file(Handle, - FileMD, - KL1, KL2, - LevelR), - {reply, - {KeyRemainders, - UpdFileMD#state.smallest_key, - UpdFileMD#state.highest_key}, - reader, - UpdFileMD#state{handle=ReadHandle, filename=Filename}} - end; -starting({sft_open, Filename}, _From, _State) -> - {_Handle, FileMD} = open_file(#state{filename=Filename}), - leveled_log:log("SFT01", [Filename]), - {reply, - {ok, {FileMD#state.smallest_key, FileMD#state.highest_key}}, - reader, - FileMD}. - -starting({sft_newfroml0cache, Filename, Slots, FetchFun, PCL}, _State) -> - SW = os:timestamp(), - Inp1 = leveled_pmem:to_list(Slots, FetchFun), - {ok, State} = create_levelzero(Inp1, Filename), - leveled_log:log_timer("SFT03", [Filename], SW), - case PCL of - undefined -> - {next_state, reader, State}; - _ -> - leveled_penciller:pcl_confirml0complete(PCL, - State#state.filename, - State#state.smallest_key, - State#state.highest_key), - {next_state, reader, State} - end. - - -reader({get_kv, Key, Hash}, _From, State) -> - Reply = - case leveled_tinybloom:check({hash, Hash}, State#state.bloom) of - false -> - not_present; - true -> - fetch_keyvalue(State#state.handle, State, Key) - end, - {reply, Reply, reader, State}; -reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> - Reply = pointer_append_queryresults(fetch_range_kv(State#state.handle, - State, - StartKey, - EndKey, - ScanWidth), - self()), - {reply, Reply, reader, State}; -reader(get_maxsqn, _From, State) -> - {reply, State#state.highest_sqn, reader, State}; -reader({set_for_delete, Penciller}, _From, State) -> - leveled_log:log("SFT02", [State#state.filename]), - {reply, - ok, - delete_pending, - State#state{penciller=Penciller}, - ?DELETE_TIMEOUT}; -reader(background_complete, _From, State) -> - if - State#state.background_complete == true -> - {reply, - {ok, - State#state.filename, - State#state.smallest_key, - State#state.highest_key}, - reader, - State} - end; -reader(close, _From, State) -> - ok = file:close(State#state.handle), - {stop, normal, ok, State}. - -delete_pending({get_kv, Key, Hash}, _From, State) -> - Reply = - case leveled_tinybloom:check({hash, Hash}, State#state.bloom) of - false -> - not_present; - true -> - fetch_keyvalue(State#state.handle, State, Key) - end, - {reply, Reply, delete_pending, State, ?DELETE_TIMEOUT}; -delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> - Reply = pointer_append_queryresults(fetch_range_kv(State#state.handle, - State, - StartKey, - EndKey, - ScanWidth), - self()), - {reply, Reply, delete_pending, State, ?DELETE_TIMEOUT}; -delete_pending(close, _From, State) -> - leveled_log:log("SFT06", [State#state.filename]), - ok = file:close(State#state.handle), - ok = file:delete(State#state.filename), - {stop, normal, ok, State}. - -delete_pending(timeout, State) -> - leveled_log:log("SFT05", [timeout, State#state.filename]), - ok = leveled_penciller:pcl_confirmdelete(State#state.penciller, - State#state.filename), - {next_state, delete_pending, State, ?DELETE_TIMEOUT}; -delete_pending(close, State) -> - leveled_log:log("SFT06", [State#state.filename]), - ok = file:close(State#state.handle), - ok = file:delete(State#state.filename), - {stop, normal, State}. - -handle_sync_event(_Msg, _From, StateName, State) -> - {reply, undefined, StateName, State}. - -handle_event(_Msg, StateName, State) -> - {next_state, StateName, State}. - -handle_info(_Msg, StateName, State) -> - {next_state, StateName, State}. - -terminate(Reason, _StateName, State) -> - leveled_log:log("SFT05", [Reason, State#state.filename]). - -code_change(_OldVsn, StateName, State, _Extra) -> - {ok, StateName, State}. - - - -%%%============================================================================ -%%% Internal functions -%%%============================================================================ - - -create_levelzero(ListForFile, Filename) -> - {TmpFilename, PrmFilename} = generate_filenames(Filename), - {Handle, FileMD} = create_file(TmpFilename), - InputSize = length(ListForFile), - leveled_log:log("SFT07", [InputSize]), - Rename = {true, TmpFilename, PrmFilename}, - {ReadHandle, - UpdFileMD, - {[], []}} = complete_file(Handle, FileMD, - ListForFile, [], - #level{level=0}, Rename), - {ok, - UpdFileMD#state{handle=ReadHandle, - filename=PrmFilename, - background_complete=true, - oversized_file=InputSize>?MAX_KEYS}}. - - -generate_filenames(RootFilename) -> - Ext = filename:extension(RootFilename), - Components = filename:split(RootFilename), - case Ext of - [] -> - {filename:join(Components) ++ ".pnd", - filename:join(Components) ++ ".sft"}; - Ext -> - %% This seems unnecessarily hard - DN = filename:dirname(RootFilename), - FP = lists:last(Components), - FP_NOEXT = lists:sublist(FP, 1, 1 + length(FP) - length(Ext)), - {DN ++ "/" ++ FP_NOEXT ++ "pnd", DN ++ "/" ++ FP_NOEXT ++ "sft"} - end. - - -%% Start a bare file with an initial header and no further details -%% Return the {Handle, metadata record} -create_file(FileName) when is_list(FileName) -> - leveled_log:log("SFT01", [FileName]), - ok = filelib:ensure_dir(FileName), - {ok, Handle} = file:open(FileName, ?WRITE_OPS), - Header = create_header(initial), - {ok, _} = file:position(Handle, bof), - ok = file:write(Handle, Header), - {ok, StartPos} = file:position(Handle, cur), - FileMD = #state{next_position=StartPos, filename=FileName}, - {Handle, FileMD}. - - -create_header(initial) -> - {Major, Minor} = ?CURRENT_VERSION, - Version = <>, - %% Not thought of any options - options are ignored - Options = <<0:8>>, - %% Settings are currently ignored - {BlSize, BlCount, SlCount} = {?BLOCK_COUNT, ?BLOCK_SIZE, ?SLOT_COUNT}, - Settings = <>, - {SpareO, SpareL} = {<<0:48>>, <<0:192>>}, - Lengths = <<0:32, 0:32, 0:32, 0:32>>, - H1 = <>, - CRC32 = erlang:crc32(H1), - <

>. - -%% Open a file returning a handle and metadata which can be used in fetch and -%% iterator requests -%% The handle should be read-only as these are immutable files, a file cannot -%% be opened for writing keys, it can only be created to write keys - -open_file(FileMD) -> - Filename = FileMD#state.filename, - {ok, Handle} = file:open(Filename, [binary, raw, read]), - {ok, HeaderLengths} = file:pread(Handle, 12, 16), - <> = HeaderLengths, - {ok, <>} = - file:pread(Handle, ?HEADER_LEN + Blen + Ilen + Flen, Slen), - {{LowSQN, HighSQN}, {LowKey, HighKey}, Bloom} = - case erlang:crc32(SummaryBin) of - SummaryCRC -> - binary_to_term(SummaryBin) - end, - {ok, SlotIndexBin} = file:pread(Handle, ?HEADER_LEN + Blen, Ilen), - SlotIndex = binary_to_term(SlotIndexBin), - {Handle, FileMD#state{slot_index=SlotIndex, - smallest_sqn=LowSQN, - highest_sqn=HighSQN, - smallest_key=LowKey, - highest_key=HighKey, - slots_pointer=?HEADER_LEN, - index_pointer=?HEADER_LEN + Blen, - filter_pointer=?HEADER_LEN + Blen + Ilen, - summ_pointer=?HEADER_LEN + Blen + Ilen + Flen, - summ_length=Slen, - handle=Handle, - bloom=Bloom}}. - -%% Take a file handle with a previously created header and complete it based on -%% the two key lists KL1 and KL2 -complete_file(Handle, FileMD, KL1, KL2, LevelR) -> - complete_file(Handle, FileMD, KL1, KL2, LevelR, false). - -complete_file(Handle, FileMD, KL1, KL2, LevelR, Rename) -> - {ok, KeyRemainders} = write_keys(Handle, - maybe_expand_pointer(KL1), - maybe_expand_pointer(KL2), - LevelR, - fun sftwrite_function/2, - #writer{}), - {ReadHandle, UpdFileMD} = case Rename of - false -> - open_file(FileMD); - {true, OldName, NewName} -> - ok = rename_file(OldName, NewName), - open_file(FileMD#state{filename=NewName}) - end, - {ReadHandle, UpdFileMD, KeyRemainders}. - -rename_file(OldName, NewName) -> - leveled_log:log("SFT08", [OldName, NewName]), - case filelib:is_file(NewName) of - true -> - leveled_log:log("SFT09", [NewName]), - AltName = filename:join(filename:dirname(NewName), - filename:basename(NewName)) - ++ ?DISCARD_EXT, - leveled_log:log("SFT10", [NewName, AltName]), - ok = file:rename(NewName, AltName); - false -> - ok - end, - file:rename(OldName, NewName). - - -%% Fetch a Key and Value from a file, returns -%% {value, KV} or not_present -%% The key must be pre-checked to ensure it is in the valid range for the file -%% A key out of range may fail - -fetch_keyvalue(Handle, FileMD, Key) -> - case get_nearestkey(FileMD#state.slot_index, Key) of - not_found -> - not_present; - {_NearestKey, {FilterLen, PointerF}, {LengthList, PointerB}} -> - FilterPointer = PointerF + FileMD#state.filter_pointer, - {ok, SegFilter} = file:pread(Handle, - FilterPointer, - FilterLen), - SegID = hash_for_segmentid({keyonly, Key}), - case check_for_segments(SegFilter, [SegID], true) of - {maybe_present, BlockList} -> - BlockPointer = PointerB + FileMD#state.slots_pointer, - fetch_keyvalue_fromblock(BlockList, - Key, - LengthList, - Handle, - BlockPointer); - not_present -> - not_present; - error_so_maybe_present -> - BlockPointer = PointerB + FileMD#state.slots_pointer, - fetch_keyvalue_fromblock(lists:seq(0,length(LengthList)), - Key, - LengthList, - Handle, - BlockPointer) - end - end. - -%% Fetches a range of keys returning a list of {Key, SeqN} tuples -fetch_range_keysonly(Handle, FileMD, StartKey, EndKey) -> - fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_keysonly/2). - -fetch_range_keysonly(Handle, FileMD, StartKey, EndKey, ScanWidth) -> - fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_keysonly/2, - ScanWidth). - -%% Fetches a range of keys returning the full tuple, including value -fetch_range_kv(Handle, FileMD, StartKey, EndKey, ScanWidth) -> - fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_kv/2, - ScanWidth). - -acc_list_keysonly(null, empty) -> - []; -acc_list_keysonly(null, RList) -> - RList; -acc_list_keysonly(R, RList) when is_list(R) -> - lists:foldl(fun acc_list_keysonly/2, RList, R); -acc_list_keysonly(R, RList) -> - lists:append(RList, [leveled_codec:strip_to_keyseqstatusonly(R)]). - -acc_list_kv(null, empty) -> - []; -acc_list_kv(null, RList) -> - RList; -acc_list_kv(R, RList) when is_list(R) -> - RList ++ R; -acc_list_kv(R, RList) -> - lists:append(RList, [R]). - -%% Iterate keys, returning a batch of keys & values in a range -%% - the iterator can have a ScanWidth which is how many slots should be -%% scanned by the iterator before returning a result -%% - batches can be ended with a pointer to indicate there are potentially -%% further values in the range -%% - a list of functions can be provided, which should either return true -%% or false, and these can be used to filter the results from the query, -%% for example to ignore keys above a certain sequence number, to ignore -%% keys not matching a certain regular expression, or to ignore keys not -%% a member of a particular partition -%% - An Accumulator and an Accumulator function can be passed. The function -%% needs to handle being passed (KV, Acc) to add the current result to the -%% Accumulator. The functional should handle KV=null, Acc=empty to initiate -%% the accumulator, and KV=null to leave the Accumulator unchanged. -%% Flexibility with accumulators is such that keys-only can be returned rather -%% than keys and values, or other entirely different accumulators can be -%% used - e.g. counters, hash-lists to build bloom filters etc - -fetch_range(Handle, FileMD, StartKey, EndKey, AccFun) -> - fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ?ITERATOR_SCANWIDTH). - -fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth) -> - fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth, empty). - -fetch_range(_Handle, _FileMD, StartKey, _EndKey, _AccFun, 0, Acc) -> - {partial, Acc, StartKey}; -fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth, Acc) -> - %% get_nearestkey gets the last key in the index <= StartKey, or the next - %% key along if {next, StartKey} is passed - case get_nearestkey(FileMD#state.slot_index, StartKey) of - {NearestKey, _Filter, {LengthList, PointerB}} -> - fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, - AccFun, ScanWidth, - LengthList, - 0, - PointerB + FileMD#state.slots_pointer, - AccFun(null, Acc)); - not_found -> - {complete, AccFun(null, Acc)} - end. - -fetch_range(Handle, FileMD, _StartKey, NearestKey, EndKey, - AccFun, ScanWidth, - LengthList, - BlockNumber, - _Pointer, - Acc) - when length(LengthList) == BlockNumber -> - %% Reached the end of the slot. Move the start key on one to scan a new slot - fetch_range(Handle, FileMD, {next, NearestKey}, EndKey, - AccFun, ScanWidth - 1, - Acc); -fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, - AccFun, ScanWidth, - LengthList, - BlockNumber, - Pointer, - Acc) -> - Block = fetch_block(Handle, LengthList, BlockNumber, Pointer), - Results = - case maybe_scan_entire_block(Block, StartKey, EndKey) of - true -> - {partial, AccFun(Block, Acc), StartKey}; - false -> - scan_block(Block, StartKey, EndKey, AccFun, Acc) - end, - case Results of - {partial, Acc1, StartKey} -> - %% Move on to the next block - fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, - AccFun, ScanWidth, - LengthList, - BlockNumber + 1, - Pointer, - Acc1); - {complete, Acc1} -> - {complete, Acc1} - end. - -scan_block([], StartKey, _EndKey, _AccFun, Acc) -> - {partial, Acc, StartKey}; -scan_block([HeadKV|T], StartKey, EndKey, AccFun, Acc) -> - K = leveled_codec:strip_to_keyonly(HeadKV), - case {StartKey > K, leveled_codec:endkey_passed(EndKey, K)} of - {true, _} when StartKey /= all -> - scan_block(T, StartKey, EndKey, AccFun, Acc); - {_, true} when EndKey /= all -> - {complete, Acc}; - _ -> - scan_block(T, StartKey, EndKey, AccFun, AccFun(HeadKV, Acc)) - end. - - -maybe_scan_entire_block([], _, _) -> - true; -maybe_scan_entire_block(_Block, all, all) -> - true; -maybe_scan_entire_block(Block, StartKey, all) -> - [FirstKey|_Tail] = Block, - leveled_codec:strip_to_keyonly(FirstKey) > StartKey; -maybe_scan_entire_block(Block, StartKey, EndKey) -> - [FirstKey|_Tail] = Block, - LastKey = leveled_codec:strip_to_keyonly(lists:last(Block)), - FromStart = leveled_codec:strip_to_keyonly(FirstKey) > StartKey, - ToEnd = leveled_codec:endkey_passed(EndKey, LastKey), - case {FromStart, ToEnd} of - {true, false} -> - true; - _ -> - false - end. - -fetch_keyvalue_fromblock([], _Key, _LengthList, _Handle, _StartOfSlot) -> - not_present; -fetch_keyvalue_fromblock([BlockNmb|T], Key, LengthList, Handle, StartOfSlot) -> - BlockToCheck = fetch_block(Handle, LengthList, BlockNmb, StartOfSlot), - Result = lists:keyfind(Key, 1, BlockToCheck), - case Result of - false -> - fetch_keyvalue_fromblock(T, Key, LengthList, Handle, StartOfSlot); - KV -> - KV - end. - -fetch_block(Handle, LengthList, BlockNmb, StartOfSlot) -> - Start = lists:sum(lists:sublist(LengthList, BlockNmb)), - Length = lists:nth(BlockNmb + 1, LengthList), - {ok, BlockToCheckBin} = file:pread(Handle, Start + StartOfSlot, Length), - binary_to_term(BlockToCheckBin). - -%% Need to deal with either Key or {next, Key} -get_nearestkey([H|_Tail], all) -> - H; -get_nearestkey(KVList, Key) -> - case Key of - {next, K} -> - get_nextkeyaftermatch(KVList, K, not_found); - _ -> - get_firstkeytomatch(KVList, Key, not_found) - end. - -get_firstkeytomatch([], _KeyToFind, PrevV) -> - PrevV; -get_firstkeytomatch([{K, FilterInfo, SlotInfo}|_T], KeyToFind, PrevV) - when K > KeyToFind -> - case PrevV of - not_found -> - {K, FilterInfo, SlotInfo}; - _ -> - PrevV - end; -get_firstkeytomatch([{K, FilterInfo, SlotInfo}|T], KeyToFind, _PrevV) -> - get_firstkeytomatch(T, KeyToFind, {K, FilterInfo, SlotInfo}). - -get_nextkeyaftermatch([], _KeyToFind, _PrevV) -> - not_found; -get_nextkeyaftermatch([{K, FilterInfo, SlotInfo}|T], KeyToFind, PrevV) - when K >= KeyToFind -> - case PrevV of - not_found -> - get_nextkeyaftermatch(T, KeyToFind, next); - next -> - {K, FilterInfo, SlotInfo} - end; -get_nextkeyaftermatch([_KTuple|T], KeyToFind, PrevV) -> - get_nextkeyaftermatch(T, KeyToFind, PrevV). - - -%% Take a file handle at the sart position (after creating the header) and then -%% write the Key lists to the file slot by slot. -%% -%% Slots are created then written in bulk to impove I/O efficiency. Slots will -%% be written in groups - -write_keys(Handle, KL1, KL2, LevelR, WriteFun, WriteState) -> - write_keys(Handle, KL1, KL2, LevelR, WriteFun, WriteState, {0, 0, []}). - -write_keys(Handle, KL1, KL2, LevelR, WriteFun, WState, - {SlotC, SlotT, SlotLists}) - when SlotC =:= ?SLOT_GROUPWRITE_COUNT -> - WState0 = lists:foldl(fun finalise_slot/2, WState, SlotLists), - Handle0 = WriteFun(slots, {Handle, WState0#writer.slot_binary}), - case maxslots_bylevel(SlotT, LevelR#level.level) of - reached -> - {complete_keywrite(Handle0, WState0, WriteFun), {KL1, KL2}}; - continue -> - write_keys(Handle0, KL1, KL2, LevelR, WriteFun, - WState0#writer{slot_binary = <<>>}, {0, SlotT, []}) - end; -write_keys(Handle, KL1, KL2, LevelR, WriteFun, WState, - {SlotC, SlotT, SlotLists}) -> - {Status, BlockKeyLists} = create_slot(KL1, KL2, LevelR), - case Status of - S when S == complete; S == partial -> - WState0 = - case BlockKeyLists of - [[]] -> - WState; - _ -> - lists:foldl(fun finalise_slot/2, - WState, - SlotLists ++ [BlockKeyLists]) - end, - Handle0 = WriteFun(slots, {Handle, WState0#writer.slot_binary}), - {complete_keywrite(Handle0, WState0, WriteFun), {[], []}}; - {full, KL1Rem, KL2Rem} -> - write_keys(Handle, KL1Rem, KL2Rem, LevelR, WriteFun, WState, - {SlotC + 1, SlotT + 1, SlotLists ++ [BlockKeyLists]}) - end. - - -complete_keywrite(Handle, WriteState, WriteFun) -> - FirstKey = - case length(WriteState#writer.slot_index) of - 0 -> - null; - _ -> - element(1, lists:nth(1, WriteState#writer.slot_index)) - end, - ConvSlotIndex = convert_slotindex(WriteState#writer.slot_index), - WriteFun(finalise, {Handle, - ConvSlotIndex, - {{WriteState#writer.min_sqn, WriteState#writer.max_sqn}, - {FirstKey, WriteState#writer.last_key}, - WriteState#writer.bloom}}). - -%% Take a slot index, and remove the SegFilters replacing with pointers -%% Return a tuple of the accumulated slot filters, and a pointer-based -%% slot-index - -convert_slotindex(SlotIndex) -> - SlotFun = fun({LowKey, SegFilter, LengthList}, - {FilterAcc, SlotIndexAcc, PointerF, PointerB}) -> - FilterOut = serialise_segment_filter(SegFilter), - FilterLen = byte_size(FilterOut), - {<>, - lists:append(SlotIndexAcc, [{LowKey, - {FilterLen, PointerF}, - {LengthList, PointerB}}]), - PointerF + FilterLen, - PointerB + lists:sum(LengthList)} end, - {SlotFilters, PointerIndex, _FLength, _BLength} = - lists:foldl(SlotFun, {<<>>, [], 0, 0}, SlotIndex), - {SlotFilters, PointerIndex}. - -sftwrite_function(slots, {Handle, SerialisedSlots}) -> - ok = file:write(Handle, SerialisedSlots), - Handle; -sftwrite_function(finalise, - {Handle, - {SlotFilters, PointerIndex}, - {SNExtremes, KeyExtremes, Bloom}}) -> - {ok, Position} = file:position(Handle, cur), - - BlocksLength = Position - ?HEADER_LEN, - Index = term_to_binary(PointerIndex), - IndexLength = byte_size(Index), - FilterLength = byte_size(SlotFilters), - Summary = term_to_binary({SNExtremes, KeyExtremes, Bloom}), - SummaryCRC = erlang:crc32(Summary), - SummaryLength = byte_size(Summary) + 4, - %% Write Index, Filter and Summary - ok = file:write(Handle, <>), - %% Write Lengths into header - ok = file:pwrite(Handle, 12, <>), - {ok, _Position} = file:position(Handle, bof), - ok = file:advise(Handle, - BlocksLength + IndexLength, - FilterLength, - will_need), - file:close(Handle). - -%% Level 0 files are of variable (infinite) size to avoid issues with having -%% any remainders when flushing from memory -maxslots_bylevel(_SlotTotal, 0) -> - continue; -maxslots_bylevel(SlotTotal, _Level) -> - case SlotTotal of - ?SLOT_COUNT -> - reached; - X when X < ?SLOT_COUNT -> - continue - end. - - - -%% Take two potentially overlapping lists of keys and produce a block size -%% list of keys in the correct order. Outputs: -%% - Status of -%% - - all_complete (no more keys and block is complete) -%% - - partial (no more keys and block is not complete) -%% - - {block_full, Rem1, Rem2} the block is complete but there is a remainder -%% of keys - -create_block(KeyList1, KeyList2, LevelR) -> - create_block(KeyList1, KeyList2, LevelR, []). - - -create_block([], [], _LevelR, BlockKeyList) - when length(BlockKeyList)==?BLOCK_SIZE -> - {all_complete, lists:reverse(BlockKeyList)}; -create_block([], [], _LevelR, BlockKeyList) -> - {partial, lists:reverse(BlockKeyList)}; -create_block(KeyList1, KeyList2, _LevelR, BlockKeyList) - when length(BlockKeyList)==?BLOCK_SIZE -> - {{block_full, KeyList1, KeyList2}, lists:reverse(BlockKeyList)}; -create_block(KeyList1, KeyList2, LevelR, BlockKeyList) -> - case key_dominates(KeyList1, KeyList2, - {LevelR#level.is_basement, LevelR#level.timestamp}) of - {{next_key, TopKey}, Rem1, Rem2} -> - create_block(Rem1, Rem2, LevelR, [TopKey|BlockKeyList]); - {skipped_key, Rem1, Rem2} -> - create_block(Rem1, Rem2, LevelR, BlockKeyList) - end. - -%% create_slot should simply output a list of BlockKeyLists no bigger than -%% the BlockCount, the the status (with key remianders if not complete) - -create_slot(KL1, KL2, LevelR) -> - create_slot(KL1, KL2, LevelR, ?BLOCK_COUNT, []). - -create_slot(KL1, KL2, LevelR, BlockCount, BlockKeyLists) -> - {Status, KeyList} = create_block(KL1, KL2, LevelR), - case {Status, BlockCount - 1} of - {partial, _N} -> - {partial, BlockKeyLists ++ [KeyList]}; - {all_complete, 0} -> - {complete, BlockKeyLists ++ [KeyList]}; - {all_complete, _N} -> - % From the perspective of the slot it is partially complete - {partial, BlockKeyLists ++ [KeyList]}; - {{block_full, KL1Rem, KL2Rem}, 0} -> - {{full, KL1Rem, KL2Rem}, BlockKeyLists ++ [KeyList]}; - {{block_full, KL1Rem, KL2Rem}, N} -> - create_slot(KL1Rem, KL2Rem, LevelR, N, BlockKeyLists ++ [KeyList]) - end. - - - -%% Fold over the List of BlockKeys updating the writer record -finalise_slot(BlockKeyLists, WriteState) -> - BlockFolder = - fun(KV, {AccMinSQN, AccMaxSQN, Bloom, SegmentIDList}) -> - {SQN, Hash} = leveled_codec:strip_to_seqnhashonly(KV), - {min(AccMinSQN, SQN), - max(AccMaxSQN, SQN), - leveled_tinybloom:enter({hash, Hash}, Bloom), - [hash_for_segmentid(KV)|SegmentIDList]} - end, - SlotFolder = - fun(BlockKeyList, - {MinSQN, MaxSQN, Bloom, SegLists, KVBinary, Lengths}) -> - {BlockMinSQN, BlockMaxSQN, UpdBloom, Segs} = - lists:foldr(BlockFolder, - {infinity, 0, Bloom, []}, - BlockKeyList), - SerialisedBlock = serialise_block(BlockKeyList), - {min(MinSQN, BlockMinSQN), - max(MaxSQN, BlockMaxSQN), - UpdBloom, - SegLists ++ [Segs], - <>, - Lengths ++ [byte_size(SerialisedBlock)]} - end, - - {SlotMinSQN, - SlotMaxSQN, - SlotUpdBloom, - SlotSegLists, - SlotBinary, - BlockLengths} = - lists:foldl(SlotFolder, - {WriteState#writer.min_sqn, - WriteState#writer.max_sqn, - WriteState#writer.bloom, - [], - WriteState#writer.slot_binary, - []}, - BlockKeyLists), - - FirstSlotKey = leveled_codec:strip_to_keyonly(lists:nth(1, - lists:nth(1, - BlockKeyLists))), - LastSlotKV = lists:last(lists:last(BlockKeyLists)), - SegFilter = generate_segment_filter(SlotSegLists), - UpdSlotIndex = lists:append(WriteState#writer.slot_index, - [{FirstSlotKey, SegFilter, BlockLengths}]), - - #writer{slot_index = UpdSlotIndex, - slot_binary = SlotBinary, - bloom = SlotUpdBloom, - min_sqn = SlotMinSQN, - max_sqn = SlotMaxSQN, - last_key = leveled_codec:strip_to_keyonly(LastSlotKV)}. - - -serialise_block(BlockKeyList) -> - term_to_binary(BlockKeyList, [{compressed, ?COMPRESSION_LEVEL}]). - - -%% Compare the keys at the head of the list, and either skip that "best" key or -%% identify as the next key. -%% -%% The logic needs to change if the file is in the basement level, as keys with -%% expired timestamps need not be written at this level -%% -%% The best key is considered to be the lowest key in erlang term order. If -%% there are matching keys then the highest sequence number must be chosen and -%% any lower sequence numbers should be compacted out of existence - - -key_dominates(KL1, KL2, Level) -> - key_dominates_expanded(maybe_expand_pointer(KL1), - maybe_expand_pointer(KL2), - Level). - -key_dominates_expanded([H1|T1], [], Level) -> - case leveled_codec:maybe_reap_expiredkey(H1, Level) of - true -> - {skipped_key, maybe_expand_pointer(T1), []}; - false -> - {{next_key, H1}, maybe_expand_pointer(T1), []} - end; -key_dominates_expanded([], [H2|T2], Level) -> - case leveled_codec:maybe_reap_expiredkey(H2, Level) of - true -> - {skipped_key, [], maybe_expand_pointer(T2)}; - false -> - {{next_key, H2}, [], maybe_expand_pointer(T2)} - end; -key_dominates_expanded([H1|T1], [H2|T2], Level) -> - case leveled_codec:key_dominates(H1, H2) of - left_hand_first -> - case leveled_codec:maybe_reap_expiredkey(H1, Level) of - true -> - {skipped_key, maybe_expand_pointer(T1), [H2|T2]}; - false -> - {{next_key, H1}, maybe_expand_pointer(T1), [H2|T2]} - end; - right_hand_first -> - case leveled_codec:maybe_reap_expiredkey(H2, Level) of - true -> - {skipped_key, [H1|T1], maybe_expand_pointer(T2)}; - false -> - {{next_key, H2}, [H1|T1], maybe_expand_pointer(T2)} - end; - left_hand_dominant -> - {skipped_key, [H1|T1], maybe_expand_pointer(T2)}; - right_hand_dominant -> - {skipped_key, maybe_expand_pointer(T1), [H2|T2]} - end. - - -%% When a list is provided it may include a pointer to gain another batch of -%% entries from the same file, or a new batch of entries from another file -%% -%% This resultant list should include the Tail of any pointers added at the -%% end of the list - -maybe_expand_pointer([]) -> - []; -maybe_expand_pointer([H|Tail]) -> - case H of - {next, SFTPid, StartKey} -> - %% io:format("Scanning further on PID ~w ~w~n", [SFTPid, StartKey]), - SW = os:timestamp(), - Acc = sft_getkvrange(SFTPid, StartKey, all, ?MERGE_SCANWIDTH), - leveled_log:log_timer("SFT14", [SFTPid], SW), - lists:append(Acc, Tail); - _ -> - [H|Tail] - end. - - -pointer_append_queryresults(Results, QueryPid) -> - case Results of - {complete, Acc} -> - Acc; - {partial, Acc, StartKey} -> - lists:append(Acc, [{next, QueryPid, StartKey}]) - end. - - -%% The Segment filter is a compressed filter representing the keys in a -%% given slot. The filter is delta-compressed list of integers using rice -%% encoding extended by the reference to each integer having an extra two bits -%% to indicate the block - there are four blocks in each slot. -%% -%% So each delta is represented as -%% - variable length exponent ending in 0, -%% with 0 representing the exponent of 0, -%% 10 -> 2 ^ 13, -%% 110 -> 2^14, -%% 1110 -> 2^15 etc -%% - 13-bit fixed length remainder -%% - 2-bit block number -%% This gives about 2-bytes per key, with a 1:8000 (approx) false positive -%% ratio (when checking the key by hashing to the segment ID) -%% -%% Before the delta list are three 20-bit integers representing the highest -%% integer in each block. Plus two bytes to indicate how many hashes -%% there are in the slot -%% -%% To check for the presence of a segment in a slot, roll over the deltas -%% keeping a running total overall and the current highest segment ID seen -%% per block. Roll all the way through even if matches are found or passed -%% over to confirm that the totals match the expected value (hence creating -%% a natural checksum) -%% -%% The end-result is a 260-byte check for the presence of a key in a slot -%% returning the block in which the segment can be found, which may also be -%% used directly for checking for the presence of segments. -%% -%% This is more space efficient than the equivalent bloom filter and avoids -%% the calculation of many hash functions. - -generate_segment_filter([SegL1]) -> - generate_segment_filter({SegL1, [], [], []}); -generate_segment_filter([SegL1, SegL2]) -> - generate_segment_filter({SegL1, SegL2, [], []}); -generate_segment_filter([SegL1, SegL2, SegL3]) -> - generate_segment_filter({SegL1, SegL2, SegL3, []}); -generate_segment_filter([SegL1, SegL2, SegL3, SegL4]) -> - generate_segment_filter({SegL1, SegL2, SegL3, SegL4}); -generate_segment_filter(SegLists) -> - generate_segment_filter(merge_seglists(SegLists), - [], - [{0, 0}, {0, 1}, {0, 2}, {0, 3}]). - -%% to generate the segment filter needs a sorted list of {Delta, Block} pairs -%% as DeltaList and a list of {TopHash, Block} pairs as TopHashes - -generate_segment_filter([], DeltaList, TopHashes) -> - {lists:reverse(DeltaList), TopHashes}; -generate_segment_filter([NextSeg|SegTail], DeltaList, TopHashes) -> - {TopHash, _} = lists:max(TopHashes), - {NextSegHash, NextSegBlock} = NextSeg, - DeltaList2 = [{NextSegHash - TopHash, NextSegBlock}|DeltaList], - TopHashes2 = lists:keyreplace(NextSegBlock, 2, TopHashes, - {NextSegHash, NextSegBlock}), - generate_segment_filter(SegTail, DeltaList2, TopHashes2). - - -serialise_segment_filter({DeltaList, TopHashes}) -> - TopHashesBin = lists:foldl(fun({X, _}, Acc) -> - <> end, - <<>>, TopHashes), - Length = length(DeltaList), - HeaderBin = <>, - {Divisor, Factor} = {?DIVISOR, ?DIVISOR_BITS}, - F = fun({Delta, Block}, Acc) -> - Exponent = buildexponent(Delta div Divisor), - Remainder = Delta rem Divisor, - Block2Bit = Block, - <> end, - pad_binary(lists:foldl(F, HeaderBin, DeltaList)). - - -pad_binary(BitString) -> - Pad = 8 - bit_size(BitString) rem 8, - case Pad of - 8 -> BitString; - _ -> <> - end. - -buildexponent(Exponent) -> - buildexponent(Exponent, <<0:1>>). - -buildexponent(0, OutputBits) -> - OutputBits; -buildexponent(Exponent, OutputBits) -> - buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>). - -merge_seglists({SegList1, SegList2, SegList3, SegList4}) -> - Stage1 = lists:foldl(fun(X, Acc) -> [{X, 0}|Acc] end, [], SegList1), - Stage2 = lists:foldl(fun(X, Acc) -> [{X, 1}|Acc] end, Stage1, SegList2), - Stage3 = lists:foldl(fun(X, Acc) -> [{X, 2}|Acc] end, Stage2, SegList3), - Stage4 = lists:foldl(fun(X, Acc) -> [{X, 3}|Acc] end, Stage3, SegList4), - lists:sort(Stage4). - -hash_for_segmentid(KV) -> - erlang:phash2(leveled_codec:strip_to_keyonly(KV), ?MAX_SEG_HASH). - - -%% Check for a given list of segments in the filter, returning in normal -%% operations a TupleList of {SegmentID, [ListOFBlocks]} where the ListOfBlocks -%% are the block IDs which contain keys in that given segment -%% -%% If there is a failure - perhaps due to a bit flip of some sort an error -%% willl be returned (error_so_maybe_present) and all blocks should be checked -%% as the filter cannot be relied upon - -check_for_segments(SegFilter, SegmentList, CRCCheck) -> - case CRCCheck of - true -> - <> = SegFilter, - CheckSum = [T0, T1, T2, T3], - case safecheck_for_segments(SegRem, SegmentList, - [0, 0, 0, 0], - 0, Count, []) of - {error_so_maybe_present, Reason} -> - leveled_log:log("SFT11", [Reason]), - error_so_maybe_present; - {OutputCheck, BlockList} when OutputCheck == CheckSum, - BlockList == [] -> - not_present; - {OutputCheck, BlockList} when OutputCheck == CheckSum -> - {maybe_present, BlockList}; - {OutputCheck, _} -> - leveled_log:log("SFT12", [OutputCheck, CheckSum]), - error_so_maybe_present - end; - false -> - <<_:80/bitstring, Count:16/integer, SegRem/bitstring>> = SegFilter, - case quickcheck_for_segments(SegRem, SegmentList, - lists:max(SegmentList), - 0, Count, []) of - {error_so_maybe_present, Reason} -> - leveled_log:log("SFT13", [Reason]), - error_so_maybe_present; - BlockList when BlockList == [] -> - not_present; - BlockList -> - {maybe_present, BlockList} - end - end. - - -safecheck_for_segments(_, _, TopHashes, _, 0, BlockList) -> - {TopHashes, BlockList}; -safecheck_for_segments(Filter, SegmentList, TopHs, Acc, Count, BlockList) -> - case findexponent(Filter) of - {ok, Exp, FilterRem1} -> - case findremainder(FilterRem1, ?DIVISOR_BITS) of - {ok, Remainder, BlockID, FilterRem2} -> - {NextHash, BlockList2} = checkhash_forsegments(Acc, - Exp, - Remainder, - SegmentList, - BlockList, - BlockID), - TopHashes2 = setnth(BlockID, TopHs, NextHash), - safecheck_for_segments(FilterRem2, SegmentList, - TopHashes2, - NextHash, Count - 1, - BlockList2); - error -> - {error_so_maybe_present, "Remainder Check"} - end; - error -> - {error_so_maybe_present, "Exponent Check"} - end. - -quickcheck_for_segments(_, _, _, _, 0, BlockList) -> - BlockList; -quickcheck_for_segments(Filter, SegmentList, MaxSeg, Acc, Count, BlockList) -> - case findexponent(Filter) of - {ok, Exp, FilterRem1} -> - case findremainder(FilterRem1, ?DIVISOR_BITS) of - {ok, Remainder, BlockID, FilterRem2} -> - {NextHash, BlockList2} = checkhash_forsegments(Acc, - Exp, - Remainder, - SegmentList, - BlockList, - BlockID), - case NextHash > MaxSeg of - true -> - BlockList2; - false -> - quickcheck_for_segments(FilterRem2, SegmentList, - MaxSeg, - NextHash, Count - 1, - BlockList2) - end; - error -> - {error_so_maybe_present, "Remainder Check"} - end; - error -> - {error_so_maybe_present, "Exponent Check"} - end. - - -checkhash_forsegments(Acc, Exp, Remainder, SegmentList, BlockList, BlockID) -> - NextHash = Acc + ?DIVISOR * Exp + Remainder, - case lists:member(NextHash, SegmentList) of - true -> - {NextHash, [BlockID|BlockList]}; - false -> - {NextHash, BlockList} - end. - - -setnth(0, [_|Rest], New) -> [New|Rest]; -setnth(I, [E|Rest], New) -> [E|setnth(I-1, Rest, New)]. - - -findexponent(BitStr) -> - findexponent(BitStr, 0). - -findexponent(<<>>, _) -> - error; -findexponent(<>, Acc) -> - case H of - 1 -> findexponent(T, Acc + 1); - 0 -> {ok, Acc, T} - end. - - -findremainder(BitStr, Factor) -> - case BitStr of - <> -> - {ok, Remainder, BlockID, Tail}; - _ -> - error - end. - - - -%%%============================================================================ -%%% Test -%%%============================================================================ - - --ifdef(TEST). - -generate_randomkeys({Count, StartSQN}) -> - generate_randomkeys(Count, StartSQN, []); -generate_randomkeys(Count) -> - generate_randomkeys(Count, 0, []). - -generate_randomkeys(0, _SQN, Acc) -> - lists:reverse(Acc); -generate_randomkeys(Count, SQN, Acc) -> - K = {o, - lists:concat(["Bucket", random:uniform(1024)]), - lists:concat(["Key", random:uniform(1024)]), - null}, - RandKey = {K, - {SQN, - {active, infinity}, - leveled_codec:magic_hash(K), - null}}, - generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). - -generate_sequentialkeys(Count, Start) -> - generate_sequentialkeys(Count + Start, Start, []). - -generate_sequentialkeys(Target, Incr, Acc) when Incr =:= Target -> - Acc; -generate_sequentialkeys(Target, Incr, Acc) -> - KeyStr = string:right(integer_to_list(Incr), 8, $0), - K = {o, "BucketSeq", lists:concat(["Key", KeyStr]), null}, - NextKey = {K, - {5, - {active, infinity}, - leveled_codec:magic_hash(K), - null}}, - generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]). - -simple_create_block_test() -> - KeyList1 = [{{o, "Bucket1", "Key1", null}, - {1, {active, infinity}, no_lookup, null}}, - {{o, "Bucket1", "Key3", null}, - {2, {active, infinity}, no_lookup, null}}], - KeyList2 = [{{o, "Bucket1", "Key2", null}, - {3, {active, infinity}, no_lookup, null}}], - {Status, BlockKeyList} = create_block(KeyList1, - KeyList2, - #level{level=1}), - ?assertMatch(partial, Status), - [H1|T1] = BlockKeyList, - ?assertMatch({{o, "Bucket1", "Key1", null}, - {1, {active, infinity}, no_lookup, null}}, H1), - [H2|T2] = T1, - ?assertMatch({{o, "Bucket1", "Key2", null}, - {3, {active, infinity}, no_lookup, null}}, H2), - ?assertMatch([{{o, "Bucket1", "Key3", null}, - {2, {active, infinity}, no_lookup, null}}], T2). - -dominate_create_block_test() -> - KeyList1 = [{{o, "Bucket1", "Key1", null}, - {1, {active, infinity}, no_lookup, null}}, - {{o, "Bucket1", "Key2", null}, - {2, {active, infinity}, no_lookup, null}}], - KeyList2 = [{{o, "Bucket1", "Key2", null}, - {3, {tomb, infinity}, no_lookup, null}}], - {Status, BlockKeyList} = create_block(KeyList1, - KeyList2, - #level{level=1}), - ?assertMatch(partial, Status), - [K1, K2] = BlockKeyList, - ?assertMatch(K1, lists:nth(1, KeyList1)), - ?assertMatch(K2, lists:nth(1, KeyList2)). - -sample_keylist() -> - KeyList1 = - [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key3", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key5", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key7", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key1", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key3", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key5", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key7", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key9", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key1", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key3", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key5", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key7", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key9", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, 0, null}}], - KeyList2 = - [{{o, "Bucket1", "Key2", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key4", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key6", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key8", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9a", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9c", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9d", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key2", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key4", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key6", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key8", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key2", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key4", null}, {3, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key6", null}, {2, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key8", null}, {1, {active, infinity}, 0, null}}], - {KeyList1, KeyList2}. - -alternating_create_block_test() -> - {KeyList1, KeyList2} = sample_keylist(), - {Status, BlockKeyList} = create_block(KeyList1, - KeyList2, - #level{level=1}), - BlockSize = length(BlockKeyList), - ?assertMatch(BlockSize, 32), - ?assertMatch(all_complete, Status), - K1 = lists:nth(1, BlockKeyList), - ?assertMatch(K1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, 0, null}}), - K11 = lists:nth(11, BlockKeyList), - ?assertMatch(K11, {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, 0, null}}), - K32 = lists:nth(32, BlockKeyList), - ?assertMatch(K32, {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, 0, null}}), - HKey = {{o, "Bucket1", "Key0", null}, {1, {active, infinity}, 0, null}}, - {Status2, _} = create_block([HKey|KeyList1], KeyList2, #level{level=1}), - ?assertMatch(block_full, element(1, Status2)). - - -merge_seglists_test() -> - SegList1 = [0, 100, 200], - SegList2 = [50, 200], - SegList3 = [75, 10000], - SegList4 = [], - MergedList = merge_seglists({SegList1, SegList2, - SegList3, SegList4}), - ?assertMatch(MergedList, [{0, 0}, {50, 1}, {75, 2}, {100, 0}, - {200, 0}, {200,1}, {10000,2}]), - SegTerm = generate_segment_filter({SegList1, SegList2, - SegList3, SegList4}), - ?assertMatch(SegTerm, {[{0, 0}, {50, 1}, {25, 2}, {25, 0}, - {100, 0}, {0, 1}, {9800, 2}], - [{200, 0}, {200, 1}, {10000, 2},{0, 3}]}), - SegBin = serialise_segment_filter(SegTerm), - ExpectedTopHashes = <<200:20, 200:20, 10000:20, 0:20>>, - ExpectedDeltas = <<0:1, 0:13, 0:2, - 0:1, 50:13, 1:2, - 0:1, 25:13, 2:2, - 0:1, 25:13, 0:2, - 0:1, 100:13, 0:2, - 0:1, 0:13, 1:2, - 2:2, 1708:13, 2:2>>, - ExpectedResult = <>, - ?assertMatch(SegBin, ExpectedResult), - R1 = check_for_segments(SegBin, [100], true), - ?assertMatch(R1,{maybe_present, [0]}), - R2 = check_for_segments(SegBin, [900], true), - ?assertMatch(R2, not_present), - R3 = check_for_segments(SegBin, [200], true), - ?assertMatch(R3, {maybe_present, [1,0]}), - R4 = check_for_segments(SegBin, [0,900], true), - ?assertMatch(R4, {maybe_present, [0]}), - R5 = check_for_segments(SegBin, [100], false), - ?assertMatch(R5, {maybe_present, [0]}), - R6 = check_for_segments(SegBin, [900], false), - ?assertMatch(R6, not_present), - R7 = check_for_segments(SegBin, [200], false), - ?assertMatch(R7, {maybe_present, [1,0]}), - R8 = check_for_segments(SegBin, [0,900], false), - ?assertMatch(R8, {maybe_present, [0]}), - R9 = check_for_segments(SegBin, [1024*1024 - 1], false), - ?assertMatch(R9, not_present), - io:format("Try corrupted bloom filter with flipped bit in " ++ - "penultimate delta~n"), - ExpectedDeltasFlippedBit = <<0:1, 0:13, 0:2, - 0:1, 50:13, 1:2, - 0:1, 25:13, 2:2, - 0:1, 25:13, 0:2, - 0:1, 100:13, 0:2, - 0:1, 0:13, 1:2, - 2:2, 1709:13, 2:2>>, - SegBin1 = <>, - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin1, [900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin1, [200], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin1, [0,900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin1, [1024*1024 - 1], true)), - % This match is before the flipped bit, so still works without CRC check - ?assertMatch({maybe_present, [0]}, - check_for_segments(SegBin1, [0,900], false)), - io:format("Try corrupted bloom filter with flipped bit in " ++ - "final block's top hash~n"), - ExpectedTopHashesFlippedBit = <<200:20, 200:20, 10000:20, 1:20>>, - SegBin2 = <>, - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin2, [900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin2, [200], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin2, [0,900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin2, [1024*1024 - 1], true)), - % This match is before the flipped bit, so still works without CRC check - ?assertMatch({maybe_present, [0]}, - check_for_segments(SegBin2, [0,900], false)), - - ExpectedDeltasAll1s = <<4294967295:32/integer>>, - SegBin3 = <>, - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [200], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [0,900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [1024*1024 - 1], true)), - % This is so badly mangled, the error gets detected event without CRC - % checking being enforced - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [900], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [200], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [0,900], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [1024*1024 - 1], false)), - - ExpectedDeltasNearlyAll1s = <<4294967287:32/integer>>, - SegBin4 = <>, - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [200], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [0,900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [1024*1024 - 1], true)), - % This is so badly mangled, the error gets detected event without CRC - % checking being enforced - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [900], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [200], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [0,900], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [1024*1024 - 1], false)). - -createslot_stage1_test() -> - {KeyList1, KeyList2} = sample_keylist(), - {Status, BlockKeyLists} = create_slot(KeyList1, KeyList2, #level{level=1}), - WState = finalise_slot(BlockKeyLists, #writer{}), - - ?assertMatch({o, "Bucket4", "Key1", null}, WState#writer.last_key), - ?assertMatch(partial, Status), - - %% Writer state has the SlotIndex which includes the segment filter - SegFilter = element(2, lists:nth(1, WState#writer.slot_index)), - - R0 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, {o, "Bucket1", "Key1", null}})], - true), - ?assertMatch({maybe_present, [0]}, R0), - R1 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, {o, "Bucket1", "Key99", null}})], - true), - ?assertMatch(not_present, R1), - ?assertMatch(1, WState#writer.min_sqn), - ?assertMatch(3, WState#writer.max_sqn). - - -createslot_stage2_test() -> - {Status, BlockKeyLists} = create_slot(lists:sort(generate_randomkeys(100)), - lists:sort(generate_randomkeys(100)), - #level{level=1}), - WState = finalise_slot(BlockKeyLists, #writer{}), - LengthList = element(3, lists:nth(1, WState#writer.slot_index)), - - ?assertMatch(full, element(1, Status)), - Sum1 = lists:sum(LengthList), - Sum2 = byte_size(WState#writer.slot_binary), - ?assertMatch(Sum1, Sum2). - - -createslot_stage3_test() -> - {Status, BlockKeyLists} = create_slot(lists:sort(generate_sequentialkeys(100, 1)), - lists:sort(generate_sequentialkeys(100, 101)), - #level{level=1}), - WState = finalise_slot(BlockKeyLists, #writer{}), - {FirstKey, SegFilter, LengthList} = lists:nth(1, WState#writer.slot_index), - - ?assertMatch(full, element(1, Status)), - Sum1 = lists:sum(LengthList), - Sum2 = byte_size(WState#writer.slot_binary), - ?assertMatch(Sum1, Sum2), - ?assertMatch({o, "BucketSeq", "Key00000001", null}, FirstKey), - ?assertMatch({o, "BucketSeq", "Key00000128", null}, WState#writer.last_key), - ?assertMatch([], element(2, Status)), - Rem = length(element(3, Status)), - ?assertMatch(Rem, 72), - R0 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, - {o, "BucketSeq", "Key00000100", null}})], - true), - ?assertMatch({maybe_present, [3]}, R0), - R1 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, - {o, "Bucket1", "Key99", null}})], - true), - ?assertMatch(not_present, R1), - R2 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, - {o, "BucketSeq", "Key00000040", null}})], - true), - ?assertMatch({maybe_present, [1]}, R2), - R3 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, - {o, "BucketSeq", "Key00000004", null}})], - true), - ?assertMatch({maybe_present, [0]}, R3). - - -initial_create_header_test() -> - Output = create_header(initial), - ?assertMatch(?HEADER_LEN, byte_size(Output)). - -initial_create_file_test() -> - Filename = "../test/test1.sft", - {KL1, KL2} = sample_keylist(), - {Handle, FileMD} = create_file(Filename), - {UpdHandle, UpdFileMD, {[], []}} = complete_file(Handle, FileMD, - KL1, KL2, - #level{level=1}), - - io:format("Slot Index of UpdFileMD ~w~n", [UpdFileMD#state.slot_index]), - Result1 = fetch_keyvalue(UpdHandle, UpdFileMD, {o, "Bucket1", "Key8", null}), - ?assertMatch({{o, "Bucket1", "Key8", null}, - {1, {active, infinity}, 0, null}}, Result1), - Result2 = fetch_keyvalue(UpdHandle, UpdFileMD, {o, "Bucket1", "Key88", null}), - ?assertMatch(not_present, Result2), - ok = file:close(UpdHandle), - ok = file:delete(Filename). - -big_create_file_test() -> - Filename = "../test/bigtest1.sft", - {KL1, KL2} = {lists:sort(generate_randomkeys(2000)), - lists:sort(generate_randomkeys(40000))}, - {InitHandle, InitFileMD} = create_file(Filename), - {Handle, FileMD, {_KL1Rem, _KL2Rem}} = complete_file(InitHandle, - InitFileMD, - KL1, KL2, - #level{level=1}), - [{K1, {Sq1, St1, MH1, V1}}|_] = KL1, - [{K2, {Sq2, St2, MH2, V2}}|_] = KL2, - Result1 = fetch_keyvalue(Handle, FileMD, K1), - Result2 = fetch_keyvalue(Handle, FileMD, K2), - ?assertMatch({K1, {Sq1, St1, MH1, V1}}, Result1), - ?assertMatch({K2, {Sq2, St2, MH2, V2}}, Result2), - SubList = lists:sublist(KL2, 1000), - lists:foreach(fun(KV) -> - {Kn, _} = KV, - Rn = fetch_keyvalue(Handle, FileMD, Kn), - ?assertMatch({Kn, _}, Rn) - end, - SubList), - Result3 = fetch_keyvalue(Handle, - FileMD, - {o, "Bucket1024", "Key1024Alt", null}), - ?assertMatch(Result3, not_present), - ok = file:close(Handle), - ok = file:delete(Filename). - -initial_iterator_test() -> - Filename = "../test/test2.sft", - {KL1, KL2} = sample_keylist(), - {Handle, FileMD} = create_file(Filename), - {UpdHandle, UpdFileMD, {[], []}} = complete_file(Handle, FileMD, - KL1, KL2, - #level{level=1}), - Result1 = fetch_range_keysonly(UpdHandle, UpdFileMD, - {o, "Bucket1", "Key8", null}, - {o, "Bucket1", "Key9d", null}), - io:format("Result returned of ~w~n", [Result1]), - ?assertMatch({complete, - [{{o, "Bucket1", "Key8", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9a", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9b", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9c", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9d", null}, 1, {active, infinity}} - ]}, - Result1), - Result2 = fetch_range_keysonly(UpdHandle, UpdFileMD, - {o, "Bucket1", "Key8", null}, - {o, "Bucket1", "Key9b", null}), - ?assertMatch({complete, - [{{o, "Bucket1", "Key8", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9a", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9b", null}, 1, {active, infinity}} - ]}, - Result2), - Result3 = fetch_range_keysonly(UpdHandle, UpdFileMD, - {o, "Bucket3", "Key4", null}, - all), - {partial, RL3, _} = Result3, - ?assertMatch([{{o, "Bucket3", "Key4", null}, 3, {active, infinity}}, - {{o, "Bucket3", "Key5", null}, 1, {active, infinity}}, - {{o, "Bucket3", "Key6", null}, 2, {active, infinity}}, - {{o, "Bucket3", "Key7", null}, 1, {active, infinity}}, - {{o, "Bucket3", "Key8", null}, 1, {active, infinity}}, - {{o, "Bucket3", "Key9", null}, 1, {active, infinity}}, - {{o, "Bucket4", "Key1", null}, 1, {active, infinity}}], - RL3), - ok = file:close(UpdHandle), - ok = file:delete(Filename). - -key_dominates_test() -> - KV1 = {{o, "Bucket", "Key1", null}, {5, {active, infinity}, 0, []}}, - KV2 = {{o, "Bucket", "Key3", null}, {6, {active, infinity}, 0, []}}, - KV3 = {{o, "Bucket", "Key2", null}, {3, {active, infinity}, 0, []}}, - KV4 = {{o, "Bucket", "Key4", null}, {7, {active, infinity}, 0, []}}, - KV5 = {{o, "Bucket", "Key1", null}, {4, {active, infinity}, 0, []}}, - KV6 = {{o, "Bucket", "Key1", null}, {99, {tomb, 999}, 0, []}}, - KV7 = {{o, "Bucket", "Key1", null}, {99, tomb, 0, []}}, - KL1 = [KV1, KV2], - KL2 = [KV3, KV4], - ?assertMatch({{next_key, KV1}, [KV2], KL2}, - key_dominates(KL1, KL2, {undefined, 1})), - ?assertMatch({{next_key, KV1}, KL2, [KV2]}, - key_dominates(KL2, KL1, {undefined, 1})), - ?assertMatch({skipped_key, KL2, KL1}, - key_dominates([KV5|KL2], KL1, {undefined, 1})), - ?assertMatch({{next_key, KV1}, [KV2], []}, - key_dominates(KL1, [], {undefined, 1})), - ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, - key_dominates([KV6|KL2], KL1, {undefined, 1})), - ?assertMatch({{next_key, KV6}, KL2, [KV2]}, - key_dominates([KV6|KL2], [KV2], {undefined, 1})), - ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, - key_dominates([KV6|KL2], KL1, {true, 1})), - ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, - key_dominates([KV6|KL2], KL1, {true, 1000})), - ?assertMatch({{next_key, KV6}, KL2, [KV2]}, - key_dominates([KV6|KL2], [KV2], {true, 1})), - ?assertMatch({skipped_key, KL2, [KV2]}, - key_dominates([KV6|KL2], [KV2], {true, 1000})), - ?assertMatch({skipped_key, [], []}, - key_dominates([KV6], [], {true, 1000})), - ?assertMatch({skipped_key, [], []}, - key_dominates([], [KV6], {true, 1000})), - ?assertMatch({{next_key, KV6}, [], []}, - key_dominates([KV6], [], {true, 1})), - ?assertMatch({{next_key, KV6}, [], []}, - key_dominates([], [KV6], {true, 1})), - ?assertMatch({skipped_key, [], []}, - key_dominates([KV7], [], {true, 1})), - ?assertMatch({skipped_key, [], []}, - key_dominates([], [KV7], {true, 1})), - ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, - key_dominates([KV7|KL2], KL1, {undefined, 1})), - ?assertMatch({{next_key, KV7}, KL2, [KV2]}, - key_dominates([KV7|KL2], [KV2], {undefined, 1})), - ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, - key_dominates([KV7|KL2], KL1, {true, 1})), - ?assertMatch({skipped_key, KL2, [KV2]}, - key_dominates([KV7|KL2], [KV2], {true, 1})). - - -corrupted_sft_test() -> - Filename = "../test/bigcorrupttest1.sft", - {KL1, KL2} = {lists:ukeysort(1, generate_randomkeys(2000)), []}, - {InitHandle, InitFileMD} = create_file(Filename), - {Handle, _FileMD, _Rems} = complete_file(InitHandle, - InitFileMD, - KL1, KL2, - #level{level=1}), - {ok, Lengths} = file:pread(Handle, 12, 12), - <> = Lengths, - ok = file:close(Handle), - - {ok, Corrupter} = file:open(Filename , [binary, raw, read, write]), - lists:foreach(fun(X) -> - case X * 5 of - Y when Y < FilterLength -> - Position = ?HEADER_LEN + X * 5 - + BlocksLength + IndexLength, - file:pwrite(Corrupter, - Position, - <<0:8/integer>>) - end - end, - lists:seq(1, 100)), - ok = file:close(Corrupter), - - {ok, SFTr, _KeyExtremes} = sft_open(Filename), - lists:foreach(fun({K, V}) -> - ?assertMatch({K, V}, sft_get(SFTr, K)) - end, - KL1), - ok = sft_clear(SFTr). - -big_iterator_test() -> - Filename = "../test/bigtest1.sft", - {KL1, KL2} = {lists:sort(generate_randomkeys(10000)), []}, - {InitHandle, InitFileMD} = create_file(Filename), - {Handle, FileMD, {KL1Rem, KL2Rem}} = complete_file(InitHandle, InitFileMD, - KL1, KL2, - #level{level=1}), - io:format("Remainder lengths are ~w and ~w ~n", [length(KL1Rem), - length(KL2Rem)]), - {complete, - Result1} = fetch_range_keysonly(Handle, - FileMD, - {o, "Bucket0000", "Key0000", null}, - {o, "Bucket9999", "Key9999", null}, - 256), - NumAddedKeys = 10000 - length(KL1Rem), - ?assertMatch(NumAddedKeys, length(Result1)), - {partial, - Result2, - _} = fetch_range_keysonly(Handle, - FileMD, - {o, "Bucket0000", "Key0000", null}, - {o, "Bucket9999", "Key9999", null}, - 32), - ?assertMatch(32 * 128, length(Result2)), - {partial, - Result3, - _} = fetch_range_keysonly(Handle, - FileMD, - {o, "Bucket0000", "Key0000", null}, - {o, "Bucket9999", "Key9999", null}, - 4), - ?assertMatch(4 * 128, length(Result3)), - ok = file:close(Handle), - ok = file:delete(Filename). - -hashclash_test() -> - Filename = "../test/hashclash.sft", - Key1 = {o, "Bucket", "Key838068", null}, - Key99 = {o, "Bucket", "Key898982", null}, - KeyNF = {o, "Bucket", "Key539122", null}, - ?assertMatch(4, hash_for_segmentid({keyonly, Key1})), - ?assertMatch(4, hash_for_segmentid({keyonly, Key99})), - ?assertMatch(4, hash_for_segmentid({keyonly, KeyNF})), - KeyList = lists:foldl(fun(X, Acc) -> - Key = {o, - "Bucket", - "Key8400" ++ integer_to_list(X), - null}, - Value = {X, - {active, infinity}, - leveled_codec:magic_hash(Key), - null}, - Acc ++ [{Key, Value}] end, - [], - lists:seq(10,98)), - KeyListToUse = [{Key1, - {1, - {active, infinity}, - leveled_codec:magic_hash(Key1), - null}}|KeyList] - ++ [{Key99, - {99, - {active, infinity}, - leveled_codec:magic_hash(Key99), - null}}], - {InitHandle, InitFileMD} = create_file(Filename), - {Handle, _FileMD, _Rem} = complete_file(InitHandle, InitFileMD, - KeyListToUse, [], - #level{level=1}), - ok = file:close(Handle), - {ok, SFTr, _KeyExtremes} = sft_open(Filename), - ?assertMatch({Key1, - {1, {active, infinity}, _, null}}, - sft_get(SFTr, Key1)), - ?assertMatch({Key99, - {99, {active, infinity}, _, null}}, - sft_get(SFTr, Key99)), - ?assertMatch(not_present, - sft_get(SFTr, KeyNF)), - - ok = sft_clear(SFTr). - -filename_test() -> - FN1 = "../tmp/filename", - FN2 = "../tmp/filename.pnd", - FN3 = "../tmp/subdir/file_name.pend", - ?assertMatch({"../tmp/filename.pnd", "../tmp/filename.sft"}, - generate_filenames(FN1)), - ?assertMatch({"../tmp/filename.pnd", "../tmp/filename.sft"}, - generate_filenames(FN2)), - ?assertMatch({"../tmp/subdir/file_name.pnd", - "../tmp/subdir/file_name.sft"}, - generate_filenames(FN3)). - -empty_file_test() -> - {ok, Pid, _Reply} = sft_new("../test/emptyfile.pnd", [], [], 1), - ?assertMatch(not_present, sft_get(Pid, "Key1")), - ?assertMatch([], sft_getkvrange(Pid, all, all, 16)), - ok = sft_clear(Pid). - - -nonsense_coverage_test() -> - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - undefined = gen_fsm:sync_send_all_state_event(Pid, nonsense), - ok = gen_fsm:send_all_state_event(Pid, nonsense), - ?assertMatch({next_state, reader, #state{}}, handle_info(nonsense, - reader, - #state{})), - ?assertMatch({ok, reader, #state{}}, code_change(nonsense, - reader, - #state{}, - nonsense)). - --endif. \ No newline at end of file diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index d03f0c1..5829448 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -29,6 +29,7 @@ lookup/2, lookup/3, key_above/2, + key_above_notequals/2, empty/0, empty/1, size/1 @@ -123,8 +124,15 @@ to_range(SkipList, Start, End) -> to_list(SkipList) -> to_list(element(2, SkipList), ?LIST_HEIGHT). +%% If a mark is found that matches the key, will return that mark key_above(SkipList, Key) -> - key_above(element(2, SkipList), Key, ?LIST_HEIGHT). + TestFun = fun(Mark, K) -> Mark >= K end, + key_above(element(2, SkipList), Key, ?LIST_HEIGHT, TestFun). + +%% If a mark is found that matches the key, will return the next mark +key_above_notequals(SkipList, Key) -> + TestFun = fun(Mark, K) -> Mark > K end, + key_above(element(2, SkipList), Key, ?LIST_HEIGHT, TestFun). empty() -> empty(false). @@ -321,11 +329,11 @@ sublist_above(SkipList, StartKey, Level, StartIncl) -> sublist_above(SL, StartKey, Level - 1, StartIncl) end. -key_above(SkipList, Key, 0) -> +key_above(SkipList, Key, 0, TestFun) -> FindFun = fun({Mark, V}, Found) -> case Found of false -> - case Key =< Mark of + case TestFun(Mark, Key) of true -> {Mark, V}; false -> @@ -336,13 +344,13 @@ key_above(SkipList, Key, 0) -> end end, lists:foldl(FindFun, false, SkipList); -key_above(SkipList, Key, Level) -> +key_above(SkipList, Key, Level, TestFun) -> FindFun = fun({Mark, SL}, Found) -> case Found of false -> - case Key =< Mark of + case TestFun(Mark, Key) of true -> - key_above(SL, Key, Level - 1); + key_above(SL, Key, Level - 1, TestFun); false -> false end; diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 877e42c..ef699d6 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -65,6 +65,7 @@ -define(LEVEL_BLOOM_SLOTS, [{0, 64}, {1, 48}, {default, 32}]). -define(MERGE_SCANWIDTH, 16). -define(DISCARD_EXT, ".discarded"). +-define(DELETE_TIMEOUT, 10000). -include_lib("eunit/include/eunit.hrl"). @@ -74,21 +75,28 @@ handle_info/3, terminate/3, code_change/4, + starting/2, starting/3, - reader/3]). + reader/3, + delete_pending/2, + delete_pending/3]). --export([sst_new/3, - sst_new/5, - sst_newlevelzero/4, +-export([sst_new/4, + sst_new/6, + sst_newlevelzero/5, sst_open/1, sst_get/2, sst_get/3, sst_getkvrange/4, sst_getslots/2, + sst_getmaxsequencenumber/1, + sst_setfordelete/2, + sst_clear/1, + sst_checkready/1, + sst_deleteconfirmed/1, sst_close/1]). --export([generate_randomkeys/1]). - +-export([expand_list_by_pointer/3]). -record(slot_index_value, {slot_id :: integer(), @@ -100,12 +108,14 @@ last_key :: tuple(), index :: list(), % leveled_skiplist bloom :: tuple(), % leveled_tinybloom - size :: integer()}). + size :: integer(), + max_sqn :: integer()}). -record(state, {summary, handle :: file:fd(), sst_timings :: tuple(), slot_lengths :: list(), + penciller :: pid(), filename, cache}). @@ -121,33 +131,42 @@ sst_open(Filename) -> {ok, Pid, {SK, EK}} end. -sst_new(Filename, Level, KVList) -> +sst_new(Filename, Level, KVList, MaxSQN) -> {ok, Pid} = gen_fsm:start(?MODULE, [], []), case gen_fsm:sync_send_event(Pid, - {sst_new, Filename, Level, KVList}, + {sst_new, + Filename, + Level, + KVList, + MaxSQN}, infinity) of {ok, {SK, EK}} -> {ok, Pid, {SK, EK}} end. -sst_new(Filename, KL1, KL2, IsBasement, Level) -> +sst_new(Filename, KL1, KL2, IsBasement, Level, MaxSQN) -> {{Rem1, Rem2}, MergedList} = merge_lists(KL1, KL2, {IsBasement, Level}), {ok, Pid} = gen_fsm:start(?MODULE, [], []), case gen_fsm:sync_send_event(Pid, - {sst_new, Filename, Level, MergedList}, + {sst_new, + Filename, + Level, + MergedList, + MaxSQN}, infinity) of {ok, {SK, EK}} -> {ok, Pid, {{Rem1, Rem2}, SK, EK}} end. -sst_newlevelzero(Filename, Slots, FetchFun, Penciller) -> +sst_newlevelzero(Filename, Slots, FetchFun, Penciller, MaxSQN) -> {ok, Pid} = gen_fsm:start(?MODULE, [], []), gen_fsm:send_event(Pid, {sst_newlevelzero, Filename, Slots, FetchFun, - Penciller}), + Penciller, + MaxSQN}), {ok, Pid, noreply}. sst_get(Pid, LedgerKey) -> @@ -164,6 +183,24 @@ sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> sst_getslots(Pid, SlotList) -> gen_fsm:sync_send_event(Pid, {get_slots, SlotList}, infinity). +sst_getmaxsequencenumber(Pid) -> + gen_fsm:sync_send_event(Pid, get_maxsequencenumber, infinity). + +sst_setfordelete(Pid, Penciller) -> + gen_fsm:sync_send_event(Pid, {set_for_delete, Penciller}, infinity). + +sst_clear(Pid) -> + gen_fsm:sync_send_event(Pid, {set_for_delete, false}, infinity), + gen_fsm:sync_send_event(Pid, close, 1000). + +sst_deleteconfirmed(Pid) -> + gen_fsm:send_event(Pid, close). + +sst_checkready(Pid) -> + %% Only used in test + gen_fsm:sync_send_event(Pid, background_complete, 100). + + sst_close(Pid) -> gen_fsm:sync_send_event(Pid, close, 2000). @@ -186,31 +223,48 @@ starting({sst_open, Filename}, _From, State) -> {ok, {Summary#summary.first_key, Summary#summary.last_key}}, reader, UpdState}; -starting({sst_new, Filename, Level, KVList}, _From, State) -> +starting({sst_new, Filename, Level, KVList, MaxSQN}, _From, State) -> {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), - SummaryBin = build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L), + SummaryBin = build_table_summary(SlotIndex, + AllHashes, + Level, + FirstKey, + L, + MaxSQN), ActualFilename = write_file(Filename, SummaryBin, SlotsBin), - UpdState = read_file(ActualFilename, - State#state{filename=ActualFilename}), + UpdState = read_file(ActualFilename, State), Summary = UpdState#state.summary, + leveled_log:log("SST08", [ActualFilename, Level, Summary#summary.max_sqn]), {reply, {ok, {Summary#summary.first_key, Summary#summary.last_key}}, reader, UpdState}. -starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller}, State) -> +starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller, MaxSQN}, + State) -> KVList = leveled_pmem:to_list(Slots, FetchFun), {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), - SummaryBin = build_table_summary(SlotIndex, AllHashes, 0, FirstKey, L), + SummaryBin = build_table_summary(SlotIndex, + AllHashes, + 0, + FirstKey, + L, + MaxSQN), ActualFilename = write_file(Filename, SummaryBin, SlotsBin), - UpdState = read_file(ActualFilename, - State#state{filename=ActualFilename}), + UpdState = read_file(ActualFilename, State), Summary = UpdState#state.summary, - leveled_penciller:pcl_confirml0complete(Penciller, - UpdState#state.filename, - Summary#summary.first_key, - Summary#summary.last_key), - {next_state, reader, UpdState}. + leveled_log:log("SST08", [ActualFilename, 0, Summary#summary.max_sqn]), + case Penciller of + undefined -> + {next_state, reader, UpdState}; + _ -> + leveled_penciller:pcl_confirml0complete(Penciller, + UpdState#state.filename, + Summary#summary.first_key, + Summary#summary.last_key), + {next_state, reader, UpdState} + end. + reader({get_kv, LedgerKey, Hash}, _From, State) -> SW = os:timestamp(), @@ -240,13 +294,70 @@ reader({get_slots, SlotList}, _From, State) -> fun({SlotBin, SK, EK}, Acc) -> Acc ++ trim_slot(SlotBin, SK, EK) end, {reply, lists:foldl(FoldFun, [], SlotBins), reader, State}; +reader(get_maxsequencenumber, _From, State) -> + Summary = State#state.summary, + {reply, Summary#summary.max_sqn, reader, State}; reader(print_timings, _From, State) -> io:format(user, "Timings of ~w~n", [State#state.sst_timings]), {reply, ok, reader, State#state{sst_timings = undefined}}; +reader({set_for_delete, Penciller}, _From, State) -> + leveled_log:log("SST06", [State#state.filename]), + {reply, + ok, + delete_pending, + State#state{penciller=Penciller}, + ?DELETE_TIMEOUT}; +reader(background_complete, _From, State) -> + Summary = State#state.summary, + {reply, + {ok, + State#state.filename, + Summary#summary.first_key, + Summary#summary.last_key}, + reader, + State}; reader(close, _From, State) -> ok = file:close(State#state.handle), {stop, normal, ok, State}. + +delete_pending({get_kv, LedgerKey, Hash}, _From, State) -> + {Result, Stage, SlotID} = fetch(LedgerKey, Hash, State), + case {Result, Stage} of + {not_present, slot_crc_wonky} -> + leveled_log:log("SST02", [State#state.filename, SlotID]), + {reply, Result, reader, State, ?DELETE_TIMEOUT}; + {not_present, _} -> + {reply, Result, reader, State, ?DELETE_TIMEOUT}; + {KV, slot_lookup_hit} -> + UpdCache = array:set(SlotID, KV, State#state.cache), + UpdState = State#state{cache = UpdCache}, + {reply, Result, reader, UpdState, ?DELETE_TIMEOUT}; + _ -> + {reply, Result, reader, State, ?DELETE_TIMEOUT} + end; +delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> + {reply, + fetch_range(StartKey, EndKey, ScanWidth, State), + reader, + State, + ?DELETE_TIMEOUT}; +delete_pending(close, _From, State) -> + leveled_log:log("SST07", [State#state.filename]), + ok = file:close(State#state.handle), + ok = file:delete(State#state.filename), + {stop, normal, ok, State}. + +delete_pending(timeout, State) -> + ok = leveled_penciller:pcl_confirmdelete(State#state.penciller, + State#state.filename), + {next_state, delete_pending, State, ?DELETE_TIMEOUT}; +delete_pending(close, State) -> + leveled_log:log("SST07", [State#state.filename]), + ok = file:close(State#state.handle), + ok = file:delete(State#state.filename), + {stop, normal, State}. + handle_sync_event(_Msg, _From, StateName, State) -> {reply, undefined, StateName, State}. @@ -413,10 +524,14 @@ read_file(Filename, State) -> SlotCount = length(SlotLengths), SkipL = leveled_skiplist:from_sortedlist(Summary#summary.index), UpdSummary = Summary#summary{index = SkipL}, - leveled_log:log("SST03", [Filename, Summary#summary.size, SlotCount]), + leveled_log:log("SST03", [Filename, + Summary#summary.size, + SlotCount, + Summary#summary.max_sqn]), State#state{summary = UpdSummary, slot_lengths = SlotLengths, handle = Handle, + filename = Filename, cache = array:new({size, SlotCount + 1})}. open_reader(Filename) -> @@ -426,7 +541,7 @@ open_reader(Filename) -> {ok, SummaryBin} = file:pread(Handle, SlotsLength + 8, SummaryLength), {Handle, SummaryBin}. -build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L) -> +build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) -> BloomSlots = case lists:keyfind(Level, 1, ?LEVEL_BLOOM_SLOTS) of {Level, N} -> @@ -442,7 +557,8 @@ build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L) -> last_key = LastKey, size = L, index = lists:reverse(SlotIndex), - bloom = Bloom}, + bloom = Bloom, + max_sqn = MaxSQN}, SummBin = term_to_binary(Summary, [{compressed, ?COMPRESSION_LEVEL}]), SummCRC = erlang:crc32(SummBin), <>. @@ -546,8 +662,13 @@ lookup_slots_int(StartKey, EndKey, SkipList) -> EndKey -> {L0, true, false}; _ -> - LTail = leveled_skiplist:key_above(SkipList, EndKey), - {L0 ++ [LTail], true, true} + LTail = leveled_skiplist:key_above_notequals(SkipList, LastKey), + case LTail of + false -> + {L0, true, false}; + _ -> + {L0 ++ [LTail], true, true} + end end. @@ -751,13 +872,29 @@ key_dominates_expanded([H1|T1], [H2|T2], Level) -> maybe_expand_pointer([]) -> []; -maybe_expand_pointer([{pointer, SFTPid, Slot, StartKey, all}|Tail]) -> +maybe_expand_pointer([{pointer, SSTPid, Slot, StartKey, all}|Tail]) -> + expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, all}, + Tail, + ?MERGE_SCANWIDTH); +maybe_expand_pointer([{next, SSTPid, StartKey}|Tail]) -> + expand_list_by_pointer({next, SSTPid, StartKey, all}, + Tail, + ?MERGE_SCANWIDTH); +maybe_expand_pointer(List) -> + List. + + +expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey}, Tail, 1) -> + AccPointers = [{pointer, Slot, StartKey, EndKey}], + ExpPointers = leveled_sst:sst_getslots(SSTPid, AccPointers), + lists:append(ExpPointers, Tail); +expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, all}, Tail, Width) -> FoldFun = fun(X, {Pointers, Remainder}) -> case length(Pointers) of - L when L < ?MERGE_SCANWIDTH -> + L when L < Width -> case X of - {pointer, SFTPid, S, SK, EK} -> + {pointer, SSTPid, S, SK, EK} -> {Pointers ++ [{pointer, S, SK, EK}], Remainder}; _ -> {Pointers, Remainder ++ [X]} @@ -768,16 +905,11 @@ maybe_expand_pointer([{pointer, SFTPid, Slot, StartKey, all}|Tail]) -> end, InitAcc = {[{pointer, Slot, StartKey, all}], []}, {AccPointers, AccTail} = lists:foldl(FoldFun, InitAcc, Tail), - SW = os:timestamp(), - ExpPointers = sst_getslots(SFTPid, AccPointers), - leveled_log:log_timer("SFT14", [SFTPid], SW), + ExpPointers = leveled_sst:sst_getslots(SSTPid, AccPointers), lists:append(ExpPointers, AccTail); -maybe_expand_pointer([{next, SFTPid, StartKey}|Tail]) -> - ExpPointer = sst_getkvrange(SFTPid, StartKey, all, ?MERGE_SCANWIDTH), - maybe_expand_pointer(ExpPointer ++ Tail); -maybe_expand_pointer(List) -> - List. - +expand_list_by_pointer({next, SSTPid, StartKey, EndKey}, Tail, Width) -> + ExpPointer = leveled_sst:sst_getkvrange(SSTPid, StartKey, EndKey, Width), + ExpPointer ++ Tail. @@ -787,13 +919,6 @@ maybe_expand_pointer(List) -> -ifdef(TEST). -generate_randomkeys({Count, StartSQN}) -> - BucketNumber = random:uniform(1024), - generate_randomkeys(Count, StartSQN, [], BucketNumber, BucketNumber); -generate_randomkeys(Count) -> - BucketNumber = random:uniform(1024), - generate_randomkeys(Count, 0, [], BucketNumber, BucketNumber). - generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> generate_randomkeys(Seqn, Count, @@ -834,8 +959,8 @@ merge_test() -> KVL2 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 20)), KVL3 = lists:ukeymerge(1, KVL1, KVL2), SW0 = os:timestamp(), - {ok, P1, {FK1, LK1}} = sst_new("../test/level1_src", 1, KVL1), - {ok, P2, {FK2, LK2}} = sst_new("../test/level2_src", 2, KVL2), + {ok, P1, {FK1, LK1}} = sst_new("../test/level1_src", 1, KVL1, 6000), + {ok, P2, {FK2, LK2}} = sst_new("../test/level2_src", 2, KVL2, 3000), ExpFK1 = element(1, lists:nth(1, KVL1)), ExpLK1 = element(1, lists:last(KVL1)), ExpFK2 = element(1, lists:nth(1, KVL2)), @@ -850,7 +975,8 @@ merge_test() -> ML1, ML2, false, - 2), + 2, + N * 2), ?assertMatch([], Rem1), ?assertMatch([], Rem2), ?assertMatch(true, FK3 == min(FK1, FK2)), @@ -915,7 +1041,8 @@ simple_slotbinsummary_test() -> AllHashes, 2, FirstKey, - length(KVList1)), + length(KVList1), + undefined), Summary = read_table_summary(SummaryBin), SummaryIndex = leveled_skiplist:from_sortedlist(Summary#summary.index), FetchFun = @@ -945,7 +1072,10 @@ simple_persisted_test() -> KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), - {ok, Pid, {FirstKey, LastKey}} = sst_new(Filename, 1, KVList1), + {ok, Pid, {FirstKey, LastKey}} = sst_new(Filename, + 1, + KVList1, + length(KVList1)), SW1 = os:timestamp(), lists:foreach(fun({K, V}) -> ?assertMatch({K, V}, sst_get(Pid, K)), @@ -1014,7 +1144,71 @@ simple_persisted_test() -> ?assertMatch(SubKVListA1L, length(FetchedListB2)), ?assertMatch(SubKVListA1, FetchedListB2), + FetchListB3 = sst_getkvrange(Pid, + Eight000Key, + {o, null, null, null}, + 4), + FetchedListB3 = lists:foldl(FoldFun, [], FetchListB3), + SubKVListA3 = lists:nthtail(800 - 1, KVList1), + SubKVListA3L = length(SubKVListA3), + io:format("Length expected ~w~n", [SubKVListA3L]), + ?assertMatch(SubKVListA3L, length(FetchedListB3)), + ?assertMatch(SubKVListA3, FetchedListB3), + ok = sst_close(Pid), ok = file:delete(Filename ++ ".sst"). +key_dominates_test() -> + KV1 = {{o, "Bucket", "Key1", null}, {5, {active, infinity}, 0, []}}, + KV2 = {{o, "Bucket", "Key3", null}, {6, {active, infinity}, 0, []}}, + KV3 = {{o, "Bucket", "Key2", null}, {3, {active, infinity}, 0, []}}, + KV4 = {{o, "Bucket", "Key4", null}, {7, {active, infinity}, 0, []}}, + KV5 = {{o, "Bucket", "Key1", null}, {4, {active, infinity}, 0, []}}, + KV6 = {{o, "Bucket", "Key1", null}, {99, {tomb, 999}, 0, []}}, + KV7 = {{o, "Bucket", "Key1", null}, {99, tomb, 0, []}}, + KL1 = [KV1, KV2], + KL2 = [KV3, KV4], + ?assertMatch({{next_key, KV1}, [KV2], KL2}, + key_dominates(KL1, KL2, {undefined, 1})), + ?assertMatch({{next_key, KV1}, KL2, [KV2]}, + key_dominates(KL2, KL1, {undefined, 1})), + ?assertMatch({skipped_key, KL2, KL1}, + key_dominates([KV5|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV1}, [KV2], []}, + key_dominates(KL1, [], {undefined, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV6}, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {undefined, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {true, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {true, 1000})), + ?assertMatch({{next_key, KV6}, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {true, 1})), + ?assertMatch({skipped_key, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {true, 1000})), + ?assertMatch({skipped_key, [], []}, + key_dominates([KV6], [], {true, 1000})), + ?assertMatch({skipped_key, [], []}, + key_dominates([], [KV6], {true, 1000})), + ?assertMatch({{next_key, KV6}, [], []}, + key_dominates([KV6], [], {true, 1})), + ?assertMatch({{next_key, KV6}, [], []}, + key_dominates([], [KV6], {true, 1})), + ?assertMatch({skipped_key, [], []}, + key_dominates([KV7], [], {true, 1})), + ?assertMatch({skipped_key, [], []}, + key_dominates([], [KV7], {true, 1})), + ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, + key_dominates([KV7|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV7}, KL2, [KV2]}, + key_dominates([KV7|KL2], [KV2], {undefined, 1})), + ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, + key_dominates([KV7|KL2], KL1, {true, 1})), + ?assertMatch({skipped_key, KL2, [KV2]}, + key_dominates([KV7|KL2], [KV2], {true, 1})). + + + -endif. \ No newline at end of file From 18f2b5660d157217d12841994dd2a9ec56dd5566 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 29 Dec 2016 02:31:10 +0000 Subject: [PATCH 21/58] Fix to ensure directory structure created --- src/leveled_penciller.erl | 5 ++++- src/leveled_sst.erl | 13 +++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 7f36325..fa98f9b 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -572,8 +572,11 @@ start_from_file(PCLopts) -> levelzero_index=leveled_pmem:new_index()}, %% Open manifest - ManifestPath = InitState#state.root_path ++ "/" ++ ?MANIFEST_FP ++ "/", + ManifestPath = filepath(InitState#state.root_path, manifest) ++ "/", + SSTPath = filepath(InitState#state.root_path, files) ++ "/", ok = filelib:ensure_dir(ManifestPath), + ok = filelib:ensure_dir(SSTPath), + {ok, Filenames} = file:list_dir(ManifestPath), CurrRegex = "nonzero_(?[0-9]+)\\." ++ ?CURRENT_FILEX, ValidManSQNs = lists:foldl(fun(FN, Acc) -> diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index ef699d6..602fc2d 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -493,12 +493,13 @@ write_file(Filename, SummaryBin, SlotsBin) -> SummaryLength = byte_size(SummaryBin), SlotsLength = byte_size(SlotsBin), {PendingName, FinalName} = generate_filenames(Filename), - file:write_file(PendingName, - <>, - [raw]), + DirName = filename:dirname(PendingName), + ok = file:write_file(PendingName, + <>, + [raw]), case filelib:is_file(FinalName) of true -> AltName = filename:join(filename:dirname(FinalName), From 0c4d949c7fc5144f0b372d1711437b442d705f9a Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 02:40:09 +0000 Subject: [PATCH 22/58] State mixup in FSM --- src/leveled_sst.erl | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 602fc2d..434b19c 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -326,20 +326,30 @@ delete_pending({get_kv, LedgerKey, Hash}, _From, State) -> case {Result, Stage} of {not_present, slot_crc_wonky} -> leveled_log:log("SST02", [State#state.filename, SlotID]), - {reply, Result, reader, State, ?DELETE_TIMEOUT}; + {reply, Result, delete_pending, State, ?DELETE_TIMEOUT}; {not_present, _} -> - {reply, Result, reader, State, ?DELETE_TIMEOUT}; + {reply, Result, delete_pending, State, ?DELETE_TIMEOUT}; {KV, slot_lookup_hit} -> UpdCache = array:set(SlotID, KV, State#state.cache), UpdState = State#state{cache = UpdCache}, - {reply, Result, reader, UpdState, ?DELETE_TIMEOUT}; + {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT}; _ -> - {reply, Result, reader, State, ?DELETE_TIMEOUT} + {reply, Result, delete_pending, State, ?DELETE_TIMEOUT} end; delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> {reply, fetch_range(StartKey, EndKey, ScanWidth, State), - reader, + delete_pending, + State, + ?DELETE_TIMEOUT}; +delete_pending({get_slots, SlotList}, _From, State) -> + SlotBins = read_slots(State#state.handle, SlotList), + FoldFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ trim_slot(SlotBin, SK, EK) end, + {reply, + lists:foldl(FoldFun, [], SlotBins), + delete_pending, State, ?DELETE_TIMEOUT}; delete_pending(close, _From, State) -> From a665b8ea4f3532344b7ea8b2e58139821e3b76db Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 02:41:02 +0000 Subject: [PATCH 23/58] Tidy-up unused variable --- src/leveled_sst.erl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 434b19c..17c094e 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -503,7 +503,6 @@ write_file(Filename, SummaryBin, SlotsBin) -> SummaryLength = byte_size(SummaryBin), SlotsLength = byte_size(SlotsBin), {PendingName, FinalName} = generate_filenames(Filename), - DirName = filename:dirname(PendingName), ok = file:write_file(PendingName, < Date: Thu, 29 Dec 2016 02:47:21 +0000 Subject: [PATCH 24/58] Remove io:format from debugging --- src/leveled_penciller.erl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index fa98f9b..031b0d2 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -402,7 +402,6 @@ handle_call({fetch_keys, StartKey, EndKey, AccFun, InitAcc, MaxKeys}, SSTiter = initiate_rangequery_frommanifest(StartKey, EndKey, State#state.manifest), - io:format("SSTiter on query ~w~n", [SSTiter]), Acc = keyfolder({L0AsList, SSTiter}, {StartKey, EndKey}, {AccFun, InitAcc}, From 3f3b36597a289e5be57b691d7832ce305f22cd2a Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 02:55:28 +0000 Subject: [PATCH 25/58] Add timer for SST creation --- src/leveled_sst.erl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 17c094e..b81346d 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -224,6 +224,7 @@ starting({sst_open, Filename}, _From, State) -> reader, UpdState}; starting({sst_new, Filename, Level, KVList, MaxSQN}, _From, State) -> + SW = os:timestamp(), {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), SummaryBin = build_table_summary(SlotIndex, AllHashes, @@ -234,7 +235,9 @@ starting({sst_new, Filename, Level, KVList, MaxSQN}, _From, State) -> ActualFilename = write_file(Filename, SummaryBin, SlotsBin), UpdState = read_file(ActualFilename, State), Summary = UpdState#state.summary, - leveled_log:log("SST08", [ActualFilename, Level, Summary#summary.max_sqn]), + leveled_log:log_timer("SST08", + [ActualFilename, Level, Summary#summary.max_sqn], + SW), {reply, {ok, {Summary#summary.first_key, Summary#summary.last_key}}, reader, @@ -242,6 +245,7 @@ starting({sst_new, Filename, Level, KVList, MaxSQN}, _From, State) -> starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller, MaxSQN}, State) -> + SW = os:timestamp(), KVList = leveled_pmem:to_list(Slots, FetchFun), {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), SummaryBin = build_table_summary(SlotIndex, @@ -253,7 +257,9 @@ starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller, MaxSQN}, ActualFilename = write_file(Filename, SummaryBin, SlotsBin), UpdState = read_file(ActualFilename, State), Summary = UpdState#state.summary, - leveled_log:log("SST08", [ActualFilename, 0, Summary#summary.max_sqn]), + leveled_log:log_timer("SST08", + [ActualFilename, 0, Summary#summary.max_sqn], + SW), case Penciller of undefined -> {next_state, reader, UpdState}; From 5b9e68df9910ebfdabcb331b33a07400c74073e7 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 03:04:10 +0000 Subject: [PATCH 26/58] Add some crash protection for empty return from to_range Not clear though why it would occur. --- src/leveled_sst.erl | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index b81346d..a3f2ec1 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -672,18 +672,23 @@ lookup_slots_int(StartKey, all, SkipList) -> LTrim = FirstKey < StartKey, {RKeep0, LTrim, false}; lookup_slots_int(StartKey, EndKey, SkipList) -> - L0 = leveled_skiplist:to_range(SkipList, StartKey, EndKey), - {LastKey, _LastVal} = lists:last(L0), - case LastKey of - EndKey -> - {L0, true, false}; - _ -> - LTail = leveled_skiplist:key_above_notequals(SkipList, LastKey), - case LTail of - false -> + case leveled_skiplist:to_range(SkipList, StartKey, EndKey) of + [] -> + {[], false, false}; + L0 -> + {LastKey, _LastVal} = lists:last(L0), + case LastKey of + EndKey -> {L0, true, false}; _ -> - {L0 ++ [LTail], true, true} + LTail = leveled_skiplist:key_above_notequals(SkipList, + LastKey), + case LTail of + false -> + {L0, true, false}; + _ -> + {L0 ++ [LTail], true, true} + end end end. From 55386622f7d035e8babb2a0d72ae2e8f59e5fb63 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 04:37:49 +0000 Subject: [PATCH 27/58] Fixed issues Two issues - when the key range falls in-between two marks in the summary, we didn't pick up any mark. then when trimming both right and left, the left trim was being discarded. --- src/leveled_skiplist.erl | 26 ++++++++++++++++++++++++++ src/leveled_sst.erl | 13 +++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 5829448..e8d8627 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -673,6 +673,32 @@ skiplist_keybefore_test() -> io:format(user, "~nFinding self in keys above ~w microseconds for ~w finds~n", [timer:now_diff(os:timestamp(), SW), N]). +skiplist_range_test() -> + N = 150, + KL = generate_randomkeys(1, N, 1, N div 5), + + KLSL1 = lists:sublist(lists:ukeysort(1, KL), 128), + SkipList1 = from_list(KLSL1), + {LastK1, V1} = lists:last(KLSL1), + R1 = to_range(SkipList1, LastK1, LastK1), + ?assertMatch([{LastK1, V1}], R1), + + KLSL2 = lists:sublist(lists:ukeysort(1, KL), 127), + SkipList2 = from_list(KLSL2), + {LastK2, V2} = lists:last(KLSL2), + R2 = to_range(SkipList2, LastK2, LastK2), + ?assertMatch([{LastK2, V2}], R2), + + KLSL3 = lists:sublist(lists:ukeysort(1, KL), 129), + SkipList3 = from_list(KLSL3), + {LastK3, V3} = lists:last(KLSL3), + R3 = to_range(SkipList3, LastK3, LastK3), + ?assertMatch([{LastK3, V3}], R3), + + {FirstK4, V4} = lists:nth(1, KLSL3), + R4 = to_range(SkipList3, FirstK4, FirstK4), + ?assertMatch([{FirstK4, V4}], R4). + empty_skiplist_size_test() -> ?assertMatch(0, leveled_skiplist:size(empty(false))), diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index a3f2ec1..3fc5ef3 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -674,7 +674,8 @@ lookup_slots_int(StartKey, all, SkipList) -> lookup_slots_int(StartKey, EndKey, SkipList) -> case leveled_skiplist:to_range(SkipList, StartKey, EndKey) of [] -> - {[], false, false}; + BestKey = leveled_skiplist:key_above(SkipList, StartKey), + {[BestKey], true, true}; L0 -> {LastKey, _LastVal} = lists:last(L0), case LastKey of @@ -788,7 +789,7 @@ trim_slot(SlotBinary, StartKey, EndKey) -> all -> LTrimL; _ -> - {LKeep, _RDrop} = lists:splitwith(RTrimFun, L), + {LKeep, _RDrop} = lists:splitwith(RTrimFun, LTrimL), LKeep end, RTrimL. @@ -1176,6 +1177,14 @@ simple_persisted_test() -> ?assertMatch(SubKVListA3L, length(FetchedListB3)), ?assertMatch(SubKVListA3, FetchedListB3), + io:format("Eight hundredth key ~w~n", [Eight000Key]), + FetchListB4 = sst_getkvrange(Pid, + Eight000Key, + Eight000Key, + 4), + FetchedListB4 = lists:foldl(FoldFun, [], FetchListB4), + ?assertMatch([{Eight000Key, _v800}], FetchedListB4), + ok = sst_close(Pid), ok = file:delete(Filename ++ ".sst"). From e01b310d2059fd77e6d357fdcd415661ee3b329f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 05:09:47 +0000 Subject: [PATCH 28/58] Handle production of empty file --- src/leveled_log.erl | 2 -- src/leveled_pclerk.erl | 4 +--- src/leveled_sst.erl | 25 +++++++++++++++---------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index 8ab798f..de73c23 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -152,8 +152,6 @@ {info, "File to be created as part of MSN=~w Filename=~s"}}, {"PC013", {warn, "Merge resulted in empty file ~s"}}, - {"PC014", - {info, "Empty file ~s to be cleared"}}, {"PC015", {info, "File created"}}, {"PC016", diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 2f29920..1744a3d 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -319,10 +319,8 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN, SrcLevel + 1, MaxSQN), case Reply of - {{[], []}, null, _} -> + empty -> leveled_log:log("PC013", [FileName]), - leveled_log:log("PC014", [FileName]), - ok = leveled_sst:sst_clear(Pid), OutList; {{KL1Rem, KL2Rem}, SmallestKey, HighestKey} -> ExtMan = lists:append(OutList, diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 3fc5ef3..41fcbb6 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -146,16 +146,21 @@ sst_new(Filename, Level, KVList, MaxSQN) -> sst_new(Filename, KL1, KL2, IsBasement, Level, MaxSQN) -> {{Rem1, Rem2}, MergedList} = merge_lists(KL1, KL2, {IsBasement, Level}), - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - case gen_fsm:sync_send_event(Pid, - {sst_new, - Filename, - Level, - MergedList, - MaxSQN}, - infinity) of - {ok, {SK, EK}} -> - {ok, Pid, {{Rem1, Rem2}, SK, EK}} + case MergedList of + [] -> + empty; + _ -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + case gen_fsm:sync_send_event(Pid, + {sst_new, + Filename, + Level, + MergedList, + MaxSQN}, + infinity) of + {ok, {SK, EK}} -> + {ok, Pid, {{Rem1, Rem2}, SK, EK}} + end end. sst_newlevelzero(Filename, Slots, FetchFun, Penciller, MaxSQN) -> From 0c543ae3ecd4903a934f1c01b7586212645b5387 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 05:10:11 +0000 Subject: [PATCH 29/58] Remove legacy logs --- src/leveled_log.erl | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index de73c23..ba669fa 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -249,35 +249,6 @@ {"SST08", {info, "Completed creation of ~s at level ~w with max sqn ~w"}}, - {"SFT01", - {info, "Opened filename with name ~s"}}, - {"SFT02", - {info, "File ~s has been set for delete"}}, - {"SFT03", - {info, "File creation of L0 file ~s"}}, - {"SFT04", - {debug, "File ~s prompting for delete status check"}}, - {"SFT05", - {info, "Exit called for reason ~w on filename ~s"}}, - {"SFT06", - {info, "Exit called and now clearing ~s"}}, - {"SFT07", - {info, "Creating file with input of size ~w"}}, - {"SFT08", - {info, "Renaming file from ~s to ~s"}}, - {"SFT09", - {warn, "Filename ~s already exists"}}, - {"SFT10", - {warn, "Rename rogue filename ~s to ~s"}}, - {"SFT11", - {error, "Segment filter failed due to ~s"}}, - {"SFT12", - {error, "Segment filter failed due to CRC check ~w did not match ~w"}}, - {"SFT13", - {error, "Segment filter failed due to ~s"}}, - {"SFT14", - {debug, "Range fetch from SFT PID ~w"}}, - {"CDB01", {info, "Opening file for writing with filename ~s"}}, {"CDB02", From 7049aaf5ca0fe6aeb2918a4fd5a0ea34ca4dff54 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 09:35:58 +0000 Subject: [PATCH 30/58] Better attempt to handle empty file being generated --- src/leveled_pclerk.erl | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 1744a3d..9ccc791 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -312,26 +312,21 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN, [SrcLevel + 1, FileCounter])), leveled_log:log("PC012", [MSN, FileName]), TS1 = os:timestamp(), - {ok, Pid, Reply} = leveled_sst:sst_new(FileName, - KL1, - KL2, - IsB, - SrcLevel + 1, - MaxSQN), - case Reply of + case leveled_sst:sst_new(FileName, KL1, KL2, IsB, SrcLevel + 1, MaxSQN) of empty -> leveled_log:log("PC013", [FileName]), OutList; - {{KL1Rem, KL2Rem}, SmallestKey, HighestKey} -> - ExtMan = lists:append(OutList, - [#manifest_entry{start_key=SmallestKey, - end_key=HighestKey, - owner=Pid, - filename=FileName}]), - leveled_log:log_timer("PC015", [], TS1), - do_merge(KL1Rem, KL2Rem, - {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN, - FileCounter + 1, ExtMan) + {ok, Pid, Reply} -> + {{KL1Rem, KL2Rem}, SmallestKey, HighestKey} = Reply, + ExtMan = lists:append(OutList, + [#manifest_entry{start_key=SmallestKey, + end_key=HighestKey, + owner=Pid, + filename=FileName}]), + leveled_log:log_timer("PC015", [], TS1), + do_merge(KL1Rem, KL2Rem, + {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN, + FileCounter + 1, ExtMan) end. From afb28aa7d6f0fd423512b30c1280e59c0f927037 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 10:21:57 +0000 Subject: [PATCH 31/58] Switch iterator scan width to macro And 4 seems a more reasonable number than 1 --- src/leveled_penciller.erl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 031b0d2..b99b3bd 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -203,6 +203,7 @@ -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). -define(COIN_SIDECOUNT, 5). -define(SLOW_FETCH, 20000). +-define(ITERATOR_SCANWIDTH, 4). -record(state, {manifest = [] :: list(), manifest_sqn = 0 :: integer(), @@ -962,7 +963,7 @@ find_nextkey(QueryArray, StartKey, EndKey) -> {null, null}, StartKey, EndKey, - 1). + ?ITERATOR_SCANWIDTH). find_nextkey(_QueryArray, LCnt, {null, null}, _StartKey, _EndKey, _Width) when LCnt > ?MAX_LEVELS -> From 8f0bf8b892562bf7e2094556df1675c230e27c53 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 10:34:53 +0000 Subject: [PATCH 32/58] Fix overlapping _ references --- src/leveled_sst.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 41fcbb6..bcdfa6c 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -728,8 +728,8 @@ read_slot(Handle, Slot) -> end. read_slots(Handle, SlotList) -> - [{pointer, FirstSlot, _SK, _EK}|_Rest] = SlotList, - {pointer, LastSlot, _SK, _EK} = lists:last(SlotList), + [{pointer, FirstSlot, _SK1, _EK1}|_Rest] = SlotList, + {pointer, LastSlot, _SKL, _EKL} = lists:last(SlotList), StartPos = FirstSlot#slot_index_value.start_position, Length = LastSlot#slot_index_value.start_position + LastSlot#slot_index_value.length From fb75a26497854d4a75c0c76d27f887aede344ca7 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 10:46:12 +0000 Subject: [PATCH 33/58] Handle mismatch on expanding pointer Remove the nasty legacy of hard-coding for a scan width of 1 --- src/leveled_sst.erl | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index bcdfa6c..7078064 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -911,11 +911,7 @@ maybe_expand_pointer(List) -> List. -expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey}, Tail, 1) -> - AccPointers = [{pointer, Slot, StartKey, EndKey}], - ExpPointers = leveled_sst:sst_getslots(SSTPid, AccPointers), - lists:append(ExpPointers, Tail); -expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, all}, Tail, Width) -> +expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey}, Tail, Width) -> FoldFun = fun(X, {Pointers, Remainder}) -> case length(Pointers) of @@ -930,7 +926,7 @@ expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, all}, Tail, Width) -> {Pointers, Remainder ++ [X]} end end, - InitAcc = {[{pointer, Slot, StartKey, all}], []}, + InitAcc = {[{pointer, Slot, StartKey, EndKey}], []}, {AccPointers, AccTail} = lists:foldl(FoldFun, InitAcc, Tail), ExpPointers = leveled_sst:sst_getslots(SSTPid, AccPointers), lists:append(ExpPointers, AccTail); From 4784f8521aa0875f82511e4dbb06a1e12fb6f22e Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 11:59:07 +0000 Subject: [PATCH 34/58] Entropy fiddle Try and increase efefctiveness of bloom by combing Magic Hash with phash2 --- src/leveled_sst.erl | 29 ++++++++++++++++++----------- src/leveled_tinybloom.erl | 35 +++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 7078064..f84d43a 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -401,13 +401,14 @@ code_change(_OldVsn, StateName, State, _Extra) -> fetch(LedgerKey, Hash, State) -> Summary = State#state.summary, - case leveled_tinybloom:check({hash, Hash}, Summary#summary.bloom) of + case leveled_tinybloom:check({hash, Hash}, + Summary#summary.bloom) of false -> {not_present, summary_bloom, null}; true -> Slot = lookup_slot(LedgerKey, Summary#summary.index), SlotBloom = Slot#slot_index_value.bloom, - case is_check_slot_required({hash, Hash}, SlotBloom) of + case is_check_slot_required({hash, Hash}, LedgerKey, SlotBloom) of false -> {not_present, slot_bloom, null}; true -> @@ -570,7 +571,9 @@ build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) -> false -> element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_SLOTS)) end, - Bloom = lists:foldr(fun leveled_tinybloom:enter/2, + BloomAddFun = + fun({H, _K}, Bloom) -> leveled_tinybloom:enter(H, Bloom) end, + Bloom = lists:foldr(BloomAddFun, leveled_tinybloom:empty(BloomSlots), AllHashes), [{LastKey, _LastV}|_Rest] = SlotIndex, @@ -627,7 +630,7 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) -> no_lookup -> Acc; H -> - [{hash, H}|Acc] + [{{hash, H}, K}|Acc] end end, HashList = lists:foldr(ExtractHashFun, [], SlotList), @@ -649,16 +652,18 @@ build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) -> build_slot(KVList, HashList) -> Tree = gb_trees:from_orddict(KVList), - Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2, + BloomAddFun = + fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end, + Bloom = lists:foldr(BloomAddFun, leveled_tinybloom:tiny_empty(), HashList), SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), {SlotBin, Bloom}. -is_check_slot_required(_Hash, none) -> +is_check_slot_required(_Hash, _Key, none) -> true; -is_check_slot_required(Hash, Bloom) -> - leveled_tinybloom:tiny_check(Hash, Bloom). +is_check_slot_required(Hash, Key, Bloom) -> + leveled_tinybloom:tiny_check(Hash, Key, Bloom). %% Returns a section from the summary index and two booleans to indicate if %% the first slot needs trimming, or the last slot @@ -1030,7 +1035,7 @@ simple_slotbin_test() -> ExtractHashFun = fun({K, V}) -> {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), - {hash, H} end, + {{hash, H}, K} end, HashList = lists:map(ExtractHashFun, KVList1), SW0 = os:timestamp(), {SlotBin0, Bloom0} = build_slot(KVList1, HashList), @@ -1038,8 +1043,10 @@ simple_slotbin_test() -> [timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]), SW1 = os:timestamp(), - lists:foreach(fun(H) -> ?assertMatch(true, - is_check_slot_required(H, Bloom0)) + lists:foreach(fun({H, K}) -> ?assertMatch(true, + is_check_slot_required(H, + K, + Bloom0)) end, HashList), lists:foreach(fun({K, V}) -> diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 5428917..c03a5b5 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -20,8 +20,8 @@ enter/2, check/2, empty/1, - tiny_enter/2, - tiny_check/2, + tiny_enter/3, + tiny_check/3, tiny_empty/0 ]). @@ -75,16 +75,16 @@ check(Key, Bloom) -> tiny_empty() -> <<0:1024>>. -tiny_enter({hash, no_lookup}, Bloom) -> +tiny_enter({hash, no_lookup}, _Key, Bloom) -> Bloom; -tiny_enter({hash, Hash}, Bloom) -> - {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), +tiny_enter({hash, Hash}, Key, Bloom) -> + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key), AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end, lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]). -tiny_check({hash, Hash}, Bloom) -> - {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash), +tiny_check({hash, Hash}, Key, Bloom) -> + {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key), case getbit(Bit0, Bloom, 1024) of <<0:1>> -> false; @@ -113,8 +113,9 @@ split_hash(Hash) -> H2 = Hash bsr 20, {H0, H1, H2}. -split_hash_for_tinybloom(Hash) -> +split_hash_for_tinybloom(MagicHash, Key) -> % Tiny bloom can make k=3 from one hash + Hash = MagicHash bxor erlang:phash2(Key), H0 = Hash band 1023, H1 = (Hash bsr 11) band 1023, H2 = (Hash bsr 22) band 1023, @@ -194,8 +195,8 @@ simple_test() -> ?assertMatch(true, FP < (N div 4)). tiny_test() -> - N = 256, - K = 32, % more checks out then in K * checks + N = 128, + K = 64, % more checks out than in K * checks KLin = lists:map(fun(X) -> "Key_" ++ integer_to_list(X) ++ integer_to_list(random:uniform(100)) ++ @@ -211,27 +212,29 @@ tiny_test() -> lists:seq(1, N * K)), HashIn = lists:map(fun(X) -> - {hash, leveled_codec:magic_hash(X)} end, + {{hash, leveled_codec:magic_hash(X)}, X} end, KLin), HashOut = lists:map(fun(X) -> - {hash, leveled_codec:magic_hash(X)} end, + {{hash, leveled_codec:magic_hash(X)}, X} end, KLout), SW1 = os:timestamp(), - Bloom = lists:foldr(fun tiny_enter/2, tiny_empty(), HashIn), + Bloom = lists:foldr(fun({H0, K0}, B) -> tiny_enter(H0, K0, B) end, + tiny_empty(), + HashIn), io:format(user, "~nAdding ~w hashes to tiny bloom took ~w microseconds~n", [N, timer:now_diff(os:timestamp(), SW1)]), SW2 = os:timestamp(), - lists:foreach(fun(X) -> - ?assertMatch(true, tiny_check(X, Bloom)) end, HashIn), + lists:foreach(fun({H1, K1}) -> + ?assertMatch(true, tiny_check(H1, K1, Bloom)) end, HashIn), io:format(user, "~nChecking ~w hashes in tiny bloom took ~w microseconds~n", [N, timer:now_diff(os:timestamp(), SW2)]), SW3 = os:timestamp(), - FP = lists:foldr(fun(X, Acc) -> case tiny_check(X, Bloom) of + FP = lists:foldr(fun({H3, K3}, Acc) -> case tiny_check(H3, K3, Bloom) of true -> Acc + 1; false -> Acc end end, From a261d4793b199303198df558a40c171b2e8c4d13 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 12:01:42 +0000 Subject: [PATCH 35/58] Increase test size Be able to read more into sample-based output --- src/leveled_sst.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index f84d43a..b4d0ce0 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -1098,7 +1098,7 @@ simple_slotbinsummary_test() -> simple_persisted_test() -> Filename = "../test/simple_test", - KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20), + KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 32, 1, 20), KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), @@ -1117,7 +1117,7 @@ simple_persisted_test() -> ++ "microseconds~n", [length(KVList1), timer:now_diff(os:timestamp(), SW1)]), ok = sst_printtimings(Pid), - KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20), + KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 32, 1, 20), MapFun = fun({K, V}, Acc) -> In = lists:keymember(K, 1, KVList1), From 41ee90a2ef1baafd42fb6b36249c7d2e1f85ef9c Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 12:10:12 +0000 Subject: [PATCH 36/58] OTP16 compatability --- src/leveled_sst.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index b4d0ce0..f1466c4 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -100,7 +100,7 @@ -record(slot_index_value, {slot_id :: integer(), - bloom :: dict:dict(), + bloom, start_position :: integer(), length :: integer()}). From b855401696028e8792612fdbe338a8eae5fa87eb Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 14:11:05 +0000 Subject: [PATCH 37/58] Experiment Want to experiemnt with different datatypes for the slot - maybe use a raw list but with a mini hashtree index like the CDB file --- src/leveled_sst.erl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index f1466c4..f281ba1 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -981,6 +981,41 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> BRange). +experimental_test() -> + io:format(user, "~nExperimental timing test:~n", []), + N = 128, + KVL1 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 2)), + ExtractHashFun = + fun({K, V}) -> + {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), + {{hash, H}, K} end, + HashList = lists:map(ExtractHashFun, KVL1), + + SWA0 = os:timestamp(), + Tree = gb_trees:from_orddict(KVL1), + BloomAddFun = + fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end, + _Bloom = lists:foldr(BloomAddFun, + leveled_tinybloom:tiny_empty(), + HashList), + SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), + io:format(user, + "Created slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWA0)]), + + % {TestK1, TestV1} = lists:nth(16, KVL1), + {TestK2, TestV2} = lists:nth(64, KVL1), + % {TestK3, TestV3} = lists:nth(96, KVL1), + SWA1 = os:timestamp(), + Slot0 = binary_to_term(SlotBin), + {value, TestV2} = gb_trees:lookup(TestK2, Slot0), + io:format(user, + "Looked in slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWA1)]). + + + + merge_test() -> N = 3000, KVL1 = lists:ukeysort(1, generate_randomkeys(N + 1, N, 1, 20)), From b509e81cfd08c234eba89fb7c2eeefc7c7fef7fc Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 29 Dec 2016 14:14:09 +0000 Subject: [PATCH 38/58] Ongoing timing tests --- src/leveled_sst.erl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index f281ba1..40b4a06 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -1003,18 +1003,23 @@ experimental_test() -> "Created slot in ~w microseconds~n", [timer:now_diff(os:timestamp(), SWA0)]), - % {TestK1, TestV1} = lists:nth(16, KVL1), + {TestK1, TestV1} = lists:nth(16, KVL1), {TestK2, TestV2} = lists:nth(64, KVL1), - % {TestK3, TestV3} = lists:nth(96, KVL1), + {TestK3, TestV3} = lists:nth(96, KVL1), + test_slot(SlotBin, TestK1, TestV1), + test_slot(SlotBin, TestK2, TestV2), + test_slot(SlotBin, TestK3, TestV3). + + + +test_slot(SlotBin, Key, Value) -> SWA1 = os:timestamp(), Slot0 = binary_to_term(SlotBin), - {value, TestV2} = gb_trees:lookup(TestK2, Slot0), + {value, Value} = gb_trees:lookup(Key, Slot0), io:format(user, "Looked in slot in ~w microseconds~n", [timer:now_diff(os:timestamp(), SWA1)]). - - merge_test() -> N = 3000, From c0d959beffa1a2485e699b97d2eba242f8519b2b Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 29 Dec 2016 22:22:13 +0000 Subject: [PATCH 39/58] Five alternatives explored --- src/leveled_sst.erl | 379 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 364 insertions(+), 15 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 40b4a06..713b0fa 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -983,41 +983,390 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> experimental_test() -> io:format(user, "~nExperimental timing test:~n", []), - N = 128, - KVL1 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 2)), + N = 150, + KVL0 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 4)), + KVL1 = lists:sublist(KVL0, 128), + KVL2 = generate_randomkeys(1, N, 1, 4), + KVLnot0 = lists:foldr(fun({K, V}, Acc) -> + case lists:keymember(K, 1, KVL1) of + true -> + Acc; + false -> + [{K, leveled_codec:magic_hash(K), V}|Acc] + end end, + [], + KVL2), + KVLnot1 = lists:sublist(KVLnot0, 8), + ExtractHashFun = fun({K, V}) -> {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), {{hash, H}, K} end, - HashList = lists:map(ExtractHashFun, KVL1), - - SWA0 = os:timestamp(), - Tree = gb_trees:from_orddict(KVL1), BloomAddFun = fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end, - _Bloom = lists:foldr(BloomAddFun, + AltHashFoldFun = + fun({K, V}, {HashLAcc, PosBAcc}) -> + {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), + PosH = H band 65535, + {[{{hash, H}, K}|HashLAcc], <>} + end, + AltAltHashFoldFun = + fun({K, V}, {HashLAcc, PosBAcc, PosC}) -> + {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), + Slot = H rem 256, + PosH = (H bsr 8) band 65535, + {[{{hash, H}, K}|HashLAcc], + [{Slot, <>}|PosBAcc], + PosC + 1} + end, + + + SWA0 = os:timestamp(), + HashList = lists:map(ExtractHashFun, KVL1), + Tree = gb_trees:from_orddict(KVL1), + _BloomA = lists:foldr(BloomAddFun, leveled_tinybloom:tiny_empty(), HashList), - SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), + SlotBinA = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), io:format(user, "Created slot in ~w microseconds~n", [timer:now_diff(os:timestamp(), SWA0)]), - {TestK1, TestV1} = lists:nth(16, KVL1), - {TestK2, TestV2} = lists:nth(64, KVL1), - {TestK3, TestV3} = lists:nth(96, KVL1), - test_slot(SlotBin, TestK1, TestV1), - test_slot(SlotBin, TestK2, TestV2), - test_slot(SlotBin, TestK3, TestV3). + {TestK1, TestV1} = lists:nth(20, KVL1), + MH1 = leveled_codec:magic_hash(TestK1), + {TestK2, TestV2} = lists:nth(40, KVL1), + MH2 = leveled_codec:magic_hash(TestK2), + {TestK3, TestV3} = lists:nth(60, KVL1), + MH3 = leveled_codec:magic_hash(TestK3), + {TestK4, TestV4} = lists:nth(80, KVL1), + MH4 = leveled_codec:magic_hash(TestK4), + {TestK5, TestV5} = lists:nth(100, KVL1), + MH5 = leveled_codec:magic_hash(TestK5), + + test_slot(SlotBinA, TestK1, TestV1), + test_slot(SlotBinA, TestK2, TestV2), + test_slot(SlotBinA, TestK3, TestV3), + test_slot(SlotBinA, TestK4, TestV4), + test_slot(SlotBinA, TestK5, TestV5), + lists:foreach(fun({NotK, _H, _V}) -> + test_not_slot(SlotBinA, NotK) end, + KVLnot1), + + SWB0 = os:timestamp(), + {Alt1HashList, Pos1Bin} = lists:foldr(AltHashFoldFun, {[], <<>>}, KVL1), + _BloomB = lists:foldr(BloomAddFun, + leveled_tinybloom:tiny_empty(), + Alt1HashList), + Alt1SlotBinB = term_to_binary(KVL1, [{compressed, ?COMPRESSION_LEVEL}]), + io:format(user, + "Alt1-method created slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWB0)]), + + alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH1, TestK1, TestV1), + alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH2, TestK2, TestV2), + alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH3, TestK3, TestV3), + alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH4, TestK4, TestV4), + alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH5, TestK5, TestV5), + lists:foreach(fun({NotK, NotMH, _V}) -> + alt1_test_not_slot(Alt1SlotBinB, Pos1Bin, NotMH, NotK) end, + KVLnot1), + + SWC0 = os:timestamp(), + {KVL1A, KVL1B} = lists:split(64, KVL1), + {Alt2HashListA, Pos2BinA} = lists:foldr(AltHashFoldFun, {[], <<>>}, KVL1A), + {Alt2HashList, Pos2BinB} = lists:foldr(AltHashFoldFun, {Alt2HashListA, <<>>}, KVL1B), + _BloomB = lists:foldr(BloomAddFun, + leveled_tinybloom:tiny_empty(), + Alt2HashList), + Alt2SlotBinB_A = term_to_binary(KVL1A, [{compressed, ?COMPRESSION_LEVEL}]), + Alt2SlotBinB_B = term_to_binary(KVL1B, [{compressed, ?COMPRESSION_LEVEL}]), + Alt2Tester = [{Alt2SlotBinB_A, Pos2BinA}, {Alt2SlotBinB_B, Pos2BinB}], + io:format(user, + "Alt2-method created slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWC0)]), + + alt2_test_slot(Alt2Tester, MH1, TestK1, TestV1, "Alt2"), + alt2_test_slot(Alt2Tester, MH2, TestK2, TestV2, "Alt2"), + alt2_test_slot(Alt2Tester, MH3, TestK3, TestV3, "Alt2"), + alt2_test_slot(Alt2Tester, MH4, TestK4, TestV4, "Alt2"), + alt2_test_slot(Alt2Tester, MH5, TestK5, TestV5, "Alt2"), + lists:foreach(fun({NotK, NotMH, _V}) -> + alt2_test_not_slot(Alt2Tester, NotMH, NotK, "Alt2") end, + KVLnot1), + + SWD0 = os:timestamp(), + {KVL1A32, KVL1_96} = lists:split(32, KVL1), + {KVL1B32, KVL1_64} = lists:split(32, KVL1_96), + {KVL1C32, KVL1D32} = lists:split(32, KVL1_64), + {Alt3HashListA, Pos3BinA} = lists:foldr(AltHashFoldFun, {[], <<>>}, KVL1A32), + {Alt3HashListB, Pos3BinB} = lists:foldr(AltHashFoldFun, {Alt3HashListA, <<>>}, KVL1B32), + {Alt3HashListC, Pos3BinC} = lists:foldr(AltHashFoldFun, {Alt3HashListB, <<>>}, KVL1C32), + {Alt3HashList, Pos3BinD} = lists:foldr(AltHashFoldFun, {Alt3HashListC, <<>>}, KVL1D32), + _BloomB = lists:foldr(BloomAddFun, + leveled_tinybloom:tiny_empty(), + Alt3HashList), + Alt3SlotBinB_A = term_to_binary(KVL1A32, [{compressed, ?COMPRESSION_LEVEL}]), + Alt3SlotBinB_B = term_to_binary(KVL1B32, [{compressed, ?COMPRESSION_LEVEL}]), + Alt3SlotBinB_C = term_to_binary(KVL1C32, [{compressed, ?COMPRESSION_LEVEL}]), + Alt3SlotBinB_D = term_to_binary(KVL1D32, [{compressed, ?COMPRESSION_LEVEL}]), + Alt3Tester = [{Alt3SlotBinB_A, Pos3BinA}, {Alt3SlotBinB_B, Pos3BinB}, + {Alt3SlotBinB_C, Pos3BinC}, {Alt3SlotBinB_D, Pos3BinD}], + io:format(user, + "Alt3-method created slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWD0)]), + + + alt2_test_slot(Alt3Tester, MH1, TestK1, TestV1, "Alt3"), + alt2_test_slot(Alt3Tester, MH2, TestK2, TestV2, "Alt3"), + alt2_test_slot(Alt3Tester, MH3, TestK3, TestV3, "Alt3"), + alt2_test_slot(Alt3Tester, MH4, TestK4, TestV4, "Alt3"), + alt2_test_slot(Alt3Tester, MH5, TestK5, TestV5, "Alt3"), + lists:foreach(fun({NotK, NotMH, _V}) -> + alt2_test_not_slot(Alt3Tester, NotMH, NotK, "Alt3") end, + KVLnot1), + + SWE0 = os:timestamp(), + {KVL1A32, KVL1_96} = lists:split(32, KVL1), + {KVL1B32, KVL1_64} = lists:split(32, KVL1_96), + {KVL1C32, KVL1D32} = lists:split(32, KVL1_64), + io:format("final block length ~w~n", [length(KVL1D32)]), + {Alt4HashListA, PosTLA, 32} = lists:foldl(AltAltHashFoldFun, {[], [], 0}, KVL1A32), + {Alt4HashListB, PosTLB, 64} = lists:foldl(AltAltHashFoldFun, {Alt3HashListA, PosTLA, 32}, KVL1B32), + {Alt4HashListC, PosTLC, 96} = lists:foldl(AltAltHashFoldFun, {Alt3HashListB, PosTLB, 64}, KVL1C32), + {Alt4HashList, PosTLall, 128} = lists:foldl(AltAltHashFoldFun, {Alt3HashListC, PosTLC, 96}, KVL1D32), + _BloomB = lists:foldr(BloomAddFun, + leveled_tinybloom:tiny_empty(), + Alt3HashList), + Alt3SlotBinB_A = term_to_binary(KVL1A32, [{compressed, ?COMPRESSION_LEVEL}]), + Alt3SlotBinB_B = term_to_binary(KVL1B32, [{compressed, ?COMPRESSION_LEVEL}]), + Alt3SlotBinB_C = term_to_binary(KVL1C32, [{compressed, ?COMPRESSION_LEVEL}]), + Alt3SlotBinB_D = term_to_binary(KVL1D32, [{compressed, ?COMPRESSION_LEVEL}]), + SortedSlotList = lists:keysort(1, PosTLall), + PosBinList = build_hashtree_binary(SortedSlotList, 256, 0, []), + Alt4PosBin = list_to_binary(PosBinList), + + + Alt4Tester = {Alt3SlotBinB_A, Alt3SlotBinB_B, Alt3SlotBinB_C, Alt3SlotBinB_D}, + io:format(user, + "Alt4-method created slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWE0)]), + + alt4_test_slot(Alt4Tester, Alt4PosBin, TestK1, MH1, TestV1, "Alt4"), + alt4_test_slot(Alt4Tester, Alt4PosBin, TestK2, MH2, TestV2, "Alt4"), + alt4_test_slot(Alt4Tester, Alt4PosBin, TestK3, MH3, TestV3, "Alt4"), + alt4_test_slot(Alt4Tester, Alt4PosBin, TestK4, MH4, TestV4, "Alt4"), + alt4_test_slot(Alt4Tester, Alt4PosBin, TestK5, MH5, TestV5, "Alt4"). + + +alt4_test_slot(Alt4Tester, PosBin, Key, Hash, Value, M) -> + io:format("looking for key ~s ~s~n", [element(2, Key), element(3, Key)]), + SW = os:timestamp(), + Slot = Hash rem 256, + PosH = (Hash bsr 8) band 65535, + PosList = find_pos_byslot(PosBin, Slot, PosH, []), + io:format("PosList ~w~n", [PosList]), + FindKVFun = + fun(Pos, Found) -> + case Found of + not_present -> + Block = (Pos div 32) + 1, + BlockPos = (Pos rem 32) + 1, + io:format("Looking Block ~w position ~w~n", [Block, BlockPos]), + CheckBlock = element(Block, Alt4Tester), + {K, V} = lists:nth(BlockPos, binary_to_term(CheckBlock)), + io:format("K of ~s ~s~n", [element(2, K), element(3, K)]), + case K of + Key -> + {value, V}; + _ -> + not_present + end; + _ -> + Found + end end, + {value, Value} = lists:foldl(FindKVFun, not_present, PosList), + io:format(user, + M ++ "-method found in slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW)]). + + +find_pos_byslot(PosBin, Slot, PosH, PosList) -> + Start = Slot * 3, + <<_LHS:Start/binary, Hash:16/integer, Pos:8/integer, _Rest/binary>> = PosBin, + case Hash of + PosH -> + find_pos_byslot(PosBin, (Slot + 1) rem 256, PosH, [Pos|PosList]); + 0 -> + PosList; + _ -> + find_pos_byslot(PosBin, (Slot + 1) rem 256, PosH, PosList) + end. + + + +build_hashtree_binary([], IdxLen, SlotPos, Bin) -> + case SlotPos of + IdxLen -> + lists:reverse(Bin); + N when N < IdxLen -> + ZeroLen = (IdxLen - N) * 24, + lists:reverse([<<0:ZeroLen>>|Bin]) + end; +build_hashtree_binary([{TopSlot, TopBin}|SlotMapTail], IdxLen, SlotPos, Bin) -> + case TopSlot of + N when N > SlotPos -> + D = N - SlotPos, + Bridge = lists:duplicate(D, <<0:24>>) ++ Bin, + UpdBin = [<>|Bridge], + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + D + 1, + UpdBin); + N when N =< SlotPos, SlotPos < IdxLen -> + UpdBin = [<>|Bin], + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + 1, + UpdBin); + N when N < SlotPos, SlotPos == IdxLen -> + % Need to wrap round and put in the first empty slot from the + % beginning + Pos = find_firstzero(Bin, length(Bin)), + {LHS, [<<0:24>>|RHS]} = lists:split(Pos - 1, Bin), + UpdBin = lists:append(LHS, [TopBin|RHS]), + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos, + UpdBin) + end. + +% Search from the tail of the list to find the first zero +find_firstzero(Bin, Pos) -> + case lists:nth(Pos, Bin) of + <<0:24>> -> + Pos; + _ -> + find_firstzero(Bin, Pos - 1) + end. + + + + +alt2_test_not_slot(TesterList, Hash, Key, M) -> + SWB1 = os:timestamp(), + not_present = alt2_test_slot_int(TesterList, Hash, Key, not_present), + io:format(user, + M ++ "-method missed in slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWB1)]). + + +alt2_test_slot(TesterList, Hash, Key, Value, M) -> + SWB1 = os:timestamp(), + {value, Value} = alt2_test_slot_int(TesterList, Hash, Key, not_present), + io:format(user, + M ++ "-method found in slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWB1)]). + +alt2_test_slot_int([], _Hash, _Key, Result) -> + Result; +alt2_test_slot_int([{SlotBin, PosBin}|Rest], Hash, Key, not_present) -> + H2F = Hash band 65535, + PosL = posbin_finder(PosBin, H2F, [], 1), + case PosL of + [] -> + alt2_test_slot_int(Rest, Hash, Key, not_present); + _ -> + Slot = binary_to_term(SlotBin), + FindFun = + fun(P, Acc) -> + case Acc of + not_present -> + case lists:nth(P, Slot) of + {Key, V} -> + {value, V}; + _ -> + not_present + end; + _ -> + Acc + end end, + Out = lists:foldr(FindFun, not_present, PosL), + alt2_test_slot_int(Rest, Hash, Key, Out) + end; +alt2_test_slot_int(_Testers, _Hash, _Key, Result) -> + Result. + + +alt1_test_not_slot(SlotBin, PosBin, Hash, Key) -> + SWB1 = os:timestamp(), + not_present = alt1_test_slot_int(SlotBin, PosBin, Hash, Key), + io:format(user, + "Alt1-method missed in slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWB1)]). + + +alt1_test_slot(SlotBin, PosBin, Hash, Key, Value) -> + SWB1 = os:timestamp(), + {value, Value} = alt1_test_slot_int(SlotBin, PosBin, Hash, Key), + io:format(user, + "Alt1-method found in slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWB1)]). + +alt1_test_slot_int(SlotBin, PosBin, Hash, Key) -> + Slot0 = binary_to_term(SlotBin), + % io:format(user, + % "Alt-method fraction on b_to_t ~w microseconds~n", + % [timer:now_diff(os:timestamp(), SWB1)]), + + H2F = Hash band 65535, + PosL = posbin_finder(PosBin, H2F, [], 1), + FindFun = + fun(P, Acc) -> + case Acc of + not_present -> + case lists:nth(P, Slot0) of + {Key, V} -> + {value, V}; + _ -> + not_present + end; + _ -> + Acc + end end, + lists:foldr(FindFun, not_present, PosL). + + + +posbin_finder(<<>>, _Hash, FoundL, _PosC) -> + FoundL; +posbin_finder(<>, Hash, FoundL, PosC) -> + posbin_finder(Rest, Hash, [PosC|FoundL], PosC + 1); +posbin_finder(<<_Hash:16/integer, Rest/binary>>, Hash, FoundL, PosC) -> + posbin_finder(Rest, Hash, FoundL, PosC + 1). + test_slot(SlotBin, Key, Value) -> SWA1 = os:timestamp(), Slot0 = binary_to_term(SlotBin), + % io:format(user, + % "Fraction on b_to_t ~w microseconds~n", + % [timer:now_diff(os:timestamp(), SWA1)]), {value, Value} = gb_trees:lookup(Key, Slot0), io:format(user, - "Looked in slot in ~w microseconds~n", + "Found in slot in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWA1)]). + +test_not_slot(SlotBin, Key) -> + SWA1 = os:timestamp(), + Slot0 = binary_to_term(SlotBin), + % io:format(user, + % "Fraction on b_to_t ~w microseconds~n", + % [timer:now_diff(os:timestamp(), SWA1)]), + none = gb_trees:lookup(Key, Slot0), + io:format(user, + "Missed in slot in ~w microseconds~n", [timer:now_diff(os:timestamp(), SWA1)]). From 2079fff7f8ae813ff5ddd16a05e4fa8c09c0ec51 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 2 Jan 2017 10:47:04 +0000 Subject: [PATCH 40/58] Switched to indexed blocks as slot implementation Prior to this refactor, the slot and been made up of four blocks with an external binary index. Although the form of the index has changed again, micro-benchmarking once again showed that this was a relatively efficient mechanism. --- src/leveled_log.erl | 3 +- src/leveled_sst.erl | 1316 ++++++++++++++++++++++--------------------- 2 files changed, 664 insertions(+), 655 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index ba669fa..231dbce 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -431,8 +431,7 @@ sst_timing({N, SSTTimerD}, SW, TimerType) -> end. sst_keylist() -> - [summary_bloom, cache_entry, - slot_bloom, slot_crc_wonky, slot_lookup_miss, slot_lookup_hit]. + [summary_bloom, slot_cache, slot_bloom, slot_fetch]. get_timing(undefined, SW, TimerType) -> diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 713b0fa..f84a0d1 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -8,49 +8,58 @@ %% -------- Slots --------- %% %% The view is built from sublists referred to as slot. Each slot is up to 128 -%% keys and values in size. The slots are each themselves a gb_tree. The -%% gb_tree is slightly slower than the skiplist at fetch time, and doesn't -%% support directly the useful to_range function. However the from_orddict -%% capability is much faster than from_sortedlist in skiplist, saving on CPU -%% at sst build time: +%% keys and values in size. Three strategis have been benchmarked for the +%% slot: a skiplist, a gb-tree, four blocks of flat lists with an index. %% %% Skiplist: -%% build and serialise slot 3233 microseconds +%% build and serialise slot - 3233 microseconds %% de-serialise and check * 128 - 14669 microseconds %% flatten back to list - 164 microseconds %% %% GBTree: -%% build and serialise tree 1433 microseconds +%% build and serialise tree - 1433 microseconds %% de-serialise and check * 128 - 15263 microseconds %% flatten back to list - 175 microseconds %% -%% The performance advantage at lookup time is no negligible as the time to -%% de-deserialise for each check is dominant. This time grows linearly with -%% the size of the slot, wherease the serialisation time is relatively constant -%% with growth. So bigger slots would be quicker to build, but the penalty for -%% that speed is too high at lookup time. +%% Indexed Blocks: +%% build and serialise slot 342 microseconds +%% de-deriaise and check * 128 - 6746 microseconds +%% flatten back to list - 187 microseconds +%% +%% The negative side of using Indexed Blocks is the storage of the index. In +%% the original implementation this was stored on fadvised disk (the index in +%% this case was a rice-encoded view of which block the object is in). In this +%% implementation it is cached in memory -requiring 2-bytes per key to be kept +%% in memory. %% %% -------- Blooms --------- %% -%% There are two different tiny blooms for each table. One is split by the +%% There is a summary bloom for the table. the summary bloom is split by the %% first byte of the hash, and consists of two hashes (derived from the -%% remainder of the hash). This is the top bloom, and the size vaires by +%% remainder of the hash). This is the top bloom, and the size varies by %% level. %% Level 0 has 8 bits per key - 0.05 fpr %% Level 1 has 6 bits per key - 0.08 fpr %% Other Levels have 4 bits per key - 0.15 fpr %% -%% If this level is passed, then each slot has its own bloom based on the -%% same hash, but now split into three hashes and having a fixed 8 bit per -%% key size at all levels. -%% Slot Bloom has 8 bits per key - 0.03 fpr +%% With the indexed block implementation of the slot a second slot-level bloom +%% is unnecessary (as the index itself yields a 0.003 % fpr). %% -%% All blooms are based on the DJ Bernstein magic hash which proved to give -%% the predicted fpr in tests (unlike phash2 which has significantly higher -%% fpr). Due to the cost of producing the magic hash, it is read from the -%% value not reproduced each time. If the value is set to no_lookup no bloom -%% entry is added, and if all hashes are no_lookup in the slot then no bloom -%% is produced. +%% -------- Summary --------- +%% +%% Each file has a summary - which is the 128 keys at the top of each slot in +%% a skiplist, with some basic metadata about the slot stored as the value. +%% +%% The summary is stored seperately to the slots (wihtin the same file). +%% +%% -------- CRC Checks --------- +%% +%% Every attempt to either read a summary or a slot off disk will also include +%% a CRC check. If the CRC check fails non-presence is assumed (the data +%% within is assumed to be entirely lost). The data can be recovered by either +%% using a recoverable strategy in transaction log compaction, and triggering +%% the transaction log replay; or by using a higher level for of anti-entropy +%% (i.e. make Riak responsible). -module(leveled_sst). @@ -60,8 +69,9 @@ -include("include/leveled.hrl"). -define(MAX_SLOTS, 256). --define(SLOT_SIZE, 128). +-define(SLOT_SIZE, 128). % This is not configurable -define(COMPRESSION_LEVEL, 1). +-define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]). -define(LEVEL_BLOOM_SLOTS, [{0, 64}, {1, 48}, {default, 32}]). -define(MERGE_SCANWIDTH, 16). -define(DISCARD_EXT, ".discarded"). @@ -100,16 +110,15 @@ -record(slot_index_value, {slot_id :: integer(), - bloom, start_position :: integer(), length :: integer()}). -record(summary, {first_key :: tuple(), last_key :: tuple(), index :: list(), % leveled_skiplist - bloom :: tuple(), % leveled_tinybloom size :: integer(), - max_sqn :: integer()}). + max_sqn :: integer(), + bloom}). -record(state, {summary, handle :: file:fd(), @@ -117,7 +126,8 @@ slot_lengths :: list(), penciller :: pid(), filename, - cache}). + lastfetch_cache, + blockindex_cache}). %%%============================================================================ @@ -230,12 +240,17 @@ starting({sst_open, Filename}, _From, State) -> UpdState}; starting({sst_new, Filename, Level, KVList, MaxSQN}, _From, State) -> SW = os:timestamp(), - {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), + {FirstKey, + Length, + SlotIndex, + AllHashes, + BlockIndex, + SlotsBin} = build_all_slots(KVList), SummaryBin = build_table_summary(SlotIndex, AllHashes, Level, FirstKey, - L, + Length, MaxSQN), ActualFilename = write_file(Filename, SummaryBin, SlotsBin), UpdState = read_file(ActualFilename, State), @@ -246,18 +261,23 @@ starting({sst_new, Filename, Level, KVList, MaxSQN}, _From, State) -> {reply, {ok, {Summary#summary.first_key, Summary#summary.last_key}}, reader, - UpdState}. + UpdState#state{blockindex_cache = BlockIndex}}. starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller, MaxSQN}, State) -> SW = os:timestamp(), KVList = leveled_pmem:to_list(Slots, FetchFun), - {FirstKey, L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList), + {FirstKey, + Length, + SlotIndex, + AllHashes, + BlockIndex, + SlotsBin} = build_all_slots(KVList), SummaryBin = build_table_summary(SlotIndex, AllHashes, 0, FirstKey, - L, + Length, MaxSQN), ActualFilename = write_file(Filename, SummaryBin, SlotsBin), UpdState = read_file(ActualFilename, State), @@ -267,32 +287,29 @@ starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller, MaxSQN}, SW), case Penciller of undefined -> - {next_state, reader, UpdState}; + {next_state, reader, UpdState#state{blockindex_cache = BlockIndex}}; _ -> leveled_penciller:pcl_confirml0complete(Penciller, UpdState#state.filename, Summary#summary.first_key, Summary#summary.last_key), - {next_state, reader, UpdState} + {next_state, reader, UpdState#state{blockindex_cache = BlockIndex}} end. reader({get_kv, LedgerKey, Hash}, _From, State) -> SW = os:timestamp(), - {Result, Stage, SlotID} = fetch(LedgerKey, Hash, State), + {Result, Stage, SlotID, UpdState} = fetch(LedgerKey, Hash, State), UpdTimings = leveled_log:sst_timing(State#state.sst_timings, SW, Stage), case {Result, Stage} of - {not_present, slot_crc_wonky} -> - leveled_log:log("SST02", [State#state.filename, SlotID]), - {reply, Result, reader, State#state{sst_timings = UpdTimings}}; {not_present, _} -> - {reply, Result, reader, State#state{sst_timings = UpdTimings}}; + {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}}; {KV, slot_lookup_hit} -> - UpdCache = array:set(SlotID, KV, State#state.cache), - {reply, Result, reader, State#state{cache = UpdCache, - sst_timings = UpdTimings}}; + UpdCache = array:set(SlotID - 1, KV, State#state.lastfetch_cache), + {reply, Result, reader, UpdState#state{lastfetch_cache = UpdCache, + sst_timings = UpdTimings}}; _ -> - {reply, Result, reader, State#state{sst_timings = UpdTimings}} + {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}} end; reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> {reply, @@ -301,10 +318,11 @@ reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> State}; reader({get_slots, SlotList}, _From, State) -> SlotBins = read_slots(State#state.handle, SlotList), - FoldFun = + FetchFun = fun({SlotBin, SK, EK}, Acc) -> - Acc ++ trim_slot(SlotBin, SK, EK) end, - {reply, lists:foldl(FoldFun, [], SlotBins), reader, State}; + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + {reply, lists:foldl(FetchFun, [], SlotBins), reader, State}; reader(get_maxsequencenumber, _From, State) -> Summary = State#state.summary, {reply, Summary#summary.max_sqn, reader, State}; @@ -333,16 +351,13 @@ reader(close, _From, State) -> delete_pending({get_kv, LedgerKey, Hash}, _From, State) -> - {Result, Stage, SlotID} = fetch(LedgerKey, Hash, State), + {Result, Stage, SlotID, UpdState} = fetch(LedgerKey, Hash, State), case {Result, Stage} of - {not_present, slot_crc_wonky} -> - leveled_log:log("SST02", [State#state.filename, SlotID]), - {reply, Result, delete_pending, State, ?DELETE_TIMEOUT}; {not_present, _} -> {reply, Result, delete_pending, State, ?DELETE_TIMEOUT}; {KV, slot_lookup_hit} -> - UpdCache = array:set(SlotID, KV, State#state.cache), - UpdState = State#state{cache = UpdCache}, + UpdCache = array:set(SlotID - 1, KV, State#state.lastfetch_cache), + UpdState = State#state{lastfetch_cache = UpdCache}, {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT}; _ -> {reply, Result, delete_pending, State, ?DELETE_TIMEOUT} @@ -355,11 +370,12 @@ delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> ?DELETE_TIMEOUT}; delete_pending({get_slots, SlotList}, _From, State) -> SlotBins = read_slots(State#state.handle, SlotList), - FoldFun = + FetchFun = fun({SlotBin, SK, EK}, Acc) -> - Acc ++ trim_slot(SlotBin, SK, EK) end, - {reply, - lists:foldl(FoldFun, [], SlotBins), + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + {reply, + lists:foldl(FetchFun, [], SlotBins), delete_pending, State, ?DELETE_TIMEOUT}; @@ -404,42 +420,60 @@ fetch(LedgerKey, Hash, State) -> case leveled_tinybloom:check({hash, Hash}, Summary#summary.bloom) of false -> - {not_present, summary_bloom, null}; + {not_present, summary_bloom, null, State}; true -> Slot = lookup_slot(LedgerKey, Summary#summary.index), - SlotBloom = Slot#slot_index_value.bloom, - case is_check_slot_required({hash, Hash}, LedgerKey, SlotBloom) of - false -> - {not_present, slot_bloom, null}; - true -> - CacheEntry = array:get(Slot#slot_index_value.slot_id, - State#state.cache), - case CacheEntry of - {LedgerKey, CachedValue} -> - {{LedgerKey, CachedValue}, cache_entry, null}; + SlotID = Slot#slot_index_value.slot_id, + CachedBlockIdx = array:get(SlotID - 1, + State#state.blockindex_cache), + case CachedBlockIdx of + none -> + SlotBin = read_slot(State#state.handle, Slot), + {Result, BlockIdx} = binaryslot_get(SlotBin, + LedgerKey, + Hash, + none), + BlockIndexCache = array:set(SlotID - 1, + BlockIdx, + State#state.blockindex_cache), + {Result, + slot_fetch, + Slot#slot_index_value.slot_id, + State#state{blockindex_cache = BlockIndexCache}}; + _ -> + PosList = find_pos(CachedBlockIdx, + double_hash(Hash, LedgerKey), + [], + 0), + case PosList of + [] -> + {not_present, + slot_bloom, + SlotID, + State}; _ -> - SlotLook = lookup_in_slot(LedgerKey, - {pointer, - State#state.handle, - Slot}), - case SlotLook of - crc_wonky -> - {not_present, - slot_crc_wonky, - Slot#slot_index_value.slot_id}; - none -> - {not_present, - slot_lookup_miss, - null}; - {value, V} -> - {{LedgerKey, V}, - slot_lookup_hit, - Slot#slot_index_value.slot_id} + LastKV = array:get(SlotID - 1, + State#state.lastfetch_cache), + case LastKV of + {LedgerKey, _} -> + {LastKV, slot_cache, SlotID, State}; + _ -> + SlotBin = read_slot(State#state.handle, + Slot), + Result = binaryslot_get(SlotBin, + LedgerKey, + Hash, + {true, PosList}), + {element(1, Result), + slot_fetch, + SlotID, + State} end - end + end end end. + fetch_range(StartKey, EndKey, ScanWidth, State) -> Summary = State#state.summary, Handle = State#state.handle, @@ -505,10 +539,14 @@ fetch_range(StartKey, EndKey, ScanWidth, State) -> _ -> lists:split(ScanWidth, ExpandedSlots) end, + + SlotsToFetchBinList = read_slots(Handle, SlotsToFetch), + FetchFun = - fun({pointer, _Self, S, SK, EK}, Acc) -> - Acc ++ trim_slot({pointer, Handle, S}, SK, EK) end, - lists:foldl(FetchFun, [], SlotsToFetch) ++ SlotsToPoint. + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint. write_file(Filename, SummaryBin, SlotsBin) -> @@ -544,17 +582,29 @@ read_file(Filename, State) -> end, SlotLengths = lists:foldr(SlotLengthFetchFun, [], Summary#summary.index), SlotCount = length(SlotLengths), + UpdState = + case State#state.blockindex_cache of + undefined -> + BlockIndexCache = array:new([{size, SlotCount}, + {default, none}]), + LastFetchCache = array:new([{size, SlotCount}]), + State#state{blockindex_cache = BlockIndexCache, + lastfetch_cache = LastFetchCache}; + _ -> + LastFetchCache = array:new([{size, SlotCount}]), + State#state{lastfetch_cache = LastFetchCache} + end, + SkipL = leveled_skiplist:from_sortedlist(Summary#summary.index), UpdSummary = Summary#summary{index = SkipL}, leveled_log:log("SST03", [Filename, Summary#summary.size, SlotCount, Summary#summary.max_sqn]), - State#state{summary = UpdSummary, + UpdState#state{summary = UpdSummary, slot_lengths = SlotLengths, handle = Handle, - filename = Filename, - cache = array:new({size, SlotCount + 1})}. + filename = Filename}. open_reader(Filename) -> {ok, Handle} = file:open(Filename, [binary, raw, read]), @@ -583,7 +633,7 @@ build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) -> index = lists:reverse(SlotIndex), bloom = Bloom, max_sqn = MaxSQN}, - SummBin = term_to_binary(Summary, [{compressed, ?COMPRESSION_LEVEL}]), + SummBin = term_to_binary(Summary, ?BINARY_SETTINGS), SummCRC = erlang:crc32(SummBin), <>. @@ -602,69 +652,45 @@ build_all_slots(KVList) -> % but otherwise length must be called each iteration to avoid exception % on split or sublist [{FirstKey, _FirstV}|_Rest] = KVList, - SlotCount = L div ?SLOT_SIZE, - {SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList, - SlotCount, - 8, - [], - 1, - [], - <<>>), - {FirstKey, L, SlotIndex, AllHashes, SlotsBin}. + SlotCount = L div ?SLOT_SIZE + 1, + BuildResponse = build_all_slots(KVList, + SlotCount, + 8, + [], + 1, + [], + array:new([{size, SlotCount}, + {default, none}]), + <<>>), + {SlotIndex, AllHashes, BlockIndex, SlotsBin} = BuildResponse, + {FirstKey, L, SlotIndex, AllHashes, BlockIndex, SlotsBin}. -build_all_slots([], _Count, _Start, AllHashes, _SlotID, SlotIndex, SlotsBin) -> - {SlotIndex, AllHashes, SlotsBin}; -build_all_slots(KVL, Count, Start, AllHashes, SlotID, SlotIndex, SlotsBin) -> +build_all_slots([], _SC, _Pos, Hashes, _SlotID, SlotIdx, BlockIdxA, SlotsBin) -> + {SlotIdx, Hashes, BlockIdxA, SlotsBin}; +build_all_slots(KVL, SC, Pos, Hashes, SlotID, SlotIdx, BlockIdxA, SlotsBin) -> {SlotList, KVRem} = - case Count of - 0 -> + case SC of + 1 -> {lists:sublist(KVL, ?SLOT_SIZE), []}; _N -> lists:split(?SLOT_SIZE, KVL) end, {LastKey, _V} = lists:last(SlotList), - ExtractHashFun = - fun({K, V}, Acc) -> - {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), - case H of - no_lookup -> - Acc; - H -> - [{{hash, H}, K}|Acc] - end - end, - HashList = lists:foldr(ExtractHashFun, [], SlotList), - {SlotBin, Bloom} = build_slot(SlotList, HashList), - SlotCRC = erlang:crc32(SlotBin), - Length = byte_size(SlotBin) + 4, + {HashList, BlockIndex, SlotBin} = generate_binary_slot(SlotList), + Length = byte_size(SlotBin), SlotIndexV = #slot_index_value{slot_id = SlotID, - bloom = Bloom, - start_position = Start, + start_position = Pos, length = Length}, build_all_slots(KVRem, - Count - 1, - Start + Length, - HashList ++ AllHashes, + SC - 1, + Pos + Length, + HashList ++ Hashes, SlotID + 1, - [{LastKey, SlotIndexV}|SlotIndex], - <>). + [{LastKey, SlotIndexV}|SlotIdx], + array:set(SlotID - 1, BlockIndex, BlockIdxA), + <>). -build_slot(KVList, HashList) -> - Tree = gb_trees:from_orddict(KVList), - BloomAddFun = - fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end, - Bloom = lists:foldr(BloomAddFun, - leveled_tinybloom:tiny_empty(), - HashList), - SlotBin = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), - {SlotBin, Bloom}. - -is_check_slot_required(_Hash, _Key, none) -> - true; -is_check_slot_required(Hash, Key, Bloom) -> - leveled_tinybloom:tiny_check(Hash, Key, Bloom). - %% Returns a section from the summary index and two booleans to indicate if %% the first slot needs trimming, or the last slot lookup_slots(StartKey, EndKey, SkipList) -> @@ -708,102 +734,45 @@ lookup_slot(Key, SkipList) -> {_Mark, Slot} = leveled_skiplist:key_above(SkipList, Key), Slot. -lookup_in_slot(Key, {pointer, Handle, Slot}) -> - SlotBin = read_slot(Handle, Slot), - case SlotBin of - crc_wonky -> - crc_wonky; - _ -> - lookup_in_slot(Key, SlotBin) - end; -lookup_in_slot(Key, SlotBin) -> - Tree = binary_to_term(SlotBin), - gb_trees:lookup(Key, Tree). - read_slot(Handle, Slot) -> {ok, SlotBin} = file:pread(Handle, Slot#slot_index_value.start_position, Slot#slot_index_value.length), - <> = SlotBin, - case erlang:crc32(SlotNoCRC) of - SlotCRC -> - SlotNoCRC; - _ -> - crc_wonky - end. + SlotBin. read_slots(Handle, SlotList) -> - [{pointer, FirstSlot, _SK1, _EK1}|_Rest] = SlotList, - {pointer, LastSlot, _SKL, _EKL} = lists:last(SlotList), - StartPos = FirstSlot#slot_index_value.start_position, - Length = LastSlot#slot_index_value.start_position - + LastSlot#slot_index_value.length - - StartPos, - {ok, MultiSlotBin} = file:pread(Handle, StartPos, Length), - read_off_binary(MultiSlotBin, SlotList, []). + PointerMapFun = + fun(Pointer) -> + {Slot, SK, EK} = + case Pointer of + {pointer, _Pid, Slot0, SK0, EK0} -> + {Slot0, SK0, EK0}; + {pointer, Slot0, SK0, EK0} -> + {Slot0, SK0, EK0} + end, -read_off_binary(<<>>, [], SplitBins) -> - SplitBins; -read_off_binary(MultiSlotBin, [TopSlot|Rest], SplitBins) -> - {pointer, Slot, SK, EK} = TopSlot, - Length = Slot#slot_index_value.length - 4, - <> = MultiSlotBin, - case erlang:crc32(SlotBin) of - SlotCRC -> - read_off_binary(RestBin, - Rest, - SplitBins ++ [{SlotBin, SK, EK}]); - _ -> - read_off_binary(RestBin, - Rest, - SplitBins ++ []) - end. - - -trim_slot({pointer, Handle, Slot}, all, all) -> - case read_slot(Handle, Slot) of - crc_wonky -> - []; - SlotBin -> - trim_slot(SlotBin, all, all) - end; -trim_slot(SlotBinary, all, all) -> - Tree = binary_to_term(SlotBinary), - gb_trees:to_list(Tree); -trim_slot({pointer, Handle, Slot}, StartKey, EndKey) -> - case read_slot(Handle, Slot) of - crc_wonky -> - []; - SlotBin -> - trim_slot(SlotBin, StartKey, EndKey) - end; -trim_slot(SlotBinary, StartKey, EndKey) -> - Tree = binary_to_term(SlotBinary), - L = gb_trees:to_list(Tree), - LTrimFun = fun({K, _V}) -> - K < StartKey end, - RTrimFun = fun({K, _V}) -> - not leveled_codec:endkey_passed(EndKey, K) end, - LTrimL = - case StartKey of - all -> - L; - _ -> - {_LDrop, RKeep} = lists:splitwith(LTrimFun, L), - RKeep + {Slot#slot_index_value.start_position, + Slot#slot_index_value.length, + SK, + EK} end, - RTrimL = - case EndKey of - all -> - LTrimL; - _ -> - {LKeep, _RDrop} = lists:splitwith(RTrimFun, LTrimL), - LKeep - end, - RTrimL. + + LengthList = lists:map(PointerMapFun, SlotList), + StartPos = element(1, lists:nth(1, LengthList)), + EndPos = element(1, lists:last(LengthList)) + + element(2, lists:last(LengthList)), + {ok, MultiSlotBin} = file:pread(Handle, StartPos, EndPos - StartPos), + BinSplitMapFun = + fun({SP, L, SK, EK}) -> + Start = SP - StartPos, + <<_Pre:Start/binary, + SlotBin:L/binary, + _Post/binary>> = MultiSlotBin, + {SlotBin, SK, EK} + end, + + lists:map(BinSplitMapFun, LengthList). generate_filenames(RootFilename) -> Ext = filename:extension(RootFilename), @@ -819,6 +788,331 @@ generate_filenames(RootFilename) -> filename:join(DN, FP_NOEXT) ++ ".sst"} end. + +%%%============================================================================ +%%% Slot Implementation +%%%============================================================================ + +%% Implementing a slot has gone through numerous iterations. One of the most +%% critical considerations has been the cost of the binary_to_term and +%% term_to_binary calls for different sizes of slots and different data types. +%% +%% Microbenchmarking indicated that flat lists were the fastest. However, the +%% lists need scanning at query time - and so give longer lookups. Bigger slots +%% did better at term_to_binary time. However term_to_binary is an often +%% repeated task, and this is better with smaller slots. +%% +%% The outcome has been to divide the slot into four small blocks to minimise +%% the binary_to_term time. A binary index is provided for the slot for all +%% Keys that are directly fetchable (i.e. standard keys not index keys). +%% +%% The division and use of a list saves about 100 microseconds per fetch when +%% compared to using a 128-member gb:tree. +%% +%% The binary index is cacheable and doubles as a not_present filter, as it is +%% based on a 17-bit hash (so 0.0039 fpr). + + +generate_binary_slot(KVL) -> + + HashFoldFun = + fun({K, V}, {HashListAcc, PosBinAcc, NoHashCount}) -> + + {_SQN, H1} = leveled_codec:strip_to_seqnhashonly({K, V}), + case is_integer(H1) of + true -> + PosH1 = double_hash(H1, K), + case NoHashCount of + 0 -> + {[{{hash, H1}, K}|HashListAcc], + <<1:1/integer, + PosH1:15/integer, + PosBinAcc/binary>>, + 0}; + N -> + % The No Hash Count is an integer between 0 and 127 + % and so at read time should count NHC + 1 + NHC = N - 1, + {[{{hash, H1}, K}|HashListAcc], + <<1:1/integer, + PosH1:15/integer, + 0:1/integer, + NHC:7/integer, + PosBinAcc/binary>>, + 0} + end; + false -> + {HashListAcc, PosBinAcc, NoHashCount + 1} + end + + end, + + {HashList, PosBinIndex0, NHC} = lists:foldr(HashFoldFun, + {[], <<>>, 0}, + KVL), + PosBinIndex1 = + case NHC of + 0 -> + PosBinIndex0; + _ -> + N = NHC - 1, + <<0:1/integer, N:7/integer, PosBinIndex0/binary>> + end, + + + {B1, B2, B3, B4} = + case length(KVL) of + L when L =< 32 -> + {term_to_binary(KVL, ?BINARY_SETTINGS), + <<0:0>>, + <<0:0>>, + <<0:0>>}; + L when L =< 64 -> + {KVLA_32, KVLB_32} = lists:split(32, KVL), + {term_to_binary(KVLA_32, ?BINARY_SETTINGS), + term_to_binary(KVLB_32, ?BINARY_SETTINGS), + <<0:0>>, + <<0:0>>}; + L when L =< 96 -> + {KVLA_32, KVLB_64} = lists:split(32, KVL), + {KVLB_32, KVLC_32} = lists:split(32, KVLB_64), + {term_to_binary(KVLA_32, ?BINARY_SETTINGS), + term_to_binary(KVLB_32, ?BINARY_SETTINGS), + term_to_binary(KVLC_32, ?BINARY_SETTINGS), + <<0:0>>}; + L when L =< 128 -> + {KVLA_32, KVLB_96} = lists:split(32, KVL), + {KVLB_32, KVLC_64} = lists:split(32, KVLB_96), + {KVLC_32, KVLD_32} = lists:split(32, KVLC_64), + {term_to_binary(KVLA_32, ?BINARY_SETTINGS), + term_to_binary(KVLB_32, ?BINARY_SETTINGS), + term_to_binary(KVLC_32, ?BINARY_SETTINGS), + term_to_binary(KVLD_32, ?BINARY_SETTINGS)} + end, + + B1P = byte_size(PosBinIndex1), + B1L = byte_size(B1), + B2L = byte_size(B2), + B3L = byte_size(B3), + B4L = byte_size(B4), + Lengths = <>, + SlotBin = <>, + CRC32 = erlang:crc32(SlotBin), + FullBin = <>, + + {HashList, PosBinIndex1, FullBin}. + + +binaryslot_get(FullBin, Key, Hash, CachedPosLookup) -> + case crc_check_slot(FullBin) of + {Lengths, Rest} -> + B1P = element(1, Lengths), + case CachedPosLookup of + {true, PosList} -> + <<_PosBinIndex:B1P/binary, Blocks/binary>> = Rest, + {fetch_value(PosList, Lengths, Blocks, Key), none}; + none -> + <> = Rest, + PosList = find_pos(PosBinIndex, + double_hash(Hash, Key), + [], + 0), + {fetch_value(PosList, Lengths, Blocks, Key), PosBinIndex} + end; + crc_wonky -> + {not_present, none} + end. + +binaryslot_tolist(FullBin) -> + BlockFetchFun = + fun(Length, {Acc, Bin}) -> + case Length of + 0 -> + {Acc, Bin}; + _ -> + <> = Bin, + {Acc ++ binary_to_term(Block), Rest} + end + end, + + {Out, _Rem} = + case crc_check_slot(FullBin) of + {Lengths, RestBin} -> + {B1P, B1L, B2L, B3L, B4L} = Lengths, + <<_PosBinIndex:B1P/binary, Blocks/binary>> = RestBin, + lists:foldl(BlockFetchFun, {[], Blocks}, [B1L, B2L, B3L, B4L]); + crc_wonky -> + {[], <<>>} + end, + Out. + + +binaryslot_trimmedlist(FullBin, all, all) -> + binaryslot_tolist(FullBin); +binaryslot_trimmedlist(FullBin, StartKey, EndKey) -> + LTrimFun = fun({K, _V}) -> K < StartKey end, + RTrimFun = fun({K, _V}) -> not leveled_codec:endkey_passed(EndKey, K) end, + BlockFetchFun = + fun(Length, {Acc, Bin}) -> + case Length of + 0 -> + {Acc, Bin}; + _ -> + <> = Bin, + BlockList = binary_to_term(Block), + {FirstKey, _FV} = lists:nth(1, BlockList), + {LastKey, _LV} = lists:last(BlockList), + TrimBools = trim_booleans(FirstKey, LastKey, + StartKey, EndKey), + case TrimBools of + {true, _, _, _} -> + {Acc, Rest}; + {false, true, _, _} -> + {Acc ++ BlockList, Rest}; + {false, false, true, false} -> + {_LDrop, RKeep} = lists:splitwith(LTrimFun, + BlockList), + {Acc ++ RKeep, Rest}; + {false, false, false, true} -> + {LKeep, _RDrop} = lists:splitwith(RTrimFun, + BlockList), + {Acc ++ LKeep, Rest}; + {false, false, true, true} -> + {_LDrop, RKeep} = lists:splitwith(LTrimFun, + BlockList), + {LKeep, _RDrop} = lists:splitwith(RTrimFun, RKeep), + {Acc ++ LKeep, Rest} + end + + end + end, + + {Out, _Rem} = + case crc_check_slot(FullBin) of + {Lengths, RestBin} -> + {B1P, B1L, B2L, B3L, B4L} = Lengths, + <<_PosBinIndex:B1P/binary, Blocks/binary>> = RestBin, + lists:foldl(BlockFetchFun, {[], Blocks}, [B1L, B2L, B3L, B4L]); + crc_wonky -> + {[], <<>>} + end, + Out. + + +trim_booleans(FirstKey, _LastKey, StartKey, all) -> + FirstKeyPassedStart = FirstKey > StartKey, + case FirstKeyPassedStart of + true -> + {false, true, false, false}; + false -> + {false, false, true, false} + end; +trim_booleans(_FirstKey, LastKey, all, EndKey) -> + LastKeyPassedEnd = leveled_codec:endkey_passed(EndKey, LastKey), + case LastKeyPassedEnd of + true -> + {false, false, false, true}; + false -> + {false, true, false, false} + end; +trim_booleans(FirstKey, LastKey, StartKey, EndKey) -> + FirstKeyPassedStart = FirstKey > StartKey, + PreRange = LastKey < StartKey, + PostRange = leveled_codec:endkey_passed(EndKey, FirstKey), + OutOfRange = PreRange or PostRange, + LastKeyPassedEnd = leveled_codec:endkey_passed(EndKey, LastKey), + case OutOfRange of + true -> + {true, false, false, false}; + false -> + case {FirstKeyPassedStart, LastKeyPassedEnd} of + {true, false} -> + {false, true, false, false}; + {false, false} -> + {false, false, true, false}; + {true, true} -> + {false, false, false, true}; + {false, true} -> + {false, false, true, true} + end + end. + + + + +crc_check_slot(FullBin) -> + <> = FullBin, + case erlang:crc32(SlotBin) of + CRC32 -> + <> = SlotBin, + Lengths = {B1P, B1L, B2L, B3L, B4L}, + {Lengths, Rest}; + _ -> + crc_wonky + end. + +double_hash(Hash, Key) -> + H2 = erlang:phash2(Key), + (Hash bxor H2) band 32767. + +fetch_value([], _Lengths, _Blocks, _Key) -> + not_present; +fetch_value([Pos|Rest], Lengths, Blocks, Key) -> + BlockNumber = (Pos div 32) + 1, + BlockPos = (Pos rem 32) + 1, + BlockL = + case BlockNumber of + 1 -> + B1L = element(2, Lengths), + <> = Blocks, + binary_to_term(Block); + 2 -> + B1L = element(2, Lengths), + B2L = element(3, Lengths), + <<_Pass:B1L/binary, Block:B2L/binary, _Rest/binary>> = Blocks, + binary_to_term(Block); + 3 -> + PreL = element(2, Lengths) + element(3, Lengths), + B3L = element(4, Lengths), + <<_Pass:PreL/binary, Block:B3L/binary, _Rest/binary>> = Blocks, + binary_to_term(Block); + 4 -> + {_B1P, B1L, B2L, B3L, B4L} = Lengths, + PreL = B1L + B2L + B3L, + <<_Pass:PreL/binary, Block:B4L/binary>> = Blocks, + binary_to_term(Block) + end, + + {K, V} = lists:nth(BlockPos, BlockL), + case K of + Key -> + {K, V}; + _ -> + fetch_value(Rest, Lengths, Blocks, Key) + end. + +find_pos(<<>>, _Hash, PosList, _Count) -> + PosList; +find_pos(<<1:1/integer, Hash:15/integer, T/binary>>, Hash, PosList, Count) -> + find_pos(T, Hash, PosList ++ [Count], Count + 1); +find_pos(<<1:1/integer, _Miss:15/integer, T/binary>>, Hash, PosList, Count) -> + find_pos(T, Hash, PosList, Count + 1); +find_pos(<<0:1/integer, NHC:7/integer, T/binary>>, Hash, PosList, Count) -> + find_pos(T, Hash, PosList, Count + NHC + 1). + + + %%%============================================================================ %%% Merge Functions %%%============================================================================ @@ -981,57 +1275,43 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> BRange). -experimental_test() -> - io:format(user, "~nExperimental timing test:~n", []), +generate_indexkeys(Count) -> + generate_indexkeys(Count, []). + +generate_indexkeys(0, IndexList) -> + IndexList; +generate_indexkeys(Count, IndexList) -> + IndexSpecs = [{add, "t1_int", random:uniform(80000)}], + Changes = leveled_codec:convert_indexspecs(IndexSpecs, + "Bucket", + "Key" ++ integer_to_list(Count), + Count, + infinity), + generate_indexkeys(Count - 1, IndexList ++ Changes). + + +indexed_list_test() -> + io:format(user, "~nIndexed list timing test:~n", []), N = 150, KVL0 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 4)), KVL1 = lists:sublist(KVL0, 128), - KVL2 = generate_randomkeys(1, N, 1, 4), - KVLnot0 = lists:foldr(fun({K, V}, Acc) -> - case lists:keymember(K, 1, KVL1) of - true -> - Acc; - false -> - [{K, leveled_codec:magic_hash(K), V}|Acc] - end end, - [], - KVL2), - KVLnot1 = lists:sublist(KVLnot0, 8), - ExtractHashFun = - fun({K, V}) -> - {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), - {{hash, H}, K} end, - BloomAddFun = - fun({H, K}, Bloom) -> leveled_tinybloom:tiny_enter(H, K, Bloom) end, - AltHashFoldFun = - fun({K, V}, {HashLAcc, PosBAcc}) -> - {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), - PosH = H band 65535, - {[{{hash, H}, K}|HashLAcc], <>} - end, - AltAltHashFoldFun = - fun({K, V}, {HashLAcc, PosBAcc, PosC}) -> - {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), - Slot = H rem 256, - PosH = (H bsr 8) band 65535, - {[{{hash, H}, K}|HashLAcc], - [{Slot, <>}|PosBAcc], - PosC + 1} - end, + % BloomAddFun = + % fun({H, K}, {Bloom, Total, Max}) -> + % SW = os:timestamp(), + % Bloom0 = leveled_tinybloom:tiny_enter(H, K, Bloom), + % T0 = timer:now_diff(os:timestamp(), SW), + % {Bloom0, Total + T0, max(T0, Max)} - - SWA0 = os:timestamp(), - HashList = lists:map(ExtractHashFun, KVL1), - Tree = gb_trees:from_orddict(KVL1), - _BloomA = lists:foldr(BloomAddFun, - leveled_tinybloom:tiny_empty(), - HashList), - SlotBinA = term_to_binary(Tree, [{compressed, ?COMPRESSION_LEVEL}]), + % end, + + SW0 = os:timestamp(), + + {_HashList, _PosBinIndex1, FullBin} = generate_binary_slot(KVL1), io:format(user, - "Created slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWA0)]), - + "Indexed list created slot in ~w microseconds of size ~w~n", + [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]), + {TestK1, TestV1} = lists:nth(20, KVL1), MH1 = leveled_codec:magic_hash(TestK1), {TestK2, TestV2} = lists:nth(40, KVL1), @@ -1043,331 +1323,125 @@ experimental_test() -> {TestK5, TestV5} = lists:nth(100, KVL1), MH5 = leveled_codec:magic_hash(TestK5), - test_slot(SlotBinA, TestK1, TestV1), - test_slot(SlotBinA, TestK2, TestV2), - test_slot(SlotBinA, TestK3, TestV3), - test_slot(SlotBinA, TestK4, TestV4), - test_slot(SlotBinA, TestK5, TestV5), - lists:foreach(fun({NotK, _H, _V}) -> - test_not_slot(SlotBinA, NotK) end, - KVLnot1), - - SWB0 = os:timestamp(), - {Alt1HashList, Pos1Bin} = lists:foldr(AltHashFoldFun, {[], <<>>}, KVL1), - _BloomB = lists:foldr(BloomAddFun, - leveled_tinybloom:tiny_empty(), - Alt1HashList), - Alt1SlotBinB = term_to_binary(KVL1, [{compressed, ?COMPRESSION_LEVEL}]), - io:format(user, - "Alt1-method created slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWB0)]), - - alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH1, TestK1, TestV1), - alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH2, TestK2, TestV2), - alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH3, TestK3, TestV3), - alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH4, TestK4, TestV4), - alt1_test_slot(Alt1SlotBinB, Pos1Bin, MH5, TestK5, TestV5), - lists:foreach(fun({NotK, NotMH, _V}) -> - alt1_test_not_slot(Alt1SlotBinB, Pos1Bin, NotMH, NotK) end, - KVLnot1), - - SWC0 = os:timestamp(), - {KVL1A, KVL1B} = lists:split(64, KVL1), - {Alt2HashListA, Pos2BinA} = lists:foldr(AltHashFoldFun, {[], <<>>}, KVL1A), - {Alt2HashList, Pos2BinB} = lists:foldr(AltHashFoldFun, {Alt2HashListA, <<>>}, KVL1B), - _BloomB = lists:foldr(BloomAddFun, - leveled_tinybloom:tiny_empty(), - Alt2HashList), - Alt2SlotBinB_A = term_to_binary(KVL1A, [{compressed, ?COMPRESSION_LEVEL}]), - Alt2SlotBinB_B = term_to_binary(KVL1B, [{compressed, ?COMPRESSION_LEVEL}]), - Alt2Tester = [{Alt2SlotBinB_A, Pos2BinA}, {Alt2SlotBinB_B, Pos2BinB}], - io:format(user, - "Alt2-method created slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWC0)]), - - - alt2_test_slot(Alt2Tester, MH1, TestK1, TestV1, "Alt2"), - alt2_test_slot(Alt2Tester, MH2, TestK2, TestV2, "Alt2"), - alt2_test_slot(Alt2Tester, MH3, TestK3, TestV3, "Alt2"), - alt2_test_slot(Alt2Tester, MH4, TestK4, TestV4, "Alt2"), - alt2_test_slot(Alt2Tester, MH5, TestK5, TestV5, "Alt2"), - lists:foreach(fun({NotK, NotMH, _V}) -> - alt2_test_not_slot(Alt2Tester, NotMH, NotK, "Alt2") end, - KVLnot1), - - SWD0 = os:timestamp(), - {KVL1A32, KVL1_96} = lists:split(32, KVL1), - {KVL1B32, KVL1_64} = lists:split(32, KVL1_96), - {KVL1C32, KVL1D32} = lists:split(32, KVL1_64), - {Alt3HashListA, Pos3BinA} = lists:foldr(AltHashFoldFun, {[], <<>>}, KVL1A32), - {Alt3HashListB, Pos3BinB} = lists:foldr(AltHashFoldFun, {Alt3HashListA, <<>>}, KVL1B32), - {Alt3HashListC, Pos3BinC} = lists:foldr(AltHashFoldFun, {Alt3HashListB, <<>>}, KVL1C32), - {Alt3HashList, Pos3BinD} = lists:foldr(AltHashFoldFun, {Alt3HashListC, <<>>}, KVL1D32), - _BloomB = lists:foldr(BloomAddFun, - leveled_tinybloom:tiny_empty(), - Alt3HashList), - Alt3SlotBinB_A = term_to_binary(KVL1A32, [{compressed, ?COMPRESSION_LEVEL}]), - Alt3SlotBinB_B = term_to_binary(KVL1B32, [{compressed, ?COMPRESSION_LEVEL}]), - Alt3SlotBinB_C = term_to_binary(KVL1C32, [{compressed, ?COMPRESSION_LEVEL}]), - Alt3SlotBinB_D = term_to_binary(KVL1D32, [{compressed, ?COMPRESSION_LEVEL}]), - Alt3Tester = [{Alt3SlotBinB_A, Pos3BinA}, {Alt3SlotBinB_B, Pos3BinB}, - {Alt3SlotBinB_C, Pos3BinC}, {Alt3SlotBinB_D, Pos3BinD}], - io:format(user, - "Alt3-method created slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWD0)]), - - - alt2_test_slot(Alt3Tester, MH1, TestK1, TestV1, "Alt3"), - alt2_test_slot(Alt3Tester, MH2, TestK2, TestV2, "Alt3"), - alt2_test_slot(Alt3Tester, MH3, TestK3, TestV3, "Alt3"), - alt2_test_slot(Alt3Tester, MH4, TestK4, TestV4, "Alt3"), - alt2_test_slot(Alt3Tester, MH5, TestK5, TestV5, "Alt3"), - lists:foreach(fun({NotK, NotMH, _V}) -> - alt2_test_not_slot(Alt3Tester, NotMH, NotK, "Alt3") end, - KVLnot1), - - SWE0 = os:timestamp(), - {KVL1A32, KVL1_96} = lists:split(32, KVL1), - {KVL1B32, KVL1_64} = lists:split(32, KVL1_96), - {KVL1C32, KVL1D32} = lists:split(32, KVL1_64), - io:format("final block length ~w~n", [length(KVL1D32)]), - {Alt4HashListA, PosTLA, 32} = lists:foldl(AltAltHashFoldFun, {[], [], 0}, KVL1A32), - {Alt4HashListB, PosTLB, 64} = lists:foldl(AltAltHashFoldFun, {Alt3HashListA, PosTLA, 32}, KVL1B32), - {Alt4HashListC, PosTLC, 96} = lists:foldl(AltAltHashFoldFun, {Alt3HashListB, PosTLB, 64}, KVL1C32), - {Alt4HashList, PosTLall, 128} = lists:foldl(AltAltHashFoldFun, {Alt3HashListC, PosTLC, 96}, KVL1D32), - _BloomB = lists:foldr(BloomAddFun, - leveled_tinybloom:tiny_empty(), - Alt3HashList), - Alt3SlotBinB_A = term_to_binary(KVL1A32, [{compressed, ?COMPRESSION_LEVEL}]), - Alt3SlotBinB_B = term_to_binary(KVL1B32, [{compressed, ?COMPRESSION_LEVEL}]), - Alt3SlotBinB_C = term_to_binary(KVL1C32, [{compressed, ?COMPRESSION_LEVEL}]), - Alt3SlotBinB_D = term_to_binary(KVL1D32, [{compressed, ?COMPRESSION_LEVEL}]), - SortedSlotList = lists:keysort(1, PosTLall), - PosBinList = build_hashtree_binary(SortedSlotList, 256, 0, []), - Alt4PosBin = list_to_binary(PosBinList), + test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), + test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), + test_binary_slot(FullBin, TestK3, MH3, {TestK3, TestV3}), + test_binary_slot(FullBin, TestK4, MH4, {TestK4, TestV4}), + test_binary_slot(FullBin, TestK5, MH5, {TestK5, TestV5}). - Alt4Tester = {Alt3SlotBinB_A, Alt3SlotBinB_B, Alt3SlotBinB_C, Alt3SlotBinB_D}, - io:format(user, - "Alt4-method created slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWE0)]), +indexed_list_mixedkeys_test() -> + KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)), + KVL1 = lists:sublist(KVL0, 33), + Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), - alt4_test_slot(Alt4Tester, Alt4PosBin, TestK1, MH1, TestV1, "Alt4"), - alt4_test_slot(Alt4Tester, Alt4PosBin, TestK2, MH2, TestV2, "Alt4"), - alt4_test_slot(Alt4Tester, Alt4PosBin, TestK3, MH3, TestV3, "Alt4"), - alt4_test_slot(Alt4Tester, Alt4PosBin, TestK4, MH4, TestV4, "Alt4"), - alt4_test_slot(Alt4Tester, Alt4PosBin, TestK5, MH5, TestV5, "Alt4"). + {_HashList, _PosBinIndex1, FullBin} = generate_binary_slot(Keys), + {TestK1, TestV1} = lists:nth(4, KVL1), + MH1 = leveled_codec:magic_hash(TestK1), + {TestK2, TestV2} = lists:nth(8, KVL1), + MH2 = leveled_codec:magic_hash(TestK2), + {TestK3, TestV3} = lists:nth(12, KVL1), + MH3 = leveled_codec:magic_hash(TestK3), + {TestK4, TestV4} = lists:nth(16, KVL1), + MH4 = leveled_codec:magic_hash(TestK4), + {TestK5, TestV5} = lists:nth(20, KVL1), + MH5 = leveled_codec:magic_hash(TestK5), -alt4_test_slot(Alt4Tester, PosBin, Key, Hash, Value, M) -> - io:format("looking for key ~s ~s~n", [element(2, Key), element(3, Key)]), - SW = os:timestamp(), - Slot = Hash rem 256, - PosH = (Hash bsr 8) band 65535, - PosList = find_pos_byslot(PosBin, Slot, PosH, []), - io:format("PosList ~w~n", [PosList]), - FindKVFun = - fun(Pos, Found) -> - case Found of - not_present -> - Block = (Pos div 32) + 1, - BlockPos = (Pos rem 32) + 1, - io:format("Looking Block ~w position ~w~n", [Block, BlockPos]), - CheckBlock = element(Block, Alt4Tester), - {K, V} = lists:nth(BlockPos, binary_to_term(CheckBlock)), - io:format("K of ~s ~s~n", [element(2, K), element(3, K)]), - case K of - Key -> - {value, V}; - _ -> - not_present - end; - _ -> - Found - end end, - {value, Value} = lists:foldl(FindKVFun, not_present, PosList), - io:format(user, - M ++ "-method found in slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW)]). + test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), + test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), + test_binary_slot(FullBin, TestK3, MH3, {TestK3, TestV3}), + test_binary_slot(FullBin, TestK4, MH4, {TestK4, TestV4}), + test_binary_slot(FullBin, TestK5, MH5, {TestK5, TestV5}). - -find_pos_byslot(PosBin, Slot, PosH, PosList) -> - Start = Slot * 3, - <<_LHS:Start/binary, Hash:16/integer, Pos:8/integer, _Rest/binary>> = PosBin, - case Hash of - PosH -> - find_pos_byslot(PosBin, (Slot + 1) rem 256, PosH, [Pos|PosList]); - 0 -> - PosList; - _ -> - find_pos_byslot(PosBin, (Slot + 1) rem 256, PosH, PosList) - end. - - - -build_hashtree_binary([], IdxLen, SlotPos, Bin) -> - case SlotPos of - IdxLen -> - lists:reverse(Bin); - N when N < IdxLen -> - ZeroLen = (IdxLen - N) * 24, - lists:reverse([<<0:ZeroLen>>|Bin]) - end; -build_hashtree_binary([{TopSlot, TopBin}|SlotMapTail], IdxLen, SlotPos, Bin) -> - case TopSlot of - N when N > SlotPos -> - D = N - SlotPos, - Bridge = lists:duplicate(D, <<0:24>>) ++ Bin, - UpdBin = [<>|Bridge], - build_hashtree_binary(SlotMapTail, - IdxLen, - SlotPos + D + 1, - UpdBin); - N when N =< SlotPos, SlotPos < IdxLen -> - UpdBin = [<>|Bin], - build_hashtree_binary(SlotMapTail, - IdxLen, - SlotPos + 1, - UpdBin); - N when N < SlotPos, SlotPos == IdxLen -> - % Need to wrap round and put in the first empty slot from the - % beginning - Pos = find_firstzero(Bin, length(Bin)), - {LHS, [<<0:24>>|RHS]} = lists:split(Pos - 1, Bin), - UpdBin = lists:append(LHS, [TopBin|RHS]), - build_hashtree_binary(SlotMapTail, - IdxLen, - SlotPos, - UpdBin) - end. - -% Search from the tail of the list to find the first zero -find_firstzero(Bin, Pos) -> - case lists:nth(Pos, Bin) of - <<0:24>> -> - Pos; - _ -> - find_firstzero(Bin, Pos - 1) - end. - - - - -alt2_test_not_slot(TesterList, Hash, Key, M) -> - SWB1 = os:timestamp(), - not_present = alt2_test_slot_int(TesterList, Hash, Key, not_present), - io:format(user, - M ++ "-method missed in slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWB1)]). - - -alt2_test_slot(TesterList, Hash, Key, Value, M) -> - SWB1 = os:timestamp(), - {value, Value} = alt2_test_slot_int(TesterList, Hash, Key, not_present), - io:format(user, - M ++ "-method found in slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWB1)]). - -alt2_test_slot_int([], _Hash, _Key, Result) -> - Result; -alt2_test_slot_int([{SlotBin, PosBin}|Rest], Hash, Key, not_present) -> - H2F = Hash band 65535, - PosL = posbin_finder(PosBin, H2F, [], 1), - case PosL of - [] -> - alt2_test_slot_int(Rest, Hash, Key, not_present); - _ -> - Slot = binary_to_term(SlotBin), - FindFun = - fun(P, Acc) -> - case Acc of - not_present -> - case lists:nth(P, Slot) of - {Key, V} -> - {value, V}; - _ -> - not_present - end; - _ -> - Acc - end end, - Out = lists:foldr(FindFun, not_present, PosL), - alt2_test_slot_int(Rest, Hash, Key, Out) - end; -alt2_test_slot_int(_Testers, _Hash, _Key, Result) -> - Result. - - -alt1_test_not_slot(SlotBin, PosBin, Hash, Key) -> - SWB1 = os:timestamp(), - not_present = alt1_test_slot_int(SlotBin, PosBin, Hash, Key), - io:format(user, - "Alt1-method missed in slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWB1)]). - - -alt1_test_slot(SlotBin, PosBin, Hash, Key, Value) -> - SWB1 = os:timestamp(), - {value, Value} = alt1_test_slot_int(SlotBin, PosBin, Hash, Key), - io:format(user, - "Alt1-method found in slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWB1)]). - -alt1_test_slot_int(SlotBin, PosBin, Hash, Key) -> - Slot0 = binary_to_term(SlotBin), +indexed_list_allindexkeys_test() -> + Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128), + {_HashList, PosBinIndex1, FullBin} = generate_binary_slot(Keys), + ?assertMatch(<<127:8/integer>>, PosBinIndex1), + % SW = os:timestamp(), + BinToList = binaryslot_tolist(FullBin), % io:format(user, - % "Alt-method fraction on b_to_t ~w microseconds~n", - % [timer:now_diff(os:timestamp(), SWB1)]), + % "Indexed list flattened in ~w microseconds ~n", + % [timer:now_diff(os:timestamp(), SW)]), + ?assertMatch(Keys, BinToList), + ?assertMatch(Keys, binaryslot_trimmedlist(FullBin, all, all)). - H2F = Hash band 65535, - PosL = posbin_finder(PosBin, H2F, [], 1), - FindFun = - fun(P, Acc) -> - case Acc of - not_present -> - case lists:nth(P, Slot0) of - {Key, V} -> - {value, V}; - _ -> - not_present - end; - _ -> - Acc - end end, - lists:foldr(FindFun, not_present, PosL). + +indexed_list_allindexkeys_trimmed_test() -> + Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128), + {_HashList, PosBinIndex1, FullBin} = generate_binary_slot(Keys), + ?assertMatch(<<127:8/integer>>, PosBinIndex1), + ?assertMatch(Keys, binaryslot_trimmedlist(FullBin, + {i, + "Bucket", + {"t1_int", 0}, + null}, + {i, + "Bucket", + {"t1_int", 99999}, + null})), + {SK1, _} = lists:nth(10, Keys), + {EK1, _} = lists:nth(100, Keys), + R1 = lists:sublist(Keys, 10, 91), + O1 = binaryslot_trimmedlist(FullBin, SK1, EK1), + ?assertMatch(91, length(O1)), + ?assertMatch(R1, O1), + + {SK2, _} = lists:nth(10, Keys), + {EK2, _} = lists:nth(20, Keys), + R2 = lists:sublist(Keys, 10, 11), + O2 = binaryslot_trimmedlist(FullBin, SK2, EK2), + ?assertMatch(11, length(O2)), + ?assertMatch(R2, O2), + + {SK3, _} = lists:nth(127, Keys), + {EK3, _} = lists:nth(128, Keys), + R3 = lists:sublist(Keys, 127, 2), + O3 = binaryslot_trimmedlist(FullBin, SK3, EK3), + ?assertMatch(2, length(O3)), + ?assertMatch(R3, O3). -posbin_finder(<<>>, _Hash, FoundL, _PosC) -> - FoundL; -posbin_finder(<>, Hash, FoundL, PosC) -> - posbin_finder(Rest, Hash, [PosC|FoundL], PosC + 1); -posbin_finder(<<_Hash:16/integer, Rest/binary>>, Hash, FoundL, PosC) -> - posbin_finder(Rest, Hash, FoundL, PosC + 1). +indexed_list_mixedkeys_bitflip_test() -> + KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)), + KVL1 = lists:sublist(KVL0, 33), + Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), + + {_HashList, _PosBinIndex1, FullBin} = generate_binary_slot(Keys), + L = byte_size(FullBin), + Byte1 = random:uniform(L), + <> = FullBin, + FullBin0 = + case A of + 0 -> + <>; + _ -> + <> + end, + + {TestK1, _TestV1} = lists:nth(20, KVL1), + MH1 = leveled_codec:magic_hash(TestK1), + + test_binary_slot(FullBin0, TestK1, MH1, not_present), + ToList = binaryslot_tolist(FullBin0), + ?assertMatch([], ToList), + + {SK1, _} = lists:nth(10, Keys), + {EK1, _} = lists:nth(50, Keys), + O1 = binaryslot_trimmedlist(FullBin0, SK1, EK1), + ?assertMatch(0, length(O1)), + ?assertMatch([], O1). -test_slot(SlotBin, Key, Value) -> - SWA1 = os:timestamp(), - Slot0 = binary_to_term(SlotBin), - % io:format(user, - % "Fraction on b_to_t ~w microseconds~n", - % [timer:now_diff(os:timestamp(), SWA1)]), - {value, Value} = gb_trees:lookup(Key, Slot0), - io:format(user, - "Found in slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWA1)]). +test_binary_slot(FullBin, Key, Hash, ExpectedValue) -> + % SW = os:timestamp(), + {ReturnedValue, _} = binaryslot_get(FullBin, Key, Hash, none), + ?assertMatch(ExpectedValue, ReturnedValue). + % io:format(user, "Fetch success in ~w microseconds ~n", + % [timer:now_diff(os:timestamp(), SW)]). -test_not_slot(SlotBin, Key) -> - SWA1 = os:timestamp(), - Slot0 = binary_to_term(SlotBin), - % io:format(user, - % "Fraction on b_to_t ~w microseconds~n", - % [timer:now_diff(os:timestamp(), SWA1)]), - none = gb_trees:lookup(Key, Slot0), - io:format(user, - "Missed in slot in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SWA1)]). merge_test() -> @@ -1417,73 +1491,7 @@ merge_test() -> ok = file:delete("../test/level1_src.sst"), ok = file:delete("../test/level2_src.sst"), ok = file:delete("../test/level2_merge.sst"). - -simple_slotbin_test() -> - KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 2, 1, 4), - KVList1 = lists:sublist(lists:ukeysort(1, KVList0), 1, ?SLOT_SIZE), - ExtractHashFun = - fun({K, V}) -> - {_SQN, H} = leveled_codec:strip_to_seqnhashonly({K, V}), - {{hash, H}, K} end, - HashList = lists:map(ExtractHashFun, KVList1), - SW0 = os:timestamp(), - {SlotBin0, Bloom0} = build_slot(KVList1, HashList), - io:format(user, "Slot built in ~w microseconds with size ~w~n", - [timer:now_diff(os:timestamp(), SW0), byte_size(SlotBin0)]), - SW1 = os:timestamp(), - lists:foreach(fun({H, K}) -> ?assertMatch(true, - is_check_slot_required(H, - K, - Bloom0)) - end, - HashList), - lists:foreach(fun({K, V}) -> - ?assertMatch({value, V}, - lookup_in_slot(K, SlotBin0)) - end, - KVList1), - io:format(user, "Slot checked for all keys in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW1)]), - SW2 = os:timestamp(), - ?assertMatch(KVList1, trim_slot(SlotBin0, all, all)), - io:format(user, "Slot flattened in ~w microseconds~n", - [timer:now_diff(os:timestamp(), SW2)]). - - -simple_slotbinsummary_test() -> - KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20), - KVList1 = lists:ukeysort(1, KVList0), - [{FirstKey, _V}|_Rest] = KVList1, - {FirstKey, _L, SlotIndex, AllHashes, SlotsBin} = build_all_slots(KVList1), - SummaryBin = build_table_summary(SlotIndex, - AllHashes, - 2, - FirstKey, - length(KVList1), - undefined), - Summary = read_table_summary(SummaryBin), - SummaryIndex = leveled_skiplist:from_sortedlist(Summary#summary.index), - FetchFun = - fun({Key, Value}) -> - Slot = lookup_slot(Key, SummaryIndex), - StartPos = Slot#slot_index_value.start_position, - Length = Slot#slot_index_value.length, - io:format("lookup slot id ~w from ~w length ~w~n", - [Slot#slot_index_value.slot_id, StartPos, Length]), - <<_Pre:StartPos/binary, - SlotBin:Length/binary, - _Post/binary>> = <<0:64/integer, SlotsBin/binary>>, - <> = SlotBin, - ?assertMatch(SlotCRC, erlang:crc32(SlotBinNoCRC)), - {value, V} = lookup_in_slot(Key, SlotBinNoCRC), - ?assertMatch(Value, V) - end, - SW = os:timestamp(), - lists:foreach(FetchFun, KVList1), - io:format(user, - "Checking for ~w keys in slots took ~w microseconds~n", - [length(KVList1), timer:now_diff(os:timestamp(), SW)]). simple_persisted_test() -> Filename = "../test/simple_test", @@ -1531,7 +1539,9 @@ simple_persisted_test() -> FoldFun = fun(X, Acc) -> case X of {pointer, P, S, SK, EK} -> - Acc ++ sst_getslots(P, [{pointer, S, SK, EK}]); + io:format("Get slot ~w with Acc at ~w~n", + [S, length(Acc)]), + Acc ++ sst_getslots(P, [{pointer, P, S, SK, EK}]); _ -> Acc ++ [X] end end, From 972aa850122a6d435cf261a1a56ff16acdac1271 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 2 Jan 2017 18:09:36 +0000 Subject: [PATCH 41/58] Try three hash tinybloom Improved fpr in three hash bloom - so examine performance --- src/leveled_tinybloom.erl | 152 +++++++++++--------------------------- 1 file changed, 45 insertions(+), 107 deletions(-) diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index c03a5b5..1cfde77 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -19,10 +19,7 @@ -export([ enter/2, check/2, - empty/1, - tiny_enter/3, - tiny_check/3, - tiny_empty/0 + empty/1 ]). @@ -40,67 +37,45 @@ empty(Width) when Width =< 256 -> enter({hash, no_lookup}, Bloom) -> Bloom; enter({hash, Hash}, Bloom) -> - {H0, Bit1, Bit2} = split_hash(Hash), - Slot = H0 rem dict:size(Bloom), + {Slot0, Q, Bit1, Bit2, Bit3} = split_hash(Hash), + Slot = Slot0 rem dict:size(Bloom), BitArray0 = dict:fetch(Slot, Bloom), + {Pre, SplitArray0, Post} = split_array(BitArray0, Q), FoldFun = - fun(K, Arr) -> add_to_array(K, Arr, 4096) end, - BitArray1 = lists:foldl(FoldFun, - BitArray0, - lists:usort([Bit1, Bit2])), - dict:store(Slot, BitArray1, Bloom); + fun(Bit, Arr) -> add_to_array(Bit, Arr, 1024) end, + SplitArray1 = lists:foldl(FoldFun, + SplitArray0, + lists:usort([Bit1, Bit2, Bit3])), + dict:store(Slot, <
>, Bloom);
 enter(Key, Bloom) ->
     Hash = leveled_codec:magic_hash(Key),
     enter({hash, Hash}, Bloom).
 
 check({hash, Hash}, Bloom) ->
-    {H0, Bit1, Bit2} = split_hash(Hash),
-    Slot = H0 rem dict:size(Bloom),
+    {Slot0, Q, Bit1, Bit2, Bit3} = split_hash(Hash),
+    Slot = Slot0 rem dict:size(Bloom),
     BitArray = dict:fetch(Slot, Bloom),
-    case getbit(Bit1, BitArray, 4096) of
+    {_Pre, SplitArray, _Post} = split_array(BitArray, Q),
+    
+    case getbit(Bit1, SplitArray, 1024) of
         <<0:1>> ->
             false;
         <<1:1>> ->
-            case getbit(Bit2, BitArray, 4096) of
+            case getbit(Bit2, SplitArray, 1024) of
                 <<0:1>> ->
                     false;
                 <<1:1>> ->
-                    true
-            end
-    end;
-check(Key, Bloom) ->
-    Hash = leveled_codec:magic_hash(Key),
-    check({hash, Hash}, Bloom).
-
-tiny_empty() ->
-    <<0:1024>>.
-
-tiny_enter({hash, no_lookup}, _Key, Bloom) ->
-    Bloom;
-tiny_enter({hash, Hash}, Key, Bloom) ->
-    {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key),
-    AddFun = fun(Bit, Arr0) -> add_to_array(Bit, Arr0, 1024) end,
-    lists:foldl(AddFun, Bloom, [Bit0, Bit1, Bit2]).
-
-
-tiny_check({hash, Hash}, Key, Bloom) ->
-    {Bit0, Bit1, Bit2} = split_hash_for_tinybloom(Hash, Key),
-    case getbit(Bit0, Bloom, 1024) of
-        <<0:1>> ->
-            false;
-        <<1:1>> ->
-            case getbit(Bit1, Bloom, 1024) of
-                <<0:1>> ->
-                    false;
-                <<1:1>> ->
-                    case getbit(Bit2, Bloom, 1024) of
+                    case getbit(Bit3, SplitArray, 1024) of
                         <<0:1>> ->
                             false;
                         <<1:1>> ->
                             true
                     end
             end
-    end.
+    end;
+check(Key, Bloom) ->
+    Hash = leveled_codec:magic_hash(Key),
+    check({hash, Hash}, Bloom).
 
 
 %%%============================================================================
@@ -108,18 +83,32 @@ tiny_check({hash, Hash}, Key, Bloom) ->
 %%%============================================================================
 
 split_hash(Hash) ->
-    H0 = Hash band 255,
-    H1 = (Hash bsr 8) band 4095,
-    H2 = Hash bsr 20,
-    {H0, H1, H2}.
+    SlotH1 = Hash band 255,
+    SlotH2 = (Hash bsr 8) band 255,
+    SlotH3 = (Hash bsr 16) band 255,
+    SlotH4 = (Hash bsr 24) band 255,
+    Slot = (SlotH1 bxor SlotH2) bxor (SlotH3 bxor SlotH4),
+    Q1 = Hash band 3,
+    H1 = (Hash bsr 2) band 1023,
+    H2 = (Hash bsr 12) band 1023,
+    H3 = (Hash bsr 22) band 1023,
+    {Slot, Q1, H1, H2, H3}.
 
-split_hash_for_tinybloom(MagicHash, Key) ->
-    % Tiny bloom can make k=3 from one hash
-    Hash = MagicHash bxor erlang:phash2(Key),
-    H0 = Hash band 1023,
-    H1 = (Hash bsr 11) band 1023,
-    H2 = (Hash bsr 22) band 1023,
-    {H0, H1, H2}.
+split_array(Bin, Q) ->
+    case Q of
+        0 ->
+            <> = Bin,
+            {<<>>, ToUse, Post};
+        1 ->
+            <> = Bin,
+            {Pre, ToUse, Post};
+        2 ->
+            <> = Bin,
+            {Pre, ToUse, Post};
+        3 ->
+            <> = Bin,
+            {Pre, ToUse, <<>>}
+    end.
 
 add_to_array(Bit, BitArray, ArrayLength) ->
     RestLen = ArrayLength - Bit - 1,
@@ -194,57 +183,6 @@ simple_test() ->
                 [N, timer:now_diff(os:timestamp(), SW3), FP / N]),
     ?assertMatch(true, FP < (N div 4)).
 
-tiny_test() ->
-    N = 128,
-    K = 64, % more checks out than in K * checks
-    KLin = lists:map(fun(X) -> "Key_" ++
-                                integer_to_list(X) ++
-                                integer_to_list(random:uniform(100)) ++
-                                binary_to_list(crypto:rand_bytes(2))
-                                end,
-                        lists:seq(1, N)),
-    KLout = lists:map(fun(X) ->
-                            "NotKey_" ++
-                            integer_to_list(X) ++
-                            integer_to_list(random:uniform(100)) ++
-                            binary_to_list(crypto:rand_bytes(2))
-                            end,
-                        lists:seq(1, N * K)),
-    
-    HashIn = lists:map(fun(X) ->
-                            {{hash, leveled_codec:magic_hash(X)}, X} end,
-                            KLin),
-    HashOut = lists:map(fun(X) ->
-                            {{hash, leveled_codec:magic_hash(X)}, X} end,
-                            KLout),
-       
-    SW1 = os:timestamp(),
-    Bloom = lists:foldr(fun({H0, K0}, B) -> tiny_enter(H0, K0, B) end,
-                        tiny_empty(),
-                        HashIn),
-    io:format(user,
-                "~nAdding ~w hashes to tiny bloom took ~w microseconds~n",
-                [N, timer:now_diff(os:timestamp(), SW1)]),
-    
-    SW2 = os:timestamp(),
-    lists:foreach(fun({H1, K1}) ->
-                    ?assertMatch(true, tiny_check(H1, K1, Bloom)) end, HashIn),
-    io:format(user,
-                "~nChecking ~w hashes in tiny bloom took ~w microseconds~n",
-                [N, timer:now_diff(os:timestamp(), SW2)]),
-    
-    SW3 = os:timestamp(),
-    FP = lists:foldr(fun({H3, K3}, Acc) -> case tiny_check(H3, K3, Bloom) of
-                                        true -> Acc + 1;
-                                        false -> Acc
-                                    end end,
-                        0,
-                        HashOut),
-    io:format(user,
-                "~nChecking ~w hashes out of tiny bloom took ~w microseconds "
-                    ++ "with ~w false positive rate~n",
-                [N * K, timer:now_diff(os:timestamp(), SW3), FP / (N * K)]),
-    ?assertMatch(true, FP < ((N * K) div 8)).
 
 
 -endif.
\ No newline at end of file

From baa644383d9d45665f9d7d053f031c2b9afb48bd Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Mon, 2 Jan 2017 18:29:15 +0000
Subject: [PATCH 42/58] Make tinybloom size configurable

Allow the bloom size to vary depending on how many fetchable keys there
are - so ther eis no large bloom held if most of the keys are index
entries for example
---
 src/leveled_sst.erl       | 9 +++++----
 src/leveled_tinybloom.erl | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index f84a0d1..2c1c25e 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -72,7 +72,7 @@
 -define(SLOT_SIZE, 128). % This is not configurable
 -define(COMPRESSION_LEVEL, 1).
 -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]).
--define(LEVEL_BLOOM_SLOTS, [{0, 64}, {1, 48}, {default, 32}]).
+-define(LEVEL_BLOOM_BITS, [{0, 12}, {1, 10}, {2, 8}, {default, 6}]).
 -define(MERGE_SCANWIDTH, 16).
 -define(DISCARD_EXT, ".discarded").
 -define(DELETE_TIMEOUT, 10000).
@@ -614,13 +614,14 @@ open_reader(Filename) ->
     {Handle, SummaryBin}.
 
 build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) ->
-    BloomSlots =
-        case lists:keyfind(Level, 1, ?LEVEL_BLOOM_SLOTS) of
+    BloomBits =
+        case lists:keyfind(Level, 1, ?LEVEL_BLOOM_BITS) of
             {Level, N} ->
                 N;
             false ->
-                element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_SLOTS))
+                element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_BITS))
         end,
+    BloomSlots = (length(AllHashes) * BloomBits) div 4096,
     BloomAddFun =
         fun({H, _K}, Bloom) -> leveled_tinybloom:enter(H, Bloom) end,
     Bloom = lists:foldr(BloomAddFun,
diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl
index 1cfde77..73982e5 100644
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@@ -133,7 +133,7 @@ getbit(Bit, BitArray, ArrayLength) ->
 
 simple_test() ->
     N = 4000,
-    W = 4,
+    W = 6,
     KLin = lists:map(fun(X) -> "Key_" ++
                                 integer_to_list(X) ++
                                 integer_to_list(random:uniform(100)) ++

From b3e189b012d0afcf5712731594086729fa284fe0 Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Mon, 2 Jan 2017 18:38:14 +0000
Subject: [PATCH 43/58] Protect against div by 0

Make sure that blooms are always at least 1 slot in size
---
 src/leveled_sst.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 2c1c25e..f8b9466 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -621,7 +621,7 @@ build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) ->
             false ->
                 element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_BITS))
         end,
-    BloomSlots = (length(AllHashes) * BloomBits) div 4096,
+    BloomSlots = max((length(AllHashes) * BloomBits) div 4096, 1),
     BloomAddFun =
         fun({H, _K}, Bloom) -> leveled_tinybloom:enter(H, Bloom) end,
     Bloom = lists:foldr(BloomAddFun,

From 31d43468068bfff108d29b28bfd8027bfe181dce Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Mon, 2 Jan 2017 18:54:19 +0000
Subject: [PATCH 44/58] Log improvements

Log on bad CRC, and also not seeing SST timing logs, so log these more
frequently
---
 src/leveled_log.erl | 4 +++-
 src/leveled_sst.erl | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 231dbce..d57083c 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -17,7 +17,7 @@
 -define(PUT_LOGPOINT, 20000).
 -define(HEAD_LOGPOINT, 160000).
 -define(GET_LOGPOINT, 160000).
--define(SST_LOGPOINT, 200000).
+-define(SST_LOGPOINT, 20000).
 -define(LOG_LEVEL, [info, warn, error, critical]).
 -define(SAMPLE_RATE, 16).
 
@@ -248,6 +248,8 @@
         {info, "Exit called and now clearing ~s"}},
     {"SST08",
         {info, "Completed creation of ~s at level ~w with max sqn ~w"}},
+    {"SST09",
+        {warn, "Read request exposes slot with bad CRC"}},
     
     {"CDB01",
         {info, "Opening file for writing with filename ~s"}},
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index f8b9466..f7903f7 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1060,6 +1060,7 @@ crc_check_slot(FullBin) ->
             Lengths = {B1P, B1L, B2L, B3L, B4L},
             {Lengths, Rest};
         _ ->
+            leveled_log:log("SST09", []),
             crc_wonky
     end.
 

From 5b4c903d53a46e2ed859dbefa329347f5ba6a3cc Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Mon, 2 Jan 2017 20:02:49 +0000
Subject: [PATCH 45/58] Check before update on bloom

---
 src/leveled_sst.erl       |  2 +-
 src/leveled_tinybloom.erl | 21 +++++++++++++++++----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index f7903f7..addabe5 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -72,7 +72,7 @@
 -define(SLOT_SIZE, 128). % This is not configurable
 -define(COMPRESSION_LEVEL, 1).
 -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]).
--define(LEVEL_BLOOM_BITS, [{0, 12}, {1, 10}, {2, 8}, {default, 6}]).
+-define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]).
 -define(MERGE_SCANWIDTH, 16).
 -define(DISCARD_EXT, ".discarded").
 -define(DELETE_TIMEOUT, 10000).
diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl
index 73982e5..9d85b9a 100644
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@@ -51,6 +51,7 @@ enter(Key, Bloom) ->
     Hash = leveled_codec:magic_hash(Key),
     enter({hash, Hash}, Bloom).
 
+
 check({hash, Hash}, Bloom) ->
     {Slot0, Q, Bit1, Bit2, Bit3} = split_hash(Hash),
     Slot = Slot0 rem dict:size(Bloom),
@@ -83,16 +84,23 @@ check(Key, Bloom) ->
 %%%============================================================================
 
 split_hash(Hash) ->
+    Slot = split_for_slot(Hash),
+    {Q1, H1, H2, H3} = split_for_bits(Hash),
+    {Slot, Q1, H1, H2, H3}.
+
+split_for_slot(Hash) ->
     SlotH1 = Hash band 255,
     SlotH2 = (Hash bsr 8) band 255,
     SlotH3 = (Hash bsr 16) band 255,
     SlotH4 = (Hash bsr 24) band 255,
-    Slot = (SlotH1 bxor SlotH2) bxor (SlotH3 bxor SlotH4),
+    (SlotH1 bxor SlotH2) bxor (SlotH3 bxor SlotH4).
+
+split_for_bits(Hash) ->
     Q1 = Hash band 3,
     H1 = (Hash bsr 2) band 1023,
     H2 = (Hash bsr 12) band 1023,
     H3 = (Hash bsr 22) band 1023,
-    {Slot, Q1, H1, H2, H3}.
+    {Q1, H1, H2, H3}.
 
 split_array(Bin, Q) ->
     case Q of
@@ -113,9 +121,14 @@ split_array(Bin, Q) ->
 add_to_array(Bit, BitArray, ArrayLength) ->
     RestLen = ArrayLength - Bit - 1,
     <> = BitArray,
-    <>.
+    case B of
+        0 ->
+            <>;
+        1 ->
+            BitArray
+    end.
 
 getbit(Bit, BitArray, ArrayLength) ->
     RestLen = ArrayLength - Bit - 1,

From d28e5d639cdd7f52117b5622a1efb730b1ab7b53 Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Tue, 3 Jan 2017 09:12:41 +0000
Subject: [PATCH 46/58] Remove SST blooms

---
 src/leveled_sst.erl | 110 +++++++++++++++++---------------------------
 1 file changed, 41 insertions(+), 69 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index addabe5..2277249 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -72,7 +72,7 @@
 -define(SLOT_SIZE, 128). % This is not configurable
 -define(COMPRESSION_LEVEL, 1).
 -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]).
--define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]).
+% -define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]).
 -define(MERGE_SCANWIDTH, 16).
 -define(DISCARD_EXT, ".discarded").
 -define(DELETE_TIMEOUT, 10000).
@@ -117,8 +117,7 @@
                         last_key :: tuple(),
                         index :: list(), % leveled_skiplist
                         size :: integer(),
-                        max_sqn :: integer(),
-                        bloom}).
+                        max_sqn :: integer()}).
 
 -record(state,      {summary,
                         handle :: file:fd(),
@@ -417,60 +416,47 @@ code_change(_OldVsn, StateName, State, _Extra) ->
 
 fetch(LedgerKey, Hash, State) ->
     Summary = State#state.summary,
-    case leveled_tinybloom:check({hash, Hash},
-                                    Summary#summary.bloom) of
-        false ->
-            {not_present, summary_bloom, null, State};
-        true ->
-            Slot = lookup_slot(LedgerKey, Summary#summary.index),
-            SlotID = Slot#slot_index_value.slot_id,
-            CachedBlockIdx = array:get(SlotID - 1, 
+    Slot = lookup_slot(LedgerKey, Summary#summary.index),
+    SlotID = Slot#slot_index_value.slot_id,
+    CachedBlockIdx = array:get(SlotID - 1, 
+                                State#state.blockindex_cache),
+    case CachedBlockIdx of 
+        none ->
+            SlotBin = read_slot(State#state.handle, Slot),
+            {Result, BlockIdx} = binaryslot_get(SlotBin, 
+                                                LedgerKey, 
+                                                Hash, 
+                                                none),
+            BlockIndexCache = array:set(SlotID - 1, 
+                                        BlockIdx,
                                         State#state.blockindex_cache),
-            case CachedBlockIdx of 
-                none ->
-                    SlotBin = read_slot(State#state.handle, Slot),
-                    {Result, BlockIdx} = binaryslot_get(SlotBin, 
-                                                        LedgerKey, 
-                                                        Hash, 
-                                                        none),
-                    BlockIndexCache = array:set(SlotID - 1, 
-                                                BlockIdx,
-                                                State#state.blockindex_cache),
-                    {Result, 
-                        slot_fetch, 
-                        Slot#slot_index_value.slot_id,
-                        State#state{blockindex_cache = BlockIndexCache}};
+            {Result, 
+                slot_fetch, 
+                Slot#slot_index_value.slot_id,
+                State#state{blockindex_cache = BlockIndexCache}};
+        _ ->
+            PosList = find_pos(CachedBlockIdx, 
+                                double_hash(Hash, LedgerKey), 
+                                [], 
+                                0),
+            case PosList of 
+                [] ->
+                    {not_present, slot_bloom,  SlotID, State};
                 _ ->
-                    PosList = find_pos(CachedBlockIdx, 
-                                        double_hash(Hash, LedgerKey), 
-                                        [], 
-                                        0),
-                    case PosList of 
-                        [] ->
-                            {not_present,
-                                slot_bloom, 
-                                SlotID,
-                                State};
+                    LastKV = array:get(SlotID - 1,
+                                        State#state.lastfetch_cache),
+                    case LastKV of 
+                        {LedgerKey, _} ->
+                            {LastKV, slot_cache, SlotID, State};
                         _ ->
-                            LastKV = array:get(SlotID - 1,
-                                                State#state.lastfetch_cache),
-                            case LastKV of 
-                                {LedgerKey, _} ->
-                                    {LastKV, slot_cache, SlotID, State};
-                                _ ->
-                                    SlotBin = read_slot(State#state.handle, 
-                                                        Slot),
-                                    Result = binaryslot_get(SlotBin, 
-                                                            LedgerKey, 
-                                                            Hash, 
-                                                            {true, PosList}),
-                                    {element(1, Result), 
-                                        slot_fetch,
-                                        SlotID,
-                                        State}
-                            end
-                    end 
-            end
+                            SlotBin = read_slot(State#state.handle, Slot),
+                            Result = binaryslot_get(SlotBin, 
+                                                    LedgerKey, 
+                                                    Hash, 
+                                                    {true, PosList}),
+                            {element(1, Result), slot_fetch, SlotID, State}
+                    end
+            end 
     end.
 
 
@@ -613,26 +599,12 @@ open_reader(Filename) ->
     {ok, SummaryBin} = file:pread(Handle, SlotsLength + 8, SummaryLength),
     {Handle, SummaryBin}.
 
-build_table_summary(SlotIndex, AllHashes, Level, FirstKey, L, MaxSQN) ->
-    BloomBits =
-        case lists:keyfind(Level, 1, ?LEVEL_BLOOM_BITS) of
-            {Level, N} ->
-                N;
-            false ->
-                element(2, lists:keyfind(default, 1, ?LEVEL_BLOOM_BITS))
-        end,
-    BloomSlots = max((length(AllHashes) * BloomBits) div 4096, 1),
-    BloomAddFun =
-        fun({H, _K}, Bloom) -> leveled_tinybloom:enter(H, Bloom) end,
-    Bloom = lists:foldr(BloomAddFun,
-                            leveled_tinybloom:empty(BloomSlots),
-                            AllHashes),
+build_table_summary(SlotIndex, _AllHashes, _Level, FirstKey, L, MaxSQN) ->
     [{LastKey, _LastV}|_Rest] = SlotIndex,
     Summary = #summary{first_key = FirstKey,
                         last_key = LastKey,
                         size = L,
                         index = lists:reverse(SlotIndex),
-                        bloom = Bloom,
                         max_sqn = MaxSQN},
     SummBin = term_to_binary(Summary, ?BINARY_SETTINGS),
     SummCRC = erlang:crc32(SummBin),

From b6ae0e1af51850492c421ca8fcfc99756de50ef7 Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Tue, 3 Jan 2017 13:03:59 +0000
Subject: [PATCH 47/58] Fix broken SST cache

---
 src/leveled_log.erl |  2 +-
 src/leveled_sst.erl | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index d57083c..5ee6dc0 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -433,7 +433,7 @@ sst_timing({N, SSTTimerD}, SW, TimerType) ->
     end.
 
 sst_keylist() ->
-    [summary_bloom, slot_cache, slot_bloom, slot_fetch].
+    [slot_cache, slot_bloom, slot_fetch].
 
 
 get_timing(undefined, SW, TimerType) ->
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 2277249..80e71e4 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -303,12 +303,12 @@ reader({get_kv, LedgerKey, Hash}, _From, State) ->
     case {Result, Stage} of
         {not_present, _} ->
             {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}};
-        {KV, slot_lookup_hit} ->
+        {_KV, slot_cache} ->
+            {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}};
+        {KV, _} ->
             UpdCache = array:set(SlotID - 1, KV, State#state.lastfetch_cache),
             {reply, Result, reader, UpdState#state{lastfetch_cache = UpdCache,
-                                                    sst_timings = UpdTimings}};
-        _ ->
-            {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}}
+                                                    sst_timings = UpdTimings}}
     end;
 reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) ->
     {reply,
@@ -1477,6 +1477,15 @@ simple_persisted_test() ->
                                                 1,
                                                 KVList1,
                                                 length(KVList1)),
+    SW0 = os:timestamp(),
+    lists:foreach(fun({K, V}) ->
+                        ?assertMatch({K, V}, sst_get(Pid, K))
+                        end,
+                    KVList1),
+    io:format(user,
+                "Checking for ~w keys (once) in file with cache hit took ~w "
+                    ++ "microseconds~n",
+                [length(KVList1), timer:now_diff(os:timestamp(), SW0)]),
     SW1 = os:timestamp(),
     lists:foreach(fun({K, V}) ->
                         ?assertMatch({K, V}, sst_get(Pid, K)),

From e1d843a2ebfd493d0297dc524c28ee8d7a233dcf Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Tue, 3 Jan 2017 15:26:44 +0000
Subject: [PATCH 48/58] Remove lastfetch cache

It appears to have some benefit at lower levels, but overall has less
benefit at higher levels.  Probably not worth having unless it cna be
controlled to go in at the basement only.
---
 src/leveled_sst.erl | 53 ++++++++++-----------------------------------
 1 file changed, 12 insertions(+), 41 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 80e71e4..6c21463 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -125,7 +125,6 @@
                         slot_lengths :: list(),
                         penciller :: pid(),
                         filename,
-                        lastfetch_cache,
                         blockindex_cache}).
 
 
@@ -298,18 +297,9 @@ starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller, MaxSQN},
 
 reader({get_kv, LedgerKey, Hash}, _From, State) ->
     SW = os:timestamp(),
-    {Result, Stage, SlotID, UpdState} = fetch(LedgerKey, Hash, State),
+    {Result, Stage, _SlotID, UpdState} = fetch(LedgerKey, Hash, State),
     UpdTimings = leveled_log:sst_timing(State#state.sst_timings, SW, Stage),
-    case {Result, Stage} of
-        {not_present, _} ->
-            {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}};
-        {_KV, slot_cache} ->
-            {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}};
-        {KV, _} ->
-            UpdCache = array:set(SlotID - 1, KV, State#state.lastfetch_cache),
-            {reply, Result, reader, UpdState#state{lastfetch_cache = UpdCache,
-                                                    sst_timings = UpdTimings}}
-    end;
+    {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}};
 reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) ->
     {reply,
         fetch_range(StartKey, EndKey, ScanWidth, State),
@@ -350,17 +340,8 @@ reader(close, _From, State) ->
 
 
 delete_pending({get_kv, LedgerKey, Hash}, _From, State) ->
-    {Result, Stage, SlotID, UpdState} = fetch(LedgerKey, Hash, State),
-    case {Result, Stage} of
-        {not_present, _} ->
-            {reply, Result, delete_pending, State, ?DELETE_TIMEOUT};
-        {KV, slot_lookup_hit} ->
-            UpdCache = array:set(SlotID - 1, KV, State#state.lastfetch_cache),
-            UpdState = State#state{lastfetch_cache = UpdCache},
-            {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT};
-        _ ->
-            {reply, Result, delete_pending, State, ?DELETE_TIMEOUT}
-    end;
+    {Result, _Stage, _SlotID, UpdState} = fetch(LedgerKey, Hash, State),
+    {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT};
 delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) ->
     {reply,
         fetch_range(StartKey, EndKey, ScanWidth, State),
@@ -443,19 +424,12 @@ fetch(LedgerKey, Hash, State) ->
                 [] ->
                     {not_present, slot_bloom,  SlotID, State};
                 _ ->
-                    LastKV = array:get(SlotID - 1,
-                                        State#state.lastfetch_cache),
-                    case LastKV of 
-                        {LedgerKey, _} ->
-                            {LastKV, slot_cache, SlotID, State};
-                        _ ->
-                            SlotBin = read_slot(State#state.handle, Slot),
-                            Result = binaryslot_get(SlotBin, 
-                                                    LedgerKey, 
-                                                    Hash, 
-                                                    {true, PosList}),
-                            {element(1, Result), slot_fetch, SlotID, State}
-                    end
+                    SlotBin = read_slot(State#state.handle, Slot),
+                    Result = binaryslot_get(SlotBin, 
+                                            LedgerKey, 
+                                            Hash, 
+                                            {true, PosList}),
+                    {element(1, Result), slot_fetch, SlotID, State}
             end 
     end.
 
@@ -573,12 +547,9 @@ read_file(Filename, State) ->
             undefined ->
                 BlockIndexCache = array:new([{size, SlotCount}, 
                                                 {default, none}]),
-                LastFetchCache = array:new([{size, SlotCount}]),
-                State#state{blockindex_cache = BlockIndexCache,
-                                lastfetch_cache = LastFetchCache};
+                State#state{blockindex_cache = BlockIndexCache};
             _ ->
-                LastFetchCache = array:new([{size, SlotCount}]),
-                State#state{lastfetch_cache = LastFetchCache}
+                State
         end,
 
     SkipL = leveled_skiplist:from_sortedlist(Summary#summary.index),

From 70c6e52fa7031616cee4dfe465c5c894bd6952cc Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Tue, 3 Jan 2017 15:27:28 +0000
Subject: [PATCH 49/58] Remove logs for slot_cache

---
 src/leveled_log.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 5ee6dc0..a8c94d9 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -433,7 +433,7 @@ sst_timing({N, SSTTimerD}, SW, TimerType) ->
     end.
 
 sst_keylist() ->
-    [slot_cache, slot_bloom, slot_fetch].
+    [slot_bloom, slot_fetch].
 
 
 get_timing(undefined, SW, TimerType) ->

From fba70edc94ee6e8c6f77c89706144cf4cf62a0ac Mon Sep 17 00:00:00 2001
From: Martin Sumner 
Date: Tue, 3 Jan 2017 17:08:36 +0000
Subject: [PATCH 50/58] Stop sort
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sort probably doesn’t help
---
 src/leveled_tinybloom.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl
index 9d85b9a..5e86473 100644
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@@ -45,7 +45,7 @@ enter({hash, Hash}, Bloom) ->
         fun(Bit, Arr) -> add_to_array(Bit, Arr, 1024) end,
     SplitArray1 = lists:foldl(FoldFun,
                                 SplitArray0,
-                                lists:usort([Bit1, Bit2, Bit3])),
+                                [Bit1, Bit2, Bit3]),
     dict:store(Slot, <
>, Bloom);
 enter(Key, Bloom) ->
     Hash = leveled_codec:magic_hash(Key),

From c4ebaa9f57eb1a0e06a08bf5e3a2ad4ad45603b2 Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Tue, 3 Jan 2017 18:20:28 +0000
Subject: [PATCH 51/58] Tidy Up All Hashes

As we're no longer generating a summayr bloom - no need to collect a big
list of hashes whilst building the sst file
---
 src/leveled_sst.erl | 46 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 6c21463..86fcd7d 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -241,11 +241,9 @@ starting({sst_new, Filename, Level, KVList, MaxSQN}, _From, State) ->
     {FirstKey, 
         Length, 
         SlotIndex, 
-        AllHashes, 
         BlockIndex, 
         SlotsBin} = build_all_slots(KVList),
     SummaryBin = build_table_summary(SlotIndex,
-                                        AllHashes,
                                         Level,
                                         FirstKey,
                                         Length,
@@ -268,11 +266,9 @@ starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller, MaxSQN},
     {FirstKey, 
         Length, 
         SlotIndex, 
-        AllHashes, 
         BlockIndex, 
         SlotsBin} = build_all_slots(KVList),
     SummaryBin = build_table_summary(SlotIndex,
-                                        AllHashes,
                                         0,
                                         FirstKey,
                                         Length,
@@ -570,7 +566,7 @@ open_reader(Filename) ->
     {ok, SummaryBin} = file:pread(Handle, SlotsLength + 8, SummaryLength),
     {Handle, SummaryBin}.
 
-build_table_summary(SlotIndex, _AllHashes, _Level, FirstKey, L, MaxSQN) ->
+build_table_summary(SlotIndex, _Level, FirstKey, L, MaxSQN) ->
     [{LastKey, _LastV}|_Rest] = SlotIndex,
     Summary = #summary{first_key = FirstKey,
                         last_key = LastKey,
@@ -600,18 +596,17 @@ build_all_slots(KVList) ->
     BuildResponse = build_all_slots(KVList,
                                     SlotCount,
                                     8,
-                                    [],
                                     1,
                                     [],
                                     array:new([{size, SlotCount}, 
                                                 {default, none}]),
                                     <<>>),
-    {SlotIndex, AllHashes, BlockIndex, SlotsBin} = BuildResponse,
-    {FirstKey, L, SlotIndex, AllHashes, BlockIndex, SlotsBin}.
+    {SlotIndex, BlockIndex, SlotsBin} = BuildResponse,
+    {FirstKey, L, SlotIndex, BlockIndex, SlotsBin}.
 
-build_all_slots([], _SC, _Pos, Hashes, _SlotID, SlotIdx, BlockIdxA, SlotsBin) ->
-    {SlotIdx, Hashes, BlockIdxA, SlotsBin};
-build_all_slots(KVL, SC, Pos, Hashes, SlotID, SlotIdx, BlockIdxA, SlotsBin) ->
+build_all_slots([], _SC, _Pos, _SlotID, SlotIdx, BlockIdxA, SlotsBin) ->
+    {SlotIdx, BlockIdxA, SlotsBin};
+build_all_slots(KVL, SC, Pos, SlotID, SlotIdx, BlockIdxA, SlotsBin) ->
     {SlotList, KVRem} =
         case SC of
             1 ->
@@ -620,7 +615,7 @@ build_all_slots(KVL, SC, Pos, Hashes, SlotID, SlotIdx, BlockIdxA, SlotsBin) ->
                 lists:split(?SLOT_SIZE, KVL)
         end,
     {LastKey, _V} = lists:last(SlotList),
-    {HashList, BlockIndex, SlotBin} = generate_binary_slot(SlotList),
+    {BlockIndex, SlotBin} = generate_binary_slot(SlotList),
     Length = byte_size(SlotBin),
     SlotIndexV = #slot_index_value{slot_id = SlotID,
                                     start_position = Pos,
@@ -628,7 +623,6 @@ build_all_slots(KVL, SC, Pos, Hashes, SlotID, SlotIdx, BlockIdxA, SlotsBin) ->
     build_all_slots(KVRem,
                     SC - 1,
                     Pos + Length,
-                    HashList ++ Hashes,
                     SlotID + 1,
                     [{LastKey, SlotIndexV}|SlotIdx],
                     array:set(SlotID - 1, BlockIndex, BlockIdxA),
@@ -760,7 +754,7 @@ generate_filenames(RootFilename) ->
 generate_binary_slot(KVL) ->
     
     HashFoldFun =
-        fun({K, V}, {HashListAcc, PosBinAcc, NoHashCount}) ->
+        fun({K, V}, {PosBinAcc, NoHashCount}) ->
             
             {_SQN, H1} = leveled_codec:strip_to_seqnhashonly({K, V}),
             case is_integer(H1) of 
@@ -768,8 +762,7 @@ generate_binary_slot(KVL) ->
                     PosH1 = double_hash(H1, K),
                     case NoHashCount of 
                         0 ->
-                            {[{{hash, H1}, K}|HashListAcc], 
-                                <<1:1/integer, 
+                            {<<1:1/integer, 
                                     PosH1:15/integer, 
                                     PosBinAcc/binary>>,
                                 0};
@@ -777,8 +770,7 @@ generate_binary_slot(KVL) ->
                             % The No Hash Count is an integer between 0 and 127
                             % and so at read time should count NHC + 1
                             NHC = N - 1,
-                            {[{{hash, H1}, K}|HashListAcc], 
-                                <<1:1/integer,
+                            {<<1:1/integer,
                                     PosH1:15/integer, 
                                     0:1/integer,
                                     NHC:7/integer, 
@@ -786,14 +778,12 @@ generate_binary_slot(KVL) ->
                                 0}
                     end;
                 false ->
-                    {HashListAcc, PosBinAcc, NoHashCount + 1}
+                    {PosBinAcc, NoHashCount + 1}
             end
          
          end,
 
-    {HashList, PosBinIndex0, NHC} = lists:foldr(HashFoldFun, 
-                                                {[], <<>>, 0}, 
-                                                KVL),
+    {PosBinIndex0, NHC} = lists:foldr(HashFoldFun, {<<>>, 0}, KVL),
     PosBinIndex1 = 
         case NHC of
             0 ->
@@ -850,7 +840,7 @@ generate_binary_slot(KVL) ->
     CRC32 = erlang:crc32(SlotBin),
     FullBin = <>,
 
-    {HashList, PosBinIndex1, FullBin}.
+    {PosBinIndex1, FullBin}.
 
 
 binaryslot_get(FullBin, Key, Hash, CachedPosLookup) ->
@@ -1252,7 +1242,7 @@ indexed_list_test() ->
 
     SW0 = os:timestamp(),
 
-    {_HashList, _PosBinIndex1, FullBin} = generate_binary_slot(KVL1),
+    {_PosBinIndex1, FullBin} = generate_binary_slot(KVL1),
     io:format(user,
                 "Indexed list created slot in ~w microseconds of size ~w~n",
                 [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]),
@@ -1280,7 +1270,7 @@ indexed_list_mixedkeys_test() ->
     KVL1 = lists:sublist(KVL0, 33),
     Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1),
 
-    {_HashList, _PosBinIndex1, FullBin} = generate_binary_slot(Keys),
+    {PosBinIndex1, FullBin} = generate_binary_slot(Keys),
 
     {TestK1, TestV1} = lists:nth(4, KVL1),
     MH1 = leveled_codec:magic_hash(TestK1),
@@ -1301,7 +1291,7 @@ indexed_list_mixedkeys_test() ->
 
 indexed_list_allindexkeys_test() ->
     Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128),
-    {_HashList, PosBinIndex1, FullBin} = generate_binary_slot(Keys),
+    {PosBinIndex1, FullBin} = generate_binary_slot(Keys),
     ?assertMatch(<<127:8/integer>>, PosBinIndex1),
     % SW = os:timestamp(),
     BinToList = binaryslot_tolist(FullBin),
@@ -1314,7 +1304,7 @@ indexed_list_allindexkeys_test() ->
 
 indexed_list_allindexkeys_trimmed_test() ->
     Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128),
-    {_HashList, PosBinIndex1, FullBin} = generate_binary_slot(Keys),
+    {PosBinIndex1, FullBin} = generate_binary_slot(Keys),
     ?assertMatch(<<127:8/integer>>, PosBinIndex1),
     ?assertMatch(Keys, binaryslot_trimmedlist(FullBin, 
                                                 {i, 
@@ -1353,7 +1343,7 @@ indexed_list_mixedkeys_bitflip_test() ->
     KVL1 = lists:sublist(KVL0, 33),
     Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1),
 
-    {_HashList, _PosBinIndex1, FullBin} = generate_binary_slot(Keys),
+    {_PosBinIndex1, FullBin} = generate_binary_slot(Keys),
     L = byte_size(FullBin),
     Byte1 = random:uniform(L),
     <> = FullBin,

From 2f3eb185480071615ec6de0388d29255c9705e18 Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Tue, 3 Jan 2017 18:26:54 +0000
Subject: [PATCH 52/58] Re-add usort

Change one thing at a time
---
 src/leveled_sst.erl       | 2 +-
 src/leveled_tinybloom.erl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 86fcd7d..6316d81 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1270,7 +1270,7 @@ indexed_list_mixedkeys_test() ->
     KVL1 = lists:sublist(KVL0, 33),
     Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1),
 
-    {PosBinIndex1, FullBin} = generate_binary_slot(Keys),
+    {_PosBinIndex1, FullBin} = generate_binary_slot(Keys),
 
     {TestK1, TestV1} = lists:nth(4, KVL1),
     MH1 = leveled_codec:magic_hash(TestK1),
diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl
index 5e86473..9d85b9a 100644
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@@ -45,7 +45,7 @@ enter({hash, Hash}, Bloom) ->
         fun(Bit, Arr) -> add_to_array(Bit, Arr, 1024) end,
     SplitArray1 = lists:foldl(FoldFun,
                                 SplitArray0,
-                                [Bit1, Bit2, Bit3]),
+                                lists:usort([Bit1, Bit2, Bit3])),
     dict:store(Slot, <
>, Bloom);
 enter(Key, Bloom) ->
     Hash = leveled_codec:magic_hash(Key),

From be1d678d858da3b6097966c3f36e6002082ce32b Mon Sep 17 00:00:00 2001
From: Martin Sumner 
Date: Tue, 3 Jan 2017 23:43:43 +0000
Subject: [PATCH 53/58] Revert to two hash tiny bloom

---
 src/leveled_tinybloom.erl | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl
index 9d85b9a..03c24bf 100644
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@@ -29,7 +29,6 @@
 %%% Bloom API
 %%%============================================================================
 
-
 empty(Width) when Width =< 256 ->
     FoldFun = fun(X, Acc) -> dict:store(X, <<0:4096>>, Acc) end,
     lists:foldl(FoldFun, dict:new(), lists:seq(0, Width - 1)).
@@ -37,7 +36,7 @@ empty(Width) when Width =< 256 ->
 enter({hash, no_lookup}, Bloom) ->
     Bloom;
 enter({hash, Hash}, Bloom) ->
-    {Slot0, Q, Bit1, Bit2, Bit3} = split_hash(Hash),
+    {Slot0, Q, Bit1, Bit2} = split_hash(Hash),
     Slot = Slot0 rem dict:size(Bloom),
     BitArray0 = dict:fetch(Slot, Bloom),
     {Pre, SplitArray0, Post} = split_array(BitArray0, Q),
@@ -45,7 +44,7 @@ enter({hash, Hash}, Bloom) ->
         fun(Bit, Arr) -> add_to_array(Bit, Arr, 1024) end,
     SplitArray1 = lists:foldl(FoldFun,
                                 SplitArray0,
-                                lists:usort([Bit1, Bit2, Bit3])),
+                                [Bit1, Bit2]),
     dict:store(Slot, <
>, Bloom);
 enter(Key, Bloom) ->
     Hash = leveled_codec:magic_hash(Key),
@@ -53,7 +52,7 @@ enter(Key, Bloom) ->
 
 
 check({hash, Hash}, Bloom) ->
-    {Slot0, Q, Bit1, Bit2, Bit3} = split_hash(Hash),
+    {Slot0, Q, Bit1, Bit2} = split_hash(Hash),
     Slot = Slot0 rem dict:size(Bloom),
     BitArray = dict:fetch(Slot, Bloom),
     {_Pre, SplitArray, _Post} = split_array(BitArray, Q),
@@ -66,12 +65,7 @@ check({hash, Hash}, Bloom) ->
                 <<0:1>> ->
                     false;
                 <<1:1>> ->
-                    case getbit(Bit3, SplitArray, 1024) of
-                        <<0:1>> ->
-                            false;
-                        <<1:1>> ->
-                            true
-                    end
+                    true
             end
     end;
 check(Key, Bloom) ->
@@ -85,22 +79,17 @@ check(Key, Bloom) ->
 
 split_hash(Hash) ->
     Slot = split_for_slot(Hash),
-    {Q1, H1, H2, H3} = split_for_bits(Hash),
-    {Slot, Q1, H1, H2, H3}.
+    {Q1, H1, H2} = split_for_bits(Hash),
+    {Slot, Q1, H1, H2}.
 
 split_for_slot(Hash) ->
-    SlotH1 = Hash band 255,
-    SlotH2 = (Hash bsr 8) band 255,
-    SlotH3 = (Hash bsr 16) band 255,
-    SlotH4 = (Hash bsr 24) band 255,
-    (SlotH1 bxor SlotH2) bxor (SlotH3 bxor SlotH4).
+    Hash band 255.
 
 split_for_bits(Hash) ->
-    Q1 = Hash band 3,
-    H1 = (Hash bsr 2) band 1023,
-    H2 = (Hash bsr 12) band 1023,
-    H3 = (Hash bsr 22) band 1023,
-    {Q1, H1, H2, H3}.
+    H1 = (Hash bsr 8) band 1023,
+    H2 = (Hash bsr 18) band 1023,
+    Q1 = (Hash bsr 28) band 3,
+    {Q1, H1, H2}.
 
 split_array(Bin, Q) ->
     case Q of

From 85aaccfe3197056d4ff6e817b67e280950b471ea Mon Sep 17 00:00:00 2001
From: Martin Sumner 
Date: Tue, 3 Jan 2017 23:53:57 +0000
Subject: [PATCH 54/58] Revert to non-split tinybloom

---
 src/leveled_tinybloom.erl | 45 +++++++++++----------------------------
 1 file changed, 13 insertions(+), 32 deletions(-)

diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl
index 03c24bf..e513ce5 100644
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@@ -36,32 +36,30 @@ empty(Width) when Width =< 256 ->
 enter({hash, no_lookup}, Bloom) ->
     Bloom;
 enter({hash, Hash}, Bloom) ->
-    {Slot0, Q, Bit1, Bit2} = split_hash(Hash),
+    {Slot0, Bit1, Bit2} = split_hash(Hash),
     Slot = Slot0 rem dict:size(Bloom),
     BitArray0 = dict:fetch(Slot, Bloom),
-    {Pre, SplitArray0, Post} = split_array(BitArray0, Q),
     FoldFun =
-        fun(Bit, Arr) -> add_to_array(Bit, Arr, 1024) end,
-    SplitArray1 = lists:foldl(FoldFun,
-                                SplitArray0,
+        fun(Bit, Arr) -> add_to_array(Bit, Arr, 4096) end,
+    BitArray1 = lists:foldl(FoldFun,
+                                BitArray0,
                                 [Bit1, Bit2]),
-    dict:store(Slot, <
>, Bloom);
+    dict:store(Slot, <>, Bloom);
 enter(Key, Bloom) ->
     Hash = leveled_codec:magic_hash(Key),
     enter({hash, Hash}, Bloom).
 
 
 check({hash, Hash}, Bloom) ->
-    {Slot0, Q, Bit1, Bit2} = split_hash(Hash),
+    {Slot0, Bit1, Bit2} = split_hash(Hash),
     Slot = Slot0 rem dict:size(Bloom),
     BitArray = dict:fetch(Slot, Bloom),
-    {_Pre, SplitArray, _Post} = split_array(BitArray, Q),
     
-    case getbit(Bit1, SplitArray, 1024) of
+    case getbit(Bit1, BitArray, 4096) of
         <<0:1>> ->
             false;
         <<1:1>> ->
-            case getbit(Bit2, SplitArray, 1024) of
+            case getbit(Bit2, BitArray, 4096) of
                 <<0:1>> ->
                     false;
                 <<1:1>> ->
@@ -79,33 +77,16 @@ check(Key, Bloom) ->
 
 split_hash(Hash) ->
     Slot = split_for_slot(Hash),
-    {Q1, H1, H2} = split_for_bits(Hash),
-    {Slot, Q1, H1, H2}.
+    {H1, H2} = split_for_bits(Hash),
+    {Slot, H1, H2}.
 
 split_for_slot(Hash) ->
     Hash band 255.
 
 split_for_bits(Hash) ->
-    H1 = (Hash bsr 8) band 1023,
-    H2 = (Hash bsr 18) band 1023,
-    Q1 = (Hash bsr 28) band 3,
-    {Q1, H1, H2}.
-
-split_array(Bin, Q) ->
-    case Q of
-        0 ->
-            <> = Bin,
-            {<<>>, ToUse, Post};
-        1 ->
-            <> = Bin,
-            {Pre, ToUse, Post};
-        2 ->
-            <> = Bin,
-            {Pre, ToUse, Post};
-        3 ->
-            <> = Bin,
-            {Pre, ToUse, <<>>}
-    end.
+    H1 = (Hash bsr 8) band 4095,
+    H2 = (Hash bsr 20) band 4095,
+    {H1, H2}.
 
 add_to_array(Bit, BitArray, ArrayLength) ->
     RestLen = ArrayLength - Bit - 1,

From 8289c3b783379a97b3a0aabef73845e5d3f6b886 Mon Sep 17 00:00:00 2001
From: Martin Sumner 
Date: Wed, 4 Jan 2017 00:26:52 +0000
Subject: [PATCH 55/58] full reversion

---
 src/leveled_tinybloom.erl | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl
index e513ce5..2278c2a 100644
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@@ -43,7 +43,7 @@ enter({hash, Hash}, Bloom) ->
         fun(Bit, Arr) -> add_to_array(Bit, Arr, 4096) end,
     BitArray1 = lists:foldl(FoldFun,
                                 BitArray0,
-                                [Bit1, Bit2]),
+                                lists:usort([Bit1, Bit2])),
     dict:store(Slot, <>, Bloom);
 enter(Key, Bloom) ->
     Hash = leveled_codec:magic_hash(Key),
@@ -76,29 +76,17 @@ check(Key, Bloom) ->
 %%%============================================================================
 
 split_hash(Hash) ->
-    Slot = split_for_slot(Hash),
-    {H1, H2} = split_for_bits(Hash),
-    {Slot, H1, H2}.
-
-split_for_slot(Hash) ->
-    Hash band 255.
-
-split_for_bits(Hash) ->
+    H0 = Hash band 255,
     H1 = (Hash bsr 8) band 4095,
-    H2 = (Hash bsr 20) band 4095,
-    {H1, H2}.
+    H2 = Hash bsr 20,
+    {H0, H1, H2}.
 
 add_to_array(Bit, BitArray, ArrayLength) ->
     RestLen = ArrayLength - Bit - 1,
     <> = BitArray,
-    case B of
-        0 ->
-            <>;
-        1 ->
-            BitArray
-    end.
+    <>.
 
 getbit(Bit, BitArray, ArrayLength) ->
     RestLen = ArrayLength - Bit - 1,

From 7d95fa6bbc11782950f56735fd2f9e1a65912a0d Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Wed, 4 Jan 2017 14:26:11 +0000
Subject: [PATCH 56/58] Switch summary index

Simplify the summayr index implementation
---
 src/leveled_sst.erl | 173 ++++++++++++++++++++++++++++----------------
 1 file changed, 109 insertions(+), 64 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 6316d81..387828f 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -74,6 +74,7 @@
 -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]).
 % -define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]).
 -define(MERGE_SCANWIDTH, 16).
+-define(INDEX_MARKER_WIDTH, 16).
 -define(DISCARD_EXT, ".discarded").
 -define(DELETE_TIMEOUT, 10000).
 
@@ -115,14 +116,13 @@
 
 -record(summary,    {first_key :: tuple(),
                         last_key :: tuple(),
-                        index :: list(), % leveled_skiplist
+                        index :: tuple(), 
                         size :: integer(),
                         max_sqn :: integer()}).
 
 -record(state,      {summary,
                         handle :: file:fd(),
                         sst_timings :: tuple(),
-                        slot_lengths :: list(),
                         penciller :: pid(),
                         filename,
                         blockindex_cache}).
@@ -530,14 +530,8 @@ write_file(Filename, SummaryBin, SlotsBin) ->
 
 read_file(Filename, State) ->
     {Handle, SummaryBin} = open_reader(Filename),
-    Summary = read_table_summary(SummaryBin),
-    SlotLengthFetchFun =
-        fun({_K, V}, Acc) ->
-                [{V#slot_index_value.slot_id,
-                    V#slot_index_value.length}|Acc]
-        end,
-    SlotLengths = lists:foldr(SlotLengthFetchFun, [], Summary#summary.index),
-    SlotCount = length(SlotLengths),
+    {Summary, SlotList} = read_table_summary(SummaryBin),
+    SlotCount = length(SlotList),
     UpdState = 
         case State#state.blockindex_cache of 
             undefined ->
@@ -547,17 +541,15 @@ read_file(Filename, State) ->
             _ ->
                 State
         end,
-
-    SkipL = leveled_skiplist:from_sortedlist(Summary#summary.index),
-    UpdSummary = Summary#summary{index = SkipL},
+    SlotIndex = from_list(SlotList),
+    UpdSummary = Summary#summary{index = SlotIndex},
     leveled_log:log("SST03", [Filename,
                                 Summary#summary.size,
                                 SlotCount,
                                 Summary#summary.max_sqn]),
     UpdState#state{summary = UpdSummary,
-                slot_lengths = SlotLengths,
-                handle = Handle,
-                filename = Filename}.
+                    handle = Handle,
+                    filename = Filename}.
 
 open_reader(Filename) ->
     {ok, Handle} = file:open(Filename, [binary, raw, read]),
@@ -566,14 +558,14 @@ open_reader(Filename) ->
     {ok, SummaryBin} = file:pread(Handle, SlotsLength + 8, SummaryLength),
     {Handle, SummaryBin}.
 
-build_table_summary(SlotIndex, _Level, FirstKey, L, MaxSQN) ->
-    [{LastKey, _LastV}|_Rest] = SlotIndex,
+build_table_summary(SlotList, _Level, FirstKey, L, MaxSQN) ->
+    [{LastKey, _LastV}|_Rest] = SlotList,
     Summary = #summary{first_key = FirstKey,
                         last_key = LastKey,
                         size = L,
-                        index = lists:reverse(SlotIndex),
                         max_sqn = MaxSQN},
-    SummBin = term_to_binary(Summary, ?BINARY_SETTINGS),
+    SummBin = term_to_binary({Summary, lists:reverse(SlotList)},
+                                ?BINARY_SETTINGS),
     SummCRC = erlang:crc32(SummBin),
     <>.
 
@@ -628,50 +620,6 @@ build_all_slots(KVL, SC, Pos, SlotID, SlotIdx, BlockIdxA, SlotsBin) ->
                     array:set(SlotID - 1, BlockIndex, BlockIdxA),
                     <>).
 
-
-%% Returns a section from the summary index and two booleans to indicate if
-%% the first slot needs trimming, or the last slot
-lookup_slots(StartKey, EndKey, SkipList) ->
-    SlotsOnlyFun = fun({_K, V}) -> V end,
-    {KSL, LTrim, RTrim} = lookup_slots_int(StartKey, EndKey, SkipList),
-    {lists:map(SlotsOnlyFun, KSL), LTrim, RTrim}.
-
-lookup_slots_int(all, all, SkipList) ->
-    {leveled_skiplist:to_list(SkipList), false, false};
-lookup_slots_int(StartKey, all, SkipList) ->
-    L = leveled_skiplist:to_list(SkipList),
-    LTrimFun = fun({K, _V}) -> K < StartKey end,
-    {_LDrop, RKeep0} = lists:splitwith(LTrimFun, L),
-    [{FirstKey, _V}|_Rest] = RKeep0,
-    LTrim = FirstKey < StartKey,
-    {RKeep0, LTrim, false};
-lookup_slots_int(StartKey, EndKey, SkipList) ->
-    case leveled_skiplist:to_range(SkipList, StartKey, EndKey) of
-        [] ->
-            BestKey = leveled_skiplist:key_above(SkipList, StartKey),
-            {[BestKey], true, true};
-        L0 ->
-            {LastKey, _LastVal} = lists:last(L0),
-            case LastKey of
-                EndKey ->
-                    {L0, true, false};
-                _ ->
-                    LTail = leveled_skiplist:key_above_notequals(SkipList,
-                                                                    LastKey),
-                    case LTail of
-                        false ->
-                            {L0, true, false};
-                        _ ->
-                            {L0 ++ [LTail], true, true}
-                    end
-            end
-    end.
-        
-
-lookup_slot(Key, SkipList) ->
-    {_Mark, Slot} = leveled_skiplist:key_above(SkipList, Key),
-    Slot.
-
 read_slot(Handle, Slot) ->
     {ok, SlotBin} = file:pread(Handle,
                                 Slot#slot_index_value.start_position,
@@ -727,6 +675,103 @@ generate_filenames(RootFilename) ->
     end.    
 
 
+%%%============================================================================
+%%% SlotIndex Implementation
+%%%============================================================================
+
+%% The Slot Index is stored as a flat (sorted) list of {Key, Slot} where Key
+%% is the last key within the slot.
+%%
+%% This implementation of the SlotIndex stores it as a tuple with the original
+%% list as the second element and a list of mark points as the first element
+%% containing every 16th key.  The Mark points are stored as {Mark, Index},
+%% where the Index correspnds with the nth point in the original list that the
+%% Mark occurs.
+
+from_list(SlotList) ->
+    L = length(SlotList),
+    MarkerList = set_marks(lists:reverse(SlotList),
+                            {?INDEX_MARKER_WIDTH,  L rem ?INDEX_MARKER_WIDTH},
+                            L,
+                            []),
+    {MarkerList, SlotList}.
+
+set_marks([], _MarkInfo, 0, MarkerList) ->
+    MarkerList;
+set_marks([{Key, _Slot}|Rest], {MarkerWidth, MarkPoint}, Count, MarkerList) ->
+    case Count rem MarkerWidth of
+        MarkPoint ->
+            set_marks(Rest,
+                        {MarkerWidth, MarkPoint},
+                        Count - 1,
+                        [{Key, Count}|MarkerList]);
+        _ ->
+            set_marks(Rest,
+                        {MarkerWidth, MarkPoint},
+                        Count - 1,
+                        MarkerList)
+    end.
+
+find_mark(Key, [{Mark, Pos}|_Rest]) when Mark >= Key ->
+    Pos;
+find_mark(Key, [_H|T]) ->
+    find_mark(Key, T).
+
+lookup_slot(Key, {MarkerList, SlotList}) ->
+    Pos = find_mark(Key, MarkerList),
+    SubList = lists:sublist(SlotList, max(1, Pos - ?INDEX_MARKER_WIDTH), Pos),
+    Slot = find_mark(Key, SubList),
+    Slot.
+
+%% Returns a section from the summary index and two booleans to indicate if
+%% the first slot needs trimming, or the last slot
+lookup_slots(StartKey, EndKey, {_MarkerList, SlotList}) ->
+    SlotsOnlyFun = fun({_K, V}) -> V end,
+    {KSL, LTrim, RTrim} = lookup_slots_int(StartKey, EndKey, SlotList),
+    {lists:map(SlotsOnlyFun, KSL), LTrim, RTrim}.
+
+lookup_slots_int(all, all, SlotList) ->
+    {SlotList, false, false};
+lookup_slots_int(StartKey, all, SlotList) ->
+    LTrimFun = fun({K, _V}) -> K < StartKey end,
+    {_LDrop, RKeep0} = lists:splitwith(LTrimFun, SlotList),
+    {RKeep0, true, false};
+lookup_slots_int(StartKey, EndKey, SlotList) ->
+    {RKeep, true, false} = lookup_slots_int(StartKey, all, SlotList),
+    [LeftMost|RKeep0] = RKeep,
+    {LeftMostK, LeftMostV} = LeftMost,
+    RTrimFun = fun({K, _V}) -> not leveled_codec:endkey_passed(EndKey, K) end,
+    case leveled_codec:endkey_passed(EndKey, LeftMostK) of
+        true ->
+            {[{LeftMostK, LeftMostV}],
+                true,
+                true};
+        false ->
+            case LeftMostK of
+                EndKey ->
+                    {[{LeftMostK, LeftMostV}],
+                        true,
+                        false};
+                _ ->
+                    {LKeep, RDisc} = lists:splitwith(RTrimFun, RKeep0),
+                    case RDisc of
+                        [] ->
+                            {[LeftMost|LKeep],
+                                true,
+                                true};
+                        [{RDiscK1, RDiscV1}|_Rest] when RDiscK1 == EndKey ->
+                            {[LeftMost|LKeep] ++ [{RDiscK1, RDiscV1}],
+                                true,
+                                false};
+                        [{RDiscK1, RDiscV1}|_Rest] ->
+                            {[LeftMost|LKeep] ++ [{RDiscK1, RDiscV1}],
+                                true,
+                                true}
+                    end
+            end
+    end.
+
+
 %%%============================================================================
 %%% Slot Implementation
 %%%============================================================================

From 6e8f8a9c86240e987bbf333c18e74a345cf8e554 Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Wed, 4 Jan 2017 17:19:27 +0000
Subject: [PATCH 57/58] Strip out extra stuff from skiplist

---
 src/leveled_skiplist.erl | 58 ----------------------------------------
 1 file changed, 58 deletions(-)

diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl
index e8d8627..a5c3414 100644
--- a/src/leveled_skiplist.erl
+++ b/src/leveled_skiplist.erl
@@ -28,8 +28,6 @@
         to_range/3,
         lookup/2,
         lookup/3,
-        key_above/2,
-        key_above_notequals/2,
         empty/0,
         empty/1,
         size/1
@@ -124,16 +122,6 @@ to_range(SkipList, Start, End) ->
 to_list(SkipList) ->
     to_list(element(2, SkipList), ?LIST_HEIGHT).
 
-%% If a mark is found that matches the key, will return that mark
-key_above(SkipList, Key) ->
-    TestFun = fun(Mark, K) -> Mark >= K end,
-    key_above(element(2, SkipList), Key, ?LIST_HEIGHT, TestFun).
-
-%% If a mark is found that matches the key, will return the next mark
-key_above_notequals(SkipList, Key) ->
-    TestFun = fun(Mark, K) -> Mark > K end,
-    key_above(element(2, SkipList), Key, ?LIST_HEIGHT, TestFun).
-
 empty() ->
     empty(false).
 
@@ -329,37 +317,6 @@ sublist_above(SkipList, StartKey, Level, StartIncl) ->
             sublist_above(SL, StartKey, Level - 1, StartIncl)
     end.
 
-key_above(SkipList, Key, 0, TestFun) ->
-    FindFun = fun({Mark, V}, Found) ->
-                    case Found of
-                        false ->
-                            case TestFun(Mark, Key) of
-                                true ->
-                                    {Mark, V};
-                                false ->
-                                    false
-                            end;
-                        _ ->
-                            Found
-                    end
-                    end,
-    lists:foldl(FindFun, false, SkipList);
-key_above(SkipList, Key, Level, TestFun) ->
-    FindFun = fun({Mark, SL}, Found) ->
-                    case Found of
-                        false ->
-                            case TestFun(Mark, Key) of
-                                true ->
-                                    key_above(SL, Key, Level - 1, TestFun);
-                                false ->
-                                    false
-                            end;
-                        _ ->
-                            Found
-                    end
-                    end,
-    lists:foldl(FindFun, false, SkipList).
-
 empty(SkipList, 1) ->
     [{?INFINITY_KEY, SkipList}];
 empty(SkipList, Level) ->
@@ -658,21 +615,6 @@ skiplist_nolookup_test() ->
                         KL),
     ?assertMatch(KLSorted, to_list(SkipList)).
 
-skiplist_keybefore_test() ->
-    N = 128,
-    KL = generate_randomkeys(1, N, 1, N div 5),
-    SkipList = lists:foldl(fun({K, V}, Acc) ->
-                                enter_nolookup(K, V, Acc) end,
-                            empty(true),
-                            KL),
-    KLSorted = lists:ukeysort(1, lists:reverse(KL)),
-    SW = os:timestamp(),
-    lists:foreach(fun({K, V}) ->
-                        ?assertMatch({K, V}, key_above(SkipList, K)) end,
-                    KLSorted),
-    io:format(user, "~nFinding self in keys above ~w microseconds for ~w finds~n",
-                    [timer:now_diff(os:timestamp(), SW), N]).
-    
 skiplist_range_test() ->
     N = 150,
     KL = generate_randomkeys(1, N, 1, N div 5),

From 2f8ff640a99dd57fd66c71e883796b5361c5d4b0 Mon Sep 17 00:00:00 2001
From: martinsumner 
Date: Wed, 4 Jan 2017 21:36:59 +0000
Subject: [PATCH 58/58] Test coverage

Add some furthe runit tests to improve test coverage
---
 src/leveled_codec.erl     |  6 ---
 src/leveled_penciller.erl | 12 +-----
 src/leveled_sst.erl       | 89 ++++++++++++++++++++++++++++++---------
 3 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index b27f5b9..ab9ee27 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -37,7 +37,6 @@
         strip_to_keyonly/1,
         strip_to_seqonly/1,
         strip_to_statusonly/1,
-        strip_to_keyseqstatusonly/1,
         strip_to_keyseqonly/1,
         strip_to_seqnhashonly/1,
         striphead_to_details/1,
@@ -80,8 +79,6 @@ magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) ->
     magic_hash({Bucket, Key});
 magic_hash({?STD_TAG, Bucket, Key, _SubKey}) ->
     magic_hash({Bucket, Key});
-magic_hash({?IDX_TAG, _B, _Idx, _Key}) ->
-    no_lookup;
 magic_hash(AnyKey) ->
     BK = term_to_binary(AnyKey),
     H = 5381,
@@ -111,11 +108,8 @@ inker_reload_strategy(AltList) ->
                     ReloadStrategy0,
                     AltList).
 
-strip_to_keyonly({keyonly, K}) -> K;
 strip_to_keyonly({K, _V}) -> K.
 
-strip_to_keyseqstatusonly({K, {SeqN, St, _, _MD}}) -> {K, SeqN, St}.
-
 strip_to_statusonly({_, {_, St, _, _}}) -> St.
 
 strip_to_seqonly({_, {SeqN, _, _, _}}) -> SeqN.
diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index b99b3bd..853498b 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -172,7 +172,6 @@
         pcl_fetchkeys/5,
         pcl_fetchnextkey/5,
         pcl_checksequencenumber/3,
-        pcl_checksequencenumber/4,
         pcl_workforclerk/1,
         pcl_promptmanifestchange/2,
         pcl_confirml0complete/4,
@@ -281,9 +280,6 @@ pcl_checksequencenumber(Pid, Key, SQN) ->
             gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity)
     end.
 
-pcl_checksequencenumber(Pid, Key, Hash, SQN) ->
-    gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity).
-
 pcl_workforclerk(Pid) ->
     gen_server:call(Pid, work_for_clerk, infinity).
 
@@ -686,13 +682,7 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN},
                 _ ->
                     leveled_log:log_timer("P0031", [], SW),
                     UpdState
-            end;
-        
-        NewL0Size == L0Size ->
-            leveled_log:log_timer("P0031", [], SW),
-            State#state{levelzero_cache=L0Cache,
-                        levelzero_size=L0Size,
-                        ledger_sqn=LedgerSQN}
+            end
     end.
 
 
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 387828f..759b5cb 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -532,15 +532,8 @@ read_file(Filename, State) ->
     {Handle, SummaryBin} = open_reader(Filename),
     {Summary, SlotList} = read_table_summary(SummaryBin),
     SlotCount = length(SlotList),
-    UpdState = 
-        case State#state.blockindex_cache of 
-            undefined ->
-                BlockIndexCache = array:new([{size, SlotCount}, 
-                                                {default, none}]),
-                State#state{blockindex_cache = BlockIndexCache};
-            _ ->
-                State
-        end,
+    BlockIndexCache = array:new([{size, SlotCount}, {default, none}]),
+    UpdState = State#state{blockindex_cache = BlockIndexCache},
     SlotIndex = from_list(SlotList),
     UpdSummary = Summary#summary{index = SlotIndex},
     leveled_log:log("SST03", [Filename,
@@ -1231,14 +1224,8 @@ generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
 generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) ->
     Acc;
 generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) ->
-    BNumber =
-        case BRange of
-            0 ->
-                string:right(integer_to_list(BucketLow), 4, $0);
-            _ ->
-                BRand = random:uniform(BRange),
-                string:right(integer_to_list(BucketLow + BRand), 4, $0)
-        end,
+    BRand = random:uniform(BRange),
+    BNumber = string:right(integer_to_list(BucketLow + BRand), 4, $0),
     KNumber = string:right(integer_to_list(random:uniform(1000)), 6, $0),
     LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber,
                                             "Key" ++ KNumber,
@@ -1334,6 +1321,20 @@ indexed_list_mixedkeys_test() ->
     test_binary_slot(FullBin, TestK4, MH4, {TestK4, TestV4}),
     test_binary_slot(FullBin, TestK5, MH5, {TestK5, TestV5}).
 
+indexed_list_mixedkeys2_test() ->
+    KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)),
+    KVL1 = lists:sublist(KVL0, 33),
+    IdxKeys1 = lists:ukeysort(1, generate_indexkeys(30)),
+    IdxKeys2 = lists:ukeysort(1, generate_indexkeys(30)),
+    % this isn't actually ordered correctly
+    Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2,
+    {_PosBinIndex1, FullBin} = generate_binary_slot(Keys),
+    lists:foreach(fun({K, V}) ->
+                        MH = leveled_codec:magic_hash(K),
+                        test_binary_slot(FullBin, K, MH, {K, V})
+                        end,
+                    KVL1).
+
 indexed_list_allindexkeys_test() ->
     Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128),
     {PosBinIndex1, FullBin} = generate_binary_slot(Keys),
@@ -1387,7 +1388,6 @@ indexed_list_mixedkeys_bitflip_test() ->
     KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)),
     KVL1 = lists:sublist(KVL0, 33),
     Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1),
-
     {_PosBinIndex1, FullBin} = generate_binary_slot(Keys),
     L = byte_size(FullBin),
     Byte1 = random:uniform(L),
@@ -1473,6 +1473,48 @@ merge_test() ->
     ok = file:delete("../test/level2_merge.sst").
     
 
+simple_persisted_range_test() ->
+    Filename = "../test/simple_test",
+    KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20),
+    KVList1 = lists:ukeysort(1, KVList0),
+    [{FirstKey, _FV}|_Rest] = KVList1,
+    {LastKey, _LV} = lists:last(KVList1),
+    {ok, Pid, {FirstKey, LastKey}} = sst_new(Filename,
+                                                1,
+                                                KVList1,
+                                                length(KVList1)),
+    
+    {o, B, K, null} = LastKey,
+    SK1 = {o, B, K, 0},
+    EK1 = {o, B, K, 1},
+    FetchListA1 = sst_getkvrange(Pid, SK1, EK1, 1),
+    ?assertMatch([], FetchListA1),
+    
+    SK2 = element(1, lists:nth(127, KVList1)),
+    SK3 = element(1, lists:nth(128, KVList1)),
+    SK4 = element(1, lists:nth(129, KVList1)),
+    SK5 = element(1, lists:nth(130, KVList1)),
+    
+    EK2 = element(1, lists:nth(255, KVList1)),
+    EK3 = element(1, lists:nth(256, KVList1)),
+    EK4 = element(1, lists:nth(257, KVList1)),
+    EK5 = element(1, lists:nth(258, KVList1)),
+    
+    TestFun =
+        fun({SK, EK}) ->
+            FetchList = sst_getkvrange(Pid, SK, EK, 4),
+            ?assertMatch(SK, element(1, lists:nth(1, FetchList))),
+            ?assertMatch(EK, element(1, lists:last(FetchList)))
+        end,
+    
+    TL2 = lists:map(fun(EK) -> {SK2, EK} end, [EK2, EK3, EK4, EK5]),
+    TL3 = lists:map(fun(EK) -> {SK3, EK} end, [EK2, EK3, EK4, EK5]),
+    TL4 = lists:map(fun(EK) -> {SK4, EK} end, [EK2, EK3, EK4, EK5]),
+    TL5 = lists:map(fun(EK) -> {SK5, EK} end, [EK2, EK3, EK4, EK5]),
+    lists:foreach(TestFun, TL2 ++ TL3 ++ TL4 ++ TL5).                
+    
+    
+
 simple_persisted_test() ->
     Filename = "../test/simple_test",
     KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 32, 1, 20),
@@ -1635,6 +1677,15 @@ key_dominates_test() ->
     ?assertMatch({skipped_key, KL2, [KV2]},
                     key_dominates([KV7|KL2], [KV2], {true, 1})).
 
-
+nonsense_coverage_test() ->
+    {ok, Pid} = gen_fsm:start(?MODULE, [], []),
+    ok = gen_fsm:send_all_state_event(Pid, nonsense),
+    ?assertMatch({next_state, reader, #state{}}, handle_info(nonsense,
+                                                                reader,
+                                                                #state{})),
+    ?assertMatch({ok, reader, #state{}}, code_change(nonsense,
+                                                        reader,
+                                                        #state{},
+                                                        nonsense)).
 
 -endif.
\ No newline at end of file