diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 90a0b2a..d3c3f1f 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1186,7 +1186,10 @@ hashtree_query_test() -> {hashtree_query, ?STD_TAG, false}), - ?assertMatch(KeyHashList, HTFolder2()), + L0 = length(KeyHashList), + HTR2 = HTFolder2(), + ?assertMatch(L0, length(HTR2)), + ?assertMatch(KeyHashList, HTR2), ok = book_close(Bookie2), reset_filestructure(). diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index b27f5b9..ab9ee27 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -37,7 +37,6 @@ strip_to_keyonly/1, strip_to_seqonly/1, strip_to_statusonly/1, - strip_to_keyseqstatusonly/1, strip_to_keyseqonly/1, strip_to_seqnhashonly/1, striphead_to_details/1, @@ -80,8 +79,6 @@ magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) -> magic_hash({Bucket, Key}); magic_hash({?STD_TAG, Bucket, Key, _SubKey}) -> magic_hash({Bucket, Key}); -magic_hash({?IDX_TAG, _B, _Idx, _Key}) -> - no_lookup; magic_hash(AnyKey) -> BK = term_to_binary(AnyKey), H = 5381, @@ -111,11 +108,8 @@ inker_reload_strategy(AltList) -> ReloadStrategy0, AltList). -strip_to_keyonly({keyonly, K}) -> K; strip_to_keyonly({K, _V}) -> K. -strip_to_keyseqstatusonly({K, {SeqN, St, _, _MD}}) -> {K, SeqN, St}. - strip_to_statusonly({_, {_, St, _, _}}) -> St. strip_to_seqonly({_, {SeqN, _, _, _}}) -> SeqN. diff --git a/src/leveled_log.erl b/src/leveled_log.erl index e06118b..a8c94d9 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -11,13 +11,15 @@ log_timer/3, put_timing/4, head_timing/4, - get_timing/3]). + get_timing/3, + sst_timing/3]). --define(PUT_TIMING_LOGPOINT, 20000). --define(HEAD_TIMING_LOGPOINT, 160000). --define(GET_TIMING_LOGPOINT, 160000). +-define(PUT_LOGPOINT, 20000). +-define(HEAD_LOGPOINT, 160000). +-define(GET_LOGPOINT, 160000). +-define(SST_LOGPOINT, 20000). -define(LOG_LEVEL, [info, warn, error, critical]). --define(SAMPLE_RATE, 16#F). +-define(SAMPLE_RATE, 16). -define(LOGBASE, dict:from_list([ @@ -94,7 +96,7 @@ {info, "Response to push_mem of ~w with " ++ "L0 pending ~w and merge backlog ~w"}}, {"P0019", - {info, "Rolling level zero to filename ~s"}}, + {info, "Rolling level zero to filename ~s at ledger sqn ~w"}}, {"P0020", {info, "Work at Level ~w to be scheduled for ~w with ~w " ++ "queue items outstanding at all levels"}}, @@ -150,8 +152,6 @@ {info, "File to be created as part of MSN=~w Filename=~s"}}, {"PC013", {warn, "Merge resulted in empty file ~s"}}, - {"PC014", - {info, "Empty file ~s to be cleared"}}, {"PC015", {info, "File created"}}, {"PC016", @@ -230,35 +230,26 @@ {"PM002", {info, "Completed dump of L0 cache to list of size ~w"}}, - - {"SFT01", - {info, "Opened filename with name ~s"}}, - {"SFT02", - {info, "File ~s has been set for delete"}}, - {"SFT03", - {info, "File creation of L0 file ~s"}}, - {"SFT04", - {debug, "File ~s prompting for delete status check"}}, - {"SFT05", + {"SST01", + {info, "SST timing for result ~w is sample ~w total ~w and max ~w"}}, + {"SST02", + {error, "False result returned from SST with filename ~s as " + ++ "slot ~w has failed crc check"}}, + {"SST03", + {info, "Opening SST file with filename ~s keys ~w slots ~w and" + ++ " max sqn ~w"}}, + {"SST04", {info, "Exit called for reason ~w on filename ~s"}}, - {"SFT06", - {info, "Exit called and now clearing ~s"}}, - {"SFT07", - {info, "Creating file with input of size ~w"}}, - {"SFT08", - {info, "Renaming file from ~s to ~s"}}, - {"SFT09", - {warn, "Filename ~s already exists"}}, - {"SFT10", + {"SST05", {warn, "Rename rogue filename ~s to ~s"}}, - {"SFT11", - {error, "Segment filter failed due to ~s"}}, - {"SFT12", - {error, "Segment filter failed due to CRC check ~w did not match ~w"}}, - {"SFT13", - {error, "Segment filter failed due to ~s"}}, - {"SFT14", - {debug, "Range fetch from SFT PID ~w"}}, + {"SST06", + {info, "File ~s has been set for delete"}}, + {"SST07", + {info, "Exit called and now clearing ~s"}}, + {"SST08", + {info, "Completed creation of ~s at level ~w with max sqn ~w"}}, + {"SST09", + {warn, "Read request exposes slot with bad CRC"}}, {"CDB01", {info, "Opening file for writing with filename ~s"}}, @@ -333,14 +324,13 @@ log_timer(LogReference, Subs, StartTime) -> end. %% Make a log of put timings split out by actor - one log for every -%% PUT_TIMING_LOGPOINT puts +%% PUT_LOGPOINT puts put_timing(_Actor, undefined, T0, T1) -> {1, {T0, T1}, {T0, T1}}; -put_timing(Actor, {?PUT_TIMING_LOGPOINT, {Total0, Total1}, {Max0, Max1}}, - T0, T1) -> - RN = random:uniform(?HEAD_TIMING_LOGPOINT), - case RN > ?HEAD_TIMING_LOGPOINT div 2 of +put_timing(Actor, {?PUT_LOGPOINT, {Total0, Total1}, {Max0, Max1}}, T0, T1) -> + RN = random:uniform(?HEAD_LOGPOINT), + case RN > ?HEAD_LOGPOINT div 2 of true -> % log at the timing point less than half the time LogRef = @@ -349,7 +339,7 @@ put_timing(Actor, {?PUT_TIMING_LOGPOINT, {Total0, Total1}, {Max0, Max1}}, inker -> "I0019"; journal -> "CDB17" end, - log(LogRef, [?PUT_TIMING_LOGPOINT, Total0, Total1, Max0, Max1]), + log(LogRef, [?PUT_LOGPOINT, Total0, Total1, Max0, Max1]), put_timing(Actor, undefined, T0, T1); false -> % Log some other random time @@ -359,13 +349,13 @@ put_timing(_Actor, {N, {Total0, Total1}, {Max0, Max1}}, T0, T1) -> {N + 1, {Total0 + T0, Total1 + T1}, {max(Max0, T0), max(Max1, T1)}}. %% Make a log of penciller head timings split out by level and result - one -%% log for every HEAD_TIMING_LOGPOINT puts +%% log for every HEAD_LOGPOINT puts %% Returns a tuple of {Count, TimingDict} to be stored on the process state head_timing(undefined, SW, Level, R) -> T0 = timer:now_diff(os:timestamp(), SW), head_timing_int(undefined, T0, Level, R); head_timing({N, HeadTimingD}, SW, Level, R) -> - case N band ?SAMPLE_RATE of + case N band (?SAMPLE_RATE - 1) of 0 -> T0 = timer:now_diff(os:timestamp(), SW), head_timing_int({N, HeadTimingD}, T0, Level, R); @@ -384,9 +374,9 @@ head_timing_int(undefined, T0, Level, R) -> dict:store(K, [0, 0, 0], Acc) end end, {1, lists:foldl(NewDFun, dict:new(), head_keylist())}; -head_timing_int({?HEAD_TIMING_LOGPOINT, HeadTimingD}, T0, Level, R) -> - RN = random:uniform(?HEAD_TIMING_LOGPOINT), - case RN > ?HEAD_TIMING_LOGPOINT div 2 of +head_timing_int({?HEAD_LOGPOINT, HeadTimingD}, T0, Level, R) -> + RN = random:uniform(?HEAD_LOGPOINT), + case RN > ?HEAD_LOGPOINT div 2 of true -> % log at the timing point less than half the time LogFun = fun(K) -> log("P0032", [K|dict:fetch(K, HeadTimingD)]) end, @@ -419,21 +409,61 @@ head_keylist() -> [not_present, found_lower, found_0, found_1, found_2]. +sst_timing(undefined, SW, TimerType) -> + T0 = timer:now_diff(os:timestamp(), SW), + gen_timing_int(undefined, + T0, + TimerType, + fun sst_keylist/0, + ?SST_LOGPOINT, + "SST01"); +sst_timing({N, SSTTimerD}, SW, TimerType) -> + case N band (?SAMPLE_RATE - 1) of + 0 -> + T0 = timer:now_diff(os:timestamp(), SW), + gen_timing_int({N, SSTTimerD}, + T0, + TimerType, + fun sst_keylist/0, + ?SST_LOGPOINT, + "SST01"); + _ -> + % Not to be sampled this time + {N + 1, SSTTimerD} + end. + +sst_keylist() -> + [slot_bloom, slot_fetch]. + get_timing(undefined, SW, TimerType) -> T0 = timer:now_diff(os:timestamp(), SW), - get_timing_int(undefined, T0, TimerType); + gen_timing_int(undefined, + T0, + TimerType, + fun get_keylist/0, + ?GET_LOGPOINT, + "B0014"); get_timing({N, GetTimerD}, SW, TimerType) -> - case N band ?SAMPLE_RATE of + case N band (?SAMPLE_RATE - 1) of 0 -> T0 = timer:now_diff(os:timestamp(), SW), - get_timing_int({N, GetTimerD}, T0, TimerType); + gen_timing_int({N, GetTimerD}, + T0, + TimerType, + fun get_keylist/0, + ?GET_LOGPOINT, + "B0014"); _ -> % Not to be sampled this time {N + 1, GetTimerD} end. -get_timing_int(undefined, T0, TimerType) -> +get_keylist() -> + [head_not_present, head_found, fetch]. + + +gen_timing_int(undefined, T0, TimerType, KeyListFun, _LogPoint, _LogRef) -> NewDFun = fun(K, Acc) -> case K of TimerType -> @@ -441,31 +471,32 @@ get_timing_int(undefined, T0, TimerType) -> _ -> dict:store(K, [0, 0, 0], Acc) end end, - {1, lists:foldl(NewDFun, dict:new(), get_keylist())}; -get_timing_int({?GET_TIMING_LOGPOINT, GetTimerD}, T0, TimerType) -> - RN = random:uniform(?GET_TIMING_LOGPOINT), - case RN > ?GET_TIMING_LOGPOINT div 2 of + {1, lists:foldl(NewDFun, dict:new(), KeyListFun())}; +gen_timing_int({LogPoint, TimerD}, T0, TimerType, KeyListFun, LogPoint, + LogRef) -> + RN = random:uniform(LogPoint), + case RN > LogPoint div 2 of true -> % log at the timing point less than half the time - LogFun = fun(K) -> log("B0014", [K|dict:fetch(K, GetTimerD)]) end, - lists:foreach(LogFun, get_keylist()), - get_timing_int(undefined, T0, TimerType); + LogFun = fun(K) -> log(LogRef, [K|dict:fetch(K, TimerD)]) end, + lists:foreach(LogFun, KeyListFun()), + gen_timing_int(undefined, T0, TimerType, + KeyListFun, LogPoint, LogRef); false -> % Log some other time - reset to RN not 0 to stagger logs out over % time between the vnodes - get_timing_int({RN, GetTimerD}, T0, TimerType) + gen_timing_int({RN, TimerD}, T0, TimerType, + KeyListFun, LogPoint, LogRef) end; -get_timing_int({N, GetTimerD}, T0, TimerType) -> - [Count0, Total0, Max0] = dict:fetch(TimerType, GetTimerD), +gen_timing_int({N, TimerD}, T0, TimerType, _KeyListFun, _LogPoint, _LogRef) -> + [Count0, Total0, Max0] = dict:fetch(TimerType, TimerD), {N + 1, dict:store(TimerType, [Count0 + 1, Total0 + T0, max(Max0, T0)], - GetTimerD)}. + TimerD)}. -get_keylist() -> - [head_not_present, head_found, fetch]. %%%============================================================================ %%% Test diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index b5f8e3f..9ccc791 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -9,7 +9,7 @@ %% %% -------- COMMITTING MANIFEST CHANGES --------- %% -%% Once the Penciller has taken a manifest change, the SFT file owners which no +%% Once the Penciller has taken a manifest change, the SST file owners which no %% longer form part of the manifest will be marked for delete. By marking for %% deletion, the owners will poll to confirm when it is safe for them to be %% deleted. @@ -225,7 +225,7 @@ merge(WI) -> mark_for_delete([], _Penciller) -> ok; mark_for_delete([Head|Tail], Penciller) -> - ok = leveled_sft:sft_setfordelete(Head#manifest_entry.owner, Penciller), + ok = leveled_sst:sst_setfordelete(Head#manifest_entry.owner, Penciller), mark_for_delete(Tail, Penciller). @@ -268,13 +268,13 @@ select_filetomerge(SrcLevel, Manifest) -> -%% Assumption is that there is a single SFT from a higher level that needs -%% to be merged into multiple SFTs at a lower level. This should create an -%% entirely new set of SFTs, and the calling process can then update the +%% Assumption is that there is a single SST from a higher level that needs +%% to be merged into multiple SSTs at a lower level. This should create an +%% entirely new set of SSTs, and the calling process can then update the %% manifest. %% %% Once the FileToMerge has been emptied, the remainder of the candidate list -%% needs to be placed in a remainder SFT that may be of a sub-optimal (small) +%% needs to be placed in a remainder SST that may be of a sub-optimal (small) %% size. This stops the need to perpetually roll over the whole level if the %% level consists of already full files. Some smartness may be required when %% selecting the candidate list so that small files just outside the candidate @@ -293,49 +293,40 @@ perform_merge({SrcPid, SrcFN}, CandidateList, LevelInfo, {Filepath, MSN}) -> PointerList = lists:map(fun(P) -> {next, P#manifest_entry.owner, all} end, CandidateList), + MaxSQN = leveled_sst:sst_getmaxsequencenumber(SrcPid), do_merge([{next, SrcPid, all}], PointerList, LevelInfo, {Filepath, MSN}, + MaxSQN, 0, []). -do_merge([], [], {SrcLevel, _IsB}, {_Filepath, MSN}, FileCounter, OutList) -> +do_merge([], [], {SrcLevel, _IsB}, {_Filepath, MSN}, _MaxSQN, + FileCounter, OutList) -> leveled_log:log("PC011", [MSN, SrcLevel, FileCounter]), OutList; -do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> - FileName = lists:flatten(io_lib:format(Filepath ++ "_~w_~w.sft", +do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN, + FileCounter, OutList) -> + FileName = lists:flatten(io_lib:format(Filepath ++ "_~w_~w.sst", [SrcLevel + 1, FileCounter])), leveled_log:log("PC012", [MSN, FileName]), TS1 = os:timestamp(), - LevelR = case IsB of - true -> - #level{level = SrcLevel + 1, - is_basement = true, - timestamp = leveled_codec:integer_now()}; - false -> - SrcLevel + 1 - end, - {ok, Pid, Reply} = leveled_sft:sft_new(FileName, - KL1, - KL2, - LevelR), - case Reply of - {{[], []}, null, _} -> + case leveled_sst:sst_new(FileName, KL1, KL2, IsB, SrcLevel + 1, MaxSQN) of + empty -> leveled_log:log("PC013", [FileName]), - leveled_log:log("PC014", [FileName]), - ok = leveled_sft:sft_clear(Pid), OutList; - {{KL1Rem, KL2Rem}, SmallestKey, HighestKey} -> - ExtMan = lists:append(OutList, - [#manifest_entry{start_key=SmallestKey, - end_key=HighestKey, - owner=Pid, - filename=FileName}]), - leveled_log:log_timer("PC015", [], TS1), - do_merge(KL1Rem, KL2Rem, - {SrcLevel, IsB}, {Filepath, MSN}, - FileCounter + 1, ExtMan) + {ok, Pid, Reply} -> + {{KL1Rem, KL2Rem}, SmallestKey, HighestKey} = Reply, + ExtMan = lists:append(OutList, + [#manifest_entry{start_key=SmallestKey, + end_key=HighestKey, + owner=Pid, + filename=FileName}]), + leveled_log:log_timer("PC015", [], TS1), + do_merge(KL1Rem, KL2Rem, + {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN, + FileCounter + 1, ExtMan) end. @@ -384,7 +375,7 @@ find_randomkeys(FList, Count, Source) -> KV1 = lists:nth(random:uniform(length(Source)), Source), K1 = leveled_codec:strip_to_keyonly(KV1), P1 = choose_pid_toquery(FList, K1), - FoundKV = leveled_sft:sft_get(P1, K1), + FoundKV = leveled_sst:sst_get(P1, K1), Found = leveled_codec:strip_to_keyonly(FoundKV), io:format("success finding ~w in ~w~n", [K1, P1]), ?assertMatch(K1, Found), @@ -393,21 +384,31 @@ find_randomkeys(FList, Count, Source) -> merge_file_test() -> KL1_L1 = lists:sort(generate_randomkeys(8000, 0, 1000)), - {ok, PidL1_1, _} = leveled_sft:sft_new("../test/KL1_L1.sft", - KL1_L1, [], 1), + {ok, PidL1_1, _} = leveled_sst:sst_new("../test/KL1_L1.sst", + 1, + KL1_L1, + undefined), KL1_L2 = lists:sort(generate_randomkeys(8000, 0, 250)), - {ok, PidL2_1, _} = leveled_sft:sft_new("../test/KL1_L2.sft", - KL1_L2, [], 2), + {ok, PidL2_1, _} = leveled_sst:sst_new("../test/KL1_L2.sst", + 2, + KL1_L2, + undefined), KL2_L2 = lists:sort(generate_randomkeys(8000, 250, 250)), - {ok, PidL2_2, _} = leveled_sft:sft_new("../test/KL2_L2.sft", - KL2_L2, [], 2), + {ok, PidL2_2, _} = leveled_sst:sst_new("../test/KL2_L2.sst", + 2, + KL2_L2, + undefined), KL3_L2 = lists:sort(generate_randomkeys(8000, 500, 250)), - {ok, PidL2_3, _} = leveled_sft:sft_new("../test/KL3_L2.sft", - KL3_L2, [], 2), + {ok, PidL2_3, _} = leveled_sst:sst_new("../test/KL3_L2.sst", + 2, + KL3_L2, + undefined), KL4_L2 = lists:sort(generate_randomkeys(8000, 750, 250)), - {ok, PidL2_4, _} = leveled_sft:sft_new("../test/KL4_L2.sft", - KL4_L2, [], 2), - Result = perform_merge({PidL1_1, "../test/KL1_L1.sft"}, + {ok, PidL2_4, _} = leveled_sst:sst_new("../test/KL4_L2.sst", + 2, + KL4_L2, + undefined), + Result = perform_merge({PidL1_1, "../test/KL1_L1.sst"}, [#manifest_entry{owner=PidL2_1}, #manifest_entry{owner=PidL2_2}, #manifest_entry{owner=PidL2_3}, @@ -429,13 +430,13 @@ merge_file_test() -> ok = find_randomkeys(Result, 50, KL3_L2), io:format("Finding keys in KL4_L2~n"), ok = find_randomkeys(Result, 50, KL4_L2), - leveled_sft:sft_clear(PidL1_1), - leveled_sft:sft_clear(PidL2_1), - leveled_sft:sft_clear(PidL2_2), - leveled_sft:sft_clear(PidL2_3), - leveled_sft:sft_clear(PidL2_4), + leveled_sst:sst_clear(PidL1_1), + leveled_sst:sst_clear(PidL2_1), + leveled_sst:sst_clear(PidL2_2), + leveled_sst:sst_clear(PidL2_3), + leveled_sst:sst_clear(PidL2_4), lists:foreach(fun(ManEntry) -> - leveled_sft:sft_clear(ManEntry#manifest_entry.owner) end, + leveled_sst:sst_clear(ManEntry#manifest_entry.owner) end, Result). select_merge_candidates_test() -> diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 0de9b2b..853498b 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -22,17 +22,17 @@ %% %% The Ledger is divided into many levels %% - L0: New keys are received from the Bookie and merged into a single -%% gb_tree, until that tree is the size of a SFT file, and it is then persisted -%% as a SFT file at this level. L0 SFT files can be larger than the normal +%% gb_tree, until that tree is the size of a SST file, and it is then persisted +%% as a SST file at this level. L0 SST files can be larger than the normal %% maximum size - so we don't have to consider problems of either having more %% than one L0 file (and handling what happens on a crash between writing the %% files when the second may have overlapping sequence numbers), or having a %% remainder with overlapping in sequence numbers in memory after the file is %% written. Once the persistence is completed, the L0 tree can be erased. -%% There can be only one SFT file at Level 0, so the work to merge that file +%% There can be only one SST file at Level 0, so the work to merge that file %% to the lower level must be the highest priority, as otherwise writes to the %% ledger will stall, when there is next a need to persist. -%% - L1 TO L7: May contain multiple processes managing non-overlapping sft +%% - L1 TO L7: May contain multiple processes managing non-overlapping SST %% files. Compaction work should be sheduled if the number of files exceeds %% the target size of the level, where the target size is 8 ^ n. %% @@ -67,14 +67,14 @@ %% completed to merge the tree into the L0 tree. %% %% The Penciller MUST NOT accept a new PUSH if the Clerk has commenced the -%% conversion of the current L0 tree into a SFT file, but not completed this +%% conversion of the current L0 tree into a SST file, but not completed this %% change. The Penciller in this case returns the push, and the Bookie should %% continue to grow the cache before trying again. %% %% ---------- FETCH ---------- %% %% On request to fetch a key the Penciller should look first in the in-memory -%% L0 tree, then look in the SFT files Level by Level (including level 0), +%% L0 tree, then look in the SST files Level by Level (including level 0), %% consulting the Manifest to determine which file should be checked at each %% level. %% @@ -82,16 +82,16 @@ %% %% Iterators may request a snapshot of the database. A snapshot is a cloned %% Penciller seeded not from disk, but by the in-memory L0 gb_tree and the -%% in-memory manifest, allowing for direct reference for the SFT file processes. +%% in-memory manifest, allowing for direct reference for the SST file processes. %% %% Clones formed to support snapshots are registered by the Penciller, so that -%% SFT files valid at the point of the snapshot until either the iterator is +%% SST files valid at the point of the snapshot until either the iterator is %% completed or has timed out. %% %% ---------- ON STARTUP ---------- %% %% On Startup the Bookie with ask the Penciller to initiate the Ledger first. -%% To initiate the Ledger the must consult the manifest, and then start a SFT +%% To initiate the Ledger the must consult the manifest, and then start a SST %% management process for each file in the manifest. %% %% The penciller should then try and read any Level 0 file which has the @@ -103,14 +103,14 @@ %% ---------- ON SHUTDOWN ---------- %% %% On a controlled shutdown the Penciller should attempt to write any in-memory -%% ETS table to a L0 SFT file, assuming one is nto already pending. If one is +%% ETS table to a L0 SST file, assuming one is nto already pending. If one is %% already pending then the Penciller will not persist this part of the Ledger. %% %% ---------- FOLDER STRUCTURE ---------- %% %% The following folders are used by the Penciller %% $ROOT/ledger/ledger_manifest/ - used for keeping manifest files -%% $ROOT/ledger/ledger_files/ - containing individual SFT files +%% $ROOT/ledger/ledger_files/ - containing individual SST files %% %% In larger stores there could be a large number of files in the ledger_file %% folder - perhaps o(1000). It is assumed that modern file systems should @@ -120,7 +120,7 @@ %% %% The Penciller can have one and only one Clerk for performing compaction %% work. When the Clerk has requested and taken work, it should perform the -%5 compaction work starting the new SFT process to manage the new Ledger state +%5 compaction work starting the new SST process to manage the new Ledger state %% and then write a new manifest file that represents that state with using %% the next Manifest sequence number as the filename: %% - nonzero_.pnd @@ -130,14 +130,14 @@ %% %% On startup, the Penciller should look for the nonzero_*.crr file with the %% highest such manifest sequence number. This will be started as the -%% manifest, together with any _0_0.sft file found at that Manifest SQN. +%% manifest, together with any _0_0.sst file found at that Manifest SQN. %% Level zero files are not kept in the persisted manifest, and adding a L0 %% file does not advanced the Manifest SQN. %% %% The pace at which the store can accept updates will be dependent on the %% speed at which the Penciller's Clerk can merge files at lower levels plus %% the time it takes to merge from Level 0. As if a clerk has commenced -%% compaction work at a lower level and then immediately a L0 SFT file is +%% compaction work at a lower level and then immediately a L0 SST file is %% written the Penciller will need to wait for this compaction work to %% complete and the L0 file to be compacted before the ETS table can be %% allowed to again reach capacity @@ -145,7 +145,7 @@ %% The writing of L0 files do not require the involvement of the clerk. %% The L0 files are prompted directly by the penciller when the in-memory tree %% has reached capacity. This places the penciller in a levelzero_pending -%% state, and in this state it must return new pushes. Once the SFT file has +%% state, and in this state it must return new pushes. Once the SST file has %% been completed it will confirm completion to the penciller which can then %% revert the levelzero_pending state, add the file to the manifest and clear %% the current level zero in-memory view. @@ -172,7 +172,6 @@ pcl_fetchkeys/5, pcl_fetchnextkey/5, pcl_checksequencenumber/3, - pcl_checksequencenumber/4, pcl_workforclerk/1, pcl_promptmanifestchange/2, pcl_confirml0complete/4, @@ -203,6 +202,7 @@ -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). -define(COIN_SIDECOUNT, 5). -define(SLOW_FETCH, 20000). +-define(ITERATOR_SCANWIDTH, 4). -record(state, {manifest = [] :: list(), manifest_sqn = 0 :: integer(), @@ -280,9 +280,6 @@ pcl_checksequencenumber(Pid, Key, SQN) -> gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity) end. -pcl_checksequencenumber(Pid, Key, Hash, SQN) -> - gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity). - pcl_workforclerk(Pid) -> gen_server:call(Pid, work_for_clerk, infinity). @@ -399,10 +396,10 @@ handle_call({fetch_keys, StartKey, EndKey, AccFun, InitAcc, MaxKeys}, List -> List end, - SFTiter = initiate_rangequery_frommanifest(StartKey, + SSTiter = initiate_rangequery_frommanifest(StartKey, EndKey, State#state.manifest), - Acc = keyfolder({L0AsList, SFTiter}, + Acc = keyfolder({L0AsList, SSTiter}, {StartKey, EndKey}, {AccFun, InitAcc}, MaxKeys), @@ -456,7 +453,7 @@ handle_cast({confirm_delete, FileName}, State=#state{is_snapshot=Snap}) {true, Pid} -> UF1 = lists:keydelete(FileName, 1, State#state.unreferenced_files), leveled_log:log("P0005", [FileName]), - ok = leveled_sft:sft_deleteconfirmed(Pid), + ok = leveled_sst:sst_deleteconfirmed(Pid), {noreply, State#state{unreferenced_files=UF1}}; _ -> {noreply, State} @@ -525,7 +522,7 @@ terminate(Reason, State) -> leveled_log:log("P0009", []); {false, [], _N} -> L0Pid = roll_memory(UpdState, true), - ok = leveled_sft:sft_close(L0Pid); + ok = leveled_sst:sst_close(L0Pid); StatusTuple -> leveled_log:log("P0010", [StatusTuple]) end, @@ -533,7 +530,7 @@ terminate(Reason, State) -> % Tidy shutdown of individual files ok = close_files(0, UpdState#state.manifest), lists:foreach(fun({_FN, Pid, _SN}) -> - ok = leveled_sft:sft_close(Pid) end, + ok = leveled_sst:sst_close(Pid) end, UpdState#state.unreferenced_files), leveled_log:log("P0011", []), ok. @@ -571,8 +568,11 @@ start_from_file(PCLopts) -> levelzero_index=leveled_pmem:new_index()}, %% Open manifest - ManifestPath = InitState#state.root_path ++ "/" ++ ?MANIFEST_FP ++ "/", + ManifestPath = filepath(InitState#state.root_path, manifest) ++ "/", + SSTPath = filepath(InitState#state.root_path, files) ++ "/", ok = filelib:ensure_dir(ManifestPath), + ok = filelib:ensure_dir(SSTPath), + {ok, Filenames} = file:list_dir(ManifestPath), CurrRegex = "nonzero_(?[0-9]+)\\." ++ ?CURRENT_FILEX, ValidManSQNs = lists:foldl(fun(FN, Acc) -> @@ -608,14 +608,14 @@ start_from_file(PCLopts) -> leveled_log:log("P0014", [MaxSQN]), %% Find any L0 files - L0FN = filepath(RootPath, TopManSQN, new_merge_files) ++ "_0_0.sft", + L0FN = filepath(RootPath, TopManSQN, new_merge_files) ++ "_0_0.sst", case filelib:is_file(L0FN) of true -> leveled_log:log("P0015", [L0FN]), {ok, L0Pid, - {L0StartKey, L0EndKey}} = leveled_sft:sft_open(L0FN), - L0SQN = leveled_sft:sft_getmaxsequencenumber(L0Pid), + {L0StartKey, L0EndKey}} = leveled_sst:sst_open(L0FN), + L0SQN = leveled_sst:sst_getmaxsequencenumber(L0Pid), ManifestEntry = #manifest_entry{start_key=L0StartKey, end_key=L0EndKey, owner=L0Pid, @@ -682,13 +682,7 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, _ -> leveled_log:log_timer("P0031", [], SW), UpdState - end; - - NewL0Size == L0Size -> - leveled_log:log_timer("P0031", [], SW), - State#state{levelzero_cache=L0Cache, - levelzero_size=L0Size, - ledger_sqn=LedgerSQN} + end end. @@ -696,7 +690,7 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, %% to an immediate return as expected. With 32K keys in the TreeList it could %% take around 35-40ms. %% -%% To avoid blocking this gen_server, the SFT file can request each item of the +%% To avoid blocking this gen_server, the SST file can request each item of the %% cache one at a time. %% %% The Wait is set to false to use a cast when calling this in normal operation @@ -704,25 +698,22 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, roll_memory(State, false) -> FileName = levelzero_filename(State), - leveled_log:log("P0019", [FileName]), - Opts = #sft_options{wait=false, penciller=self()}, + leveled_log:log("P0019", [FileName, State#state.ledger_sqn]), PCL = self(), FetchFun = fun(Slot) -> pcl_fetchlevelzero(PCL, Slot) end, - % FetchFun = fun(Slot) -> lists:nth(Slot, State#state.levelzero_cache) end, - R = leveled_sft:sft_newfroml0cache(FileName, + R = leveled_sst:sst_newlevelzero(FileName, length(State#state.levelzero_cache), FetchFun, - Opts), + PCL, + State#state.ledger_sqn), {ok, Constructor, _} = R, Constructor; roll_memory(State, true) -> FileName = levelzero_filename(State), - Opts = #sft_options{wait=true}, FetchFun = fun(Slot) -> lists:nth(Slot, State#state.levelzero_cache) end, - R = leveled_sft:sft_newfroml0cache(FileName, - length(State#state.levelzero_cache), - FetchFun, - Opts), + KVList = leveled_pmem:to_list(length(State#state.levelzero_cache), + FetchFun), + R = leveled_sst:sst_new(FileName, 0, KVList, State#state.ledger_sqn), {ok, Constructor, _} = R, Constructor. @@ -753,7 +744,7 @@ fetch_mem(Key, Hash, Manifest, L0Cache, none) -> L0Check = leveled_pmem:check_levelzero(Key, Hash, L0Cache), case L0Check of {false, not_found} -> - fetch(Key, Hash, Manifest, 0, fun timed_sft_get/3); + fetch(Key, Hash, Manifest, 0, fun timed_sst_get/3); {true, KV} -> {KV, 0} end; @@ -762,7 +753,7 @@ fetch_mem(Key, Hash, Manifest, L0Cache, L0Index) -> true -> fetch_mem(Key, Hash, Manifest, L0Cache, none); false -> - fetch(Key, Hash, Manifest, 0, fun timed_sft_get/3) + fetch(Key, Hash, Manifest, 0, fun timed_sst_get/3) end. fetch(_Key, _Hash, _Manifest, ?MAX_LEVELS + 1, _FetchFun) -> @@ -791,9 +782,9 @@ fetch(Key, Hash, Manifest, Level, FetchFun) -> end end. -timed_sft_get(PID, Key, Hash) -> +timed_sst_get(PID, Key, Hash) -> SW = os:timestamp(), - R = leveled_sft:sft_get(PID, Key, Hash), + R = leveled_sst:sst_get(PID, Key, Hash), T0 = timer:now_diff(os:timestamp(), SW), case {T0, R} of {T, R} when T < ?SLOW_FETCH -> @@ -880,7 +871,7 @@ close_files(?MAX_LEVELS - 1, _Manifest) -> close_files(Level, Manifest) -> LevelList = get_item(Level, Manifest, []), lists:foreach(fun(F) -> - ok = leveled_sft:sft_close(F#manifest_entry.owner) end, + ok = leveled_sst:sst_close(F#manifest_entry.owner) end, LevelList), close_files(Level + 1, Manifest). @@ -897,8 +888,8 @@ open_all_filesinmanifest({Manifest, TopSQN}, Level) -> %5 replace them LvlR = lists:foldl(fun(F, {FL, FL_SQN}) -> FN = F#manifest_entry.filename, - {ok, P, _Keys} = leveled_sft:sft_open(FN), - F_SQN = leveled_sft:sft_getmaxsequencenumber(P), + {ok, P, _Keys} = leveled_sst:sst_open(FN), + F_SQN = leveled_sst:sst_getmaxsequencenumber(P), {lists:append(FL, [F#manifest_entry{owner = P}]), max(FL_SQN, F_SQN)} @@ -932,24 +923,24 @@ initiate_rangequery_frommanifest(StartKey, EndKey, Manifest) -> C2 = leveled_codec:endkey_passed(EndKey, M#manifest_entry.start_key), not (C1 or C2) end, - lists:foldl(fun(L, AccL) -> - Level = get_item(L, Manifest, []), - FL = lists:foldl(fun(M, Acc) -> - case CompareFun(M) of - true -> - Acc ++ [{next_file, M}]; - false -> - Acc - end end, - [], - Level), - case FL of - [] -> AccL; - FL -> AccL ++ [{L, FL}] - end - end, - [], - lists:seq(0, ?MAX_LEVELS - 1)). + FoldFun = + fun(L, AccL) -> + Level = get_item(L, Manifest, []), + FL = lists:foldl(fun(M, Acc) -> + case CompareFun(M) of + true -> + Acc ++ [{next, M, StartKey}]; + false -> + Acc + end end, + [], + Level), + case FL of + [] -> AccL; + FL -> AccL ++ [{L, FL}] + end + end, + lists:foldl(FoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)). %% Looks to find the best choice for the next key across the levels (other %% than in-memory table) @@ -960,22 +951,25 @@ find_nextkey(QueryArray, StartKey, EndKey) -> find_nextkey(QueryArray, 0, {null, null}, - {fun leveled_sft:sft_getkvrange/4, StartKey, EndKey, 1}). + StartKey, + EndKey, + ?ITERATOR_SCANWIDTH). -find_nextkey(_QueryArray, LCnt, {null, null}, _QueryFunT) +find_nextkey(_QueryArray, LCnt, {null, null}, _StartKey, _EndKey, _Width) when LCnt > ?MAX_LEVELS -> % The array has been scanned wihtout finding a best key - must be % exhausted - respond to indicate no more keys to be found by the % iterator no_more_keys; -find_nextkey(QueryArray, LCnt, {BKL, BestKV}, _QueryFunT) +find_nextkey(QueryArray, LCnt, {BKL, BestKV}, _StartKey, _EndKey, _Width) when LCnt > ?MAX_LEVELS -> % All levels have been scanned, so need to remove the best result from % the array, and return that array along with the best key/sqn/status % combination {BKL, [BestKV|Tail]} = lists:keyfind(BKL, 1, QueryArray), {lists:keyreplace(BKL, 1, QueryArray, {BKL, Tail}), BestKV}; -find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> +find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, + StartKey, EndKey, Width) -> % Get the next key at this level {NextKey, RestOfKeys} = case lists:keyfind(LCnt, 1, QueryArray) of false -> @@ -989,39 +983,46 @@ find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> case {NextKey, BestKeyLevel, BestKV} of {null, BKL, BKV} -> % There is no key at this level - go to the next level - find_nextkey(QueryArray, LCnt + 1, {BKL, BKV}, QueryFunT); - {{next_file, ManifestEntry}, BKL, BKV} -> + find_nextkey(QueryArray, + LCnt + 1, + {BKL, BKV}, + StartKey, EndKey, Width); + {{next, ManifestEntry, _SK}, BKL, BKV} -> % The first key at this level is pointer to a file - need to query % the file to expand this level out before proceeding Owner = ManifestEntry#manifest_entry.owner, - {QueryFun, StartKey, EndKey, ScanSize} = QueryFunT, - QueryResult = QueryFun(Owner, StartKey, EndKey, ScanSize), - NewEntry = {LCnt, QueryResult ++ RestOfKeys}, + Pointer = {next, Owner, StartKey, EndKey}, + UpdList = leveled_sst:expand_list_by_pointer(Pointer, + RestOfKeys, + Width), + NewEntry = {LCnt, UpdList}, % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), LCnt, {BKL, BKV}, - QueryFunT); - {{next, SFTpid, NewStartKey}, BKL, BKV} -> + StartKey, EndKey, Width); + {{pointer, SSTPid, Slot, PSK, PEK}, BKL, BKV} -> % The first key at this level is pointer within a file - need to % query the file to expand this level out before proceeding - {QueryFun, _StartKey, EndKey, ScanSize} = QueryFunT, - QueryResult = QueryFun(SFTpid, NewStartKey, EndKey, ScanSize), - NewEntry = {LCnt, QueryResult ++ RestOfKeys}, + Pointer = {pointer, SSTPid, Slot, PSK, PEK}, + UpdList = leveled_sst:expand_list_by_pointer(Pointer, + RestOfKeys, + Width), + NewEntry = {LCnt, UpdList}, % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), LCnt, {BKL, BKV}, - QueryFunT); + StartKey, EndKey, Width); {{Key, Val}, null, null} -> % No best key set - so can assume that this key is the best key, % and check the lower levels find_nextkey(QueryArray, LCnt + 1, {LCnt, {Key, Val}}, - QueryFunT); + StartKey, EndKey, Width); {{Key, Val}, _BKL, {BestKey, _BestVal}} when Key < BestKey -> % There is a real key and a best key to compare, and the real key % at this level is before the best key, and so is now the new best @@ -1030,7 +1031,7 @@ find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> find_nextkey(QueryArray, LCnt + 1, {LCnt, {Key, Val}}, - QueryFunT); + StartKey, EndKey, Width); {{Key, Val}, BKL, {BestKey, BestVal}} when Key == BestKey -> SQN = leveled_codec:strip_to_seqonly({Key, Val}), BestSQN = leveled_codec:strip_to_seqonly({BestKey, BestVal}), @@ -1041,7 +1042,7 @@ find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), LCnt + 1, {BKL, {BestKey, BestVal}}, - QueryFunT); + StartKey, EndKey, Width); SQN > BestSQN -> % There is a real key at the front of this level and it has % a higher SQN than the best key, so we should use this as @@ -1056,29 +1057,32 @@ find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> {BKL, BestTail}), LCnt + 1, {LCnt, {Key, Val}}, - QueryFunT) + StartKey, EndKey, Width) end; {_, BKL, BKV} -> % This is not the best key - find_nextkey(QueryArray, LCnt + 1, {BKL, BKV}, QueryFunT) + find_nextkey(QueryArray, + LCnt + 1, + {BKL, BKV}, + StartKey, EndKey, Width) end. -keyfolder(IMMiter, SFTiter, StartKey, EndKey, {AccFun, Acc}) -> - keyfolder({IMMiter, SFTiter}, {StartKey, EndKey}, {AccFun, Acc}, -1). +keyfolder(IMMiter, SSTiter, StartKey, EndKey, {AccFun, Acc}) -> + keyfolder({IMMiter, SSTiter}, {StartKey, EndKey}, {AccFun, Acc}, -1). keyfolder(_Iterators, _KeyRange, {_AccFun, Acc}, MaxKeys) when MaxKeys == 0 -> Acc; -keyfolder({[], SFTiter}, KeyRange, {AccFun, Acc}, MaxKeys) -> +keyfolder({[], SSTiter}, KeyRange, {AccFun, Acc}, MaxKeys) -> {StartKey, EndKey} = KeyRange, - case find_nextkey(SFTiter, StartKey, EndKey) of + case find_nextkey(SSTiter, StartKey, EndKey) of no_more_keys -> Acc; - {NxSFTiter, {SFTKey, SFTVal}} -> - Acc1 = AccFun(SFTKey, SFTVal, Acc), - keyfolder({[], NxSFTiter}, KeyRange, {AccFun, Acc1}, MaxKeys - 1) + {NxSSTiter, {SSTKey, SSTVal}} -> + Acc1 = AccFun(SSTKey, SSTVal, Acc), + keyfolder({[], NxSSTiter}, KeyRange, {AccFun, Acc1}, MaxKeys - 1) end; -keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SFTiterator}, KeyRange, +keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, KeyRange, {AccFun, Acc}, MaxKeys) -> {StartKey, EndKey} = KeyRange, case {IMMKey < StartKey, leveled_codec:endkey_passed(EndKey, IMMKey)} of @@ -1087,7 +1091,7 @@ keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SFTiterator}, KeyRange, % Normally everything is pre-filterd, but the IMM iterator can % be re-used and so may be behind the StartKey if the StartKey has % advanced from the previous use - keyfolder({NxIMMiterator, SFTiterator}, + keyfolder({NxIMMiterator, SSTiterator}, KeyRange, {AccFun, Acc}, MaxKeys); @@ -1095,44 +1099,44 @@ keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SFTiterator}, KeyRange, % There are no more keys in-range in the in-memory % iterator, so take action as if this iterator is empty % (see above) - keyfolder({[], SFTiterator}, + keyfolder({[], SSTiterator}, KeyRange, {AccFun, Acc}, MaxKeys); {false, false} -> - case find_nextkey(SFTiterator, StartKey, EndKey) of + case find_nextkey(SSTiterator, StartKey, EndKey) of no_more_keys -> % No more keys in range in the persisted store, so use the % in-memory KV as the next Acc1 = AccFun(IMMKey, IMMVal, Acc), - keyfolder({NxIMMiterator, SFTiterator}, + keyfolder({NxIMMiterator, SSTiterator}, KeyRange, {AccFun, Acc1}, MaxKeys - 1); - {NxSFTiterator, {SFTKey, SFTVal}} -> + {NxSSTiterator, {SSTKey, SSTVal}} -> % There is a next key, so need to know which is the % next key between the two (and handle two keys % with different sequence numbers). case leveled_codec:key_dominates({IMMKey, IMMVal}, - {SFTKey, - SFTVal}) of + {SSTKey, + SSTVal}) of left_hand_first -> Acc1 = AccFun(IMMKey, IMMVal, Acc), - keyfolder({NxIMMiterator, SFTiterator}, + keyfolder({NxIMMiterator, SSTiterator}, KeyRange, {AccFun, Acc1}, MaxKeys - 1); right_hand_first -> - Acc1 = AccFun(SFTKey, SFTVal, Acc), + Acc1 = AccFun(SSTKey, SSTVal, Acc), keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], - NxSFTiterator}, + NxSSTiterator}, KeyRange, {AccFun, Acc1}, MaxKeys - 1); left_hand_dominant -> Acc1 = AccFun(IMMKey, IMMVal, Acc), - keyfolder({NxIMMiterator, NxSFTiterator}, + keyfolder({NxIMMiterator, NxSSTiterator}, KeyRange, {AccFun, Acc1}, MaxKeys - 1) @@ -1286,6 +1290,27 @@ confirm_delete(Filename, UnreferencedFiles, RegisteredSnapshots) -> -ifdef(TEST). + +generate_randomkeys({Count, StartSQN}) -> + generate_randomkeys(Count, StartSQN, []); +generate_randomkeys(Count) -> + generate_randomkeys(Count, 0, []). + +generate_randomkeys(0, _SQN, Acc) -> + lists:reverse(Acc); +generate_randomkeys(Count, SQN, Acc) -> + K = {o, + lists:concat(["Bucket", random:uniform(1024)]), + lists:concat(["Key", random:uniform(1024)]), + null}, + RandKey = {K, + {SQN, + {active, infinity}, + leveled_codec:magic_hash(K), + null}}, + generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). + + clean_testdir(RootPath) -> clean_subdir(filepath(RootPath, manifest)), clean_subdir(filepath(RootPath, files)). @@ -1332,8 +1357,8 @@ compaction_work_assessment_test() -> ?assertMatch([{1, Manifest3, 1}], WorkQ3). confirm_delete_test() -> - Filename = 'test.sft', - UnreferencedFiles = [{'other.sft', dummy_owner, 15}, + Filename = 'test.sst', + UnreferencedFiles = [{'other.sst', dummy_owner, 15}, {Filename, dummy_owner, 10}], RegisteredIterators1 = [{dummy_pid, 16}, {dummy_pid, 12}], R1 = confirm_delete(Filename, UnreferencedFiles, RegisteredIterators1), @@ -1376,20 +1401,20 @@ simple_server_test() -> Key1_Pre = {{o,"Bucket0001", "Key0001", null}, {1, {active, infinity}, null}}, Key1 = add_missing_hash(Key1_Pre), - KL1 = leveled_sft:generate_randomkeys({1000, 2}), + KL1 = generate_randomkeys({1000, 2}), Key2_Pre = {{o,"Bucket0002", "Key0002", null}, {1002, {active, infinity}, null}}, Key2 = add_missing_hash(Key2_Pre), - KL2 = leveled_sft:generate_randomkeys({900, 1003}), + KL2 = generate_randomkeys({900, 1003}), % Keep below the max table size by having 900 not 1000 Key3_Pre = {{o,"Bucket0003", "Key0003", null}, {2003, {active, infinity}, null}}, Key3 = add_missing_hash(Key3_Pre), - KL3 = leveled_sft:generate_randomkeys({1000, 2004}), + KL3 = generate_randomkeys({1000, 2004}), Key4_Pre = {{o,"Bucket0004", "Key0004", null}, {3004, {active, infinity}, null}}, Key4 = add_missing_hash(Key4_Pre), - KL4 = leveled_sft:generate_randomkeys({1000, 3005}), + KL4 = generate_randomkeys({1000, 3005}), ok = maybe_pause_push(PCL, [Key1]), ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), ok = maybe_pause_push(PCL, KL1), @@ -1464,7 +1489,7 @@ simple_server_test() -> Key1A_Pre = {{o,"Bucket0001", "Key0001", null}, {4005, {active, infinity}, null}}, Key1A = add_missing_hash(Key1A_Pre), - KL1A = leveled_sft:generate_randomkeys({2000, 4006}), + KL1A = generate_randomkeys({2000, 4006}), ok = maybe_pause_push(PCLr, [Key1A]), ok = maybe_pause_push(PCLr, KL1A), ?assertMatch(true, pcl_checksequencenumber(PclSnap, @@ -1528,17 +1553,16 @@ rangequery_manifest_test() -> end_key={o, "Bucket1", "K996", null}, filename="Z6"}}, Man = [{1, [E1, E2, E3]}, {2, [E4, E5, E6]}], - R1 = initiate_rangequery_frommanifest({o, "Bucket1", "K711", null}, - {o, "Bucket1", "K999", null}, - Man), - ?assertMatch([{1, [{next_file, E3}]}, - {2, [{next_file, E5}, {next_file, E6}]}], + SK1 = {o, "Bucket1", "K711", null}, + EK1 = {o, "Bucket1", "K999", null}, + R1 = initiate_rangequery_frommanifest(SK1, EK1, Man), + ?assertMatch([{1, [{next, E3, SK1}]}, + {2, [{next, E5, SK1}, {next, E6, SK1}]}], R1), - R2 = initiate_rangequery_frommanifest({i, "Bucket1", {"Idx1", "Fld8"}, null}, - {i, "Bucket1", {"Idx1", "Fld8"}, null}, - Man), - ?assertMatch([{1, [{next_file, E1}]}, {2, [{next_file, E5}]}], - R2), + SK2 = {i, "Bucket1", {"Idx1", "Fld8"}, null}, + EK2 = {i, "Bucket1", {"Idx1", "Fld8"}, null}, + R2 = initiate_rangequery_frommanifest(SK2, EK2, Man), + ?assertMatch([{1, [{next, E1, SK2}]}, {2, [{next, E5, SK2}]}], R2), R3 = initiate_rangequery_frommanifest({i, "Bucket1", {"Idx0", "Fld8"}, null}, {i, "Bucket1", {"Idx0", "Fld9"}, null}, Man), @@ -1693,17 +1717,18 @@ foldwithimm_simple_test() -> {{o, "Bucket1", "Key6"}, 7}], AccB). create_file_test() -> - Filename = "../test/new_file.sft", + Filename = "../test/new_file.sst", ok = file:write_file(Filename, term_to_binary("hello")), - KVL = lists:usort(leveled_sft:generate_randomkeys(10000)), + KVL = lists:usort(generate_randomkeys(10000)), Tree = leveled_skiplist:from_list(KVL), FetchFun = fun(Slot) -> lists:nth(Slot, [Tree]) end, {ok, SP, - noreply} = leveled_sft:sft_newfroml0cache(Filename, + noreply} = leveled_sst:sst_newlevelzero(Filename, 1, FetchFun, - #sft_options{wait=false}), + undefined, + 10000), lists:foreach(fun(X) -> case checkready(SP) of timeout -> @@ -1716,9 +1741,9 @@ create_file_test() -> io:format("StartKey ~w EndKey ~w~n", [StartKey, EndKey]), ?assertMatch({o, _, _, _}, StartKey), ?assertMatch({o, _, _, _}, EndKey), - ?assertMatch("../test/new_file.sft", SrcFN), - ok = leveled_sft:sft_clear(SP), - {ok, Bin} = file:read_file("../test/new_file.sft.discarded"), + ?assertMatch("../test/new_file.sst", SrcFN), + ok = leveled_sst:sst_clear(SP), + {ok, Bin} = file:read_file("../test/new_file.sst.discarded"), ?assertMatch("hello", binary_to_term(Bin)). commit_manifest_test() -> @@ -1735,14 +1760,14 @@ commit_manifest_test() -> ok = file:write_file(ManifestFP ++ "nonzero_1.pnd", term_to_binary("dummy data")), - L1_0 = [{1, [#manifest_entry{filename="1.sft"}]}], + L1_0 = [{1, [#manifest_entry{filename="1.sst"}]}], Resp_WI0 = Resp_WI#penciller_work{new_manifest=L1_0, unreferenced_files=[]}, {ok, State0} = commit_manifest_change(Resp_WI0, State), ?assertMatch(1, State0#state.manifest_sqn), ?assertMatch([], get_item(0, State0#state.manifest, [])), - L0Entry = [#manifest_entry{filename="0.sft"}], + L0Entry = [#manifest_entry{filename="0.sst"}], ManifestPlus = [{0, L0Entry}|State0#state.manifest], NxtSent_WI = #penciller_work{next_sqn=2, @@ -1756,7 +1781,7 @@ commit_manifest_test() -> ok = file:write_file(ManifestFP ++ "nonzero_2.pnd", term_to_binary("dummy data")), - L2_0 = [#manifest_entry{filename="2.sft"}], + L2_0 = [#manifest_entry{filename="2.sst"}], NxtResp_WI0 = NxtResp_WI#penciller_work{new_manifest=[{2, L2_0}], unreferenced_files=[]}, {ok, State2} = commit_manifest_change(NxtResp_WI0, State1), @@ -1777,7 +1802,7 @@ badmanifest_test() -> Key1_pre = {{o,"Bucket0001", "Key0001", null}, {1001, {active, infinity}, null}}, Key1 = add_missing_hash(Key1_pre), - KL1 = leveled_sft:generate_randomkeys({1000, 1}), + KL1 = generate_randomkeys({1000, 1}), ok = maybe_pause_push(PCL, KL1 ++ [Key1]), %% Added together, as split apart there will be a race between the close @@ -1798,7 +1823,7 @@ badmanifest_test() -> checkready(Pid) -> try - leveled_sft:sft_checkready(Pid) + leveled_sst:sst_checkready(Pid) catch exit:{timeout, _} -> timeout diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl deleted file mode 100644 index e736a47..0000000 --- a/src/leveled_sft.erl +++ /dev/null @@ -1,2024 +0,0 @@ -%% This module provides functions for managing sft files - a modified version -%% of sst files, to be used in leveleddb. -%% -%% sft files are segment filtered tables in that they are guarded by a quick -%% access filter that checks for the presence of key by segment id, with the -%% segment id being a hash in the range 0 - 1024 * 1024 -%% -%% This filter has a dual purpose -%% - a memory efficient way of discovering non-presence with low false positive -%% rate -%% - to make searching for all keys by hashtree segment more efficient (a -%% specific change to optimise behaviour for use with the incremental refresh) -%% of riak hashtrees -%% -%% All keys are not equal in sft files, keys are only expected in a specific -%% series of formats -%% - {Tag, Bucket, Key, SubKey|null} - Object Keys -%% - {i, Bucket, {IndexName, IndexTerm}, Key} - Postings -%% The {Bucket, Key} part of all types of keys are hashed for segment filters. -%% For Postings the {Bucket, IndexName, IndexTerm} is also hashed. This -%% causes a false positive on lookup of a segment, but allows for the presence -%% of specific index terms to be checked -%% -%% The objects stored are a tuple of {Key, SequenceNumber, State, Value}, where -%% Key - as above -%% SequenceNumber - monotonically increasing counter of addition to the nursery -%% log -%% State - {active|tomb, ExpiryTimestamp | infinity} -%% Value - null (all postings) | [Object Metadata] (all object keys) -%% Keys should be unique in files. If more than two keys are candidate for -%% the same file the highest sequence number should be chosen. If the file -%% is at the basemenet level of a leveleddb database the objects with an -%% ExpiryTimestamp in the past should not be written, but at all other levels -%% keys should not be ignored because of a timestamp in the past. -%% tomb objects are written for deletions, and these tombstones may have an -%% Expirytimestamp which in effect is the time when the tombstone should be -%% reaped. -%% -%% sft files are broken into the following sections: -%% - Header (fixed width 80 bytes - containing pointers and metadata) -%% - Blocks (variable length) -%% - Slot Filter (variable length) -%% - Slot Index (variable length) -%% - Table Summary (variable length) -%% Each section should contain at the footer of the section a 4-byte CRC which -%% is to be checked only on the opening of the file -%% -%% The keys in the sft file are placed into the file in erlang term order. -%% There will normally be 256 slots of keys. The Slot Index is a gb_tree -%% acting as a helper to find the right slot to check when searching for a key -%% or range of keys. -%% The Key in the Slot Index is the Key at the start of the Slot. -%% The Value in the Slot Index is a record indicating: -%% - The starting position of the Slot within the Blocks (relative to the -%% starting position of the Blocks) -%% - The (relative) starting position of the Slot Filter for this Slot -%% - The number of blocks within the Slot -%% - The length of each of the Blocks within the Slot -%% -%% When checking for a Key in the sft file, the key should be hashed to the -%% segment, then the key should be looked-up in the Slot Index. The segment -%% ID can then be checked against the Slot Filter which will either return -%% not_present or [BlockIDs] -%% If a list of BlockIDs (normally of length 1) is returned the block should -%% be fetched using the starting position and length of the Block to find the -%% actual key (or not if the Slot Filter had returned a false positive) -%% -%% There will exist a Slot Filter for each entry in the Slot Index -%% The Slot Filter starts with some fixed length metadata -%% - 1 byte stating the expected number of keys in the block -%% - 1 byte stating the number of complete (i.e. containing the expected -%% number of keys) Blocks in the Slot -%% - 1 byte stating the number of keys in any incomplete Block (there can -%% only be 1 incomplete Block per Slot and it must be the last block) -%% - 3 bytes stating the largest segment ID in the Slot -%% - 1 byte stating the exponent used in the rice-encoding of the filter -%% The Filter itself is a rice-encoded list of Integers representing the -%% differences between the Segment IDs in the Slot with each entry being -%% appended by the minimal number of bits to represent the Block ID in which -%% an entry for that segment can be found. Where a segment exists more than -%% once then a 0 length will be used. -%% To use the filter code should roll over the filter incrementing the Segment -%% ID by each difference, and counting the keys by Block ID. This should -%% return one of: -%% mismatch - the final Segment Count didn't meet the largest Segment ID or -%% the per-block key counts don't add-up. There could have been a bit-flip, -%% so don't rely on the filter -%% no_match - everything added up but the counter never equalled the queried -%% Segment ID -%% {match, [BlockIDs]} - everything added up and the Segment may be -%% represented in the given blocks -%% -%% The makeup of a block -%% - A block is a list of 32 {Key, Value} pairs in Erlang term order -%% - The block is stored using standard compression in term_to_binary -%% May be improved by use of lz4 or schema-based binary_to_term -%% -%% The Table Summary may contain multiple summaries -%% The standard table summary contains: -%% - a count of keys by bucket and type of key (posting or object key) -%% - the total size of objects referred to by object keys -%% - the number of postings by index name -%% - the number of tombstones within the file -%% - the highest and lowest sequence number in the file -%% Summaries could be used for other summaries of table content in the future, -%% perhaps application-specific bloom filters - -%% The 56-byte header is made up of -%% - 1 byte version (major 5 bits, minor 3 bits) - default 0.1 -%% - 1 byte options (currently undefined) -%% - 1 byte Block Size - the expected number of keys in each block -%% - 1 byte Block Count - the expected number of blocks in each slot -%% - 2 byte Slot Count - the maximum number of slots in the file -%% - 6 bytes - spare -%% - 4 bytes - Blocks length -%% - 4 bytes - Slot Index length -%% - 4 bytes - Slot Filter length -%% - 4 bytes - Table summary length -%% - 24 bytes - spare -%% - 4 bytes - CRC32 -%% -%% The file body is written in the same order of events as the header (i.e. -%% Blocks first) -%% -%% Once open the file can be in the following states -%% - writing, the file is still being created -%% - available, the file may be read, but never again must be modified -%% - pending_deletion, the file can be closed and deleted once all outstanding -%% Snapshots have been started beyond a certain sequence number -%% -%% Level managers should only be aware of files in the available state. -%% Iterators may be aware of files in either available or pending_delete. -%% Level maintainers should control the file exclusively when in the writing -%% state, and send the event to trigger pending_delete with the a sequence -%% number equal to or higher than the number at the point it was no longer -%% active at any level. -%% -%% The format of the file is intended to support quick lookups, whilst -%% allowing for a new file to be written incrementally (so that all keys and -%% values need not be retained in memory) - perhaps n blocks at a time - - --module(leveled_sft). - --behaviour(gen_fsm). --include("include/leveled.hrl"). - --export([init/1, - handle_sync_event/4, - handle_event/3, - handle_info/3, - terminate/3, - code_change/4, - starting/2, - starting/3, - reader/3, - delete_pending/3, - delete_pending/2]). - --export([sft_new/4, - sft_newfroml0cache/4, - sft_open/1, - sft_get/2, - sft_get/3, - sft_getkvrange/4, - sft_close/1, - sft_clear/1, - sft_checkready/1, - sft_setfordelete/2, - sft_deleteconfirmed/1, - sft_getmaxsequencenumber/1]). - --export([generate_randomkeys/1]). - --include_lib("eunit/include/eunit.hrl"). - - --define(WORD_SIZE, 4). --define(DWORD_SIZE, 8). --define(CURRENT_VERSION, {0,1}). --define(SLOT_COUNT, 256). --define(SLOT_GROUPWRITE_COUNT, 16). --define(BLOCK_SIZE, 32). --define(BLOCK_COUNT, 4). --define(FOOTERPOS_HEADERPOS, 2). --define(MAX_SEG_HASH, 1048576). --define(DIVISOR_BITS, 13). --define(DIVISOR, 8092). --define(COMPRESSION_LEVEL, 1). --define(HEADER_LEN, 56). --define(ITERATOR_SCANWIDTH, 1). --define(MERGE_SCANWIDTH, 32). --define(BLOOM_WIDTH, 48). --define(DELETE_TIMEOUT, 10000). --define(MAX_KEYS, ?SLOT_COUNT * ?BLOCK_COUNT * ?BLOCK_SIZE). --define(DISCARD_EXT, ".discarded"). --define(WRITE_OPS, [binary, raw, read, write, delayed_write]). --define(READ_OPS, [binary, raw, read]). - --record(state, {version = ?CURRENT_VERSION :: tuple(), - slot_index :: list(), - next_position :: integer(), - smallest_sqn :: integer(), - highest_sqn :: integer(), - smallest_key :: string(), - highest_key :: string(), - slots_pointer :: integer(), - index_pointer :: integer(), - filter_pointer :: integer(), - summ_pointer :: integer(), - summ_length :: integer(), - filename = "not set" :: string(), - handle :: file:fd(), - background_complete = false :: boolean(), - oversized_file = false :: boolean(), - penciller :: pid(), - bloom}). - -%% Helper object when writing a file to keep track of various accumulators --record(writer, {slot_index = [] :: list(), - slot_binary = <<>> :: binary(), - bloom = leveled_tinybloom:empty(?BLOOM_WIDTH), - min_sqn = infinity :: integer()|infinity, - max_sqn = 0 :: integer(), - last_key = {last, null}}). - -%%%============================================================================ -%%% API -%%%============================================================================ - - -sft_new(Filename, KL1, KL2, LevelInfo) -> - LevelR = case is_integer(LevelInfo) of - true -> - #level{level=LevelInfo}; - _ -> - if - is_record(LevelInfo, level) -> - LevelInfo - end - end, - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - Reply = gen_fsm:sync_send_event(Pid, - {sft_new, Filename, KL1, KL2, LevelR}, - infinity), - {ok, Pid, Reply}. - -sft_newfroml0cache(Filename, Slots, FetchFun, Options) -> - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - case Options#sft_options.wait of - true -> - KL1 = leveled_pmem:to_list(Slots, FetchFun), - Reply = gen_fsm:sync_send_event(Pid, - {sft_new, - Filename, - KL1, - [], - #level{level=0}}, - infinity), - {ok, Pid, Reply}; - false -> - gen_fsm:send_event(Pid, - {sft_newfroml0cache, - Filename, - Slots, - FetchFun, - Options#sft_options.penciller}), - {ok, Pid, noreply} - end. - -sft_open(Filename) -> - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - case gen_fsm:sync_send_event(Pid, {sft_open, Filename}, infinity) of - {ok, {SK, EK}} -> - {ok, Pid, {SK, EK}} - end. - -sft_setfordelete(Pid, Penciller) -> - gen_fsm:sync_send_event(Pid, {set_for_delete, Penciller}, infinity). - -sft_get(Pid, Key, Hash) -> - gen_fsm:sync_send_event(Pid, {get_kv, Key, Hash}, infinity). - -sft_get(Pid, Key) -> - sft_get(Pid, Key, leveled_codec:magic_hash(Key)). - -sft_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> - gen_fsm:sync_send_event(Pid, - {get_kvrange, StartKey, EndKey, ScanWidth}, - infinity). - -sft_clear(Pid) -> - gen_fsm:sync_send_event(Pid, {set_for_delete, false}, infinity), - gen_fsm:sync_send_event(Pid, close, 1000). - -sft_close(Pid) -> - gen_fsm:sync_send_event(Pid, close, 1000). - -sft_deleteconfirmed(Pid) -> - gen_fsm:send_event(Pid, close). - -sft_checkready(Pid) -> - gen_fsm:sync_send_event(Pid, background_complete, 20). - -sft_getmaxsequencenumber(Pid) -> - gen_fsm:sync_send_event(Pid, get_maxsqn, infinity). - - - -%%%============================================================================ -%%% gen_server callbacks -%%%============================================================================ - -init([]) -> - {ok, starting, #state{}}. - -starting({sft_new, Filename, KL1, [], _LevelR=#level{level=L}}, _From, _State) - when L == 0 -> - {ok, State} = create_levelzero(KL1, Filename), - {reply, - {{[], []}, State#state.smallest_key, State#state.highest_key}, - reader, - State}; -starting({sft_new, Filename, KL1, KL2, LevelR}, _From, _State) -> - case create_file(Filename) of - {Handle, FileMD} -> - {ReadHandle, UpdFileMD, KeyRemainders} = complete_file(Handle, - FileMD, - KL1, KL2, - LevelR), - {reply, - {KeyRemainders, - UpdFileMD#state.smallest_key, - UpdFileMD#state.highest_key}, - reader, - UpdFileMD#state{handle=ReadHandle, filename=Filename}} - end; -starting({sft_open, Filename}, _From, _State) -> - {_Handle, FileMD} = open_file(#state{filename=Filename}), - leveled_log:log("SFT01", [Filename]), - {reply, - {ok, {FileMD#state.smallest_key, FileMD#state.highest_key}}, - reader, - FileMD}. - -starting({sft_newfroml0cache, Filename, Slots, FetchFun, PCL}, _State) -> - SW = os:timestamp(), - Inp1 = leveled_pmem:to_list(Slots, FetchFun), - {ok, State} = create_levelzero(Inp1, Filename), - leveled_log:log_timer("SFT03", [Filename], SW), - case PCL of - undefined -> - {next_state, reader, State}; - _ -> - leveled_penciller:pcl_confirml0complete(PCL, - State#state.filename, - State#state.smallest_key, - State#state.highest_key), - {next_state, reader, State} - end. - - -reader({get_kv, Key, Hash}, _From, State) -> - Reply = - case leveled_tinybloom:check({hash, Hash}, State#state.bloom) of - false -> - not_present; - true -> - fetch_keyvalue(State#state.handle, State, Key) - end, - {reply, Reply, reader, State}; -reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> - Reply = pointer_append_queryresults(fetch_range_kv(State#state.handle, - State, - StartKey, - EndKey, - ScanWidth), - self()), - {reply, Reply, reader, State}; -reader(get_maxsqn, _From, State) -> - {reply, State#state.highest_sqn, reader, State}; -reader({set_for_delete, Penciller}, _From, State) -> - leveled_log:log("SFT02", [State#state.filename]), - {reply, - ok, - delete_pending, - State#state{penciller=Penciller}, - ?DELETE_TIMEOUT}; -reader(background_complete, _From, State) -> - if - State#state.background_complete == true -> - {reply, - {ok, - State#state.filename, - State#state.smallest_key, - State#state.highest_key}, - reader, - State} - end; -reader(close, _From, State) -> - ok = file:close(State#state.handle), - {stop, normal, ok, State}. - -delete_pending({get_kv, Key, Hash}, _From, State) -> - Reply = - case leveled_tinybloom:check({hash, Hash}, State#state.bloom) of - false -> - not_present; - true -> - fetch_keyvalue(State#state.handle, State, Key) - end, - {reply, Reply, delete_pending, State, ?DELETE_TIMEOUT}; -delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> - Reply = pointer_append_queryresults(fetch_range_kv(State#state.handle, - State, - StartKey, - EndKey, - ScanWidth), - self()), - {reply, Reply, delete_pending, State, ?DELETE_TIMEOUT}; -delete_pending(close, _From, State) -> - leveled_log:log("SFT06", [State#state.filename]), - ok = file:close(State#state.handle), - ok = file:delete(State#state.filename), - {stop, normal, ok, State}. - -delete_pending(timeout, State) -> - leveled_log:log("SFT05", [timeout, State#state.filename]), - ok = leveled_penciller:pcl_confirmdelete(State#state.penciller, - State#state.filename), - {next_state, delete_pending, State, ?DELETE_TIMEOUT}; -delete_pending(close, State) -> - leveled_log:log("SFT06", [State#state.filename]), - ok = file:close(State#state.handle), - ok = file:delete(State#state.filename), - {stop, normal, State}. - -handle_sync_event(_Msg, _From, StateName, State) -> - {reply, undefined, StateName, State}. - -handle_event(_Msg, StateName, State) -> - {next_state, StateName, State}. - -handle_info(_Msg, StateName, State) -> - {next_state, StateName, State}. - -terminate(Reason, _StateName, State) -> - leveled_log:log("SFT05", [Reason, State#state.filename]). - -code_change(_OldVsn, StateName, State, _Extra) -> - {ok, StateName, State}. - - - -%%%============================================================================ -%%% Internal functions -%%%============================================================================ - - -create_levelzero(ListForFile, Filename) -> - {TmpFilename, PrmFilename} = generate_filenames(Filename), - {Handle, FileMD} = create_file(TmpFilename), - InputSize = length(ListForFile), - leveled_log:log("SFT07", [InputSize]), - Rename = {true, TmpFilename, PrmFilename}, - {ReadHandle, - UpdFileMD, - {[], []}} = complete_file(Handle, FileMD, - ListForFile, [], - #level{level=0}, Rename), - {ok, - UpdFileMD#state{handle=ReadHandle, - filename=PrmFilename, - background_complete=true, - oversized_file=InputSize>?MAX_KEYS}}. - - -generate_filenames(RootFilename) -> - Ext = filename:extension(RootFilename), - Components = filename:split(RootFilename), - case Ext of - [] -> - {filename:join(Components) ++ ".pnd", - filename:join(Components) ++ ".sft"}; - Ext -> - %% This seems unnecessarily hard - DN = filename:dirname(RootFilename), - FP = lists:last(Components), - FP_NOEXT = lists:sublist(FP, 1, 1 + length(FP) - length(Ext)), - {DN ++ "/" ++ FP_NOEXT ++ "pnd", DN ++ "/" ++ FP_NOEXT ++ "sft"} - end. - - -%% Start a bare file with an initial header and no further details -%% Return the {Handle, metadata record} -create_file(FileName) when is_list(FileName) -> - leveled_log:log("SFT01", [FileName]), - ok = filelib:ensure_dir(FileName), - {ok, Handle} = file:open(FileName, ?WRITE_OPS), - Header = create_header(initial), - {ok, _} = file:position(Handle, bof), - ok = file:write(Handle, Header), - {ok, StartPos} = file:position(Handle, cur), - FileMD = #state{next_position=StartPos, filename=FileName}, - {Handle, FileMD}. - - -create_header(initial) -> - {Major, Minor} = ?CURRENT_VERSION, - Version = <>, - %% Not thought of any options - options are ignored - Options = <<0:8>>, - %% Settings are currently ignored - {BlSize, BlCount, SlCount} = {?BLOCK_COUNT, ?BLOCK_SIZE, ?SLOT_COUNT}, - Settings = <>, - {SpareO, SpareL} = {<<0:48>>, <<0:192>>}, - Lengths = <<0:32, 0:32, 0:32, 0:32>>, - H1 = <>, - CRC32 = erlang:crc32(H1), - <

>. - -%% Open a file returning a handle and metadata which can be used in fetch and -%% iterator requests -%% The handle should be read-only as these are immutable files, a file cannot -%% be opened for writing keys, it can only be created to write keys - -open_file(FileMD) -> - Filename = FileMD#state.filename, - {ok, Handle} = file:open(Filename, [binary, raw, read]), - {ok, HeaderLengths} = file:pread(Handle, 12, 16), - <> = HeaderLengths, - {ok, <>} = - file:pread(Handle, ?HEADER_LEN + Blen + Ilen + Flen, Slen), - {{LowSQN, HighSQN}, {LowKey, HighKey}, Bloom} = - case erlang:crc32(SummaryBin) of - SummaryCRC -> - binary_to_term(SummaryBin) - end, - {ok, SlotIndexBin} = file:pread(Handle, ?HEADER_LEN + Blen, Ilen), - SlotIndex = binary_to_term(SlotIndexBin), - {Handle, FileMD#state{slot_index=SlotIndex, - smallest_sqn=LowSQN, - highest_sqn=HighSQN, - smallest_key=LowKey, - highest_key=HighKey, - slots_pointer=?HEADER_LEN, - index_pointer=?HEADER_LEN + Blen, - filter_pointer=?HEADER_LEN + Blen + Ilen, - summ_pointer=?HEADER_LEN + Blen + Ilen + Flen, - summ_length=Slen, - handle=Handle, - bloom=Bloom}}. - -%% Take a file handle with a previously created header and complete it based on -%% the two key lists KL1 and KL2 -complete_file(Handle, FileMD, KL1, KL2, LevelR) -> - complete_file(Handle, FileMD, KL1, KL2, LevelR, false). - -complete_file(Handle, FileMD, KL1, KL2, LevelR, Rename) -> - {ok, KeyRemainders} = write_keys(Handle, - maybe_expand_pointer(KL1), - maybe_expand_pointer(KL2), - LevelR, - fun sftwrite_function/2, - #writer{}), - {ReadHandle, UpdFileMD} = case Rename of - false -> - open_file(FileMD); - {true, OldName, NewName} -> - ok = rename_file(OldName, NewName), - open_file(FileMD#state{filename=NewName}) - end, - {ReadHandle, UpdFileMD, KeyRemainders}. - -rename_file(OldName, NewName) -> - leveled_log:log("SFT08", [OldName, NewName]), - case filelib:is_file(NewName) of - true -> - leveled_log:log("SFT09", [NewName]), - AltName = filename:join(filename:dirname(NewName), - filename:basename(NewName)) - ++ ?DISCARD_EXT, - leveled_log:log("SFT10", [NewName, AltName]), - ok = file:rename(NewName, AltName); - false -> - ok - end, - file:rename(OldName, NewName). - - -%% Fetch a Key and Value from a file, returns -%% {value, KV} or not_present -%% The key must be pre-checked to ensure it is in the valid range for the file -%% A key out of range may fail - -fetch_keyvalue(Handle, FileMD, Key) -> - case get_nearestkey(FileMD#state.slot_index, Key) of - not_found -> - not_present; - {_NearestKey, {FilterLen, PointerF}, {LengthList, PointerB}} -> - FilterPointer = PointerF + FileMD#state.filter_pointer, - {ok, SegFilter} = file:pread(Handle, - FilterPointer, - FilterLen), - SegID = hash_for_segmentid({keyonly, Key}), - case check_for_segments(SegFilter, [SegID], true) of - {maybe_present, BlockList} -> - BlockPointer = PointerB + FileMD#state.slots_pointer, - fetch_keyvalue_fromblock(BlockList, - Key, - LengthList, - Handle, - BlockPointer); - not_present -> - not_present; - error_so_maybe_present -> - BlockPointer = PointerB + FileMD#state.slots_pointer, - fetch_keyvalue_fromblock(lists:seq(0,length(LengthList)), - Key, - LengthList, - Handle, - BlockPointer) - end - end. - -%% Fetches a range of keys returning a list of {Key, SeqN} tuples -fetch_range_keysonly(Handle, FileMD, StartKey, EndKey) -> - fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_keysonly/2). - -fetch_range_keysonly(Handle, FileMD, StartKey, EndKey, ScanWidth) -> - fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_keysonly/2, - ScanWidth). - -%% Fetches a range of keys returning the full tuple, including value -fetch_range_kv(Handle, FileMD, StartKey, EndKey, ScanWidth) -> - fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_kv/2, - ScanWidth). - -acc_list_keysonly(null, empty) -> - []; -acc_list_keysonly(null, RList) -> - RList; -acc_list_keysonly(R, RList) when is_list(R) -> - lists:foldl(fun acc_list_keysonly/2, RList, R); -acc_list_keysonly(R, RList) -> - lists:append(RList, [leveled_codec:strip_to_keyseqstatusonly(R)]). - -acc_list_kv(null, empty) -> - []; -acc_list_kv(null, RList) -> - RList; -acc_list_kv(R, RList) when is_list(R) -> - RList ++ R; -acc_list_kv(R, RList) -> - lists:append(RList, [R]). - -%% Iterate keys, returning a batch of keys & values in a range -%% - the iterator can have a ScanWidth which is how many slots should be -%% scanned by the iterator before returning a result -%% - batches can be ended with a pointer to indicate there are potentially -%% further values in the range -%% - a list of functions can be provided, which should either return true -%% or false, and these can be used to filter the results from the query, -%% for example to ignore keys above a certain sequence number, to ignore -%% keys not matching a certain regular expression, or to ignore keys not -%% a member of a particular partition -%% - An Accumulator and an Accumulator function can be passed. The function -%% needs to handle being passed (KV, Acc) to add the current result to the -%% Accumulator. The functional should handle KV=null, Acc=empty to initiate -%% the accumulator, and KV=null to leave the Accumulator unchanged. -%% Flexibility with accumulators is such that keys-only can be returned rather -%% than keys and values, or other entirely different accumulators can be -%% used - e.g. counters, hash-lists to build bloom filters etc - -fetch_range(Handle, FileMD, StartKey, EndKey, AccFun) -> - fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ?ITERATOR_SCANWIDTH). - -fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth) -> - fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth, empty). - -fetch_range(_Handle, _FileMD, StartKey, _EndKey, _AccFun, 0, Acc) -> - {partial, Acc, StartKey}; -fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth, Acc) -> - %% get_nearestkey gets the last key in the index <= StartKey, or the next - %% key along if {next, StartKey} is passed - case get_nearestkey(FileMD#state.slot_index, StartKey) of - {NearestKey, _Filter, {LengthList, PointerB}} -> - fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, - AccFun, ScanWidth, - LengthList, - 0, - PointerB + FileMD#state.slots_pointer, - AccFun(null, Acc)); - not_found -> - {complete, AccFun(null, Acc)} - end. - -fetch_range(Handle, FileMD, _StartKey, NearestKey, EndKey, - AccFun, ScanWidth, - LengthList, - BlockNumber, - _Pointer, - Acc) - when length(LengthList) == BlockNumber -> - %% Reached the end of the slot. Move the start key on one to scan a new slot - fetch_range(Handle, FileMD, {next, NearestKey}, EndKey, - AccFun, ScanWidth - 1, - Acc); -fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, - AccFun, ScanWidth, - LengthList, - BlockNumber, - Pointer, - Acc) -> - Block = fetch_block(Handle, LengthList, BlockNumber, Pointer), - Results = - case maybe_scan_entire_block(Block, StartKey, EndKey) of - true -> - {partial, AccFun(Block, Acc), StartKey}; - false -> - scan_block(Block, StartKey, EndKey, AccFun, Acc) - end, - case Results of - {partial, Acc1, StartKey} -> - %% Move on to the next block - fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, - AccFun, ScanWidth, - LengthList, - BlockNumber + 1, - Pointer, - Acc1); - {complete, Acc1} -> - {complete, Acc1} - end. - -scan_block([], StartKey, _EndKey, _AccFun, Acc) -> - {partial, Acc, StartKey}; -scan_block([HeadKV|T], StartKey, EndKey, AccFun, Acc) -> - K = leveled_codec:strip_to_keyonly(HeadKV), - case {StartKey > K, leveled_codec:endkey_passed(EndKey, K)} of - {true, _} when StartKey /= all -> - scan_block(T, StartKey, EndKey, AccFun, Acc); - {_, true} when EndKey /= all -> - {complete, Acc}; - _ -> - scan_block(T, StartKey, EndKey, AccFun, AccFun(HeadKV, Acc)) - end. - - -maybe_scan_entire_block([], _, _) -> - true; -maybe_scan_entire_block(_Block, all, all) -> - true; -maybe_scan_entire_block(Block, StartKey, all) -> - [FirstKey|_Tail] = Block, - leveled_codec:strip_to_keyonly(FirstKey) > StartKey; -maybe_scan_entire_block(Block, StartKey, EndKey) -> - [FirstKey|_Tail] = Block, - LastKey = leveled_codec:strip_to_keyonly(lists:last(Block)), - FromStart = leveled_codec:strip_to_keyonly(FirstKey) > StartKey, - ToEnd = leveled_codec:endkey_passed(EndKey, LastKey), - case {FromStart, ToEnd} of - {true, false} -> - true; - _ -> - false - end. - -fetch_keyvalue_fromblock([], _Key, _LengthList, _Handle, _StartOfSlot) -> - not_present; -fetch_keyvalue_fromblock([BlockNmb|T], Key, LengthList, Handle, StartOfSlot) -> - BlockToCheck = fetch_block(Handle, LengthList, BlockNmb, StartOfSlot), - Result = lists:keyfind(Key, 1, BlockToCheck), - case Result of - false -> - fetch_keyvalue_fromblock(T, Key, LengthList, Handle, StartOfSlot); - KV -> - KV - end. - -fetch_block(Handle, LengthList, BlockNmb, StartOfSlot) -> - Start = lists:sum(lists:sublist(LengthList, BlockNmb)), - Length = lists:nth(BlockNmb + 1, LengthList), - {ok, BlockToCheckBin} = file:pread(Handle, Start + StartOfSlot, Length), - binary_to_term(BlockToCheckBin). - -%% Need to deal with either Key or {next, Key} -get_nearestkey([H|_Tail], all) -> - H; -get_nearestkey(KVList, Key) -> - case Key of - {next, K} -> - get_nextkeyaftermatch(KVList, K, not_found); - _ -> - get_firstkeytomatch(KVList, Key, not_found) - end. - -get_firstkeytomatch([], _KeyToFind, PrevV) -> - PrevV; -get_firstkeytomatch([{K, FilterInfo, SlotInfo}|_T], KeyToFind, PrevV) - when K > KeyToFind -> - case PrevV of - not_found -> - {K, FilterInfo, SlotInfo}; - _ -> - PrevV - end; -get_firstkeytomatch([{K, FilterInfo, SlotInfo}|T], KeyToFind, _PrevV) -> - get_firstkeytomatch(T, KeyToFind, {K, FilterInfo, SlotInfo}). - -get_nextkeyaftermatch([], _KeyToFind, _PrevV) -> - not_found; -get_nextkeyaftermatch([{K, FilterInfo, SlotInfo}|T], KeyToFind, PrevV) - when K >= KeyToFind -> - case PrevV of - not_found -> - get_nextkeyaftermatch(T, KeyToFind, next); - next -> - {K, FilterInfo, SlotInfo} - end; -get_nextkeyaftermatch([_KTuple|T], KeyToFind, PrevV) -> - get_nextkeyaftermatch(T, KeyToFind, PrevV). - - -%% Take a file handle at the sart position (after creating the header) and then -%% write the Key lists to the file slot by slot. -%% -%% Slots are created then written in bulk to impove I/O efficiency. Slots will -%% be written in groups - -write_keys(Handle, KL1, KL2, LevelR, WriteFun, WriteState) -> - write_keys(Handle, KL1, KL2, LevelR, WriteFun, WriteState, {0, 0, []}). - -write_keys(Handle, KL1, KL2, LevelR, WriteFun, WState, - {SlotC, SlotT, SlotLists}) - when SlotC =:= ?SLOT_GROUPWRITE_COUNT -> - WState0 = lists:foldl(fun finalise_slot/2, WState, SlotLists), - Handle0 = WriteFun(slots, {Handle, WState0#writer.slot_binary}), - case maxslots_bylevel(SlotT, LevelR#level.level) of - reached -> - {complete_keywrite(Handle0, WState0, WriteFun), {KL1, KL2}}; - continue -> - write_keys(Handle0, KL1, KL2, LevelR, WriteFun, - WState0#writer{slot_binary = <<>>}, {0, SlotT, []}) - end; -write_keys(Handle, KL1, KL2, LevelR, WriteFun, WState, - {SlotC, SlotT, SlotLists}) -> - {Status, BlockKeyLists} = create_slot(KL1, KL2, LevelR), - case Status of - S when S == complete; S == partial -> - WState0 = - case BlockKeyLists of - [[]] -> - WState; - _ -> - lists:foldl(fun finalise_slot/2, - WState, - SlotLists ++ [BlockKeyLists]) - end, - Handle0 = WriteFun(slots, {Handle, WState0#writer.slot_binary}), - {complete_keywrite(Handle0, WState0, WriteFun), {[], []}}; - {full, KL1Rem, KL2Rem} -> - write_keys(Handle, KL1Rem, KL2Rem, LevelR, WriteFun, WState, - {SlotC + 1, SlotT + 1, SlotLists ++ [BlockKeyLists]}) - end. - - -complete_keywrite(Handle, WriteState, WriteFun) -> - FirstKey = - case length(WriteState#writer.slot_index) of - 0 -> - null; - _ -> - element(1, lists:nth(1, WriteState#writer.slot_index)) - end, - ConvSlotIndex = convert_slotindex(WriteState#writer.slot_index), - WriteFun(finalise, {Handle, - ConvSlotIndex, - {{WriteState#writer.min_sqn, WriteState#writer.max_sqn}, - {FirstKey, WriteState#writer.last_key}, - WriteState#writer.bloom}}). - -%% Take a slot index, and remove the SegFilters replacing with pointers -%% Return a tuple of the accumulated slot filters, and a pointer-based -%% slot-index - -convert_slotindex(SlotIndex) -> - SlotFun = fun({LowKey, SegFilter, LengthList}, - {FilterAcc, SlotIndexAcc, PointerF, PointerB}) -> - FilterOut = serialise_segment_filter(SegFilter), - FilterLen = byte_size(FilterOut), - {<>, - lists:append(SlotIndexAcc, [{LowKey, - {FilterLen, PointerF}, - {LengthList, PointerB}}]), - PointerF + FilterLen, - PointerB + lists:sum(LengthList)} end, - {SlotFilters, PointerIndex, _FLength, _BLength} = - lists:foldl(SlotFun, {<<>>, [], 0, 0}, SlotIndex), - {SlotFilters, PointerIndex}. - -sftwrite_function(slots, {Handle, SerialisedSlots}) -> - ok = file:write(Handle, SerialisedSlots), - Handle; -sftwrite_function(finalise, - {Handle, - {SlotFilters, PointerIndex}, - {SNExtremes, KeyExtremes, Bloom}}) -> - {ok, Position} = file:position(Handle, cur), - - BlocksLength = Position - ?HEADER_LEN, - Index = term_to_binary(PointerIndex), - IndexLength = byte_size(Index), - FilterLength = byte_size(SlotFilters), - Summary = term_to_binary({SNExtremes, KeyExtremes, Bloom}), - SummaryCRC = erlang:crc32(Summary), - SummaryLength = byte_size(Summary) + 4, - %% Write Index, Filter and Summary - ok = file:write(Handle, <>), - %% Write Lengths into header - ok = file:pwrite(Handle, 12, <>), - {ok, _Position} = file:position(Handle, bof), - ok = file:advise(Handle, - BlocksLength + IndexLength, - FilterLength, - will_need), - file:close(Handle). - -%% Level 0 files are of variable (infinite) size to avoid issues with having -%% any remainders when flushing from memory -maxslots_bylevel(_SlotTotal, 0) -> - continue; -maxslots_bylevel(SlotTotal, _Level) -> - case SlotTotal of - ?SLOT_COUNT -> - reached; - X when X < ?SLOT_COUNT -> - continue - end. - - - -%% Take two potentially overlapping lists of keys and produce a block size -%% list of keys in the correct order. Outputs: -%% - Status of -%% - - all_complete (no more keys and block is complete) -%% - - partial (no more keys and block is not complete) -%% - - {block_full, Rem1, Rem2} the block is complete but there is a remainder -%% of keys - -create_block(KeyList1, KeyList2, LevelR) -> - create_block(KeyList1, KeyList2, LevelR, []). - - -create_block([], [], _LevelR, BlockKeyList) - when length(BlockKeyList)==?BLOCK_SIZE -> - {all_complete, lists:reverse(BlockKeyList)}; -create_block([], [], _LevelR, BlockKeyList) -> - {partial, lists:reverse(BlockKeyList)}; -create_block(KeyList1, KeyList2, _LevelR, BlockKeyList) - when length(BlockKeyList)==?BLOCK_SIZE -> - {{block_full, KeyList1, KeyList2}, lists:reverse(BlockKeyList)}; -create_block(KeyList1, KeyList2, LevelR, BlockKeyList) -> - case key_dominates(KeyList1, KeyList2, - {LevelR#level.is_basement, LevelR#level.timestamp}) of - {{next_key, TopKey}, Rem1, Rem2} -> - create_block(Rem1, Rem2, LevelR, [TopKey|BlockKeyList]); - {skipped_key, Rem1, Rem2} -> - create_block(Rem1, Rem2, LevelR, BlockKeyList) - end. - -%% create_slot should simply output a list of BlockKeyLists no bigger than -%% the BlockCount, the the status (with key remianders if not complete) - -create_slot(KL1, KL2, LevelR) -> - create_slot(KL1, KL2, LevelR, ?BLOCK_COUNT, []). - -create_slot(KL1, KL2, LevelR, BlockCount, BlockKeyLists) -> - {Status, KeyList} = create_block(KL1, KL2, LevelR), - case {Status, BlockCount - 1} of - {partial, _N} -> - {partial, BlockKeyLists ++ [KeyList]}; - {all_complete, 0} -> - {complete, BlockKeyLists ++ [KeyList]}; - {all_complete, _N} -> - % From the perspective of the slot it is partially complete - {partial, BlockKeyLists ++ [KeyList]}; - {{block_full, KL1Rem, KL2Rem}, 0} -> - {{full, KL1Rem, KL2Rem}, BlockKeyLists ++ [KeyList]}; - {{block_full, KL1Rem, KL2Rem}, N} -> - create_slot(KL1Rem, KL2Rem, LevelR, N, BlockKeyLists ++ [KeyList]) - end. - - - -%% Fold over the List of BlockKeys updating the writer record -finalise_slot(BlockKeyLists, WriteState) -> - BlockFolder = - fun(KV, {AccMinSQN, AccMaxSQN, Bloom, SegmentIDList}) -> - {SQN, Hash} = leveled_codec:strip_to_seqnhashonly(KV), - {min(AccMinSQN, SQN), - max(AccMaxSQN, SQN), - leveled_tinybloom:enter({hash, Hash}, Bloom), - [hash_for_segmentid(KV)|SegmentIDList]} - end, - SlotFolder = - fun(BlockKeyList, - {MinSQN, MaxSQN, Bloom, SegLists, KVBinary, Lengths}) -> - {BlockMinSQN, BlockMaxSQN, UpdBloom, Segs} = - lists:foldr(BlockFolder, - {infinity, 0, Bloom, []}, - BlockKeyList), - SerialisedBlock = serialise_block(BlockKeyList), - {min(MinSQN, BlockMinSQN), - max(MaxSQN, BlockMaxSQN), - UpdBloom, - SegLists ++ [Segs], - <>, - Lengths ++ [byte_size(SerialisedBlock)]} - end, - - {SlotMinSQN, - SlotMaxSQN, - SlotUpdBloom, - SlotSegLists, - SlotBinary, - BlockLengths} = - lists:foldl(SlotFolder, - {WriteState#writer.min_sqn, - WriteState#writer.max_sqn, - WriteState#writer.bloom, - [], - WriteState#writer.slot_binary, - []}, - BlockKeyLists), - - FirstSlotKey = leveled_codec:strip_to_keyonly(lists:nth(1, - lists:nth(1, - BlockKeyLists))), - LastSlotKV = lists:last(lists:last(BlockKeyLists)), - SegFilter = generate_segment_filter(SlotSegLists), - UpdSlotIndex = lists:append(WriteState#writer.slot_index, - [{FirstSlotKey, SegFilter, BlockLengths}]), - - #writer{slot_index = UpdSlotIndex, - slot_binary = SlotBinary, - bloom = SlotUpdBloom, - min_sqn = SlotMinSQN, - max_sqn = SlotMaxSQN, - last_key = leveled_codec:strip_to_keyonly(LastSlotKV)}. - - -serialise_block(BlockKeyList) -> - term_to_binary(BlockKeyList, [{compressed, ?COMPRESSION_LEVEL}]). - - -%% Compare the keys at the head of the list, and either skip that "best" key or -%% identify as the next key. -%% -%% The logic needs to change if the file is in the basement level, as keys with -%% expired timestamps need not be written at this level -%% -%% The best key is considered to be the lowest key in erlang term order. If -%% there are matching keys then the highest sequence number must be chosen and -%% any lower sequence numbers should be compacted out of existence - - -key_dominates(KL1, KL2, Level) -> - key_dominates_expanded(maybe_expand_pointer(KL1), - maybe_expand_pointer(KL2), - Level). - -key_dominates_expanded([H1|T1], [], Level) -> - case leveled_codec:maybe_reap_expiredkey(H1, Level) of - true -> - {skipped_key, maybe_expand_pointer(T1), []}; - false -> - {{next_key, H1}, maybe_expand_pointer(T1), []} - end; -key_dominates_expanded([], [H2|T2], Level) -> - case leveled_codec:maybe_reap_expiredkey(H2, Level) of - true -> - {skipped_key, [], maybe_expand_pointer(T2)}; - false -> - {{next_key, H2}, [], maybe_expand_pointer(T2)} - end; -key_dominates_expanded([H1|T1], [H2|T2], Level) -> - case leveled_codec:key_dominates(H1, H2) of - left_hand_first -> - case leveled_codec:maybe_reap_expiredkey(H1, Level) of - true -> - {skipped_key, maybe_expand_pointer(T1), [H2|T2]}; - false -> - {{next_key, H1}, maybe_expand_pointer(T1), [H2|T2]} - end; - right_hand_first -> - case leveled_codec:maybe_reap_expiredkey(H2, Level) of - true -> - {skipped_key, [H1|T1], maybe_expand_pointer(T2)}; - false -> - {{next_key, H2}, [H1|T1], maybe_expand_pointer(T2)} - end; - left_hand_dominant -> - {skipped_key, [H1|T1], maybe_expand_pointer(T2)}; - right_hand_dominant -> - {skipped_key, maybe_expand_pointer(T1), [H2|T2]} - end. - - -%% When a list is provided it may include a pointer to gain another batch of -%% entries from the same file, or a new batch of entries from another file -%% -%% This resultant list should include the Tail of any pointers added at the -%% end of the list - -maybe_expand_pointer([]) -> - []; -maybe_expand_pointer([H|Tail]) -> - case H of - {next, SFTPid, StartKey} -> - %% io:format("Scanning further on PID ~w ~w~n", [SFTPid, StartKey]), - SW = os:timestamp(), - Acc = sft_getkvrange(SFTPid, StartKey, all, ?MERGE_SCANWIDTH), - leveled_log:log_timer("SFT14", [SFTPid], SW), - lists:append(Acc, Tail); - _ -> - [H|Tail] - end. - - -pointer_append_queryresults(Results, QueryPid) -> - case Results of - {complete, Acc} -> - Acc; - {partial, Acc, StartKey} -> - lists:append(Acc, [{next, QueryPid, StartKey}]) - end. - - -%% The Segment filter is a compressed filter representing the keys in a -%% given slot. The filter is delta-compressed list of integers using rice -%% encoding extended by the reference to each integer having an extra two bits -%% to indicate the block - there are four blocks in each slot. -%% -%% So each delta is represented as -%% - variable length exponent ending in 0, -%% with 0 representing the exponent of 0, -%% 10 -> 2 ^ 13, -%% 110 -> 2^14, -%% 1110 -> 2^15 etc -%% - 13-bit fixed length remainder -%% - 2-bit block number -%% This gives about 2-bytes per key, with a 1:8000 (approx) false positive -%% ratio (when checking the key by hashing to the segment ID) -%% -%% Before the delta list are three 20-bit integers representing the highest -%% integer in each block. Plus two bytes to indicate how many hashes -%% there are in the slot -%% -%% To check for the presence of a segment in a slot, roll over the deltas -%% keeping a running total overall and the current highest segment ID seen -%% per block. Roll all the way through even if matches are found or passed -%% over to confirm that the totals match the expected value (hence creating -%% a natural checksum) -%% -%% The end-result is a 260-byte check for the presence of a key in a slot -%% returning the block in which the segment can be found, which may also be -%% used directly for checking for the presence of segments. -%% -%% This is more space efficient than the equivalent bloom filter and avoids -%% the calculation of many hash functions. - -generate_segment_filter([SegL1]) -> - generate_segment_filter({SegL1, [], [], []}); -generate_segment_filter([SegL1, SegL2]) -> - generate_segment_filter({SegL1, SegL2, [], []}); -generate_segment_filter([SegL1, SegL2, SegL3]) -> - generate_segment_filter({SegL1, SegL2, SegL3, []}); -generate_segment_filter([SegL1, SegL2, SegL3, SegL4]) -> - generate_segment_filter({SegL1, SegL2, SegL3, SegL4}); -generate_segment_filter(SegLists) -> - generate_segment_filter(merge_seglists(SegLists), - [], - [{0, 0}, {0, 1}, {0, 2}, {0, 3}]). - -%% to generate the segment filter needs a sorted list of {Delta, Block} pairs -%% as DeltaList and a list of {TopHash, Block} pairs as TopHashes - -generate_segment_filter([], DeltaList, TopHashes) -> - {lists:reverse(DeltaList), TopHashes}; -generate_segment_filter([NextSeg|SegTail], DeltaList, TopHashes) -> - {TopHash, _} = lists:max(TopHashes), - {NextSegHash, NextSegBlock} = NextSeg, - DeltaList2 = [{NextSegHash - TopHash, NextSegBlock}|DeltaList], - TopHashes2 = lists:keyreplace(NextSegBlock, 2, TopHashes, - {NextSegHash, NextSegBlock}), - generate_segment_filter(SegTail, DeltaList2, TopHashes2). - - -serialise_segment_filter({DeltaList, TopHashes}) -> - TopHashesBin = lists:foldl(fun({X, _}, Acc) -> - <> end, - <<>>, TopHashes), - Length = length(DeltaList), - HeaderBin = <>, - {Divisor, Factor} = {?DIVISOR, ?DIVISOR_BITS}, - F = fun({Delta, Block}, Acc) -> - Exponent = buildexponent(Delta div Divisor), - Remainder = Delta rem Divisor, - Block2Bit = Block, - <> end, - pad_binary(lists:foldl(F, HeaderBin, DeltaList)). - - -pad_binary(BitString) -> - Pad = 8 - bit_size(BitString) rem 8, - case Pad of - 8 -> BitString; - _ -> <> - end. - -buildexponent(Exponent) -> - buildexponent(Exponent, <<0:1>>). - -buildexponent(0, OutputBits) -> - OutputBits; -buildexponent(Exponent, OutputBits) -> - buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>). - -merge_seglists({SegList1, SegList2, SegList3, SegList4}) -> - Stage1 = lists:foldl(fun(X, Acc) -> [{X, 0}|Acc] end, [], SegList1), - Stage2 = lists:foldl(fun(X, Acc) -> [{X, 1}|Acc] end, Stage1, SegList2), - Stage3 = lists:foldl(fun(X, Acc) -> [{X, 2}|Acc] end, Stage2, SegList3), - Stage4 = lists:foldl(fun(X, Acc) -> [{X, 3}|Acc] end, Stage3, SegList4), - lists:sort(Stage4). - -hash_for_segmentid(KV) -> - erlang:phash2(leveled_codec:strip_to_keyonly(KV), ?MAX_SEG_HASH). - - -%% Check for a given list of segments in the filter, returning in normal -%% operations a TupleList of {SegmentID, [ListOFBlocks]} where the ListOfBlocks -%% are the block IDs which contain keys in that given segment -%% -%% If there is a failure - perhaps due to a bit flip of some sort an error -%% willl be returned (error_so_maybe_present) and all blocks should be checked -%% as the filter cannot be relied upon - -check_for_segments(SegFilter, SegmentList, CRCCheck) -> - case CRCCheck of - true -> - <> = SegFilter, - CheckSum = [T0, T1, T2, T3], - case safecheck_for_segments(SegRem, SegmentList, - [0, 0, 0, 0], - 0, Count, []) of - {error_so_maybe_present, Reason} -> - leveled_log:log("SFT11", [Reason]), - error_so_maybe_present; - {OutputCheck, BlockList} when OutputCheck == CheckSum, - BlockList == [] -> - not_present; - {OutputCheck, BlockList} when OutputCheck == CheckSum -> - {maybe_present, BlockList}; - {OutputCheck, _} -> - leveled_log:log("SFT12", [OutputCheck, CheckSum]), - error_so_maybe_present - end; - false -> - <<_:80/bitstring, Count:16/integer, SegRem/bitstring>> = SegFilter, - case quickcheck_for_segments(SegRem, SegmentList, - lists:max(SegmentList), - 0, Count, []) of - {error_so_maybe_present, Reason} -> - leveled_log:log("SFT13", [Reason]), - error_so_maybe_present; - BlockList when BlockList == [] -> - not_present; - BlockList -> - {maybe_present, BlockList} - end - end. - - -safecheck_for_segments(_, _, TopHashes, _, 0, BlockList) -> - {TopHashes, BlockList}; -safecheck_for_segments(Filter, SegmentList, TopHs, Acc, Count, BlockList) -> - case findexponent(Filter) of - {ok, Exp, FilterRem1} -> - case findremainder(FilterRem1, ?DIVISOR_BITS) of - {ok, Remainder, BlockID, FilterRem2} -> - {NextHash, BlockList2} = checkhash_forsegments(Acc, - Exp, - Remainder, - SegmentList, - BlockList, - BlockID), - TopHashes2 = setnth(BlockID, TopHs, NextHash), - safecheck_for_segments(FilterRem2, SegmentList, - TopHashes2, - NextHash, Count - 1, - BlockList2); - error -> - {error_so_maybe_present, "Remainder Check"} - end; - error -> - {error_so_maybe_present, "Exponent Check"} - end. - -quickcheck_for_segments(_, _, _, _, 0, BlockList) -> - BlockList; -quickcheck_for_segments(Filter, SegmentList, MaxSeg, Acc, Count, BlockList) -> - case findexponent(Filter) of - {ok, Exp, FilterRem1} -> - case findremainder(FilterRem1, ?DIVISOR_BITS) of - {ok, Remainder, BlockID, FilterRem2} -> - {NextHash, BlockList2} = checkhash_forsegments(Acc, - Exp, - Remainder, - SegmentList, - BlockList, - BlockID), - case NextHash > MaxSeg of - true -> - BlockList2; - false -> - quickcheck_for_segments(FilterRem2, SegmentList, - MaxSeg, - NextHash, Count - 1, - BlockList2) - end; - error -> - {error_so_maybe_present, "Remainder Check"} - end; - error -> - {error_so_maybe_present, "Exponent Check"} - end. - - -checkhash_forsegments(Acc, Exp, Remainder, SegmentList, BlockList, BlockID) -> - NextHash = Acc + ?DIVISOR * Exp + Remainder, - case lists:member(NextHash, SegmentList) of - true -> - {NextHash, [BlockID|BlockList]}; - false -> - {NextHash, BlockList} - end. - - -setnth(0, [_|Rest], New) -> [New|Rest]; -setnth(I, [E|Rest], New) -> [E|setnth(I-1, Rest, New)]. - - -findexponent(BitStr) -> - findexponent(BitStr, 0). - -findexponent(<<>>, _) -> - error; -findexponent(<>, Acc) -> - case H of - 1 -> findexponent(T, Acc + 1); - 0 -> {ok, Acc, T} - end. - - -findremainder(BitStr, Factor) -> - case BitStr of - <> -> - {ok, Remainder, BlockID, Tail}; - _ -> - error - end. - - - -%%%============================================================================ -%%% Test -%%%============================================================================ - - --ifdef(TEST). - -generate_randomkeys({Count, StartSQN}) -> - generate_randomkeys(Count, StartSQN, []); -generate_randomkeys(Count) -> - generate_randomkeys(Count, 0, []). - -generate_randomkeys(0, _SQN, Acc) -> - lists:reverse(Acc); -generate_randomkeys(Count, SQN, Acc) -> - K = {o, - lists:concat(["Bucket", random:uniform(1024)]), - lists:concat(["Key", random:uniform(1024)]), - null}, - RandKey = {K, - {SQN, - {active, infinity}, - leveled_codec:magic_hash(K), - null}}, - generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). - -generate_sequentialkeys(Count, Start) -> - generate_sequentialkeys(Count + Start, Start, []). - -generate_sequentialkeys(Target, Incr, Acc) when Incr =:= Target -> - Acc; -generate_sequentialkeys(Target, Incr, Acc) -> - KeyStr = string:right(integer_to_list(Incr), 8, $0), - K = {o, "BucketSeq", lists:concat(["Key", KeyStr]), null}, - NextKey = {K, - {5, - {active, infinity}, - leveled_codec:magic_hash(K), - null}}, - generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]). - -simple_create_block_test() -> - KeyList1 = [{{o, "Bucket1", "Key1", null}, - {1, {active, infinity}, no_lookup, null}}, - {{o, "Bucket1", "Key3", null}, - {2, {active, infinity}, no_lookup, null}}], - KeyList2 = [{{o, "Bucket1", "Key2", null}, - {3, {active, infinity}, no_lookup, null}}], - {Status, BlockKeyList} = create_block(KeyList1, - KeyList2, - #level{level=1}), - ?assertMatch(partial, Status), - [H1|T1] = BlockKeyList, - ?assertMatch({{o, "Bucket1", "Key1", null}, - {1, {active, infinity}, no_lookup, null}}, H1), - [H2|T2] = T1, - ?assertMatch({{o, "Bucket1", "Key2", null}, - {3, {active, infinity}, no_lookup, null}}, H2), - ?assertMatch([{{o, "Bucket1", "Key3", null}, - {2, {active, infinity}, no_lookup, null}}], T2). - -dominate_create_block_test() -> - KeyList1 = [{{o, "Bucket1", "Key1", null}, - {1, {active, infinity}, no_lookup, null}}, - {{o, "Bucket1", "Key2", null}, - {2, {active, infinity}, no_lookup, null}}], - KeyList2 = [{{o, "Bucket1", "Key2", null}, - {3, {tomb, infinity}, no_lookup, null}}], - {Status, BlockKeyList} = create_block(KeyList1, - KeyList2, - #level{level=1}), - ?assertMatch(partial, Status), - [K1, K2] = BlockKeyList, - ?assertMatch(K1, lists:nth(1, KeyList1)), - ?assertMatch(K2, lists:nth(1, KeyList2)). - -sample_keylist() -> - KeyList1 = - [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key3", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key5", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key7", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key1", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key3", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key5", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key7", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key9", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key1", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key3", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key5", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key7", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key9", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, 0, null}}], - KeyList2 = - [{{o, "Bucket1", "Key2", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key4", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key6", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key8", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9a", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9c", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key9d", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key2", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key4", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key6", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket2", "Key8", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key2", null}, {1, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key4", null}, {3, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key6", null}, {2, {active, infinity}, 0, null}}, - {{o, "Bucket3", "Key8", null}, {1, {active, infinity}, 0, null}}], - {KeyList1, KeyList2}. - -alternating_create_block_test() -> - {KeyList1, KeyList2} = sample_keylist(), - {Status, BlockKeyList} = create_block(KeyList1, - KeyList2, - #level{level=1}), - BlockSize = length(BlockKeyList), - ?assertMatch(BlockSize, 32), - ?assertMatch(all_complete, Status), - K1 = lists:nth(1, BlockKeyList), - ?assertMatch(K1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, 0, null}}), - K11 = lists:nth(11, BlockKeyList), - ?assertMatch(K11, {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, 0, null}}), - K32 = lists:nth(32, BlockKeyList), - ?assertMatch(K32, {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, 0, null}}), - HKey = {{o, "Bucket1", "Key0", null}, {1, {active, infinity}, 0, null}}, - {Status2, _} = create_block([HKey|KeyList1], KeyList2, #level{level=1}), - ?assertMatch(block_full, element(1, Status2)). - - -merge_seglists_test() -> - SegList1 = [0, 100, 200], - SegList2 = [50, 200], - SegList3 = [75, 10000], - SegList4 = [], - MergedList = merge_seglists({SegList1, SegList2, - SegList3, SegList4}), - ?assertMatch(MergedList, [{0, 0}, {50, 1}, {75, 2}, {100, 0}, - {200, 0}, {200,1}, {10000,2}]), - SegTerm = generate_segment_filter({SegList1, SegList2, - SegList3, SegList4}), - ?assertMatch(SegTerm, {[{0, 0}, {50, 1}, {25, 2}, {25, 0}, - {100, 0}, {0, 1}, {9800, 2}], - [{200, 0}, {200, 1}, {10000, 2},{0, 3}]}), - SegBin = serialise_segment_filter(SegTerm), - ExpectedTopHashes = <<200:20, 200:20, 10000:20, 0:20>>, - ExpectedDeltas = <<0:1, 0:13, 0:2, - 0:1, 50:13, 1:2, - 0:1, 25:13, 2:2, - 0:1, 25:13, 0:2, - 0:1, 100:13, 0:2, - 0:1, 0:13, 1:2, - 2:2, 1708:13, 2:2>>, - ExpectedResult = <>, - ?assertMatch(SegBin, ExpectedResult), - R1 = check_for_segments(SegBin, [100], true), - ?assertMatch(R1,{maybe_present, [0]}), - R2 = check_for_segments(SegBin, [900], true), - ?assertMatch(R2, not_present), - R3 = check_for_segments(SegBin, [200], true), - ?assertMatch(R3, {maybe_present, [1,0]}), - R4 = check_for_segments(SegBin, [0,900], true), - ?assertMatch(R4, {maybe_present, [0]}), - R5 = check_for_segments(SegBin, [100], false), - ?assertMatch(R5, {maybe_present, [0]}), - R6 = check_for_segments(SegBin, [900], false), - ?assertMatch(R6, not_present), - R7 = check_for_segments(SegBin, [200], false), - ?assertMatch(R7, {maybe_present, [1,0]}), - R8 = check_for_segments(SegBin, [0,900], false), - ?assertMatch(R8, {maybe_present, [0]}), - R9 = check_for_segments(SegBin, [1024*1024 - 1], false), - ?assertMatch(R9, not_present), - io:format("Try corrupted bloom filter with flipped bit in " ++ - "penultimate delta~n"), - ExpectedDeltasFlippedBit = <<0:1, 0:13, 0:2, - 0:1, 50:13, 1:2, - 0:1, 25:13, 2:2, - 0:1, 25:13, 0:2, - 0:1, 100:13, 0:2, - 0:1, 0:13, 1:2, - 2:2, 1709:13, 2:2>>, - SegBin1 = <>, - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin1, [900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin1, [200], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin1, [0,900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin1, [1024*1024 - 1], true)), - % This match is before the flipped bit, so still works without CRC check - ?assertMatch({maybe_present, [0]}, - check_for_segments(SegBin1, [0,900], false)), - io:format("Try corrupted bloom filter with flipped bit in " ++ - "final block's top hash~n"), - ExpectedTopHashesFlippedBit = <<200:20, 200:20, 10000:20, 1:20>>, - SegBin2 = <>, - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin2, [900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin2, [200], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin2, [0,900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin2, [1024*1024 - 1], true)), - % This match is before the flipped bit, so still works without CRC check - ?assertMatch({maybe_present, [0]}, - check_for_segments(SegBin2, [0,900], false)), - - ExpectedDeltasAll1s = <<4294967295:32/integer>>, - SegBin3 = <>, - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [200], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [0,900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [1024*1024 - 1], true)), - % This is so badly mangled, the error gets detected event without CRC - % checking being enforced - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [900], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [200], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [0,900], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin3, [1024*1024 - 1], false)), - - ExpectedDeltasNearlyAll1s = <<4294967287:32/integer>>, - SegBin4 = <>, - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [200], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [0,900], true)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [1024*1024 - 1], true)), - % This is so badly mangled, the error gets detected event without CRC - % checking being enforced - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [900], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [200], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [0,900], false)), - ?assertMatch(error_so_maybe_present, - check_for_segments(SegBin4, [1024*1024 - 1], false)). - -createslot_stage1_test() -> - {KeyList1, KeyList2} = sample_keylist(), - {Status, BlockKeyLists} = create_slot(KeyList1, KeyList2, #level{level=1}), - WState = finalise_slot(BlockKeyLists, #writer{}), - - ?assertMatch({o, "Bucket4", "Key1", null}, WState#writer.last_key), - ?assertMatch(partial, Status), - - %% Writer state has the SlotIndex which includes the segment filter - SegFilter = element(2, lists:nth(1, WState#writer.slot_index)), - - R0 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, {o, "Bucket1", "Key1", null}})], - true), - ?assertMatch({maybe_present, [0]}, R0), - R1 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, {o, "Bucket1", "Key99", null}})], - true), - ?assertMatch(not_present, R1), - ?assertMatch(1, WState#writer.min_sqn), - ?assertMatch(3, WState#writer.max_sqn). - - -createslot_stage2_test() -> - {Status, BlockKeyLists} = create_slot(lists:sort(generate_randomkeys(100)), - lists:sort(generate_randomkeys(100)), - #level{level=1}), - WState = finalise_slot(BlockKeyLists, #writer{}), - LengthList = element(3, lists:nth(1, WState#writer.slot_index)), - - ?assertMatch(full, element(1, Status)), - Sum1 = lists:sum(LengthList), - Sum2 = byte_size(WState#writer.slot_binary), - ?assertMatch(Sum1, Sum2). - - -createslot_stage3_test() -> - {Status, BlockKeyLists} = create_slot(lists:sort(generate_sequentialkeys(100, 1)), - lists:sort(generate_sequentialkeys(100, 101)), - #level{level=1}), - WState = finalise_slot(BlockKeyLists, #writer{}), - {FirstKey, SegFilter, LengthList} = lists:nth(1, WState#writer.slot_index), - - ?assertMatch(full, element(1, Status)), - Sum1 = lists:sum(LengthList), - Sum2 = byte_size(WState#writer.slot_binary), - ?assertMatch(Sum1, Sum2), - ?assertMatch({o, "BucketSeq", "Key00000001", null}, FirstKey), - ?assertMatch({o, "BucketSeq", "Key00000128", null}, WState#writer.last_key), - ?assertMatch([], element(2, Status)), - Rem = length(element(3, Status)), - ?assertMatch(Rem, 72), - R0 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, - {o, "BucketSeq", "Key00000100", null}})], - true), - ?assertMatch({maybe_present, [3]}, R0), - R1 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, - {o, "Bucket1", "Key99", null}})], - true), - ?assertMatch(not_present, R1), - R2 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, - {o, "BucketSeq", "Key00000040", null}})], - true), - ?assertMatch({maybe_present, [1]}, R2), - R3 = check_for_segments(serialise_segment_filter(SegFilter), - [hash_for_segmentid({keyonly, - {o, "BucketSeq", "Key00000004", null}})], - true), - ?assertMatch({maybe_present, [0]}, R3). - - -initial_create_header_test() -> - Output = create_header(initial), - ?assertMatch(?HEADER_LEN, byte_size(Output)). - -initial_create_file_test() -> - Filename = "../test/test1.sft", - {KL1, KL2} = sample_keylist(), - {Handle, FileMD} = create_file(Filename), - {UpdHandle, UpdFileMD, {[], []}} = complete_file(Handle, FileMD, - KL1, KL2, - #level{level=1}), - - io:format("Slot Index of UpdFileMD ~w~n", [UpdFileMD#state.slot_index]), - Result1 = fetch_keyvalue(UpdHandle, UpdFileMD, {o, "Bucket1", "Key8", null}), - ?assertMatch({{o, "Bucket1", "Key8", null}, - {1, {active, infinity}, 0, null}}, Result1), - Result2 = fetch_keyvalue(UpdHandle, UpdFileMD, {o, "Bucket1", "Key88", null}), - ?assertMatch(not_present, Result2), - ok = file:close(UpdHandle), - ok = file:delete(Filename). - -big_create_file_test() -> - Filename = "../test/bigtest1.sft", - {KL1, KL2} = {lists:sort(generate_randomkeys(2000)), - lists:sort(generate_randomkeys(40000))}, - {InitHandle, InitFileMD} = create_file(Filename), - {Handle, FileMD, {_KL1Rem, _KL2Rem}} = complete_file(InitHandle, - InitFileMD, - KL1, KL2, - #level{level=1}), - [{K1, {Sq1, St1, MH1, V1}}|_] = KL1, - [{K2, {Sq2, St2, MH2, V2}}|_] = KL2, - Result1 = fetch_keyvalue(Handle, FileMD, K1), - Result2 = fetch_keyvalue(Handle, FileMD, K2), - ?assertMatch({K1, {Sq1, St1, MH1, V1}}, Result1), - ?assertMatch({K2, {Sq2, St2, MH2, V2}}, Result2), - SubList = lists:sublist(KL2, 1000), - lists:foreach(fun(KV) -> - {Kn, _} = KV, - Rn = fetch_keyvalue(Handle, FileMD, Kn), - ?assertMatch({Kn, _}, Rn) - end, - SubList), - Result3 = fetch_keyvalue(Handle, - FileMD, - {o, "Bucket1024", "Key1024Alt", null}), - ?assertMatch(Result3, not_present), - ok = file:close(Handle), - ok = file:delete(Filename). - -initial_iterator_test() -> - Filename = "../test/test2.sft", - {KL1, KL2} = sample_keylist(), - {Handle, FileMD} = create_file(Filename), - {UpdHandle, UpdFileMD, {[], []}} = complete_file(Handle, FileMD, - KL1, KL2, - #level{level=1}), - Result1 = fetch_range_keysonly(UpdHandle, UpdFileMD, - {o, "Bucket1", "Key8", null}, - {o, "Bucket1", "Key9d", null}), - io:format("Result returned of ~w~n", [Result1]), - ?assertMatch({complete, - [{{o, "Bucket1", "Key8", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9a", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9b", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9c", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9d", null}, 1, {active, infinity}} - ]}, - Result1), - Result2 = fetch_range_keysonly(UpdHandle, UpdFileMD, - {o, "Bucket1", "Key8", null}, - {o, "Bucket1", "Key9b", null}), - ?assertMatch({complete, - [{{o, "Bucket1", "Key8", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9a", null}, 1, {active, infinity}}, - {{o, "Bucket1", "Key9b", null}, 1, {active, infinity}} - ]}, - Result2), - Result3 = fetch_range_keysonly(UpdHandle, UpdFileMD, - {o, "Bucket3", "Key4", null}, - all), - {partial, RL3, _} = Result3, - ?assertMatch([{{o, "Bucket3", "Key4", null}, 3, {active, infinity}}, - {{o, "Bucket3", "Key5", null}, 1, {active, infinity}}, - {{o, "Bucket3", "Key6", null}, 2, {active, infinity}}, - {{o, "Bucket3", "Key7", null}, 1, {active, infinity}}, - {{o, "Bucket3", "Key8", null}, 1, {active, infinity}}, - {{o, "Bucket3", "Key9", null}, 1, {active, infinity}}, - {{o, "Bucket4", "Key1", null}, 1, {active, infinity}}], - RL3), - ok = file:close(UpdHandle), - ok = file:delete(Filename). - -key_dominates_test() -> - KV1 = {{o, "Bucket", "Key1", null}, {5, {active, infinity}, 0, []}}, - KV2 = {{o, "Bucket", "Key3", null}, {6, {active, infinity}, 0, []}}, - KV3 = {{o, "Bucket", "Key2", null}, {3, {active, infinity}, 0, []}}, - KV4 = {{o, "Bucket", "Key4", null}, {7, {active, infinity}, 0, []}}, - KV5 = {{o, "Bucket", "Key1", null}, {4, {active, infinity}, 0, []}}, - KV6 = {{o, "Bucket", "Key1", null}, {99, {tomb, 999}, 0, []}}, - KV7 = {{o, "Bucket", "Key1", null}, {99, tomb, 0, []}}, - KL1 = [KV1, KV2], - KL2 = [KV3, KV4], - ?assertMatch({{next_key, KV1}, [KV2], KL2}, - key_dominates(KL1, KL2, {undefined, 1})), - ?assertMatch({{next_key, KV1}, KL2, [KV2]}, - key_dominates(KL2, KL1, {undefined, 1})), - ?assertMatch({skipped_key, KL2, KL1}, - key_dominates([KV5|KL2], KL1, {undefined, 1})), - ?assertMatch({{next_key, KV1}, [KV2], []}, - key_dominates(KL1, [], {undefined, 1})), - ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, - key_dominates([KV6|KL2], KL1, {undefined, 1})), - ?assertMatch({{next_key, KV6}, KL2, [KV2]}, - key_dominates([KV6|KL2], [KV2], {undefined, 1})), - ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, - key_dominates([KV6|KL2], KL1, {true, 1})), - ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, - key_dominates([KV6|KL2], KL1, {true, 1000})), - ?assertMatch({{next_key, KV6}, KL2, [KV2]}, - key_dominates([KV6|KL2], [KV2], {true, 1})), - ?assertMatch({skipped_key, KL2, [KV2]}, - key_dominates([KV6|KL2], [KV2], {true, 1000})), - ?assertMatch({skipped_key, [], []}, - key_dominates([KV6], [], {true, 1000})), - ?assertMatch({skipped_key, [], []}, - key_dominates([], [KV6], {true, 1000})), - ?assertMatch({{next_key, KV6}, [], []}, - key_dominates([KV6], [], {true, 1})), - ?assertMatch({{next_key, KV6}, [], []}, - key_dominates([], [KV6], {true, 1})), - ?assertMatch({skipped_key, [], []}, - key_dominates([KV7], [], {true, 1})), - ?assertMatch({skipped_key, [], []}, - key_dominates([], [KV7], {true, 1})), - ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, - key_dominates([KV7|KL2], KL1, {undefined, 1})), - ?assertMatch({{next_key, KV7}, KL2, [KV2]}, - key_dominates([KV7|KL2], [KV2], {undefined, 1})), - ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, - key_dominates([KV7|KL2], KL1, {true, 1})), - ?assertMatch({skipped_key, KL2, [KV2]}, - key_dominates([KV7|KL2], [KV2], {true, 1})). - - -corrupted_sft_test() -> - Filename = "../test/bigcorrupttest1.sft", - {KL1, KL2} = {lists:ukeysort(1, generate_randomkeys(2000)), []}, - {InitHandle, InitFileMD} = create_file(Filename), - {Handle, _FileMD, _Rems} = complete_file(InitHandle, - InitFileMD, - KL1, KL2, - #level{level=1}), - {ok, Lengths} = file:pread(Handle, 12, 12), - <> = Lengths, - ok = file:close(Handle), - - {ok, Corrupter} = file:open(Filename , [binary, raw, read, write]), - lists:foreach(fun(X) -> - case X * 5 of - Y when Y < FilterLength -> - Position = ?HEADER_LEN + X * 5 - + BlocksLength + IndexLength, - file:pwrite(Corrupter, - Position, - <<0:8/integer>>) - end - end, - lists:seq(1, 100)), - ok = file:close(Corrupter), - - {ok, SFTr, _KeyExtremes} = sft_open(Filename), - lists:foreach(fun({K, V}) -> - ?assertMatch({K, V}, sft_get(SFTr, K)) - end, - KL1), - ok = sft_clear(SFTr). - -big_iterator_test() -> - Filename = "../test/bigtest1.sft", - {KL1, KL2} = {lists:sort(generate_randomkeys(10000)), []}, - {InitHandle, InitFileMD} = create_file(Filename), - {Handle, FileMD, {KL1Rem, KL2Rem}} = complete_file(InitHandle, InitFileMD, - KL1, KL2, - #level{level=1}), - io:format("Remainder lengths are ~w and ~w ~n", [length(KL1Rem), - length(KL2Rem)]), - {complete, - Result1} = fetch_range_keysonly(Handle, - FileMD, - {o, "Bucket0000", "Key0000", null}, - {o, "Bucket9999", "Key9999", null}, - 256), - NumAddedKeys = 10000 - length(KL1Rem), - ?assertMatch(NumAddedKeys, length(Result1)), - {partial, - Result2, - _} = fetch_range_keysonly(Handle, - FileMD, - {o, "Bucket0000", "Key0000", null}, - {o, "Bucket9999", "Key9999", null}, - 32), - ?assertMatch(32 * 128, length(Result2)), - {partial, - Result3, - _} = fetch_range_keysonly(Handle, - FileMD, - {o, "Bucket0000", "Key0000", null}, - {o, "Bucket9999", "Key9999", null}, - 4), - ?assertMatch(4 * 128, length(Result3)), - ok = file:close(Handle), - ok = file:delete(Filename). - -hashclash_test() -> - Filename = "../test/hashclash.sft", - Key1 = {o, "Bucket", "Key838068", null}, - Key99 = {o, "Bucket", "Key898982", null}, - KeyNF = {o, "Bucket", "Key539122", null}, - ?assertMatch(4, hash_for_segmentid({keyonly, Key1})), - ?assertMatch(4, hash_for_segmentid({keyonly, Key99})), - ?assertMatch(4, hash_for_segmentid({keyonly, KeyNF})), - KeyList = lists:foldl(fun(X, Acc) -> - Key = {o, - "Bucket", - "Key8400" ++ integer_to_list(X), - null}, - Value = {X, - {active, infinity}, - leveled_codec:magic_hash(Key), - null}, - Acc ++ [{Key, Value}] end, - [], - lists:seq(10,98)), - KeyListToUse = [{Key1, - {1, - {active, infinity}, - leveled_codec:magic_hash(Key1), - null}}|KeyList] - ++ [{Key99, - {99, - {active, infinity}, - leveled_codec:magic_hash(Key99), - null}}], - {InitHandle, InitFileMD} = create_file(Filename), - {Handle, _FileMD, _Rem} = complete_file(InitHandle, InitFileMD, - KeyListToUse, [], - #level{level=1}), - ok = file:close(Handle), - {ok, SFTr, _KeyExtremes} = sft_open(Filename), - ?assertMatch({Key1, - {1, {active, infinity}, _, null}}, - sft_get(SFTr, Key1)), - ?assertMatch({Key99, - {99, {active, infinity}, _, null}}, - sft_get(SFTr, Key99)), - ?assertMatch(not_present, - sft_get(SFTr, KeyNF)), - - ok = sft_clear(SFTr). - -filename_test() -> - FN1 = "../tmp/filename", - FN2 = "../tmp/filename.pnd", - FN3 = "../tmp/subdir/file_name.pend", - ?assertMatch({"../tmp/filename.pnd", "../tmp/filename.sft"}, - generate_filenames(FN1)), - ?assertMatch({"../tmp/filename.pnd", "../tmp/filename.sft"}, - generate_filenames(FN2)), - ?assertMatch({"../tmp/subdir/file_name.pnd", - "../tmp/subdir/file_name.sft"}, - generate_filenames(FN3)). - -empty_file_test() -> - {ok, Pid, _Reply} = sft_new("../test/emptyfile.pnd", [], [], 1), - ?assertMatch(not_present, sft_get(Pid, "Key1")), - ?assertMatch([], sft_getkvrange(Pid, all, all, 16)), - ok = sft_clear(Pid). - - -nonsense_coverage_test() -> - {ok, Pid} = gen_fsm:start(?MODULE, [], []), - undefined = gen_fsm:sync_send_all_state_event(Pid, nonsense), - ok = gen_fsm:send_all_state_event(Pid, nonsense), - ?assertMatch({next_state, reader, #state{}}, handle_info(nonsense, - reader, - #state{})), - ?assertMatch({ok, reader, #state{}}, code_change(nonsense, - reader, - #state{}, - nonsense)). - --endif. \ No newline at end of file diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 7fcc81a..a5c3414 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -262,79 +262,60 @@ to_list(SkipList, Level) -> [], SkipList). -to_range(SkipList, Start, End, 1) -> - R = lists:foldl(fun({Mark, SL}, {PassedStart, PassedEnd, Acc, PrevList}) -> - - case {PassedStart, PassedEnd} of - {true, true} -> - {true, true, Acc, null}; - {false, false} -> - case Start > Mark of - true -> - {false, false, Acc, SL}; - false -> - RHS = splitlist_start(Start, PrevList ++ SL), - case leveled_codec:endkey_passed(End, Mark) of - true -> - EL = splitlist_end(End, RHS), - {true, true, EL, null}; - false -> - {true, false, RHS, null} - end - end; - {true, false} -> - case leveled_codec:endkey_passed(End, Mark) of - true -> - EL = splitlist_end(End, SL), - {true, true, Acc ++ EL, null}; - false -> - {true, false, Acc ++ SL, null} - end - end end, - - {false, false, [], []}, - SkipList), - {_Bool1, _Bool2, SubList, _PrevList} = R, - SubList; -to_range(SkipList, Start, End, Level) -> - R = lists:foldl(fun({Mark, SL}, {PassedStart, PassedEnd, Acc, PrevList}) -> - - case {PassedStart, PassedEnd} of - {true, true} -> - {true, true, Acc, null}; - {false, false} -> - case Start > Mark of - true -> - {false, false, Acc, SL}; - false -> - SkipLRange = to_range(PrevList, - Start, End, - Level - 1) ++ - to_range(SL, - Start, End, - Level - 1), - case leveled_codec:endkey_passed(End, Mark) of - true -> - {true, true, SkipLRange, null}; - false -> - {true, false, SkipLRange, null} - end - end; - {true, false} -> - SkipLRange = to_range(SL, Start, End, Level - 1), - case leveled_codec:endkey_passed(End, Mark) of - true -> - {true, true, Acc ++ SkipLRange, null}; - false -> - {true, false, Acc ++ SkipLRange, null} - end - end end, - - {false, false, [], []}, - SkipList), - {_Bool1, _Bool2, SubList, _PrevList} = R, - SubList. +to_range(SkipList, StartKey, EndKey, ListHeight) -> + to_range(SkipList, StartKey, EndKey, ListHeight, [], true). + +to_range(SkipList, StartKey, EndKey, ListHeight, Acc, StartIncl) -> + SL = sublist_above(SkipList, StartKey, ListHeight, StartIncl), + case SL of + [] -> + Acc; + _ -> + {LK, _LV} = lists:last(SL), + case leveled_codec:endkey_passed(EndKey, LK) of + false -> + to_range(SkipList, + LK, + EndKey, + ListHeight, + Acc ++ SL, + false); + true -> + SplitFun = + fun({K, _V}) -> + not leveled_codec:endkey_passed(EndKey, K) end, + LHS = lists:takewhile(SplitFun, SL), + Acc ++ LHS + end + end. + +sublist_above(SkipList, StartKey, 0, StartIncl) -> + TestFun = + fun({K, _V}) -> + case StartIncl of + true -> + K < StartKey; + false -> + K =< StartKey + end end, + lists:dropwhile(TestFun, SkipList); +sublist_above(SkipList, StartKey, Level, StartIncl) -> + TestFun = + fun({K, _SL}) -> + case StartIncl of + true -> + K < StartKey; + false -> + K =< StartKey + end end, + RHS = lists:dropwhile(TestFun, SkipList), + case RHS of + [] -> + []; + [{_K, SL}|_Rest] -> + sublist_above(SL, StartKey, Level - 1, StartIncl) + end. empty(SkipList, 1) -> [{?INFINITY_KEY, SkipList}]; @@ -385,17 +366,6 @@ get_sublist(Key, SkipList) -> null, SkipList). -splitlist_start(StartKey, SL) -> - {_LHS, RHS} = lists:splitwith(fun({K, _V}) -> K < StartKey end, SL), - RHS. - -splitlist_end(EndKey, SL) -> - {LHS, _RHS} = lists:splitwith(fun({K, _V}) -> - not leveled_codec:endkey_passed(EndKey, K) - end, - SL), - LHS. - %%%============================================================================ %%% Test %%%============================================================================ @@ -645,6 +615,33 @@ skiplist_nolookup_test() -> KL), ?assertMatch(KLSorted, to_list(SkipList)). +skiplist_range_test() -> + N = 150, + KL = generate_randomkeys(1, N, 1, N div 5), + + KLSL1 = lists:sublist(lists:ukeysort(1, KL), 128), + SkipList1 = from_list(KLSL1), + {LastK1, V1} = lists:last(KLSL1), + R1 = to_range(SkipList1, LastK1, LastK1), + ?assertMatch([{LastK1, V1}], R1), + + KLSL2 = lists:sublist(lists:ukeysort(1, KL), 127), + SkipList2 = from_list(KLSL2), + {LastK2, V2} = lists:last(KLSL2), + R2 = to_range(SkipList2, LastK2, LastK2), + ?assertMatch([{LastK2, V2}], R2), + + KLSL3 = lists:sublist(lists:ukeysort(1, KL), 129), + SkipList3 = from_list(KLSL3), + {LastK3, V3} = lists:last(KLSL3), + R3 = to_range(SkipList3, LastK3, LastK3), + ?assertMatch([{LastK3, V3}], R3), + + {FirstK4, V4} = lists:nth(1, KLSL3), + R4 = to_range(SkipList3, FirstK4, FirstK4), + ?assertMatch([{FirstK4, V4}], R4). + + empty_skiplist_size_test() -> ?assertMatch(0, leveled_skiplist:size(empty(false))), ?assertMatch(0, leveled_skiplist:size(empty(true))). diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl new file mode 100644 index 0000000..759b5cb --- /dev/null +++ b/src/leveled_sst.erl @@ -0,0 +1,1691 @@ +%% -------- SST (Variant) --------- +%% +%% A FSM module intended to wrap a persisted, ordered view of Keys and Values +%% +%% The persisted view is built from a list (which may be created by merging +%% multiple lists). The list is built first, then the view is created in bulk. +%% +%% -------- Slots --------- +%% +%% The view is built from sublists referred to as slot. Each slot is up to 128 +%% keys and values in size. Three strategis have been benchmarked for the +%% slot: a skiplist, a gb-tree, four blocks of flat lists with an index. +%% +%% Skiplist: +%% build and serialise slot - 3233 microseconds +%% de-serialise and check * 128 - 14669 microseconds +%% flatten back to list - 164 microseconds +%% +%% GBTree: +%% build and serialise tree - 1433 microseconds +%% de-serialise and check * 128 - 15263 microseconds +%% flatten back to list - 175 microseconds +%% +%% Indexed Blocks: +%% build and serialise slot 342 microseconds +%% de-deriaise and check * 128 - 6746 microseconds +%% flatten back to list - 187 microseconds +%% +%% The negative side of using Indexed Blocks is the storage of the index. In +%% the original implementation this was stored on fadvised disk (the index in +%% this case was a rice-encoded view of which block the object is in). In this +%% implementation it is cached in memory -requiring 2-bytes per key to be kept +%% in memory. +%% +%% -------- Blooms --------- +%% +%% There is a summary bloom for the table. the summary bloom is split by the +%% first byte of the hash, and consists of two hashes (derived from the +%% remainder of the hash). This is the top bloom, and the size varies by +%% level. +%% Level 0 has 8 bits per key - 0.05 fpr +%% Level 1 has 6 bits per key - 0.08 fpr +%% Other Levels have 4 bits per key - 0.15 fpr +%% +%% With the indexed block implementation of the slot a second slot-level bloom +%% is unnecessary (as the index itself yields a 0.003 % fpr). +%% +%% -------- Summary --------- +%% +%% Each file has a summary - which is the 128 keys at the top of each slot in +%% a skiplist, with some basic metadata about the slot stored as the value. +%% +%% The summary is stored seperately to the slots (wihtin the same file). +%% +%% -------- CRC Checks --------- +%% +%% Every attempt to either read a summary or a slot off disk will also include +%% a CRC check. If the CRC check fails non-presence is assumed (the data +%% within is assumed to be entirely lost). The data can be recovered by either +%% using a recoverable strategy in transaction log compaction, and triggering +%% the transaction log replay; or by using a higher level for of anti-entropy +%% (i.e. make Riak responsible). + + +-module(leveled_sst). + +-behaviour(gen_fsm). + +-include("include/leveled.hrl"). + +-define(MAX_SLOTS, 256). +-define(SLOT_SIZE, 128). % This is not configurable +-define(COMPRESSION_LEVEL, 1). +-define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]). +% -define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]). +-define(MERGE_SCANWIDTH, 16). +-define(INDEX_MARKER_WIDTH, 16). +-define(DISCARD_EXT, ".discarded"). +-define(DELETE_TIMEOUT, 10000). + +-include_lib("eunit/include/eunit.hrl"). + +-export([init/1, + handle_sync_event/4, + handle_event/3, + handle_info/3, + terminate/3, + code_change/4, + starting/2, + starting/3, + reader/3, + delete_pending/2, + delete_pending/3]). + +-export([sst_new/4, + sst_new/6, + sst_newlevelzero/5, + sst_open/1, + sst_get/2, + sst_get/3, + sst_getkvrange/4, + sst_getslots/2, + sst_getmaxsequencenumber/1, + sst_setfordelete/2, + sst_clear/1, + sst_checkready/1, + sst_deleteconfirmed/1, + sst_close/1]). + +-export([expand_list_by_pointer/3]). + + +-record(slot_index_value, {slot_id :: integer(), + start_position :: integer(), + length :: integer()}). + +-record(summary, {first_key :: tuple(), + last_key :: tuple(), + index :: tuple(), + size :: integer(), + max_sqn :: integer()}). + +-record(state, {summary, + handle :: file:fd(), + sst_timings :: tuple(), + penciller :: pid(), + filename, + blockindex_cache}). + + +%%%============================================================================ +%%% API +%%%============================================================================ + +sst_open(Filename) -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + case gen_fsm:sync_send_event(Pid, {sst_open, Filename}, infinity) of + {ok, {SK, EK}} -> + {ok, Pid, {SK, EK}} + end. + +sst_new(Filename, Level, KVList, MaxSQN) -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + case gen_fsm:sync_send_event(Pid, + {sst_new, + Filename, + Level, + KVList, + MaxSQN}, + infinity) of + {ok, {SK, EK}} -> + {ok, Pid, {SK, EK}} + end. + +sst_new(Filename, KL1, KL2, IsBasement, Level, MaxSQN) -> + {{Rem1, Rem2}, MergedList} = merge_lists(KL1, KL2, {IsBasement, Level}), + case MergedList of + [] -> + empty; + _ -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + case gen_fsm:sync_send_event(Pid, + {sst_new, + Filename, + Level, + MergedList, + MaxSQN}, + infinity) of + {ok, {SK, EK}} -> + {ok, Pid, {{Rem1, Rem2}, SK, EK}} + end + end. + +sst_newlevelzero(Filename, Slots, FetchFun, Penciller, MaxSQN) -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + gen_fsm:send_event(Pid, + {sst_newlevelzero, + Filename, + Slots, + FetchFun, + Penciller, + MaxSQN}), + {ok, Pid, noreply}. + +sst_get(Pid, LedgerKey) -> + sst_get(Pid, LedgerKey, leveled_codec:magic_hash(LedgerKey)). + +sst_get(Pid, LedgerKey, Hash) -> + gen_fsm:sync_send_event(Pid, {get_kv, LedgerKey, Hash}, infinity). + +sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> + gen_fsm:sync_send_event(Pid, + {get_kvrange, StartKey, EndKey, ScanWidth}, + infinity). + +sst_getslots(Pid, SlotList) -> + gen_fsm:sync_send_event(Pid, {get_slots, SlotList}, infinity). + +sst_getmaxsequencenumber(Pid) -> + gen_fsm:sync_send_event(Pid, get_maxsequencenumber, infinity). + +sst_setfordelete(Pid, Penciller) -> + gen_fsm:sync_send_event(Pid, {set_for_delete, Penciller}, infinity). + +sst_clear(Pid) -> + gen_fsm:sync_send_event(Pid, {set_for_delete, false}, infinity), + gen_fsm:sync_send_event(Pid, close, 1000). + +sst_deleteconfirmed(Pid) -> + gen_fsm:send_event(Pid, close). + +sst_checkready(Pid) -> + %% Only used in test + gen_fsm:sync_send_event(Pid, background_complete, 100). + + +sst_close(Pid) -> + gen_fsm:sync_send_event(Pid, close, 2000). + +%% Used in unit tests to force the printing of timings +sst_printtimings(Pid) -> + gen_fsm:sync_send_event(Pid, print_timings, 1000). + + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([]) -> + {ok, starting, #state{}}. + +starting({sst_open, Filename}, _From, State) -> + UpdState = read_file(Filename, State), + Summary = UpdState#state.summary, + {reply, + {ok, {Summary#summary.first_key, Summary#summary.last_key}}, + reader, + UpdState}; +starting({sst_new, Filename, Level, KVList, MaxSQN}, _From, State) -> + SW = os:timestamp(), + {FirstKey, + Length, + SlotIndex, + BlockIndex, + SlotsBin} = build_all_slots(KVList), + SummaryBin = build_table_summary(SlotIndex, + Level, + FirstKey, + Length, + MaxSQN), + ActualFilename = write_file(Filename, SummaryBin, SlotsBin), + UpdState = read_file(ActualFilename, State), + Summary = UpdState#state.summary, + leveled_log:log_timer("SST08", + [ActualFilename, Level, Summary#summary.max_sqn], + SW), + {reply, + {ok, {Summary#summary.first_key, Summary#summary.last_key}}, + reader, + UpdState#state{blockindex_cache = BlockIndex}}. + +starting({sst_newlevelzero, Filename, Slots, FetchFun, Penciller, MaxSQN}, + State) -> + SW = os:timestamp(), + KVList = leveled_pmem:to_list(Slots, FetchFun), + {FirstKey, + Length, + SlotIndex, + BlockIndex, + SlotsBin} = build_all_slots(KVList), + SummaryBin = build_table_summary(SlotIndex, + 0, + FirstKey, + Length, + MaxSQN), + ActualFilename = write_file(Filename, SummaryBin, SlotsBin), + UpdState = read_file(ActualFilename, State), + Summary = UpdState#state.summary, + leveled_log:log_timer("SST08", + [ActualFilename, 0, Summary#summary.max_sqn], + SW), + case Penciller of + undefined -> + {next_state, reader, UpdState#state{blockindex_cache = BlockIndex}}; + _ -> + leveled_penciller:pcl_confirml0complete(Penciller, + UpdState#state.filename, + Summary#summary.first_key, + Summary#summary.last_key), + {next_state, reader, UpdState#state{blockindex_cache = BlockIndex}} + end. + + +reader({get_kv, LedgerKey, Hash}, _From, State) -> + SW = os:timestamp(), + {Result, Stage, _SlotID, UpdState} = fetch(LedgerKey, Hash, State), + UpdTimings = leveled_log:sst_timing(State#state.sst_timings, SW, Stage), + {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}}; +reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> + {reply, + fetch_range(StartKey, EndKey, ScanWidth, State), + reader, + State}; +reader({get_slots, SlotList}, _From, State) -> + SlotBins = read_slots(State#state.handle, SlotList), + FetchFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + {reply, lists:foldl(FetchFun, [], SlotBins), reader, State}; +reader(get_maxsequencenumber, _From, State) -> + Summary = State#state.summary, + {reply, Summary#summary.max_sqn, reader, State}; +reader(print_timings, _From, State) -> + io:format(user, "Timings of ~w~n", [State#state.sst_timings]), + {reply, ok, reader, State#state{sst_timings = undefined}}; +reader({set_for_delete, Penciller}, _From, State) -> + leveled_log:log("SST06", [State#state.filename]), + {reply, + ok, + delete_pending, + State#state{penciller=Penciller}, + ?DELETE_TIMEOUT}; +reader(background_complete, _From, State) -> + Summary = State#state.summary, + {reply, + {ok, + State#state.filename, + Summary#summary.first_key, + Summary#summary.last_key}, + reader, + State}; +reader(close, _From, State) -> + ok = file:close(State#state.handle), + {stop, normal, ok, State}. + + +delete_pending({get_kv, LedgerKey, Hash}, _From, State) -> + {Result, _Stage, _SlotID, UpdState} = fetch(LedgerKey, Hash, State), + {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT}; +delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> + {reply, + fetch_range(StartKey, EndKey, ScanWidth, State), + delete_pending, + State, + ?DELETE_TIMEOUT}; +delete_pending({get_slots, SlotList}, _From, State) -> + SlotBins = read_slots(State#state.handle, SlotList), + FetchFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + {reply, + lists:foldl(FetchFun, [], SlotBins), + delete_pending, + State, + ?DELETE_TIMEOUT}; +delete_pending(close, _From, State) -> + leveled_log:log("SST07", [State#state.filename]), + ok = file:close(State#state.handle), + ok = file:delete(State#state.filename), + {stop, normal, ok, State}. + +delete_pending(timeout, State) -> + ok = leveled_penciller:pcl_confirmdelete(State#state.penciller, + State#state.filename), + {next_state, delete_pending, State, ?DELETE_TIMEOUT}; +delete_pending(close, State) -> + leveled_log:log("SST07", [State#state.filename]), + ok = file:close(State#state.handle), + ok = file:delete(State#state.filename), + {stop, normal, State}. + +handle_sync_event(_Msg, _From, StateName, State) -> + {reply, undefined, StateName, State}. + +handle_event(_Msg, StateName, State) -> + {next_state, StateName, State}. + +handle_info(_Msg, StateName, State) -> + {next_state, StateName, State}. + +terminate(Reason, _StateName, State) -> + leveled_log:log("SST04", [Reason, State#state.filename]). + +code_change(_OldVsn, StateName, State, _Extra) -> + {ok, StateName, State}. + + +%%%============================================================================ +%%% Internal Functions +%%%============================================================================ + +fetch(LedgerKey, Hash, State) -> + Summary = State#state.summary, + Slot = lookup_slot(LedgerKey, Summary#summary.index), + SlotID = Slot#slot_index_value.slot_id, + CachedBlockIdx = array:get(SlotID - 1, + State#state.blockindex_cache), + case CachedBlockIdx of + none -> + SlotBin = read_slot(State#state.handle, Slot), + {Result, BlockIdx} = binaryslot_get(SlotBin, + LedgerKey, + Hash, + none), + BlockIndexCache = array:set(SlotID - 1, + BlockIdx, + State#state.blockindex_cache), + {Result, + slot_fetch, + Slot#slot_index_value.slot_id, + State#state{blockindex_cache = BlockIndexCache}}; + _ -> + PosList = find_pos(CachedBlockIdx, + double_hash(Hash, LedgerKey), + [], + 0), + case PosList of + [] -> + {not_present, slot_bloom, SlotID, State}; + _ -> + SlotBin = read_slot(State#state.handle, Slot), + Result = binaryslot_get(SlotBin, + LedgerKey, + Hash, + {true, PosList}), + {element(1, Result), slot_fetch, SlotID, State} + end + end. + + +fetch_range(StartKey, EndKey, ScanWidth, State) -> + Summary = State#state.summary, + Handle = State#state.handle, + {Slots, LTrim, RTrim} = lookup_slots(StartKey, + EndKey, + Summary#summary.index), + Self = self(), + SL = length(Slots), + ExpandedSlots = + case SL of + 0 -> + []; + 1 -> + [Slot] = Slots, + case {LTrim, RTrim} of + {true, true} -> + [{pointer, Self, Slot, StartKey, EndKey}]; + {true, false} -> + [{pointer, Self, Slot, StartKey, all}]; + {false, true} -> + [{pointer, Self, Slot, all, EndKey}]; + {false, false} -> + [{pointer, Self, Slot, all, all}] + end; + N -> + {LSlot, MidSlots, RSlot} = + case N of + 2 -> + [Slot1, Slot2] = Slots, + {Slot1, [], Slot2}; + N -> + [Slot1|_Rest] = Slots, + SlotN = lists:last(Slots), + {Slot1, lists:sublist(Slots, 2, N - 2), SlotN} + end, + MidSlotPointers = lists:map(fun(S) -> + {pointer, Self, S, all, all} + end, + MidSlots), + case {LTrim, RTrim} of + {true, true} -> + [{pointer, Self, LSlot, StartKey, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, EndKey}]; + {true, false} -> + [{pointer, Self, LSlot, StartKey, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, all}]; + {false, true} -> + [{pointer, Self, LSlot, all, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, EndKey}]; + {false, false} -> + [{pointer, Self, LSlot, all, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, all}] + end + end, + {SlotsToFetch, SlotsToPoint} = + case ScanWidth of + SW when SW >= SL -> + {ExpandedSlots, []}; + _ -> + lists:split(ScanWidth, ExpandedSlots) + end, + + SlotsToFetchBinList = read_slots(Handle, SlotsToFetch), + + FetchFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint. + + +write_file(Filename, SummaryBin, SlotsBin) -> + SummaryLength = byte_size(SummaryBin), + SlotsLength = byte_size(SlotsBin), + {PendingName, FinalName} = generate_filenames(Filename), + ok = file:write_file(PendingName, + <>, + [raw]), + case filelib:is_file(FinalName) of + true -> + AltName = filename:join(filename:dirname(FinalName), + filename:basename(FinalName)) + ++ ?DISCARD_EXT, + leveled_log:log("SST05", [FinalName, AltName]), + ok = file:rename(FinalName, AltName); + false -> + ok + end, + file:rename(PendingName, FinalName), + FinalName. + +read_file(Filename, State) -> + {Handle, SummaryBin} = open_reader(Filename), + {Summary, SlotList} = read_table_summary(SummaryBin), + SlotCount = length(SlotList), + BlockIndexCache = array:new([{size, SlotCount}, {default, none}]), + UpdState = State#state{blockindex_cache = BlockIndexCache}, + SlotIndex = from_list(SlotList), + UpdSummary = Summary#summary{index = SlotIndex}, + leveled_log:log("SST03", [Filename, + Summary#summary.size, + SlotCount, + Summary#summary.max_sqn]), + UpdState#state{summary = UpdSummary, + handle = Handle, + filename = Filename}. + +open_reader(Filename) -> + {ok, Handle} = file:open(Filename, [binary, raw, read]), + {ok, Lengths} = file:pread(Handle, 0, 8), + <> = Lengths, + {ok, SummaryBin} = file:pread(Handle, SlotsLength + 8, SummaryLength), + {Handle, SummaryBin}. + +build_table_summary(SlotList, _Level, FirstKey, L, MaxSQN) -> + [{LastKey, _LastV}|_Rest] = SlotList, + Summary = #summary{first_key = FirstKey, + last_key = LastKey, + size = L, + max_sqn = MaxSQN}, + SummBin = term_to_binary({Summary, lists:reverse(SlotList)}, + ?BINARY_SETTINGS), + SummCRC = erlang:crc32(SummBin), + <>. + +read_table_summary(BinWithCheck) -> + <> = BinWithCheck, + CRCCheck = erlang:crc32(SummBin), + if + CRCCheck == SummCRC -> + % If not might it might be possible to rebuild from all the slots + binary_to_term(SummBin) + end. + +build_all_slots(KVList) -> + L = length(KVList), + % The length is not a constant time command and the list may be large, + % but otherwise length must be called each iteration to avoid exception + % on split or sublist + [{FirstKey, _FirstV}|_Rest] = KVList, + SlotCount = L div ?SLOT_SIZE + 1, + BuildResponse = build_all_slots(KVList, + SlotCount, + 8, + 1, + [], + array:new([{size, SlotCount}, + {default, none}]), + <<>>), + {SlotIndex, BlockIndex, SlotsBin} = BuildResponse, + {FirstKey, L, SlotIndex, BlockIndex, SlotsBin}. + +build_all_slots([], _SC, _Pos, _SlotID, SlotIdx, BlockIdxA, SlotsBin) -> + {SlotIdx, BlockIdxA, SlotsBin}; +build_all_slots(KVL, SC, Pos, SlotID, SlotIdx, BlockIdxA, SlotsBin) -> + {SlotList, KVRem} = + case SC of + 1 -> + {lists:sublist(KVL, ?SLOT_SIZE), []}; + _N -> + lists:split(?SLOT_SIZE, KVL) + end, + {LastKey, _V} = lists:last(SlotList), + {BlockIndex, SlotBin} = generate_binary_slot(SlotList), + Length = byte_size(SlotBin), + SlotIndexV = #slot_index_value{slot_id = SlotID, + start_position = Pos, + length = Length}, + build_all_slots(KVRem, + SC - 1, + Pos + Length, + SlotID + 1, + [{LastKey, SlotIndexV}|SlotIdx], + array:set(SlotID - 1, BlockIndex, BlockIdxA), + <>). + +read_slot(Handle, Slot) -> + {ok, SlotBin} = file:pread(Handle, + Slot#slot_index_value.start_position, + Slot#slot_index_value.length), + SlotBin. + +read_slots(Handle, SlotList) -> + PointerMapFun = + fun(Pointer) -> + {Slot, SK, EK} = + case Pointer of + {pointer, _Pid, Slot0, SK0, EK0} -> + {Slot0, SK0, EK0}; + {pointer, Slot0, SK0, EK0} -> + {Slot0, SK0, EK0} + end, + + {Slot#slot_index_value.start_position, + Slot#slot_index_value.length, + SK, + EK} + end, + + LengthList = lists:map(PointerMapFun, SlotList), + StartPos = element(1, lists:nth(1, LengthList)), + EndPos = element(1, lists:last(LengthList)) + + element(2, lists:last(LengthList)), + {ok, MultiSlotBin} = file:pread(Handle, StartPos, EndPos - StartPos), + + BinSplitMapFun = + fun({SP, L, SK, EK}) -> + Start = SP - StartPos, + <<_Pre:Start/binary, + SlotBin:L/binary, + _Post/binary>> = MultiSlotBin, + {SlotBin, SK, EK} + end, + + lists:map(BinSplitMapFun, LengthList). + +generate_filenames(RootFilename) -> + Ext = filename:extension(RootFilename), + Components = filename:split(RootFilename), + case Ext of + [] -> + {filename:join(Components) ++ ".pnd", + filename:join(Components) ++ ".sst"}; + Ext -> + DN = filename:dirname(RootFilename), + FP_NOEXT = filename:basename(RootFilename, Ext), + {filename:join(DN, FP_NOEXT) ++ ".pnd", + filename:join(DN, FP_NOEXT) ++ ".sst"} + end. + + +%%%============================================================================ +%%% SlotIndex Implementation +%%%============================================================================ + +%% The Slot Index is stored as a flat (sorted) list of {Key, Slot} where Key +%% is the last key within the slot. +%% +%% This implementation of the SlotIndex stores it as a tuple with the original +%% list as the second element and a list of mark points as the first element +%% containing every 16th key. The Mark points are stored as {Mark, Index}, +%% where the Index correspnds with the nth point in the original list that the +%% Mark occurs. + +from_list(SlotList) -> + L = length(SlotList), + MarkerList = set_marks(lists:reverse(SlotList), + {?INDEX_MARKER_WIDTH, L rem ?INDEX_MARKER_WIDTH}, + L, + []), + {MarkerList, SlotList}. + +set_marks([], _MarkInfo, 0, MarkerList) -> + MarkerList; +set_marks([{Key, _Slot}|Rest], {MarkerWidth, MarkPoint}, Count, MarkerList) -> + case Count rem MarkerWidth of + MarkPoint -> + set_marks(Rest, + {MarkerWidth, MarkPoint}, + Count - 1, + [{Key, Count}|MarkerList]); + _ -> + set_marks(Rest, + {MarkerWidth, MarkPoint}, + Count - 1, + MarkerList) + end. + +find_mark(Key, [{Mark, Pos}|_Rest]) when Mark >= Key -> + Pos; +find_mark(Key, [_H|T]) -> + find_mark(Key, T). + +lookup_slot(Key, {MarkerList, SlotList}) -> + Pos = find_mark(Key, MarkerList), + SubList = lists:sublist(SlotList, max(1, Pos - ?INDEX_MARKER_WIDTH), Pos), + Slot = find_mark(Key, SubList), + Slot. + +%% Returns a section from the summary index and two booleans to indicate if +%% the first slot needs trimming, or the last slot +lookup_slots(StartKey, EndKey, {_MarkerList, SlotList}) -> + SlotsOnlyFun = fun({_K, V}) -> V end, + {KSL, LTrim, RTrim} = lookup_slots_int(StartKey, EndKey, SlotList), + {lists:map(SlotsOnlyFun, KSL), LTrim, RTrim}. + +lookup_slots_int(all, all, SlotList) -> + {SlotList, false, false}; +lookup_slots_int(StartKey, all, SlotList) -> + LTrimFun = fun({K, _V}) -> K < StartKey end, + {_LDrop, RKeep0} = lists:splitwith(LTrimFun, SlotList), + {RKeep0, true, false}; +lookup_slots_int(StartKey, EndKey, SlotList) -> + {RKeep, true, false} = lookup_slots_int(StartKey, all, SlotList), + [LeftMost|RKeep0] = RKeep, + {LeftMostK, LeftMostV} = LeftMost, + RTrimFun = fun({K, _V}) -> not leveled_codec:endkey_passed(EndKey, K) end, + case leveled_codec:endkey_passed(EndKey, LeftMostK) of + true -> + {[{LeftMostK, LeftMostV}], + true, + true}; + false -> + case LeftMostK of + EndKey -> + {[{LeftMostK, LeftMostV}], + true, + false}; + _ -> + {LKeep, RDisc} = lists:splitwith(RTrimFun, RKeep0), + case RDisc of + [] -> + {[LeftMost|LKeep], + true, + true}; + [{RDiscK1, RDiscV1}|_Rest] when RDiscK1 == EndKey -> + {[LeftMost|LKeep] ++ [{RDiscK1, RDiscV1}], + true, + false}; + [{RDiscK1, RDiscV1}|_Rest] -> + {[LeftMost|LKeep] ++ [{RDiscK1, RDiscV1}], + true, + true} + end + end + end. + + +%%%============================================================================ +%%% Slot Implementation +%%%============================================================================ + +%% Implementing a slot has gone through numerous iterations. One of the most +%% critical considerations has been the cost of the binary_to_term and +%% term_to_binary calls for different sizes of slots and different data types. +%% +%% Microbenchmarking indicated that flat lists were the fastest. However, the +%% lists need scanning at query time - and so give longer lookups. Bigger slots +%% did better at term_to_binary time. However term_to_binary is an often +%% repeated task, and this is better with smaller slots. +%% +%% The outcome has been to divide the slot into four small blocks to minimise +%% the binary_to_term time. A binary index is provided for the slot for all +%% Keys that are directly fetchable (i.e. standard keys not index keys). +%% +%% The division and use of a list saves about 100 microseconds per fetch when +%% compared to using a 128-member gb:tree. +%% +%% The binary index is cacheable and doubles as a not_present filter, as it is +%% based on a 17-bit hash (so 0.0039 fpr). + + +generate_binary_slot(KVL) -> + + HashFoldFun = + fun({K, V}, {PosBinAcc, NoHashCount}) -> + + {_SQN, H1} = leveled_codec:strip_to_seqnhashonly({K, V}), + case is_integer(H1) of + true -> + PosH1 = double_hash(H1, K), + case NoHashCount of + 0 -> + {<<1:1/integer, + PosH1:15/integer, + PosBinAcc/binary>>, + 0}; + N -> + % The No Hash Count is an integer between 0 and 127 + % and so at read time should count NHC + 1 + NHC = N - 1, + {<<1:1/integer, + PosH1:15/integer, + 0:1/integer, + NHC:7/integer, + PosBinAcc/binary>>, + 0} + end; + false -> + {PosBinAcc, NoHashCount + 1} + end + + end, + + {PosBinIndex0, NHC} = lists:foldr(HashFoldFun, {<<>>, 0}, KVL), + PosBinIndex1 = + case NHC of + 0 -> + PosBinIndex0; + _ -> + N = NHC - 1, + <<0:1/integer, N:7/integer, PosBinIndex0/binary>> + end, + + + {B1, B2, B3, B4} = + case length(KVL) of + L when L =< 32 -> + {term_to_binary(KVL, ?BINARY_SETTINGS), + <<0:0>>, + <<0:0>>, + <<0:0>>}; + L when L =< 64 -> + {KVLA_32, KVLB_32} = lists:split(32, KVL), + {term_to_binary(KVLA_32, ?BINARY_SETTINGS), + term_to_binary(KVLB_32, ?BINARY_SETTINGS), + <<0:0>>, + <<0:0>>}; + L when L =< 96 -> + {KVLA_32, KVLB_64} = lists:split(32, KVL), + {KVLB_32, KVLC_32} = lists:split(32, KVLB_64), + {term_to_binary(KVLA_32, ?BINARY_SETTINGS), + term_to_binary(KVLB_32, ?BINARY_SETTINGS), + term_to_binary(KVLC_32, ?BINARY_SETTINGS), + <<0:0>>}; + L when L =< 128 -> + {KVLA_32, KVLB_96} = lists:split(32, KVL), + {KVLB_32, KVLC_64} = lists:split(32, KVLB_96), + {KVLC_32, KVLD_32} = lists:split(32, KVLC_64), + {term_to_binary(KVLA_32, ?BINARY_SETTINGS), + term_to_binary(KVLB_32, ?BINARY_SETTINGS), + term_to_binary(KVLC_32, ?BINARY_SETTINGS), + term_to_binary(KVLD_32, ?BINARY_SETTINGS)} + end, + + B1P = byte_size(PosBinIndex1), + B1L = byte_size(B1), + B2L = byte_size(B2), + B3L = byte_size(B3), + B4L = byte_size(B4), + Lengths = <>, + SlotBin = <>, + CRC32 = erlang:crc32(SlotBin), + FullBin = <>, + + {PosBinIndex1, FullBin}. + + +binaryslot_get(FullBin, Key, Hash, CachedPosLookup) -> + case crc_check_slot(FullBin) of + {Lengths, Rest} -> + B1P = element(1, Lengths), + case CachedPosLookup of + {true, PosList} -> + <<_PosBinIndex:B1P/binary, Blocks/binary>> = Rest, + {fetch_value(PosList, Lengths, Blocks, Key), none}; + none -> + <> = Rest, + PosList = find_pos(PosBinIndex, + double_hash(Hash, Key), + [], + 0), + {fetch_value(PosList, Lengths, Blocks, Key), PosBinIndex} + end; + crc_wonky -> + {not_present, none} + end. + +binaryslot_tolist(FullBin) -> + BlockFetchFun = + fun(Length, {Acc, Bin}) -> + case Length of + 0 -> + {Acc, Bin}; + _ -> + <> = Bin, + {Acc ++ binary_to_term(Block), Rest} + end + end, + + {Out, _Rem} = + case crc_check_slot(FullBin) of + {Lengths, RestBin} -> + {B1P, B1L, B2L, B3L, B4L} = Lengths, + <<_PosBinIndex:B1P/binary, Blocks/binary>> = RestBin, + lists:foldl(BlockFetchFun, {[], Blocks}, [B1L, B2L, B3L, B4L]); + crc_wonky -> + {[], <<>>} + end, + Out. + + +binaryslot_trimmedlist(FullBin, all, all) -> + binaryslot_tolist(FullBin); +binaryslot_trimmedlist(FullBin, StartKey, EndKey) -> + LTrimFun = fun({K, _V}) -> K < StartKey end, + RTrimFun = fun({K, _V}) -> not leveled_codec:endkey_passed(EndKey, K) end, + BlockFetchFun = + fun(Length, {Acc, Bin}) -> + case Length of + 0 -> + {Acc, Bin}; + _ -> + <> = Bin, + BlockList = binary_to_term(Block), + {FirstKey, _FV} = lists:nth(1, BlockList), + {LastKey, _LV} = lists:last(BlockList), + TrimBools = trim_booleans(FirstKey, LastKey, + StartKey, EndKey), + case TrimBools of + {true, _, _, _} -> + {Acc, Rest}; + {false, true, _, _} -> + {Acc ++ BlockList, Rest}; + {false, false, true, false} -> + {_LDrop, RKeep} = lists:splitwith(LTrimFun, + BlockList), + {Acc ++ RKeep, Rest}; + {false, false, false, true} -> + {LKeep, _RDrop} = lists:splitwith(RTrimFun, + BlockList), + {Acc ++ LKeep, Rest}; + {false, false, true, true} -> + {_LDrop, RKeep} = lists:splitwith(LTrimFun, + BlockList), + {LKeep, _RDrop} = lists:splitwith(RTrimFun, RKeep), + {Acc ++ LKeep, Rest} + end + + end + end, + + {Out, _Rem} = + case crc_check_slot(FullBin) of + {Lengths, RestBin} -> + {B1P, B1L, B2L, B3L, B4L} = Lengths, + <<_PosBinIndex:B1P/binary, Blocks/binary>> = RestBin, + lists:foldl(BlockFetchFun, {[], Blocks}, [B1L, B2L, B3L, B4L]); + crc_wonky -> + {[], <<>>} + end, + Out. + + +trim_booleans(FirstKey, _LastKey, StartKey, all) -> + FirstKeyPassedStart = FirstKey > StartKey, + case FirstKeyPassedStart of + true -> + {false, true, false, false}; + false -> + {false, false, true, false} + end; +trim_booleans(_FirstKey, LastKey, all, EndKey) -> + LastKeyPassedEnd = leveled_codec:endkey_passed(EndKey, LastKey), + case LastKeyPassedEnd of + true -> + {false, false, false, true}; + false -> + {false, true, false, false} + end; +trim_booleans(FirstKey, LastKey, StartKey, EndKey) -> + FirstKeyPassedStart = FirstKey > StartKey, + PreRange = LastKey < StartKey, + PostRange = leveled_codec:endkey_passed(EndKey, FirstKey), + OutOfRange = PreRange or PostRange, + LastKeyPassedEnd = leveled_codec:endkey_passed(EndKey, LastKey), + case OutOfRange of + true -> + {true, false, false, false}; + false -> + case {FirstKeyPassedStart, LastKeyPassedEnd} of + {true, false} -> + {false, true, false, false}; + {false, false} -> + {false, false, true, false}; + {true, true} -> + {false, false, false, true}; + {false, true} -> + {false, false, true, true} + end + end. + + + + +crc_check_slot(FullBin) -> + <> = FullBin, + case erlang:crc32(SlotBin) of + CRC32 -> + <> = SlotBin, + Lengths = {B1P, B1L, B2L, B3L, B4L}, + {Lengths, Rest}; + _ -> + leveled_log:log("SST09", []), + crc_wonky + end. + +double_hash(Hash, Key) -> + H2 = erlang:phash2(Key), + (Hash bxor H2) band 32767. + +fetch_value([], _Lengths, _Blocks, _Key) -> + not_present; +fetch_value([Pos|Rest], Lengths, Blocks, Key) -> + BlockNumber = (Pos div 32) + 1, + BlockPos = (Pos rem 32) + 1, + BlockL = + case BlockNumber of + 1 -> + B1L = element(2, Lengths), + <> = Blocks, + binary_to_term(Block); + 2 -> + B1L = element(2, Lengths), + B2L = element(3, Lengths), + <<_Pass:B1L/binary, Block:B2L/binary, _Rest/binary>> = Blocks, + binary_to_term(Block); + 3 -> + PreL = element(2, Lengths) + element(3, Lengths), + B3L = element(4, Lengths), + <<_Pass:PreL/binary, Block:B3L/binary, _Rest/binary>> = Blocks, + binary_to_term(Block); + 4 -> + {_B1P, B1L, B2L, B3L, B4L} = Lengths, + PreL = B1L + B2L + B3L, + <<_Pass:PreL/binary, Block:B4L/binary>> = Blocks, + binary_to_term(Block) + end, + + {K, V} = lists:nth(BlockPos, BlockL), + case K of + Key -> + {K, V}; + _ -> + fetch_value(Rest, Lengths, Blocks, Key) + end. + +find_pos(<<>>, _Hash, PosList, _Count) -> + PosList; +find_pos(<<1:1/integer, Hash:15/integer, T/binary>>, Hash, PosList, Count) -> + find_pos(T, Hash, PosList ++ [Count], Count + 1); +find_pos(<<1:1/integer, _Miss:15/integer, T/binary>>, Hash, PosList, Count) -> + find_pos(T, Hash, PosList, Count + 1); +find_pos(<<0:1/integer, NHC:7/integer, T/binary>>, Hash, PosList, Count) -> + find_pos(T, Hash, PosList, Count + NHC + 1). + + + +%%%============================================================================ +%%% Merge Functions +%%%============================================================================ + +%% functions for merging two KV lists with pointers + +%% Compare the keys at the head of the list, and either skip that "best" key or +%% identify as the next key. +%% +%% The logic needs to change if the file is in the basement level, as keys with +%% expired timestamps need not be written at this level +%% +%% The best key is considered to be the lowest key in erlang term order. If +%% there are matching keys then the highest sequence number must be chosen and +%% any lower sequence numbers should be compacted out of existence + +merge_lists(KeyList1, KeyList2, LevelInfo) -> + merge_lists(KeyList1, KeyList2, LevelInfo, [], ?MAX_SLOTS * ?SLOT_SIZE). + +merge_lists([], [], _LevelR, MergedList, _MaxSize) -> + {{[], []}, lists:reverse(MergedList)}; +merge_lists(Rem1, Rem2, _LevelR, MergedList, 0) -> + {{Rem1, Rem2}, lists:reverse(MergedList)}; +merge_lists(KeyList1, KeyList2, {IsBasement, TS}, MergedList, MaxSize) -> + case key_dominates(KeyList1, KeyList2, {IsBasement, TS}) of + {{next_key, TopKey}, Rem1, Rem2} -> + merge_lists(Rem1, + Rem2, + {IsBasement, TS}, + [TopKey|MergedList], + MaxSize - 1); + {skipped_key, Rem1, Rem2} -> + merge_lists(Rem1, Rem2, {IsBasement, TS}, MergedList, MaxSize) + end. + +key_dominates(KL1, KL2, Level) -> + key_dominates_expanded(maybe_expand_pointer(KL1), + maybe_expand_pointer(KL2), + Level). + +key_dominates_expanded([H1|T1], [], Level) -> + case leveled_codec:maybe_reap_expiredkey(H1, Level) of + true -> + {skipped_key, T1, []}; + false -> + {{next_key, H1}, T1, []} + end; +key_dominates_expanded([], [H2|T2], Level) -> + case leveled_codec:maybe_reap_expiredkey(H2, Level) of + true -> + {skipped_key, [], T2}; + false -> + {{next_key, H2}, [], T2} + end; +key_dominates_expanded([H1|T1], [H2|T2], Level) -> + case leveled_codec:key_dominates(H1, H2) of + left_hand_first -> + case leveled_codec:maybe_reap_expiredkey(H1, Level) of + true -> + {skipped_key, T1, [H2|T2]}; + false -> + {{next_key, H1}, T1, [H2|T2]} + end; + right_hand_first -> + case leveled_codec:maybe_reap_expiredkey(H2, Level) of + true -> + {skipped_key, [H1|T1], T2}; + false -> + {{next_key, H2}, [H1|T1], T2} + end; + left_hand_dominant -> + {skipped_key, [H1|T1], T2}; + right_hand_dominant -> + {skipped_key, T1, [H2|T2]} + end. + + +%% When a list is provided it may include a pointer to gain another batch of +%% entries from the same file, or a new batch of entries from another file +%% +%% This resultant list should include the Tail of any pointers added at the +%% end of the list + +maybe_expand_pointer([]) -> + []; +maybe_expand_pointer([{pointer, SSTPid, Slot, StartKey, all}|Tail]) -> + expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, all}, + Tail, + ?MERGE_SCANWIDTH); +maybe_expand_pointer([{next, SSTPid, StartKey}|Tail]) -> + expand_list_by_pointer({next, SSTPid, StartKey, all}, + Tail, + ?MERGE_SCANWIDTH); +maybe_expand_pointer(List) -> + List. + + +expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey}, Tail, Width) -> + FoldFun = + fun(X, {Pointers, Remainder}) -> + case length(Pointers) of + L when L < Width -> + case X of + {pointer, SSTPid, S, SK, EK} -> + {Pointers ++ [{pointer, S, SK, EK}], Remainder}; + _ -> + {Pointers, Remainder ++ [X]} + end; + _ -> + {Pointers, Remainder ++ [X]} + end + end, + InitAcc = {[{pointer, Slot, StartKey, EndKey}], []}, + {AccPointers, AccTail} = lists:foldl(FoldFun, InitAcc, Tail), + ExpPointers = leveled_sst:sst_getslots(SSTPid, AccPointers), + lists:append(ExpPointers, AccTail); +expand_list_by_pointer({next, SSTPid, StartKey, EndKey}, Tail, Width) -> + ExpPointer = leveled_sst:sst_getkvrange(SSTPid, StartKey, EndKey, Width), + ExpPointer ++ Tail. + + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> + generate_randomkeys(Seqn, + Count, + [], + BucketRangeLow, + BucketRangeHigh). + +generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) -> + Acc; +generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> + BRand = random:uniform(BRange), + BNumber = string:right(integer_to_list(BucketLow + BRand), 4, $0), + KNumber = string:right(integer_to_list(random:uniform(1000)), 6, $0), + LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, + "Key" ++ KNumber, + o), + {_B, _K, KV} = leveled_codec:generate_ledgerkv(LedgerKey, + Seqn, + crypto:rand_bytes(64), + 64, + infinity), + generate_randomkeys(Seqn + 1, + Count - 1, + [KV|Acc], + BucketLow, + BRange). + + +generate_indexkeys(Count) -> + generate_indexkeys(Count, []). + +generate_indexkeys(0, IndexList) -> + IndexList; +generate_indexkeys(Count, IndexList) -> + IndexSpecs = [{add, "t1_int", random:uniform(80000)}], + Changes = leveled_codec:convert_indexspecs(IndexSpecs, + "Bucket", + "Key" ++ integer_to_list(Count), + Count, + infinity), + generate_indexkeys(Count - 1, IndexList ++ Changes). + + +indexed_list_test() -> + io:format(user, "~nIndexed list timing test:~n", []), + N = 150, + KVL0 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 4)), + KVL1 = lists:sublist(KVL0, 128), + + % BloomAddFun = + % fun({H, K}, {Bloom, Total, Max}) -> + % SW = os:timestamp(), + % Bloom0 = leveled_tinybloom:tiny_enter(H, K, Bloom), + % T0 = timer:now_diff(os:timestamp(), SW), + % {Bloom0, Total + T0, max(T0, Max)} + + % end, + + SW0 = os:timestamp(), + + {_PosBinIndex1, FullBin} = generate_binary_slot(KVL1), + io:format(user, + "Indexed list created slot in ~w microseconds of size ~w~n", + [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]), + + {TestK1, TestV1} = lists:nth(20, KVL1), + MH1 = leveled_codec:magic_hash(TestK1), + {TestK2, TestV2} = lists:nth(40, KVL1), + MH2 = leveled_codec:magic_hash(TestK2), + {TestK3, TestV3} = lists:nth(60, KVL1), + MH3 = leveled_codec:magic_hash(TestK3), + {TestK4, TestV4} = lists:nth(80, KVL1), + MH4 = leveled_codec:magic_hash(TestK4), + {TestK5, TestV5} = lists:nth(100, KVL1), + MH5 = leveled_codec:magic_hash(TestK5), + + test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), + test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), + test_binary_slot(FullBin, TestK3, MH3, {TestK3, TestV3}), + test_binary_slot(FullBin, TestK4, MH4, {TestK4, TestV4}), + test_binary_slot(FullBin, TestK5, MH5, {TestK5, TestV5}). + + +indexed_list_mixedkeys_test() -> + KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)), + KVL1 = lists:sublist(KVL0, 33), + Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), + + {_PosBinIndex1, FullBin} = generate_binary_slot(Keys), + + {TestK1, TestV1} = lists:nth(4, KVL1), + MH1 = leveled_codec:magic_hash(TestK1), + {TestK2, TestV2} = lists:nth(8, KVL1), + MH2 = leveled_codec:magic_hash(TestK2), + {TestK3, TestV3} = lists:nth(12, KVL1), + MH3 = leveled_codec:magic_hash(TestK3), + {TestK4, TestV4} = lists:nth(16, KVL1), + MH4 = leveled_codec:magic_hash(TestK4), + {TestK5, TestV5} = lists:nth(20, KVL1), + MH5 = leveled_codec:magic_hash(TestK5), + + test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), + test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), + test_binary_slot(FullBin, TestK3, MH3, {TestK3, TestV3}), + test_binary_slot(FullBin, TestK4, MH4, {TestK4, TestV4}), + test_binary_slot(FullBin, TestK5, MH5, {TestK5, TestV5}). + +indexed_list_mixedkeys2_test() -> + KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)), + KVL1 = lists:sublist(KVL0, 33), + IdxKeys1 = lists:ukeysort(1, generate_indexkeys(30)), + IdxKeys2 = lists:ukeysort(1, generate_indexkeys(30)), + % this isn't actually ordered correctly + Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2, + {_PosBinIndex1, FullBin} = generate_binary_slot(Keys), + lists:foreach(fun({K, V}) -> + MH = leveled_codec:magic_hash(K), + test_binary_slot(FullBin, K, MH, {K, V}) + end, + KVL1). + +indexed_list_allindexkeys_test() -> + Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128), + {PosBinIndex1, FullBin} = generate_binary_slot(Keys), + ?assertMatch(<<127:8/integer>>, PosBinIndex1), + % SW = os:timestamp(), + BinToList = binaryslot_tolist(FullBin), + % io:format(user, + % "Indexed list flattened in ~w microseconds ~n", + % [timer:now_diff(os:timestamp(), SW)]), + ?assertMatch(Keys, BinToList), + ?assertMatch(Keys, binaryslot_trimmedlist(FullBin, all, all)). + + +indexed_list_allindexkeys_trimmed_test() -> + Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128), + {PosBinIndex1, FullBin} = generate_binary_slot(Keys), + ?assertMatch(<<127:8/integer>>, PosBinIndex1), + ?assertMatch(Keys, binaryslot_trimmedlist(FullBin, + {i, + "Bucket", + {"t1_int", 0}, + null}, + {i, + "Bucket", + {"t1_int", 99999}, + null})), + + {SK1, _} = lists:nth(10, Keys), + {EK1, _} = lists:nth(100, Keys), + R1 = lists:sublist(Keys, 10, 91), + O1 = binaryslot_trimmedlist(FullBin, SK1, EK1), + ?assertMatch(91, length(O1)), + ?assertMatch(R1, O1), + + {SK2, _} = lists:nth(10, Keys), + {EK2, _} = lists:nth(20, Keys), + R2 = lists:sublist(Keys, 10, 11), + O2 = binaryslot_trimmedlist(FullBin, SK2, EK2), + ?assertMatch(11, length(O2)), + ?assertMatch(R2, O2), + + {SK3, _} = lists:nth(127, Keys), + {EK3, _} = lists:nth(128, Keys), + R3 = lists:sublist(Keys, 127, 2), + O3 = binaryslot_trimmedlist(FullBin, SK3, EK3), + ?assertMatch(2, length(O3)), + ?assertMatch(R3, O3). + + +indexed_list_mixedkeys_bitflip_test() -> + KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)), + KVL1 = lists:sublist(KVL0, 33), + Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), + {_PosBinIndex1, FullBin} = generate_binary_slot(Keys), + L = byte_size(FullBin), + Byte1 = random:uniform(L), + <> = FullBin, + FullBin0 = + case A of + 0 -> + <>; + _ -> + <> + end, + + {TestK1, _TestV1} = lists:nth(20, KVL1), + MH1 = leveled_codec:magic_hash(TestK1), + + test_binary_slot(FullBin0, TestK1, MH1, not_present), + ToList = binaryslot_tolist(FullBin0), + ?assertMatch([], ToList), + + {SK1, _} = lists:nth(10, Keys), + {EK1, _} = lists:nth(50, Keys), + O1 = binaryslot_trimmedlist(FullBin0, SK1, EK1), + ?assertMatch(0, length(O1)), + ?assertMatch([], O1). + + + +test_binary_slot(FullBin, Key, Hash, ExpectedValue) -> + % SW = os:timestamp(), + {ReturnedValue, _} = binaryslot_get(FullBin, Key, Hash, none), + ?assertMatch(ExpectedValue, ReturnedValue). + % io:format(user, "Fetch success in ~w microseconds ~n", + % [timer:now_diff(os:timestamp(), SW)]). + + + +merge_test() -> + N = 3000, + KVL1 = lists:ukeysort(1, generate_randomkeys(N + 1, N, 1, 20)), + KVL2 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 20)), + KVL3 = lists:ukeymerge(1, KVL1, KVL2), + SW0 = os:timestamp(), + {ok, P1, {FK1, LK1}} = sst_new("../test/level1_src", 1, KVL1, 6000), + {ok, P2, {FK2, LK2}} = sst_new("../test/level2_src", 2, KVL2, 3000), + ExpFK1 = element(1, lists:nth(1, KVL1)), + ExpLK1 = element(1, lists:last(KVL1)), + ExpFK2 = element(1, lists:nth(1, KVL2)), + ExpLK2 = element(1, lists:last(KVL2)), + ?assertMatch(ExpFK1, FK1), + ?assertMatch(ExpFK2, FK2), + ?assertMatch(ExpLK1, LK1), + ?assertMatch(ExpLK2, LK2), + ML1 = [{next, P1, FK1}], + ML2 = [{next, P2, FK2}], + {ok, P3, {{Rem1, Rem2}, FK3, LK3}} = sst_new("../test/level2_merge", + ML1, + ML2, + false, + 2, + N * 2), + ?assertMatch([], Rem1), + ?assertMatch([], Rem2), + ?assertMatch(true, FK3 == min(FK1, FK2)), + ?assertMatch(true, LK3 == max(LK1, LK2)), + io:format(user, + "Created and merged two files of size ~w in ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW0)]), + + SW1 = os:timestamp(), + lists:foreach(fun({K, V}) -> + ?assertMatch({K, V}, sst_get(P3, K)) + end, + KVL3), + io:format(user, + "Checked presence of all ~w objects in ~w microseconds~n", + [length(KVL3), timer:now_diff(os:timestamp(), SW1)]), + + ok = sst_close(P1), + ok = sst_close(P2), + ok = sst_close(P3), + ok = file:delete("../test/level1_src.sst"), + ok = file:delete("../test/level2_src.sst"), + ok = file:delete("../test/level2_merge.sst"). + + +simple_persisted_range_test() -> + Filename = "../test/simple_test", + KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20), + KVList1 = lists:ukeysort(1, KVList0), + [{FirstKey, _FV}|_Rest] = KVList1, + {LastKey, _LV} = lists:last(KVList1), + {ok, Pid, {FirstKey, LastKey}} = sst_new(Filename, + 1, + KVList1, + length(KVList1)), + + {o, B, K, null} = LastKey, + SK1 = {o, B, K, 0}, + EK1 = {o, B, K, 1}, + FetchListA1 = sst_getkvrange(Pid, SK1, EK1, 1), + ?assertMatch([], FetchListA1), + + SK2 = element(1, lists:nth(127, KVList1)), + SK3 = element(1, lists:nth(128, KVList1)), + SK4 = element(1, lists:nth(129, KVList1)), + SK5 = element(1, lists:nth(130, KVList1)), + + EK2 = element(1, lists:nth(255, KVList1)), + EK3 = element(1, lists:nth(256, KVList1)), + EK4 = element(1, lists:nth(257, KVList1)), + EK5 = element(1, lists:nth(258, KVList1)), + + TestFun = + fun({SK, EK}) -> + FetchList = sst_getkvrange(Pid, SK, EK, 4), + ?assertMatch(SK, element(1, lists:nth(1, FetchList))), + ?assertMatch(EK, element(1, lists:last(FetchList))) + end, + + TL2 = lists:map(fun(EK) -> {SK2, EK} end, [EK2, EK3, EK4, EK5]), + TL3 = lists:map(fun(EK) -> {SK3, EK} end, [EK2, EK3, EK4, EK5]), + TL4 = lists:map(fun(EK) -> {SK4, EK} end, [EK2, EK3, EK4, EK5]), + TL5 = lists:map(fun(EK) -> {SK5, EK} end, [EK2, EK3, EK4, EK5]), + lists:foreach(TestFun, TL2 ++ TL3 ++ TL4 ++ TL5). + + + +simple_persisted_test() -> + Filename = "../test/simple_test", + KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 32, 1, 20), + KVList1 = lists:ukeysort(1, KVList0), + [{FirstKey, _FV}|_Rest] = KVList1, + {LastKey, _LV} = lists:last(KVList1), + {ok, Pid, {FirstKey, LastKey}} = sst_new(Filename, + 1, + KVList1, + length(KVList1)), + SW0 = os:timestamp(), + lists:foreach(fun({K, V}) -> + ?assertMatch({K, V}, sst_get(Pid, K)) + end, + KVList1), + io:format(user, + "Checking for ~w keys (once) in file with cache hit took ~w " + ++ "microseconds~n", + [length(KVList1), timer:now_diff(os:timestamp(), SW0)]), + SW1 = os:timestamp(), + lists:foreach(fun({K, V}) -> + ?assertMatch({K, V}, sst_get(Pid, K)), + ?assertMatch({K, V}, sst_get(Pid, K)) + end, + KVList1), + io:format(user, + "Checking for ~w keys (twice) in file with cache hit took ~w " + ++ "microseconds~n", + [length(KVList1), timer:now_diff(os:timestamp(), SW1)]), + ok = sst_printtimings(Pid), + KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 32, 1, 20), + MapFun = + fun({K, V}, Acc) -> + In = lists:keymember(K, 1, KVList1), + case {K > FirstKey, LastKey > K, In} of + {true, true, false} -> + [{K, leveled_codec:magic_hash(K), V}|Acc]; + _ -> + Acc + end + end, + KVList3 = lists:foldl(MapFun, [], KVList2), + SW2 = os:timestamp(), + lists:foreach(fun({K, H, _V}) -> + ?assertMatch(not_present, sst_get(Pid, K, H)) + end, + KVList3), + io:format(user, + "Checking for ~w missing keys took ~w microseconds~n", + [length(KVList3), timer:now_diff(os:timestamp(), SW2)]), + ok = sst_printtimings(Pid), + FetchList1 = sst_getkvrange(Pid, all, all, 2), + FoldFun = fun(X, Acc) -> + case X of + {pointer, P, S, SK, EK} -> + io:format("Get slot ~w with Acc at ~w~n", + [S, length(Acc)]), + Acc ++ sst_getslots(P, [{pointer, P, S, SK, EK}]); + _ -> + Acc ++ [X] + end end, + FetchedList1 = lists:foldl(FoldFun, [], FetchList1), + ?assertMatch(KVList1, FetchedList1), + + {TenthKey, _v10} = lists:nth(10, KVList1), + {Three000Key, _v300} = lists:nth(300, KVList1), + SubKVList1 = lists:sublist(KVList1, 10, 291), + SubKVList1L = length(SubKVList1), + FetchList2 = sst_getkvrange(Pid, TenthKey, Three000Key, 2), + ?assertMatch(pointer, element(1, lists:last(FetchList2))), + FetchedList2 = lists:foldl(FoldFun, [], FetchList2), + ?assertMatch(SubKVList1L, length(FetchedList2)), + ?assertMatch(SubKVList1, FetchedList2), + + {Eight000Key, _v800} = lists:nth(800, KVList1), + SubKVListA1 = lists:sublist(KVList1, 10, 791), + SubKVListA1L = length(SubKVListA1), + FetchListA2 = sst_getkvrange(Pid, TenthKey, Eight000Key, 2), + ?assertMatch(pointer, element(1, lists:last(FetchListA2))), + FetchedListA2 = lists:foldl(FoldFun, [], FetchListA2), + ?assertMatch(SubKVListA1L, length(FetchedListA2)), + ?assertMatch(SubKVListA1, FetchedListA2), + + FetchListB2 = sst_getkvrange(Pid, TenthKey, Eight000Key, 4), + ?assertMatch(pointer, element(1, lists:last(FetchListB2))), + FetchedListB2 = lists:foldl(FoldFun, [], FetchListB2), + ?assertMatch(SubKVListA1L, length(FetchedListB2)), + ?assertMatch(SubKVListA1, FetchedListB2), + + FetchListB3 = sst_getkvrange(Pid, + Eight000Key, + {o, null, null, null}, + 4), + FetchedListB3 = lists:foldl(FoldFun, [], FetchListB3), + SubKVListA3 = lists:nthtail(800 - 1, KVList1), + SubKVListA3L = length(SubKVListA3), + io:format("Length expected ~w~n", [SubKVListA3L]), + ?assertMatch(SubKVListA3L, length(FetchedListB3)), + ?assertMatch(SubKVListA3, FetchedListB3), + + io:format("Eight hundredth key ~w~n", [Eight000Key]), + FetchListB4 = sst_getkvrange(Pid, + Eight000Key, + Eight000Key, + 4), + FetchedListB4 = lists:foldl(FoldFun, [], FetchListB4), + ?assertMatch([{Eight000Key, _v800}], FetchedListB4), + + ok = sst_close(Pid), + ok = file:delete(Filename ++ ".sst"). + +key_dominates_test() -> + KV1 = {{o, "Bucket", "Key1", null}, {5, {active, infinity}, 0, []}}, + KV2 = {{o, "Bucket", "Key3", null}, {6, {active, infinity}, 0, []}}, + KV3 = {{o, "Bucket", "Key2", null}, {3, {active, infinity}, 0, []}}, + KV4 = {{o, "Bucket", "Key4", null}, {7, {active, infinity}, 0, []}}, + KV5 = {{o, "Bucket", "Key1", null}, {4, {active, infinity}, 0, []}}, + KV6 = {{o, "Bucket", "Key1", null}, {99, {tomb, 999}, 0, []}}, + KV7 = {{o, "Bucket", "Key1", null}, {99, tomb, 0, []}}, + KL1 = [KV1, KV2], + KL2 = [KV3, KV4], + ?assertMatch({{next_key, KV1}, [KV2], KL2}, + key_dominates(KL1, KL2, {undefined, 1})), + ?assertMatch({{next_key, KV1}, KL2, [KV2]}, + key_dominates(KL2, KL1, {undefined, 1})), + ?assertMatch({skipped_key, KL2, KL1}, + key_dominates([KV5|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV1}, [KV2], []}, + key_dominates(KL1, [], {undefined, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV6}, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {undefined, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {true, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {true, 1000})), + ?assertMatch({{next_key, KV6}, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {true, 1})), + ?assertMatch({skipped_key, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {true, 1000})), + ?assertMatch({skipped_key, [], []}, + key_dominates([KV6], [], {true, 1000})), + ?assertMatch({skipped_key, [], []}, + key_dominates([], [KV6], {true, 1000})), + ?assertMatch({{next_key, KV6}, [], []}, + key_dominates([KV6], [], {true, 1})), + ?assertMatch({{next_key, KV6}, [], []}, + key_dominates([], [KV6], {true, 1})), + ?assertMatch({skipped_key, [], []}, + key_dominates([KV7], [], {true, 1})), + ?assertMatch({skipped_key, [], []}, + key_dominates([], [KV7], {true, 1})), + ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, + key_dominates([KV7|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV7}, KL2, [KV2]}, + key_dominates([KV7|KL2], [KV2], {undefined, 1})), + ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, + key_dominates([KV7|KL2], KL1, {true, 1})), + ?assertMatch({skipped_key, KL2, [KV2]}, + key_dominates([KV7|KL2], [KV2], {true, 1})). + +nonsense_coverage_test() -> + {ok, Pid} = gen_fsm:start(?MODULE, [], []), + ok = gen_fsm:send_all_state_event(Pid, nonsense), + ?assertMatch({next_state, reader, #state{}}, handle_info(nonsense, + reader, + #state{})), + ?assertMatch({ok, reader, #state{}}, code_change(nonsense, + reader, + #state{}, + nonsense)). + +-endif. \ No newline at end of file diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index f9212ad..2278c2a 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -2,7 +2,7 @@ %% %% For sheltering relatively expensive lookups with a probabilistic check %% -%% Uses multiple 256 byte blooms. Can sensibly hold up to 1000 keys per array. +%% Uses multiple 512 byte blooms. Can sensibly hold up to 1000 keys per array. %% Even at 1000 keys should still offer only a 20% false positive %% %% Restricted to no more than 256 arrays - so can't handle more than 250K keys @@ -22,13 +22,13 @@ empty/1 ]). + -include_lib("eunit/include/eunit.hrl"). %%%============================================================================ %%% Bloom API %%%============================================================================ - empty(Width) when Width =< 256 -> FoldFun = fun(X, Acc) -> dict:store(X, <<0:4096>>, Acc) end, lists:foldl(FoldFun, dict:new(), lists:seq(0, Width - 1)). @@ -36,26 +36,30 @@ empty(Width) when Width =< 256 -> enter({hash, no_lookup}, Bloom) -> Bloom; enter({hash, Hash}, Bloom) -> - {H0, Bit1, Bit2} = split_hash(Hash), - Slot = H0 rem dict:size(Bloom), + {Slot0, Bit1, Bit2} = split_hash(Hash), + Slot = Slot0 rem dict:size(Bloom), BitArray0 = dict:fetch(Slot, Bloom), - BitArray1 = lists:foldl(fun add_to_array/2, + FoldFun = + fun(Bit, Arr) -> add_to_array(Bit, Arr, 4096) end, + BitArray1 = lists:foldl(FoldFun, BitArray0, lists:usort([Bit1, Bit2])), - dict:store(Slot, BitArray1, Bloom); + dict:store(Slot, <>, Bloom); enter(Key, Bloom) -> Hash = leveled_codec:magic_hash(Key), enter({hash, Hash}, Bloom). + check({hash, Hash}, Bloom) -> - {H0, Bit1, Bit2} = split_hash(Hash), - Slot = H0 rem dict:size(Bloom), + {Slot0, Bit1, Bit2} = split_hash(Hash), + Slot = Slot0 rem dict:size(Bloom), BitArray = dict:fetch(Slot, Bloom), - case getbit(Bit1, BitArray) of + + case getbit(Bit1, BitArray, 4096) of <<0:1>> -> false; <<1:1>> -> - case getbit(Bit2, BitArray) of + case getbit(Bit2, BitArray, 4096) of <<0:1>> -> false; <<1:1>> -> @@ -66,6 +70,7 @@ check(Key, Bloom) -> Hash = leveled_codec:magic_hash(Key), check({hash, Hash}, Bloom). + %%%============================================================================ %%% Internal Functions %%%============================================================================ @@ -76,15 +81,15 @@ split_hash(Hash) -> H2 = Hash bsr 20, {H0, H1, H2}. -add_to_array(Bit, BitArray) -> - RestLen = 4096 - Bit - 1, +add_to_array(Bit, BitArray, ArrayLength) -> + RestLen = ArrayLength - Bit - 1, <> = BitArray, <>. -getbit(Bit, BitArray) -> - RestLen = 4096 - Bit - 1, +getbit(Bit, BitArray, ArrayLength) -> + RestLen = ArrayLength - Bit - 1, <<_Head:Bit/bitstring, B:1/bitstring, _Rest:RestLen/bitstring>> = BitArray, @@ -99,7 +104,7 @@ getbit(Bit, BitArray) -> simple_test() -> N = 4000, - W = 4, + W = 6, KLin = lists:map(fun(X) -> "Key_" ++ integer_to_list(X) ++ integer_to_list(random:uniform(100)) ++ @@ -148,6 +153,7 @@ simple_test() -> "with ~w false positive rate~n", [N, timer:now_diff(os:timestamp(), SW3), FP / N]), ?assertMatch(true, FP < (N div 4)). - + + -endif. \ No newline at end of file