leveled/src/leveled_pclerk.erl

412 lines
16 KiB
Erlang
Raw Normal View History

%% -------- PENCILLER's CLERK ---------
%%
%% The Penciller's clerk is responsible for compaction work within the Ledger.
%%
%% The Clerk will periodically poll the Penciller to check there is no work
%% at level zero pending completion, and if not the Clerk will examine the
%% manifest to see if work is necessary.
%%
%% -------- COMMITTING MANIFEST CHANGES ---------
%%
%% Once the Penciller has taken a manifest change, the SST file owners which no
%% longer form part of the manifest will be marked for delete. By marking for
%% deletion, the owners will poll to confirm when it is safe for them to be
%% deleted.
%%
%% It is imperative that the file is not marked for deletion until it is
%% certain that the manifest change has been committed. Some uncollected
%% garbage is considered acceptable.
%%
-module(leveled_pclerk).
-behaviour(gen_server).
-include("include/leveled.hrl").
-export([init/1,
handle_call/3,
handle_cast/2,
handle_info/2,
terminate/2,
clerk_new/1,
clerk_prompt/1,
clerk_manifestchange/3,
code_change/3]).
-include_lib("eunit/include/eunit.hrl").
Smoother handling of back-pressure The Penciller had two problems in previous commits: - If it had a push_mem soon after a L0 file had been created, the push_mem would stall waiting for the L0 file to complete - and this count take 100-200ms - The penciller's clerk favoured L0 work, but was lazy about asking for other work in-between, so often the L1 layer was bursting over capacity and the clerk was doing nothing but merging more L0 files in (with those merges getting more and more expensive as they had to cover more and more files) There are some partial resolutions to this. There is now an aggressive timeout when checking whther the L0 file is ready on a push_mem, and if the timeout is breached the error is caught and a 'returned' message goes back to the Bookie. the Bookie doesn't now empty its cache, it carrie son filling it, but on some probability it will keep trying to push_mem on future pushes. This increases Jitter around the expensive operation and split out the L0 delay into defined chunks. The penciller's clerk is now more aggressive in asking for work. There is also some simplification of the relationship between clerk timeouts and penciller back-pressure. Also resolved is an issue of inconcistency between the loader and the on startup (replaying the transaction log) and the standard push_mem process. The loader was not correctly de-duplicating by adding first (in order) to a tree before outputting the list from the tree. Some thought will be given later as to whether non-L0 work can be safely prioritised if the merge process still keeps getting behind.
2016-10-20 02:23:45 +01:00
-define(MAX_TIMEOUT, 2000).
2016-11-05 17:50:28 +00:00
-define(MIN_TIMEOUT, 50).
-define(END_KEY, {null, null, null, null}).
-record(state, {owner :: pid(),
manifest, % ets table reference
change_pending=false :: boolean(),
work_item :: #penciller_work{}|null}).
%%%============================================================================
%%% API
%%%============================================================================
clerk_new(Owner, Manifest) ->
{ok, Pid} = gen_server:start(?MODULE, [], []),
ok = gen_server:call(Pid, {load, Owner, Manifest}, infinity),
leveled_log:log("PC001", [Pid, Owner]),
{ok, Pid}.
clerk_prompt(Pid) ->
gen_server:cast(Pid, prompt).
clerk_close(Pid) ->
gen_server:cast(Pid, close).
%%%============================================================================
%%% gen_server callbacks
%%%============================================================================
init([]) ->
{ok, #state{}}.
handle_call({load, Owner, Manifest}, _From, State) ->
{reply,
ok,
State#state{owner=Owner, manifest=Manifest},
?MIN_TIMEOUT}.
handle_cast(prompt, State) ->
{noreply, State, ?MIN_TIMEOUT};
handle_cast(close, State) ->
(stop, normal, State).
handle_info(timeout, State=#state{change_pending=Pnd}) when Pnd == false ->
case requestandhandle_work(State) of
{false, Timeout} ->
{noreply, State, Timeout};
{true, WI} ->
% No timeout now as will wait for call to return manifest
% change
{noreply,
State#state{change_pending=true, work_item=WI}}
end.
terminate(Reason, _State) ->
leveled_log:log("PC005", [self(), Reason]).
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
%%%============================================================================
%%% Internal functions
%%%============================================================================
requestandhandle_work(State) ->
case leveled_penciller:pcl_workforclerk(State#state.owner) of
false ->
2016-11-04 17:28:04 +00:00
leveled_log:log("PC006", []),
false;
{SrcLevel, ManifestSQN} ->
{Additions, Removals} = merge(Level,
State#state.manifest,
ManifestSQN),
leveled_log:log("PC007", []),
ok = leveled_penciller:pcl_commitmanifestchange(State#state.owner,
SrcLevel,
Additions,
Removals,
ManifestSQN),
true
end.
merge(SrcLevel, Manifest, ManifestSQN) ->
SrcF = select_filetomerge(SrcLevel, Manifest),
Candidates = check_for_merge_candidates(SrcF, SinkFiles),
%% TODO:
%% Need to work out if this is the top level
%% And then tell merge process to create files at the top level
%% Which will include the reaping of expired tombstones
leveled_log:log("PC008", [SrcLevel, length(Candidates)]),
MergedFiles = case length(Candidates) of
0 ->
%% If no overlapping candiates, manifest change only required
%%
%% TODO: need to think still about simply renaming when at
%% lower level
leveled_log:log("PC009",
[SrcF#manifest_entry.filename, SrcLevel + 1]),
[SrcF];
_ ->
perform_merge({SrcF#manifest_entry.owner,
SrcF#manifest_entry.filename},
Candidates,
{SrcLevel, WI#penciller_work.target_is_basement},
{WI#penciller_work.ledger_filepath,
WI#penciller_work.next_sqn})
end,
NewLevel = lists:sort(lists:append(MergedFiles, Others)),
UpdMFest2 = lists:keystore(SrcLevel + 1,
1,
UpdMFest1,
{SrcLevel + 1, NewLevel}),
ok = filelib:ensure_dir(WI#penciller_work.manifest_file),
{ok, Handle} = file:open(WI#penciller_work.manifest_file,
[binary, raw, write]),
ok = file:write(Handle, term_to_binary(UpdMFest2)),
ok = file:close(Handle),
case lists:member(SrcF, MergedFiles) of
true ->
{UpdMFest2, Candidates};
false ->
%% Can rub out src file as it is not part of output
{UpdMFest2, Candidates ++ [SrcF]}
end.
mark_for_delete([], _Penciller) ->
ok;
mark_for_delete([Head|Tail], Penciller) ->
ok = leveled_sst:sst_setfordelete(Head#manifest_entry.owner, Penciller),
mark_for_delete(Tail, Penciller).
check_for_merge_candidates(SrcF, SinkFiles) ->
lists:partition(fun(Ref) ->
case {Ref#manifest_entry.start_key,
Ref#manifest_entry.end_key} of
{_, EK} when SrcF#manifest_entry.start_key > EK ->
false;
{SK, _} when SrcF#manifest_entry.end_key < SK ->
false;
_ ->
true
end end,
SinkFiles).
%% An algorithm for discovering which files to merge ....
%% We can find the most optimal file:
%% - The one with the most overlapping data below?
%% - The one that overlaps with the fewest files below?
%% - The smallest file?
%% We could try and be fair in some way (merge oldest first)
%% Ultimately, there is a lack of certainty that being fair or optimal is
%% genuinely better - eventually every file has to be compacted.
%%
%% Hence, the initial implementation is to select files to merge at random
select_filetomerge(SrcLevel, Manifest, ManifestSQN) ->
Level = leveled_manifest:range_lookup(Manifest,
1,
all,
?END_KEY,
ManifestSQN),
FN = lists:nth(random:uniform(length(Level)), Level).
%% Assumption is that there is a single SST from a higher level that needs
%% to be merged into multiple SSTs at a lower level. This should create an
%% entirely new set of SSTs, and the calling process can then update the
%% manifest.
%%
%% Once the FileToMerge has been emptied, the remainder of the candidate list
%% needs to be placed in a remainder SST that may be of a sub-optimal (small)
%% size. This stops the need to perpetually roll over the whole level if the
%% level consists of already full files. Some smartness may be required when
%% selecting the candidate list so that small files just outside the candidate
%% list be included to avoid a proliferation of small files.
%%
%% FileToMerge should be a tuple of {FileName, Pid} where the Pid is the Pid of
%% the gen_server leveled_sft process representing the file.
%%
%% CandidateList should be a list of {StartKey, EndKey, Pid} tuples
%% representing different gen_server leveled_sft processes, sorted by StartKey.
%%
%% The level is the level which the new files should be created at.
perform_merge({SrcPid, SrcFN}, CandidateList, LevelInfo, {Filepath, MSN}) ->
leveled_log:log("PC010", [SrcFN, MSN]),
PointerList = lists:map(fun(P) ->
{next, P#manifest_entry.owner, all} end,
CandidateList),
MaxSQN = leveled_sst:sst_getmaxsequencenumber(SrcPid),
do_merge([{next, SrcPid, all}],
PointerList,
LevelInfo,
{Filepath, MSN},
MaxSQN,
0,
[]).
do_merge([], [], {SrcLevel, _IsB}, {_Filepath, MSN}, _MaxSQN,
FileCounter, OutList) ->
leveled_log:log("PC011", [MSN, SrcLevel, FileCounter]),
OutList;
do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, MaxSQN,
FileCounter, OutList) ->
FileName = lists:flatten(io_lib:format(Filepath ++ "_~w_~w.sst",
[SrcLevel + 1, FileCounter])),
leveled_log:log("PC012", [MSN, FileName]),
TS1 = os:timestamp(),
case leveled_sst:sst_new(FileName, KL1, KL2, IsB, SrcLevel + 1, MaxSQN) of
2016-12-29 05:09:47 +00:00
empty ->
leveled_log:log("PC013", [FileName]),
OutList;
{ok, Pid, Reply} ->
{{KL1Rem, KL2Rem}, SmallestKey, HighestKey} = Reply,
ExtMan = lists:append(OutList,
[#manifest_entry{start_key=SmallestKey,
end_key=HighestKey,
owner=Pid,
filename=FileName}]),
leveled_log:log_timer("PC015", [], TS1),
do_merge(KL1Rem, KL2Rem,
{SrcLevel, IsB}, {Filepath, MSN}, MaxSQN,
FileCounter + 1, ExtMan)
end.
get_item(Index, List, Default) ->
case lists:keysearch(Index, 1, List) of
{value, {Index, Value}} ->
Value;
false ->
Default
end.
%%%============================================================================
%%% Test
%%%============================================================================
-ifdef(TEST).
generate_randomkeys(Count, BucketRangeLow, BucketRangeHigh) ->
generate_randomkeys(Count, [], BucketRangeLow, BucketRangeHigh).
generate_randomkeys(0, Acc, _BucketLow, _BucketHigh) ->
Acc;
generate_randomkeys(Count, Acc, BucketLow, BRange) ->
BNumber = string:right(integer_to_list(BucketLow + random:uniform(BRange)),
4, $0),
KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0),
K = {o, "Bucket" ++ BNumber, "Key" ++ KNumber},
RandKey = {K, {Count + 1,
{active, infinity},
leveled_codec:magic_hash(K),
null}},
generate_randomkeys(Count - 1, [RandKey|Acc], BucketLow, BRange).
choose_pid_toquery([ManEntry|_T], Key) when
Key >= ManEntry#manifest_entry.start_key,
ManEntry#manifest_entry.end_key >= Key ->
ManEntry#manifest_entry.owner;
choose_pid_toquery([_H|T], Key) ->
choose_pid_toquery(T, Key).
find_randomkeys(_FList, 0, _Source) ->
ok;
find_randomkeys(FList, Count, Source) ->
KV1 = lists:nth(random:uniform(length(Source)), Source),
K1 = leveled_codec:strip_to_keyonly(KV1),
P1 = choose_pid_toquery(FList, K1),
FoundKV = leveled_sst:sst_get(P1, K1),
Found = leveled_codec:strip_to_keyonly(FoundKV),
io:format("success finding ~w in ~w~n", [K1, P1]),
?assertMatch(K1, Found),
find_randomkeys(FList, Count - 1, Source).
merge_file_test() ->
KL1_L1 = lists:sort(generate_randomkeys(8000, 0, 1000)),
{ok, PidL1_1, _} = leveled_sst:sst_new("../test/KL1_L1.sst",
1,
KL1_L1,
undefined),
KL1_L2 = lists:sort(generate_randomkeys(8000, 0, 250)),
{ok, PidL2_1, _} = leveled_sst:sst_new("../test/KL1_L2.sst",
2,
KL1_L2,
undefined),
KL2_L2 = lists:sort(generate_randomkeys(8000, 250, 250)),
{ok, PidL2_2, _} = leveled_sst:sst_new("../test/KL2_L2.sst",
2,
KL2_L2,
undefined),
KL3_L2 = lists:sort(generate_randomkeys(8000, 500, 250)),
{ok, PidL2_3, _} = leveled_sst:sst_new("../test/KL3_L2.sst",
2,
KL3_L2,
undefined),
KL4_L2 = lists:sort(generate_randomkeys(8000, 750, 250)),
{ok, PidL2_4, _} = leveled_sst:sst_new("../test/KL4_L2.sst",
2,
KL4_L2,
undefined),
Result = perform_merge({PidL1_1, "../test/KL1_L1.sst"},
[#manifest_entry{owner=PidL2_1},
#manifest_entry{owner=PidL2_2},
#manifest_entry{owner=PidL2_3},
#manifest_entry{owner=PidL2_4}],
{2, false}, {"../test/", 99}),
lists:foreach(fun(ManEntry) ->
{o, B1, K1} = ManEntry#manifest_entry.start_key,
{o, B2, K2} = ManEntry#manifest_entry.end_key,
io:format("Result of ~s ~s and ~s ~s with Pid ~w~n",
[B1, K1, B2, K2, ManEntry#manifest_entry.owner]) end,
Result),
io:format("Finding keys in KL1_L1~n"),
ok = find_randomkeys(Result, 50, KL1_L1),
io:format("Finding keys in KL1_L2~n"),
ok = find_randomkeys(Result, 50, KL1_L2),
io:format("Finding keys in KL2_L2~n"),
ok = find_randomkeys(Result, 50, KL2_L2),
io:format("Finding keys in KL3_L2~n"),
ok = find_randomkeys(Result, 50, KL3_L2),
io:format("Finding keys in KL4_L2~n"),
ok = find_randomkeys(Result, 50, KL4_L2),
leveled_sst:sst_clear(PidL1_1),
leveled_sst:sst_clear(PidL2_1),
leveled_sst:sst_clear(PidL2_2),
leveled_sst:sst_clear(PidL2_3),
leveled_sst:sst_clear(PidL2_4),
lists:foreach(fun(ManEntry) ->
leveled_sst:sst_clear(ManEntry#manifest_entry.owner) end,
Result).
select_merge_candidates_test() ->
Sink1 = #manifest_entry{start_key = {o, "Bucket", "Key1"},
end_key = {o, "Bucket", "Key20000"}},
Sink2 = #manifest_entry{start_key = {o, "Bucket", "Key20001"},
end_key = {o, "Bucket1", "Key1"}},
Src1 = #manifest_entry{start_key = {o, "Bucket", "Key40001"},
end_key = {o, "Bucket", "Key60000"}},
{Candidates, Others} = check_for_merge_candidates(Src1, [Sink1, Sink2]),
?assertMatch([Sink2], Candidates),
?assertMatch([Sink1], Others).
select_merge_file_test() ->
L0 = [{{o, "B1", "K1"}, {o, "B3", "K3"}, dummy_pid}],
L1 = [{{o, "B1", "K1"}, {o, "B2", "K2"}, dummy_pid},
{{o, "B2", "K3"}, {o, "B4", "K4"}, dummy_pid}],
Manifest = [{0, L0}, {1, L1}],
{FileRef, NewManifest} = select_filetomerge(0, Manifest),
?assertMatch(FileRef, {{o, "B1", "K1"}, {o, "B3", "K3"}, dummy_pid}),
?assertMatch(NewManifest, [{0, []}, {1, L1}]).
coverage_cheat_test() ->
{ok, _State1} = code_change(null, #state{}, null).
2016-11-05 17:50:28 +00:00
-endif.