Close in stages - waiting for releases (#411)
* Close in stages - waiting for releases Have a consistent approach to closing the inker and the penciller - so that the close can be interrupted by releasing of snapshots. Then any unreleased snapshots are closed before shutdown - with a 10s pause to give queries a short opportunity to finish. This should address some issues, primarily seen (but very rarely) in test whereby post-rebuild destruction of parallel AAE keystores cause the crashing of aae_folds. The primary benefit is to stop an attempt to release a snapshot that has in fact already finished does not cause a crash of the database on normal stop. this was primarily an issue when shutdown is delayed by an ongoing journal compaction job. * Boost default test budget for EQC * Update test to use correct type * Update following review Avoid filtering out exited PIDs when closing snapshots by catching the exit exception when the Pid is down
This commit is contained in:
parent
bc87273c76
commit
7a5cf251b3
6 changed files with 256 additions and 88 deletions
|
@ -222,6 +222,11 @@
|
|||
-define(TIMING_SAMPLECOUNTDOWN, 10000).
|
||||
-define(TIMING_SAMPLESIZE, 100).
|
||||
-define(OPEN_LASTMOD_RANGE, {0, infinity}).
|
||||
-define(SHUTDOWN_PAUSE, 10000).
|
||||
% How long to wait for snapshots to be released on shutdown
|
||||
% before forcing closure of snapshots
|
||||
% 10s may not be long enough for all snapshots, but avoids crashes of
|
||||
% short-lived queries racing with the shutdown
|
||||
|
||||
-record(state, {manifest ::
|
||||
leveled_pmanifest:manifest() | undefined | redacted,
|
||||
|
@ -548,7 +553,19 @@ pcl_persistedsqn(Pid) ->
|
|||
%% @doc
|
||||
%% Close the penciller neatly, trying to persist to disk anything in the memory
|
||||
pcl_close(Pid) ->
|
||||
gen_server:call(Pid, close, 60000).
|
||||
gen_server:call(Pid, close, infinity).
|
||||
|
||||
-spec pcl_snapclose(pid()) -> ok.
|
||||
%% @doc
|
||||
%% Specifically to be used when closing snpashots on shutdown, will handle a
|
||||
%% scenario where a snapshot has already exited
|
||||
pcl_snapclose(Pid) ->
|
||||
try
|
||||
pcl_close(Pid)
|
||||
catch
|
||||
exit:{noproc, _CallDetails} ->
|
||||
ok
|
||||
end.
|
||||
|
||||
-spec pcl_doom(pid()) -> {ok, list()}.
|
||||
%% @doc
|
||||
|
@ -556,7 +573,7 @@ pcl_close(Pid) ->
|
|||
%% Return a list of filepaths from where files exist for this penciller (should
|
||||
%% the calling process which to erase the store).
|
||||
pcl_doom(Pid) ->
|
||||
gen_server:call(Pid, doom, 60000).
|
||||
gen_server:call(Pid, doom, infinity).
|
||||
|
||||
-spec pcl_checkbloomtest(pid(), tuple()) -> boolean().
|
||||
%% @doc
|
||||
|
@ -888,7 +905,7 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
|
|||
handle_call(close, _From, State=#state{is_snapshot=Snap}) when Snap == true ->
|
||||
ok = pcl_releasesnapshot(State#state.source_penciller, self()),
|
||||
{stop, normal, ok, State};
|
||||
handle_call(close, _From, State) ->
|
||||
handle_call(close, From, State) ->
|
||||
% Level 0 files lie outside of the manifest, and so if there is no L0
|
||||
% file present it is safe to write the current contents of memory. If
|
||||
% there is a L0 file present - then the memory can be dropped (it is
|
||||
|
@ -917,17 +934,13 @@ handle_call(close, _From, State) ->
|
|||
false ->
|
||||
leveled_log:log(p0010, [State#state.levelzero_size])
|
||||
end,
|
||||
shutdown_manifest(State#state.manifest, State#state.levelzero_constructor),
|
||||
{stop, normal, ok, State};
|
||||
handle_call(doom, _From, State) ->
|
||||
gen_server:cast(self(), {maybe_defer_shutdown, close, From}),
|
||||
{noreply, State};
|
||||
handle_call(doom, From, State) ->
|
||||
leveled_log:log(p0030, []),
|
||||
ok = leveled_pclerk:clerk_close(State#state.clerk),
|
||||
|
||||
shutdown_manifest(State#state.manifest, State#state.levelzero_constructor),
|
||||
|
||||
ManifestFP = State#state.root_path ++ "/" ++ ?MANIFEST_FP ++ "/",
|
||||
FilesFP = State#state.root_path ++ "/" ++ ?FILES_FP ++ "/",
|
||||
{stop, normal, {ok, [ManifestFP, FilesFP]}, State};
|
||||
gen_server:cast(self(), {maybe_defer_shutdown, doom, From}),
|
||||
{noreply, State};
|
||||
handle_call({checkbloom_fortest, Key, Hash}, _From, State) ->
|
||||
Manifest = State#state.manifest,
|
||||
FoldFun =
|
||||
|
@ -977,8 +990,8 @@ handle_cast({manifest_change, Manifest}, State) ->
|
|||
work_ongoing=false}}
|
||||
end;
|
||||
handle_cast({release_snapshot, Snapshot}, State) ->
|
||||
Manifest0 = leveled_pmanifest:release_snapshot(State#state.manifest,
|
||||
Snapshot),
|
||||
Manifest0 =
|
||||
leveled_pmanifest:release_snapshot(State#state.manifest, Snapshot),
|
||||
leveled_log:log(p0003, [Snapshot]),
|
||||
{noreply, State#state{manifest=Manifest0}};
|
||||
handle_cast({confirm_delete, PDFN, FilePid}, State=#state{is_snapshot=Snap})
|
||||
|
@ -1138,7 +1151,34 @@ handle_cast({remove_logs, ForcedLogs}, State) ->
|
|||
ok = leveled_log:remove_forcedlogs(ForcedLogs),
|
||||
SSTopts = State#state.sst_options,
|
||||
SSTopts0 = SSTopts#sst_options{log_options = leveled_log:get_opts()},
|
||||
{noreply, State#state{sst_options = SSTopts0}}.
|
||||
{noreply, State#state{sst_options = SSTopts0}};
|
||||
handle_cast({maybe_defer_shutdown, ShutdownType, From}, State) ->
|
||||
case length(leveled_pmanifest:snapshot_pids(State#state.manifest)) of
|
||||
0 ->
|
||||
ok;
|
||||
N ->
|
||||
% Whilst this process sleeps, then any remaining snapshots may
|
||||
% release and have their release messages queued before the
|
||||
% complete_shutdown cast is sent
|
||||
leveled_log:log(p0042, [N]),
|
||||
timer:sleep(?SHUTDOWN_PAUSE)
|
||||
end,
|
||||
gen_server:cast(self(), {complete_shutdown, ShutdownType, From}),
|
||||
{noreply, State};
|
||||
handle_cast({complete_shutdown, ShutdownType, From}, State) ->
|
||||
lists:foreach(
|
||||
fun(Snap) -> ok = pcl_snapclose(Snap) end,
|
||||
leveled_pmanifest:snapshot_pids(State#state.manifest)),
|
||||
shutdown_manifest(State#state.manifest, State#state.levelzero_constructor),
|
||||
case ShutdownType of
|
||||
doom ->
|
||||
ManifestFP = State#state.root_path ++ "/" ++ ?MANIFEST_FP ++ "/",
|
||||
FilesFP = State#state.root_path ++ "/" ++ ?FILES_FP ++ "/",
|
||||
gen_server:reply(From, {ok, [ManifestFP, FilesFP]});
|
||||
close ->
|
||||
gen_server:reply(From, ok)
|
||||
end,
|
||||
{stop, normal, State}.
|
||||
|
||||
|
||||
%% handle the bookie stopping and stop this snapshot
|
||||
|
@ -1177,8 +1217,8 @@ sst_rootpath(RootPath) ->
|
|||
FP.
|
||||
|
||||
sst_filename(ManSQN, Level, Count) ->
|
||||
lists:flatten(io_lib:format("./~w_~w_~w" ++ ?SST_FILEX,
|
||||
[ManSQN, Level, Count])).
|
||||
lists:flatten(
|
||||
io_lib:format("./~w_~w_~w" ++ ?SST_FILEX, [ManSQN, Level, Count])).
|
||||
|
||||
|
||||
%%%============================================================================
|
||||
|
@ -2010,6 +2050,34 @@ format_status_test() ->
|
|||
?assertMatch(redacted, ST#state.levelzero_astree),
|
||||
clean_testdir(RootPath).
|
||||
|
||||
close_no_crash_test_() ->
|
||||
{timeout, 60, fun close_no_crash_tester/0}.
|
||||
|
||||
close_no_crash_tester() ->
|
||||
RootPath = "test/test_area/ledger_close",
|
||||
clean_testdir(RootPath),
|
||||
{ok, PCL} =
|
||||
pcl_start(
|
||||
#penciller_options{
|
||||
root_path=RootPath,
|
||||
max_inmemory_tablesize=1000,
|
||||
sst_options=#sst_options{}}),
|
||||
{ok, PclSnap} =
|
||||
pcl_snapstart(
|
||||
#penciller_options{
|
||||
start_snapshot = true,
|
||||
snapshot_query = undefined,
|
||||
bookies_mem = {empty_cache, empty_index, 1, 1},
|
||||
source_penciller = PCL,
|
||||
snapshot_longrunning = true,
|
||||
bookies_pid = self()
|
||||
}
|
||||
),
|
||||
exit(PclSnap, kill),
|
||||
ok = pcl_close(PCL),
|
||||
clean_testdir(RootPath).
|
||||
|
||||
|
||||
simple_server_test() ->
|
||||
RootPath = "test/test_area/ledger",
|
||||
clean_testdir(RootPath),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue