Close in stages - waiting for releases (#411)

* Close in stages - waiting for releases

Have a consistent approach to closing the inker and the penciller - so that the close can be interrupted by releasing of snapshots.  Then any unreleased snapshots are closed before shutdown - with a 10s pause to give queries a short opportunity to finish.

This should address some issues, primarily seen (but very rarely) in test whereby post-rebuild destruction of parallel AAE keystores cause the crashing of aae_folds.

The primary benefit is to stop an attempt to release a snapshot that has in fact already finished does not cause a crash of the database on normal stop.  this was primarily an issue when shutdown is delayed by an ongoing journal compaction job.

* Boost default test budget for EQC

* Update test to use correct type

* Update following review

Avoid filtering out exited PIDs when closing snapshots by catching the exit exception when the Pid is down
This commit is contained in:
Martin Sumner 2023-10-03 18:30:40 +01:00
parent bc87273c76
commit 7a5cf251b3
6 changed files with 256 additions and 88 deletions

View file

@ -133,6 +133,11 @@
-define(JOURNAL_FILEX, "cdb").
-define(PENDING_FILEX, "pnd").
-define(TEST_KC, {[], infinity}).
-define(SHUTDOWN_PAUSE, 10000).
% How long to wait for snapshots to be released on shutdown
% before forcing closure of snapshots
% 10s may not be long enough for all snapshots, but avoids crashes of
% short-lived queries racing with the shutdown
-record(state, {manifest = [] :: list(),
manifest_sqn = 0 :: integer(),
@ -281,6 +286,18 @@ ink_confirmdelete(Pid, ManSQN, CDBpid) ->
ink_close(Pid) ->
gen_server:call(Pid, close, infinity).
-spec ink_snapclose(pid()) -> ok.
%% @doc
%% Specifically to be used when closing snpashots on shutdown, will handle a
%% scenario where a snapshot has already exited
ink_snapclose(Pid) ->
try
ink_close(Pid)
catch
exit:{noproc, _CallDetails} ->
ok
end.
-spec ink_doom(pid()) -> {ok, [{string(), string(), string(), string()}]}.
%% @doc
%% Test function used to close a file, and return all file paths (potentially
@ -654,33 +671,23 @@ handle_call({check_sqn, LedgerSQN}, _From, State) ->
end;
handle_call(get_journalsqn, _From, State) ->
{reply, {ok, State#state.journal_sqn}, State};
handle_call(close, _From, State) ->
case State#state.is_snapshot of
true ->
ok = ink_releasesnapshot(State#state.source_inker, self());
false ->
leveled_log:log(i0005, [close]),
leveled_log:log(
i0006, [State#state.journal_sqn, State#state.manifest_sqn]),
ok = leveled_iclerk:clerk_stop(State#state.clerk),
shutdown_snapshots(State#state.registered_snapshots),
shutdown_manifest(State#state.manifest)
end,
handle_call(close, _From, State=#state{is_snapshot=Snap}) when Snap == true ->
ok = ink_releasesnapshot(State#state.source_inker, self()),
{stop, normal, ok, State};
handle_call(doom, _From, State) ->
FPs = [filepath(State#state.root_path, journal_dir),
filepath(State#state.root_path, manifest_dir),
filepath(State#state.root_path, journal_compact_dir),
filepath(State#state.root_path, journal_waste_dir)],
leveled_log:log(i0018, []),
leveled_log:log(i0005, [doom]),
handle_call(ShutdownType, From, State)
when ShutdownType == close; ShutdownType == doom ->
case ShutdownType of
doom ->
leveled_log:log(i0018, []);
_ ->
ok
end,
leveled_log:log(i0005, [ShutdownType]),
leveled_log:log(
i0006, [State#state.journal_sqn, State#state.manifest_sqn]),
ok = leveled_iclerk:clerk_stop(State#state.clerk),
shutdown_snapshots(State#state.registered_snapshots),
shutdown_manifest(State#state.manifest),
{stop, normal, {ok, FPs}, State}.
gen_server:cast(self(), {maybe_defer_shutdown, ShutdownType, From}),
{noreply, State}.
handle_cast({clerk_complete, ManifestSnippet, FilesToDelete}, State) ->
@ -766,8 +773,39 @@ handle_cast({remove_logs, ForcedLogs}, State) ->
ok = leveled_log:remove_forcedlogs(ForcedLogs),
CDBopts = State#state.cdb_options,
CDBopts0 = CDBopts#cdb_options{log_options = leveled_log:get_opts()},
{noreply, State#state{cdb_options = CDBopts0}}.
{noreply, State#state{cdb_options = CDBopts0}};
handle_cast({maybe_defer_shutdown, ShutdownType, From}, State) ->
case length(State#state.registered_snapshots) of
0 ->
ok;
N ->
% Whilst this process sleeps, then any remaining snapshots may
% release and have their release messages queued before the
% complete_shutdown cast is sent
leveled_log:log(i0026, [N]),
timer:sleep(?SHUTDOWN_PAUSE)
end,
gen_server:cast(self(), {complete_shutdown, ShutdownType, From}),
{noreply, State};
handle_cast({complete_shutdown, ShutdownType, From}, State) ->
lists:foreach(
fun(SnapPid) -> ok = ink_snapclose(SnapPid) end,
lists:map(
fun(Snapshot) -> element(1, Snapshot) end,
State#state.registered_snapshots)),
shutdown_manifest(State#state.manifest),
case ShutdownType of
doom ->
FPs =
[filepath(State#state.root_path, journal_dir),
filepath(State#state.root_path, manifest_dir),
filepath(State#state.root_path, journal_compact_dir),
filepath(State#state.root_path, journal_waste_dir)],
gen_server:reply(From, {ok, FPs});
close ->
gen_server:reply(From, ok)
end,
{stop, normal, State}.
%% handle the bookie stopping and stop this snapshot
handle_info({'DOWN', BookieMonRef, process, _BookiePid, _Info},
@ -789,6 +827,7 @@ code_change(_OldVsn, State, _Extra) ->
%%% Internal functions
%%%============================================================================
-spec start_from_file(inker_options()) -> {ok, ink_state()}.
%% @doc
%% Start an Inker from the state on disk (i.e. not a snapshot).
@ -854,13 +893,6 @@ start_from_file(InkOpts) ->
clerk = Clerk}}.
-spec shutdown_snapshots(list(registered_snapshot())) -> ok.
%% @doc
%% Shutdown any snapshots before closing the store
shutdown_snapshots(Snapshots) ->
lists:foreach(fun({Snap, _TS, _SQN}) -> ok = ink_close(Snap) end,
Snapshots).
-spec shutdown_manifest(leveled_imanifest:manifest()) -> ok.
%% @doc
%% Shutdown all files in the manifest
@ -1603,4 +1635,28 @@ loop() ->
ok
end.
close_no_crash_test_() ->
{timeout, 60, fun close_no_crash_tester/0}.
close_no_crash_tester() ->
RootPath = "test/test_area/journal",
build_dummy_journal(),
CDBopts = #cdb_options{max_size=300000, binary_mode=true},
{ok, Inker} =
ink_start(
#inker_options{
root_path=RootPath,
cdb_options=CDBopts,
compression_method=native,
compress_on_receipt=true}),
SnapOpts =
#inker_options{
start_snapshot=true, bookies_pid = self(), source_inker=Inker},
{ok, InkSnap} = ink_snapstart(SnapOpts),
exit(InkSnap, kill),
ok = ink_close(Inker),
clean_testdir(RootPath).
-endif.