Mas d31 i410looptoclose (#421)

* Mas i410 looptoclose (#420)

* Stop waiting full SHUTDOWN_PAUSE

If there is a snapshot outstanding at shutdown time, there was a wait of SHUTDOWN_PAUSE to give the snapshot time to close down.

This causes an issue in kv_index_tictactree when rebuilds complete, when an exchange was in flight at the point the rebuild completed - the aae_controller will become blocked for the full shutdown pause, whilst it waits for the replaced key store to be closed.

This change is to loop within the shutdown pause, so that if the snapshot supporting the exchange is closed, the paused bookie can close more quickly (unblocking the controller).

Without this fix, there are intermittent issues in kv_index_tictactree's mockvnode_SUITE tests.

* Address test reliability

Be a bit clearer with waiting round seconds,  Was intermittently failing on QR4 previously (but QR5 1s later was always OK).

* Update iterator_SUITE.erl

* Refine test assertion

At Stage C there might be 0 files left, in which case equality with Stage D result is ok.
This commit is contained in:
Martin Sumner 2023-11-10 15:04:47 +00:00 committed by GitHub
parent d544db5461
commit 6223b801f3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 88 additions and 27 deletions

View file

@ -133,6 +133,7 @@
-define(JOURNAL_FILEX, "cdb").
-define(PENDING_FILEX, "pnd").
-define(TEST_KC, {[], infinity}).
-define(SHUTDOWN_LOOPS, 10).
-define(SHUTDOWN_PAUSE, 10000).
% How long to wait for snapshots to be released on shutdown
% before forcing closure of snapshots
@ -154,7 +155,8 @@
compression_method = native :: lz4|native|none,
compress_on_receipt = false :: boolean(),
snap_timeout :: pos_integer() | undefined, % in seconds
source_inker :: pid() | undefined}).
source_inker :: pid() | undefined,
shutdown_loops = ?SHUTDOWN_LOOPS :: non_neg_integer()}).
-type inker_options() :: #inker_options{}.
@ -786,16 +788,25 @@ handle_cast({remove_logs, ForcedLogs}, State) ->
handle_cast({maybe_defer_shutdown, ShutdownType, From}, State) ->
case length(State#state.registered_snapshots) of
0 ->
ok;
gen_server:cast(self(), {complete_shutdown, ShutdownType, From}),
{noreply, State};
N ->
% Whilst this process sleeps, then any remaining snapshots may
% release and have their release messages queued before the
% complete_shutdown cast is sent
leveled_log:log(i0026, [N]),
timer:sleep(?SHUTDOWN_PAUSE)
end,
gen_server:cast(self(), {complete_shutdown, ShutdownType, From}),
{noreply, State};
case State#state.shutdown_loops of
LoopCount when LoopCount > 0 ->
leveled_log:log(i0026, [N]),
timer:sleep(?SHUTDOWN_PAUSE div ?SHUTDOWN_LOOPS),
gen_server:cast(
self(), {maybe_defer_shutdown, ShutdownType, From}),
{noreply, State#state{shutdown_loops = LoopCount - 1}};
0 ->
gen_server:cast(
self(), {complete_shutdown, ShutdownType, From}),
{noreply, State}
end
end;
handle_cast({complete_shutdown, ShutdownType, From}, State) ->
lists:foreach(
fun(SnapPid) -> ok = ink_snapclose(SnapPid) end,