Close in stages - waiting for releases (#411)

* Close in stages - waiting for releases

Have a consistent approach to closing the inker and the penciller - so that the close can be interrupted by releasing of snapshots.  Then any unreleased snapshots are closed before shutdown - with a 10s pause to give queries a short opportunity to finish.

This should address some issues, primarily seen (but very rarely) in test whereby post-rebuild destruction of parallel AAE keystores cause the crashing of aae_folds.

The primary benefit is to stop an attempt to release a snapshot that has in fact already finished does not cause a crash of the database on normal stop.  this was primarily an issue when shutdown is delayed by an ongoing journal compaction job.

* Boost default test budget for EQC

* Update test to use correct type

* Update following review

Avoid filtering out exited PIDs when closing snapshots by catching the exit exception when the Pid is down
This commit is contained in:
Martin Sumner 2023-10-03 18:30:40 +01:00
parent bc87273c76
commit 7a5cf251b3
6 changed files with 256 additions and 88 deletions

View file

@ -452,7 +452,7 @@ fetchput_snapshot(_Config) ->
% Now loads lots of new objects
GenList = [20002, 40002, 60002, 80002, 100002],
GenList = [20002, 40002, 60002, 80002, 100002, 120002, 140002, 160002],
CLs2 = testutil:load_objects(20000, GenList, Bookie2, TestObject,
fun testutil:generate_smallobjects/2),
io:format("Loaded significant numbers of new objects~n"),
@ -508,7 +508,6 @@ fetchput_snapshot(_Config) ->
testutil:check_forlist(Bookie2, lists:nth(length(CLs3), CLs3)),
testutil:check_forlist(Bookie2, lists:nth(1, CLs3)),
{ok, FNsC} = file:list_dir(RootPath ++ "/ledger/ledger_files"),
io:format("FNsA ~w FNsB ~w FNsC ~w~n",
[length(FNsA), length(FNsB), length(FNsC)]),
@ -523,9 +522,35 @@ fetchput_snapshot(_Config) ->
{B1Size, B1Count} = testutil:check_bucket_stats(Bookie2, "Bucket1"),
{BSize, BCount} = testutil:check_bucket_stats(Bookie2, "Bucket"),
true = BSize > 0,
true = BCount == 120000,
true = BCount == 180000,
io:format("Shutdown with overhanging snapshot~n"),
ok = leveled_bookie:book_close(Bookie2),
{ok, SnpPCL1, SnpJrnl1} =
leveled_bookie:book_snapshot(Bookie2, store, undefined, true),
{ok, SnpPCL2, SnpJrnl2} =
leveled_bookie:book_snapshot(Bookie2, store, undefined, true),
TestPid = self(),
spawn(
fun() ->
ok = leveled_bookie:book_close(Bookie2),
TestPid ! ok
end),
timer:sleep(5000),
ok = leveled_penciller:pcl_close(SnpPCL1),
ok = leveled_inker:ink_close(SnpJrnl1),
true = is_process_alive(SnpPCL2),
true = is_process_alive(SnpJrnl2),
io:format("Time for close to complete is 2 * 10s~n"),
io:format("Both Inker and Penciller will have snapshot delay~n"),
receive ok -> ok end,
false = is_process_alive(SnpPCL2),
false = is_process_alive(SnpJrnl2),
testutil:reset_filestructure().
@ -628,7 +653,9 @@ load_and_count(JournalSize, BookiesMemSize, PencillerMemSize) ->
ok = leveled_bookie:book_close(Bookie1),
{ok, Bookie2} = leveled_bookie:book_start(StartOpts1),
{_, 300000} = testutil:check_bucket_stats(Bookie2, "Bucket"),
ok = leveled_bookie:book_close(Bookie2),
ManifestFP =
leveled_pmanifest:filepath(filename:join(RootPath, ?LEDGER_FP),
manifest),
@ -691,11 +718,13 @@ load_and_count_withdelete(_Config) ->
lists:seq(1, 20)),
not_found = testutil:book_riakget(Bookie1, BucketD, KeyD),
ok = leveled_bookie:book_close(Bookie1),
{ok, Bookie2} = leveled_bookie:book_start(StartOpts1),
testutil:check_formissingobject(Bookie2, BucketD, KeyD),
testutil:check_formissingobject(Bookie2, "Bookie1", "MissingKey0123"),
{_BSize, 0} = testutil:check_bucket_stats(Bookie2, BucketD),
ok = leveled_bookie:book_close(Bookie2),
testutil:reset_filestructure().

View file

@ -41,7 +41,8 @@
start_opts = []
}).
-define(NUMTESTS, 1000).
-define(NUMTESTS, 10000).
-define(TIME_BUDGET, 300).
-define(QC_OUT(P),
eqc:on_output(fun(Str, Args) ->
io:format(user, Str, Args) end, P)).
@ -49,7 +50,12 @@
-type state() :: #state{}.
eqc_test_() ->
{timeout, 60, ?_assertEqual(true, eqc:quickcheck(eqc:testing_time(50, ?QC_OUT(prop_db()))))}.
{timeout,
?TIME_BUDGET + 10,
?_assertEqual(
true,
eqc:quickcheck(
eqc:testing_time(?TIME_BUDGET, ?QC_OUT(prop_db()))))}.
run() ->
run(?NUMTESTS).