From adb78f5c5a0db2112fbdf586083e64f398015f72 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 9 Feb 2017 14:33:39 +0000 Subject: [PATCH 1/4] Resolve pclerk crash Need to add extra logging to understand why pclerk crashes in some volume tests. Penciller's Clerk <0.813.0> shutdown now complete for reason {badarg,[{dict,fetch,[63,{dict,0,16,16,8,80,48,{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]},{{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}}}],[{file,[100,105,99,116,46,101,114,108]},{line,126}]},{leveled_pclerk,handle_cast,2,[{file,[115,114,99,47,108,101,118,101,108,101,100,95,112,99,108,101,114,107,46,101,114,108]},{line,96}]},{gen_server,handle_msg,5,[{file,[103,101,110,95,115,101,114,118,101,114,46,101,114,108]},{line,604}]},{proc_lib,init_p_do_apply,3,[{file,[112,114,111,99,95,108,105,98,46,101,114,108]},{line,239}]}]} Should only be prompted after prompt deletions had bene updated. Perhaps a race whereby somehow it is prompted again after it is emptied. --- src/leveled_log.erl | 8 +++++++- src/leveled_pclerk.erl | 25 ++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index 26b15bf..d9e0504 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -168,7 +168,13 @@ {info, "Saved manifest file"}}, {"PC019", {debug, "After ~s level ~w is ~w"}}, - + {"PC020", + {warn, "Empty prompt deletions at ManifestSQN=~w"}}, + {"PC021", + {info, "Prompting deletions at ManifestSQN=~w"}}, + {"PC022", + {info, "Storing reference to deletions at ManifestSQN=~w"}}, + {"I0001", {info, "Unexpected failure to fetch value for Key=~w SQN=~w " ++ "with reason ~w"}}, diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 5055a01..98b2c3b 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -91,12 +91,13 @@ handle_cast(prompt, State) -> handle_cast({push_work, Work}, State) -> {ManifestSQN, Deletions} = handle_work(Work, State), PDs = dict:store(ManifestSQN, Deletions, State#state.pending_deletions), + leveled_log:log("PC022", [ManifestSQN]), {noreply, State#state{pending_deletions = PDs}, ?MAX_TIMEOUT}; handle_cast({prompt_deletions, ManifestSQN}, State) -> - Deletions = dict:fetch(ManifestSQN, State#state.pending_deletions), + {Deletions, UpdD} = return_deletions(ManifestSQN, + State#state.pending_deletions), ok = notify_deletions(Deletions, State#state.owner), - UpdDeletions = dict:erase(ManifestSQN, State#state.pending_deletions), - {noreply, State#state{pending_deletions = UpdDeletions}, ?MIN_TIMEOUT}. + {noreply, State#state{pending_deletions = UpdD}, ?MIN_TIMEOUT}. handle_info(timeout, State) -> request_work(State), @@ -223,12 +224,30 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, Additions) -> end. +return_deletions(ManifestSQN, PendingDeletionD) -> + case dict:is_empty(PendingDeletionD) of + true -> + leveled_log:log("PC020", [ManifestSQN]), + {[], PendingDeletionD}; + false -> + leveled_log:log("PC021", [ManifestSQN]), + {dict:fetch(ManifestSQN, PendingDeletionD), + dict:erase(ManifestSQN, PendingDeletionD)} + end. + %%%============================================================================ %%% Test %%%============================================================================ -ifdef(TEST). +return_deletions_test() -> + % During volume tests there would occasionaly be a deletion prompt with + % an empty pending deletions dictionary. Don't understand why this would + % happen - so we check here that at least it does not kill the clerk + R = {[], dict:new()}, + ?assertMatch(R, return_deletions(20, dict:new())). + generate_randomkeys(Count, BucketRangeLow, BucketRangeHigh) -> generate_randomkeys(Count, [], BucketRangeLow, BucketRangeHigh). From 05b2d8faafe58019a4786575cd753cac86cf8bb3 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 9 Feb 2017 21:12:01 +0000 Subject: [PATCH 2/4] is_empty not in OTP16 bloody OTP16 strikes again - is_empty isn't there --- src/leveled_pclerk.erl | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 98b2c3b..bf356fc 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -225,14 +225,13 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, Additions) -> return_deletions(ManifestSQN, PendingDeletionD) -> - case dict:is_empty(PendingDeletionD) of - true -> - leveled_log:log("PC020", [ManifestSQN]), - {[], PendingDeletionD}; - false -> + case dict:find(ManifestSQN, PendingDeletionD) of + {ok, PendingDeletions} -> leveled_log:log("PC021", [ManifestSQN]), - {dict:fetch(ManifestSQN, PendingDeletionD), - dict:erase(ManifestSQN, PendingDeletionD)} + {PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)}; + error -> + leveled_log:log("PC020", [ManifestSQN]), + {[], PendingDeletionD} end. %%%============================================================================ From 793977b76cb0487a4f45ac3a2a51fcf1e8237a72 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 9 Feb 2017 23:41:28 +0000 Subject: [PATCH 3/4] Prevent double work ongoing This was the cause of the pclerk crash. The concurrent work would lead to the attempt to re-fetch the prompted deletions after they had been deleted. --- src/leveled_penciller.erl | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 57b522c..2ac05e4 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -511,10 +511,19 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey}, State) -> manifest=UpdMan, persisted_sqn=State#state.ledger_sqn}}; handle_cast(work_for_clerk, State) -> - case State#state.levelzero_pending of - true -> - {noreply, State}; - false -> + case {State#state.levelzero_pending, State#state.work_ongoing} of + {false, false} -> + % TODO - as part of supervision tree and retry work: + % Need to check for work_ongoing as well as levelzero_pending as + % there may be a race that could lead to the clerk doing the same + % thing twice. + % + % This has implications though if we auto-restart the pclerk in the + % future, without altering this state - it may never be able to + % request work due to ongoing work that crashed the previous clerk + % + % Perhaps the pclerk should not be restarted because of this, and + % the failure should ripple up {WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest, ?LEVEL_SCALEFACTOR), case WC of @@ -534,7 +543,9 @@ handle_cast(work_for_clerk, State) -> {TL, State#state.manifest}), {noreply, State#state{work_backlog=false, work_ongoing=true}} - end + end; + _ -> + {noreply, State} end. From 8077c70486ed0a7f2900c148024bd1d1a5d5719a Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 9 Feb 2017 23:49:12 +0000 Subject: [PATCH 4/4] More conservative approach to ongoing work monitoring As per comments though - if we auto-restart pclerk in the future this will have to be re-considered. Perhaps a re-starting pclerk should force some reset of this boolean on startup perhaps by making a different work_for_clerk if in a virgin state. --- src/leveled_pclerk.erl | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index bf356fc..6e95b53 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -225,14 +225,16 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, Additions) -> return_deletions(ManifestSQN, PendingDeletionD) -> - case dict:find(ManifestSQN, PendingDeletionD) of - {ok, PendingDeletions} -> - leveled_log:log("PC021", [ManifestSQN]), - {PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)}; - error -> - leveled_log:log("PC020", [ManifestSQN]), - {[], PendingDeletionD} - end. + % The returning of deletions had been seperated out as a failure to fetch + % here had caased crashes of the clerk. The root cause of the failure to + % fetch was the same clerk being asked to do the same work twice - and this + % should be blocked now by the ongoing_work boolean in the Penciller + % LoopData + % + % So this is now allowed to crash again + PendingDeletions = dict:fetch(ManifestSQN, PendingDeletionD), + leveled_log:log("PC021", [ManifestSQN]), + {PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)}. %%%============================================================================ %%% Test @@ -240,13 +242,6 @@ return_deletions(ManifestSQN, PendingDeletionD) -> -ifdef(TEST). -return_deletions_test() -> - % During volume tests there would occasionaly be a deletion prompt with - % an empty pending deletions dictionary. Don't understand why this would - % happen - so we check here that at least it does not kill the clerk - R = {[], dict:new()}, - ?assertMatch(R, return_deletions(20, dict:new())). - generate_randomkeys(Count, BucketRangeLow, BucketRangeHigh) -> generate_randomkeys(Count, [], BucketRangeLow, BucketRangeHigh).