diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index bf356fc..6e95b53 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -225,14 +225,16 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, Additions) -> return_deletions(ManifestSQN, PendingDeletionD) -> - case dict:find(ManifestSQN, PendingDeletionD) of - {ok, PendingDeletions} -> - leveled_log:log("PC021", [ManifestSQN]), - {PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)}; - error -> - leveled_log:log("PC020", [ManifestSQN]), - {[], PendingDeletionD} - end. + % The returning of deletions had been seperated out as a failure to fetch + % here had caased crashes of the clerk. The root cause of the failure to + % fetch was the same clerk being asked to do the same work twice - and this + % should be blocked now by the ongoing_work boolean in the Penciller + % LoopData + % + % So this is now allowed to crash again + PendingDeletions = dict:fetch(ManifestSQN, PendingDeletionD), + leveled_log:log("PC021", [ManifestSQN]), + {PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)}. %%%============================================================================ %%% Test @@ -240,13 +242,6 @@ return_deletions(ManifestSQN, PendingDeletionD) -> -ifdef(TEST). -return_deletions_test() -> - % During volume tests there would occasionaly be a deletion prompt with - % an empty pending deletions dictionary. Don't understand why this would - % happen - so we check here that at least it does not kill the clerk - R = {[], dict:new()}, - ?assertMatch(R, return_deletions(20, dict:new())). - generate_randomkeys(Count, BucketRangeLow, BucketRangeHigh) -> generate_randomkeys(Count, [], BucketRangeLow, BucketRangeHigh). diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 57b522c..2ac05e4 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -511,10 +511,19 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey}, State) -> manifest=UpdMan, persisted_sqn=State#state.ledger_sqn}}; handle_cast(work_for_clerk, State) -> - case State#state.levelzero_pending of - true -> - {noreply, State}; - false -> + case {State#state.levelzero_pending, State#state.work_ongoing} of + {false, false} -> + % TODO - as part of supervision tree and retry work: + % Need to check for work_ongoing as well as levelzero_pending as + % there may be a race that could lead to the clerk doing the same + % thing twice. + % + % This has implications though if we auto-restart the pclerk in the + % future, without altering this state - it may never be able to + % request work due to ongoing work that crashed the previous clerk + % + % Perhaps the pclerk should not be restarted because of this, and + % the failure should ripple up {WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest, ?LEVEL_SCALEFACTOR), case WC of @@ -534,7 +543,9 @@ handle_cast(work_for_clerk, State) -> {TL, State#state.manifest}), {noreply, State#state{work_backlog=false, work_ongoing=true}} - end + end; + _ -> + {noreply, State} end.