Merge pull request #23 from martinsumner/mas-pclerkcrash-3

Mas pclerkcrash 3
2017-02-10 00:33:39 +00:00 · 2017-02-10 00:33:39 +00:00 · 5ae93ecb17
commit 5ae93ecb17
parent b12cd13bba 8077c70486
2 changed files with 26 additions and 20 deletions
--- a/src/leveled_pclerk.erl
+++ b/src/leveled_pclerk.erl
@ -225,14 +225,16 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, Additions) ->


 return_deletions(ManifestSQN, PendingDeletionD) ->
-    case dict:find(ManifestSQN, PendingDeletionD) of
-        {ok, PendingDeletions} ->
+    % The returning of deletions had been seperated out as a failure to fetch
+    % here had caased crashes of the clerk.  The root cause of the failure to
+    % fetch was the same clerk being asked to do the same work twice - and this
+    % should be blocked now by the ongoing_work boolean in the Penciller
+    % LoopData
+    %
+    % So this is now allowed to crash again
+    PendingDeletions = dict:fetch(ManifestSQN, PendingDeletionD),
    leveled_log:log("PC021", [ManifestSQN]),
-            {PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)};
-        error ->
-            leveled_log:log("PC020", [ManifestSQN]),
-            {[], PendingDeletionD}
-    end.
+    {PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)}.

 %%%============================================================================
 %%% Test
@ -240,13 +242,6 @@ return_deletions(ManifestSQN, PendingDeletionD) ->

 -ifdef(TEST).

-return_deletions_test() ->
-    % During volume tests there would occasionaly be a deletion prompt with
-    % an empty pending deletions dictionary.  Don't understand why this would
-    % happen - so we check here that at least it does not kill the clerk
-    R = {[], dict:new()},
-    ?assertMatch(R, return_deletions(20, dict:new())).
-
 generate_randomkeys(Count, BucketRangeLow, BucketRangeHigh) ->
    generate_randomkeys(Count, [], BucketRangeLow, BucketRangeHigh).

--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@ -511,10 +511,19 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey}, State) ->
                            manifest=UpdMan,
                            persisted_sqn=State#state.ledger_sqn}};
 handle_cast(work_for_clerk, State) ->
-    case State#state.levelzero_pending of
-        true ->
-            {noreply, State};
-        false ->
+    case {State#state.levelzero_pending, State#state.work_ongoing} of
+        {false, false} ->
+            % TODO - as part of supervision tree and retry work:
+            % Need to check for work_ongoing as well as levelzero_pending as
+            % there may be a race that could lead to the clerk doing the same
+            % thing twice.
+            %
+            % This has implications though if we auto-restart the pclerk in the
+            % future, without altering this state - it may never be able to
+            % request work due to ongoing work that crashed the previous clerk
+            %
+            % Perhaps the pclerk should not be restarted because of this, and
+            % the failure should ripple up
            {WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest,
                                                        ?LEVEL_SCALEFACTOR),
            case WC of
@ -534,7 +543,9 @@ handle_cast(work_for_clerk, State) ->
                                                    {TL, State#state.manifest}),
                    {noreply,
                        State#state{work_backlog=false, work_ongoing=true}}
-            end
+            end;
+        _ ->
+            {noreply, State}
    end.