Merge pull request #23 from martinsumner/mas-pclerkcrash-3

Mas pclerkcrash 3
This commit is contained in:
martinsumner 2017-02-10 00:33:39 +00:00 committed by GitHub
commit 5ae93ecb17
2 changed files with 26 additions and 20 deletions

View file

@ -225,14 +225,16 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, Additions) ->
return_deletions(ManifestSQN, PendingDeletionD) ->
case dict:find(ManifestSQN, PendingDeletionD) of
{ok, PendingDeletions} ->
leveled_log:log("PC021", [ManifestSQN]),
{PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)};
error ->
leveled_log:log("PC020", [ManifestSQN]),
{[], PendingDeletionD}
end.
% The returning of deletions had been seperated out as a failure to fetch
% here had caased crashes of the clerk. The root cause of the failure to
% fetch was the same clerk being asked to do the same work twice - and this
% should be blocked now by the ongoing_work boolean in the Penciller
% LoopData
%
% So this is now allowed to crash again
PendingDeletions = dict:fetch(ManifestSQN, PendingDeletionD),
leveled_log:log("PC021", [ManifestSQN]),
{PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)}.
%%%============================================================================
%%% Test
@ -240,13 +242,6 @@ return_deletions(ManifestSQN, PendingDeletionD) ->
-ifdef(TEST).
return_deletions_test() ->
% During volume tests there would occasionaly be a deletion prompt with
% an empty pending deletions dictionary. Don't understand why this would
% happen - so we check here that at least it does not kill the clerk
R = {[], dict:new()},
?assertMatch(R, return_deletions(20, dict:new())).
generate_randomkeys(Count, BucketRangeLow, BucketRangeHigh) ->
generate_randomkeys(Count, [], BucketRangeLow, BucketRangeHigh).

View file

@ -511,10 +511,19 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey}, State) ->
manifest=UpdMan,
persisted_sqn=State#state.ledger_sqn}};
handle_cast(work_for_clerk, State) ->
case State#state.levelzero_pending of
true ->
{noreply, State};
false ->
case {State#state.levelzero_pending, State#state.work_ongoing} of
{false, false} ->
% TODO - as part of supervision tree and retry work:
% Need to check for work_ongoing as well as levelzero_pending as
% there may be a race that could lead to the clerk doing the same
% thing twice.
%
% This has implications though if we auto-restart the pclerk in the
% future, without altering this state - it may never be able to
% request work due to ongoing work that crashed the previous clerk
%
% Perhaps the pclerk should not be restarted because of this, and
% the failure should ripple up
{WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest,
?LEVEL_SCALEFACTOR),
case WC of
@ -534,7 +543,9 @@ handle_cast(work_for_clerk, State) ->
{TL, State#state.manifest}),
{noreply,
State#state{work_backlog=false, work_ongoing=true}}
end
end;
_ ->
{noreply, State}
end.