Change score assessment

Wans't actually calculating scores correctly previously.
2018-09-26 12:56:28 +01:00 · 2018-09-26 12:56:28 +01:00 · d9229c9782
commit d9229c9782
parent 908f762a48
1 changed files with 68 additions and 108 deletions
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@ -118,6 +118,7 @@
                    compaction_perc :: float() | undefined}).

 -type iclerk_options() :: #iclerk_options{}.
+-type candidate():: #candidate{}.

 %%%============================================================================
 %%% API
@ -229,9 +230,9 @@ handle_cast({compact, Checker, InitiateFun, CloseFun, FilterFun, Inker, _TO},
        {MaxRunLength, 
            State#state.maxrunlength_compactionperc, 
            State#state.singlefile_compactionperc},
-    BestRun0 = assess_candidates(Candidates, ScoreParams),
-    case score_run(BestRun0, ScoreParams) of
-        Score when Score > 0.0 ->
+    {BestRun0, Score} = assess_candidates(Candidates, ScoreParams),
+    case Score > 0.0 of
+        true ->
            BestRun1 = sort_run(BestRun0),
            print_compaction_run(BestRun1, ScoreParams),
            ManifestSlice = compact_files(BestRun1,
@ -257,7 +258,7 @@ handle_cast({compact, Checker, InitiateFun, CloseFun, FilterFun, Inker, _TO},
                    ok = CloseFun(FilterServer),
                    {noreply, State}
            end;
-        Score ->
+        false ->
            leveled_log:log("IC003", [Score]),
            ok = leveled_inker:ink_compactioncomplete(Inker),
            ok = CloseFun(FilterServer),
@ -457,54 +458,42 @@ fetch_inbatches(PositionList, BatchSize, CDB, CheckedList) ->
    fetch_inbatches(Tail, BatchSize, CDB, CheckedList ++ KL_List).


+-spec assess_candidates(list(candidate()), tuple()) 
+                                            -> {list(candidate()), float()}.
+%% @doc
+%% For each run length we need to assess all the runs of candidates
+%% The best scoring run should emerge as the winner
+%% Although this requires many loops over the list of the candidate, as the
+%% file scores have already been calculated the cost per loop should not be
+%% a high burden.  Reducing the maximum run length, will reduce the cost of
+%% this exercise should be a problem
 assess_candidates(AllCandidates, Params) ->
-    NaiveBestRun = assess_candidates(AllCandidates, Params, [], []),
-    MaxRunLength = element(1, Params),
-    case length(AllCandidates) of
-        L when L > MaxRunLength, MaxRunLength > 1 ->
-            %% Assess with different offsets from the start
-            AssessFold = 
-                fun(Counter, BestRun) ->
-                    SubList = lists:nthtail(Counter, AllCandidates),
-                    assess_candidates(SubList, Params, [], BestRun)
-                end,
-
-            lists:foldl(AssessFold, 
-                            NaiveBestRun, 
-                            lists:seq(1, MaxRunLength - 1));
-        _ ->
-            NaiveBestRun
-    end.
-
-assess_candidates([], _Params, _CurrentRun0, BestAssessment) ->
-    BestAssessment;
-assess_candidates([HeadC|Tail], Params, CurrentRun0, BestAssessment) ->
-    CurrentRun1 = choose_best_assessment(CurrentRun0 ++ [HeadC],
-                                            [HeadC],
-                                            Params),
-    assess_candidates(Tail,
-                        Params,
-                        CurrentRun1,
-                        choose_best_assessment(CurrentRun1,
-                                                BestAssessment,
-                                                Params)).
+    MaxRunLength = min(element(1, Params), length(AllCandidates)),
+    NaiveBestRun = lists:sublist(AllCandidates, MaxRunLength),
+    FoldFun =
+        fun(RunLength, Best) ->
+            assess_for_runlength(RunLength, AllCandidates, Params, Best)
+        end,
+    
+    % Check all run lengths to find the best candidate.  Reverse the list of
+    % run lengths, so that longer runs win on equality of score
+    lists:foldl(FoldFun, 
+                {NaiveBestRun, score_run(NaiveBestRun, Params)}, 
+                lists:reverse(lists:seq(1, MaxRunLength))).


-choose_best_assessment(RunToAssess, BestRun, Params) ->
-    {MaxRunLength, _MR_CT, _SF_CT} = Params,
-    case length(RunToAssess) of
-        LR1 when LR1 > MaxRunLength ->
-            BestRun;
-        _ ->
-            AssessScore = score_run(RunToAssess, Params),
-            BestScore = score_run(BestRun, Params),
-            if
-                AssessScore > BestScore ->
-                    RunToAssess;
-                true ->
-                    BestRun
+assess_for_runlength(RunLength, AllCandidates, Params, Best) ->
+    NumberOfRuns = 1 + length(AllCandidates) - RunLength,
+    FoldFun =
+        fun(Offset, {BestRun, BestScore}) ->
+            Run = lists:sublist(AllCandidates, Offset, RunLength),
+            Score = score_run(Run, Params),
+            case Score > BestScore of
+                true -> {Run, Score};
+                false -> {BestRun, BestScore}
            end
-    end.    
+        end,
+    lists:foldl(FoldFun, Best, lists:seq(1, NumberOfRuns)).


 score_run([], _Params) ->
@ -740,22 +729,6 @@ simple_score_test() ->
    Run3 = [#candidate{compaction_perc = 100.0}],
    ?assertMatch(-60.0, score_run(Run3, {4, 70.0, 40.0})).

-score_compare_test() ->
-    Run1 = [#candidate{compaction_perc = 55.0},
-            #candidate{compaction_perc = 55.0},
-            #candidate{compaction_perc = 56.0},
-            #candidate{compaction_perc = 50.0}],
-    ?assertMatch(16.0, score_run(Run1, {4, 70.0, 40.0})),
-    Run2 = [#candidate{compaction_perc = 55.0}],
-    ?assertMatch(Run1, 
-                    choose_best_assessment(Run1, 
-                                            Run2, 
-                                            {4, 60.0, 40.0})),
-    ?assertMatch(Run2, 
-                    choose_best_assessment(Run1 ++ Run2, 
-                                            Run2, 
-                                            {4, 60.0, 40.0})).
-
 file_gc_test() ->
    State = #state{waste_path="test/waste/",
                    waste_retention_period=1},
@ -771,6 +744,11 @@ file_gc_test() ->
    {ok, ClearedJournals2} = file:list_dir(State#state.waste_path),
    ?assertMatch([], ClearedJournals2).

+
+check_bestrun(CandidateList, Params) ->
+    {BestRun, _Score} = assess_candidates(CandidateList, Params),
+    lists:map(fun(C) -> C#candidate.filename end, BestRun).
+
 find_bestrun_test() ->
 %% Tests dependent on these defaults
 %% -define(MAX_COMPACTION_RUN, 4).
@ -778,53 +756,35 @@ find_bestrun_test() ->
 %% -define(MAXRUNLENGTH_COMPACTION_TARGET, 60.0).
 %% Tested first with blocks significant as no back-tracking
    Params = {4, 60.0, 40.0},
-    Block1 = [#candidate{compaction_perc = 55.0},
-                #candidate{compaction_perc = 65.0},
-                #candidate{compaction_perc = 42.0},
-                #candidate{compaction_perc = 50.0}],
-    Block2 = [#candidate{compaction_perc = 38.0},
-                #candidate{compaction_perc = 75.0},
-                #candidate{compaction_perc = 75.0},
-                #candidate{compaction_perc = 45.0}],
-    Block3 = [#candidate{compaction_perc = 70.0},
-                #candidate{compaction_perc = 100.0},
-                #candidate{compaction_perc = 100.0},
-                #candidate{compaction_perc = 100.0}],
-    Block4 = [#candidate{compaction_perc = 55.0},
-                #candidate{compaction_perc = 56.0},
-                #candidate{compaction_perc = 57.0},
-                #candidate{compaction_perc = 40.0}],
-    Block5 = [#candidate{compaction_perc = 60.0},
-                #candidate{compaction_perc = 60.0}],
+    Block1 = [#candidate{compaction_perc = 55.0, filename = "a"},
+                #candidate{compaction_perc = 65.0, filename = "b"},
+                #candidate{compaction_perc = 42.0, filename = "c"},
+                #candidate{compaction_perc = 50.0, filename = "d"}],
+    Block2 = [#candidate{compaction_perc = 38.0, filename = "e"},
+                #candidate{compaction_perc = 75.0, filename = "f"},
+                #candidate{compaction_perc = 75.0, filename = "g"},
+                #candidate{compaction_perc = 45.0, filename = "h"}],
+    Block3 = [#candidate{compaction_perc = 70.0, filename = "i"},
+                #candidate{compaction_perc = 100.0, filename = "j"},
+                #candidate{compaction_perc = 100.0, filename = "k"},
+                #candidate{compaction_perc = 100.0, filename = "l"}],
+    Block4 = [#candidate{compaction_perc = 55.0, filename = "m"},
+                #candidate{compaction_perc = 56.0, filename = "n"},
+                #candidate{compaction_perc = 57.0, filename = "o"},
+                #candidate{compaction_perc = 40.0, filename = "p"}],
+    Block5 = [#candidate{compaction_perc = 60.0, filename = "q"},
+                #candidate{compaction_perc = 60.0, filename = "r"}],
    CList0 = Block1 ++ Block2 ++ Block3 ++ Block4 ++ Block5,
-    ?assertMatch(Block4, assess_candidates(CList0, Params, [], [])),
-    CList1 = CList0 ++ [#candidate{compaction_perc = 20.0}],
-    ?assertMatch([#candidate{compaction_perc = 20.0}],
-                    assess_candidates(CList1, Params, [], [])),
+    ?assertMatch(["b", "c", "d", "e"], check_bestrun(CList0, Params)),
+    CList1 = CList0 ++ [#candidate{compaction_perc = 20.0, filename="s"}],
+    ?assertMatch(["s"], check_bestrun(CList1, Params)),
    CList2 = Block4 ++ Block3 ++ Block2 ++ Block1 ++ Block5,
-    ?assertMatch(Block4, assess_candidates(CList2, Params, [], [])),
+    ?assertMatch(["h", "a", "b", "c"], check_bestrun(CList2, Params)),
    CList3 = Block5 ++ Block1 ++ Block2 ++ Block3 ++ Block4,
-    ?assertMatch([#candidate{compaction_perc = 42.0},
-                        #candidate{compaction_perc = 50.0},
-                        #candidate{compaction_perc = 38.0}],
-                    assess_candidates(CList3, Params)),
-    %% Now do some back-tracking to get a genuinely optimal solution without
-    %% needing to re-order
-    ?assertMatch([#candidate{compaction_perc = 42.0},
-                        #candidate{compaction_perc = 50.0},
-                        #candidate{compaction_perc = 38.0}],
-                    assess_candidates(CList0, Params)),
-    ?assertMatch([#candidate{compaction_perc = 42.0},
-                        #candidate{compaction_perc = 50.0},
-                        #candidate{compaction_perc = 38.0}],
-                    assess_candidates(CList0, setelement(1, Params, 5))),
-    ?assertMatch([#candidate{compaction_perc = 42.0},
-                        #candidate{compaction_perc = 50.0},
-                        #candidate{compaction_perc = 38.0},
-                        #candidate{compaction_perc = 75.0},
-                        #candidate{compaction_perc = 75.0},
-                        #candidate{compaction_perc = 45.0}], 
-                    assess_candidates(CList0, setelement(1, Params, 6))).
+    ?assertMatch(["b", "c", "d", "e"],check_bestrun(CList3, Params)).
+
+handle_emptycandidatelist_test() ->
+    ?assertMatch({[], 0.0}, assess_candidates([], {4, 60.0, 40.0})).

 test_ledgerkey(Key) ->
    {o, "Bucket", Key, null}.