Improve the quality of score

Move the average towards the current score if not scoring each run. Score from more keys to get a better score (as overheads of scoring are now better sorted by setting score_onein rather than by reducing the sample size).
2020-11-27 20:03:44 +00:00 · 2020-11-27 20:03:44 +00:00 · 00823584ec
commit 00823584ec
parent bcc331da10
2 changed files with 23 additions and 7 deletions
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@ -97,7 +97,7 @@

 -define(JOURNAL_FILEX, "cdb").
 -define(PENDING_FILEX, "pnd").
-define(SAMPLE_SIZE, 100).
+-define(SAMPLE_SIZE, 192).
 -define(BATCH_SIZE, 32).
 -define(BATCHES_TO_CHECK, 8).
 -define(CRC_SIZE, 4).
@ -331,9 +331,11 @@ handle_cast({score_filelist, [Entry|Tail]}, State) ->
    ScoringState = State#state.scoring_state,
    CpctPerc =
        case {leveled_cdb:cdb_getcachedscore(JournalP, os:timestamp()),
-                leveled_rand:uniform(State#state.score_onein) == 1} of
-            {CachedScore, UseNewScore} 
-                    when CachedScore == undefined; UseNewScore ->
+                leveled_rand:uniform(State#state.score_onein) == 1,
+                State#state.score_onein} of
+            {CachedScore, _UseNewScore, ScoreOneIn} 
+                    when CachedScore == undefined; ScoreOneIn == 1 ->
+                % If caches are not used, always use the current score
                check_single_file(JournalP,
                                    ScoringState#scoring_state.filter_fun,
                                    ScoringState#scoring_state.filter_server,
@ -341,7 +343,21 @@ handle_cast({score_filelist, [Entry|Tail]}, State) ->
                                    ?SAMPLE_SIZE,
                                    ?BATCH_SIZE,
                                    State#state.reload_strategy);
-            {CachedScore, false} ->
+            {CachedScore, true, _ScoreOneIn} ->
+                % If caches are used roll the score towards the current score
+                % Expectation is that this will reduce instances of individual
+                % files being compacted when a run is missed due to cached
+                % scores being used in surrounding journals
+                NewScore = 
+                    check_single_file(JournalP,
+                                    ScoringState#scoring_state.filter_fun,
+                                    ScoringState#scoring_state.filter_server,
+                                    ScoringState#scoring_state.max_sqn,
+                                    ?SAMPLE_SIZE,
+                                    ?BATCH_SIZE,
+                                    State#state.reload_strategy),
+                (NewScore + CachedScore) / 2;
+            {CachedScore, false, _ScoreOneIn} ->
                CachedScore
        end,
    ok = leveled_cdb:cdb_putcachedscore(JournalP, CpctPerc),
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@ -850,7 +850,7 @@ code_change(_OldVsn, StateName, State, _Extra) ->
 %% @doc
 %% Expand a list of pointers, maybe ending up with a list of keys and values
 %% with a tail of pointers
-%% By defauls will not have a segment filter, or a low last_modified_date, but
+%% By default will not have a segment filter, or a low last_modified_date, but
 %% they can be used. Range checking a last modified date must still be made on
 %% the output - at this stage the low last_modified_date has been used to bulk
 %% skip those slots not containing any information over the low last modified
@ -1867,7 +1867,7 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache},
    % List of segments passed so only {K, V} pairs matching those segments
    % should be returned.  This required the {K, V} pair to have been added 
    % with the appropriate hash - if the pair were added with no_lookup as 
-    % the hash value this will fial unexpectedly.
+    % the hash value this will fail unexpectedly.
    BinMapFun = 
        fun(Pointer, Acc) ->
            {SP, _L, ID, SK, EK} = pointer_mapfun(Pointer),