Improve the quality of score

Move the average towards the current score if not scoring each run.   Score from more keys to get a better score (as overheads of scoring are now better sorted by setting score_onein rather than by reducing the sample size).
This commit is contained in:
Martin Sumner 2020-11-27 20:03:44 +00:00
parent bcc331da10
commit 00823584ec
2 changed files with 23 additions and 7 deletions

View file

@ -97,7 +97,7 @@
-define(JOURNAL_FILEX, "cdb").
-define(PENDING_FILEX, "pnd").
-define(SAMPLE_SIZE, 100).
-define(SAMPLE_SIZE, 192).
-define(BATCH_SIZE, 32).
-define(BATCHES_TO_CHECK, 8).
-define(CRC_SIZE, 4).
@ -331,9 +331,11 @@ handle_cast({score_filelist, [Entry|Tail]}, State) ->
ScoringState = State#state.scoring_state,
CpctPerc =
case {leveled_cdb:cdb_getcachedscore(JournalP, os:timestamp()),
leveled_rand:uniform(State#state.score_onein) == 1} of
{CachedScore, UseNewScore}
when CachedScore == undefined; UseNewScore ->
leveled_rand:uniform(State#state.score_onein) == 1,
State#state.score_onein} of
{CachedScore, _UseNewScore, ScoreOneIn}
when CachedScore == undefined; ScoreOneIn == 1 ->
% If caches are not used, always use the current score
check_single_file(JournalP,
ScoringState#scoring_state.filter_fun,
ScoringState#scoring_state.filter_server,
@ -341,7 +343,21 @@ handle_cast({score_filelist, [Entry|Tail]}, State) ->
?SAMPLE_SIZE,
?BATCH_SIZE,
State#state.reload_strategy);
{CachedScore, false} ->
{CachedScore, true, _ScoreOneIn} ->
% If caches are used roll the score towards the current score
% Expectation is that this will reduce instances of individual
% files being compacted when a run is missed due to cached
% scores being used in surrounding journals
NewScore =
check_single_file(JournalP,
ScoringState#scoring_state.filter_fun,
ScoringState#scoring_state.filter_server,
ScoringState#scoring_state.max_sqn,
?SAMPLE_SIZE,
?BATCH_SIZE,
State#state.reload_strategy),
(NewScore + CachedScore) / 2;
{CachedScore, false, _ScoreOneIn} ->
CachedScore
end,
ok = leveled_cdb:cdb_putcachedscore(JournalP, CpctPerc),

View file

@ -850,7 +850,7 @@ code_change(_OldVsn, StateName, State, _Extra) ->
%% @doc
%% Expand a list of pointers, maybe ending up with a list of keys and values
%% with a tail of pointers
%% By defauls will not have a segment filter, or a low last_modified_date, but
%% By default will not have a segment filter, or a low last_modified_date, but
%% they can be used. Range checking a last modified date must still be made on
%% the output - at this stage the low last_modified_date has been used to bulk
%% skip those slots not containing any information over the low last modified
@ -1867,7 +1867,7 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache},
% List of segments passed so only {K, V} pairs matching those segments
% should be returned. This required the {K, V} pair to have been added
% with the appropriate hash - if the pair were added with no_lookup as
% the hash value this will fial unexpectedly.
% the hash value this will fail unexpectedly.
BinMapFun =
fun(Pointer, Acc) ->
{SP, _L, ID, SK, EK} = pointer_mapfun(Pointer),