Boost docs

Try and make this easier to follow with docs and specs
2018-09-26 14:07:44 +01:00 · 2018-09-26 14:07:44 +01:00 · 7885f93ba6
commit 7885f93ba6
parent 979c65f0af
1 changed files with 47 additions and 6 deletions
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@ -119,6 +119,18 @@

 -type iclerk_options() :: #iclerk_options{}.
 -type candidate() :: #candidate{}.
+-type score_parameters() :: {integer(), float(), float()}.
+    % Score parameters are a tuple 
+    % - of maximum run length; how long a run of consecutive files can be for 
+    % one compaction run
+    % - maximum run compaction target; percentage space which should be 
+    % released from a compaction run of the maximum length to make it a run
+    % worthwhile of compaction (released space is 100.0 - target e.g. 70.0 
+    % means that 30.0% should be released) 
+    % - single_file compaction target; percentage space which should be 
+    % released from a compaction run of a single file to make it a run
+    % worthwhile of compaction (released space is 100.0 - target e.g. 70.0 
+    % means that 30.0% should be released)

 %%%============================================================================
 %%% API
@ -462,15 +474,31 @@ fetch_inbatches(PositionList, BatchSize, CDB, CheckedList) ->
    fetch_inbatches(Tail, BatchSize, CDB, CheckedList ++ KL_List).


-spec assess_candidates(list(candidate()), tuple()) 
+-spec assess_candidates(list(candidate()), score_parameters()) 
                                            -> {list(candidate()), float()}.
 %% @doc
-%% For each run length we need to assess all the runs of candidates
-%% The best scoring run should emerge as the winner
+%% For each run length we need to assess all the possible runs of candidates,
+%% to determine which is the best score - to be put forward as the best
+%% candidate run for compaction.
+%% 
 %% Although this requires many loops over the list of the candidate, as the
 %% file scores have already been calculated the cost per loop should not be
 %% a high burden.  Reducing the maximum run length, will reduce the cost of
-%% this exercise should be a problem
+%% this exercise should be a problem.
+%%
+%% The score parameters are used to produce the score of the compaction run,
+%% with a higher score being better.  The parameters are the maximum run 
+%% length and the compaction targets (for max run length and single file).
+%% The score of an individual file is the approximate percentage of the space
+%% that would be retained after compaction (e.g. 100 less the percentage of 
+%% space wasted by historic objects). 
+%%
+%% So a file score of 60% indicates that 40% of the space would be 
+%% reclaimed following compaction.  A single file target of 50% would not be
+%% met for this file.  However, if there are 4 consecutive files scoring 60%,
+%% and the maximum run length is 4, and the maximum run length compaction
+%% target is 70% - then this run of four files would be a viable candidate
+%% for compaction.
 assess_candidates(AllCandidates, Params) ->
    MaxRunLength = min(element(1, Params), length(AllCandidates)),
    NaiveBestRun = lists:sublist(AllCandidates, MaxRunLength),
@ -478,7 +506,6 @@ assess_candidates(AllCandidates, Params) ->
        fun(RunLength, Best) ->
            assess_for_runlength(RunLength, AllCandidates, Params, Best)
        end,
-    
    % Check all run lengths to find the best candidate.  Reverse the list of
    % run lengths, so that longer runs win on equality of score
    lists:foldl(FoldFun, 
@ -486,6 +513,14 @@ assess_candidates(AllCandidates, Params) ->
                lists:reverse(lists:seq(1, MaxRunLength))).


+-spec assess_for_runlength(integer(), list(candidate()), score_parameters(), 
+                            {list(candidate()), float()}) 
+                                -> {list(candidate()), float()}.
+%% @doc
+%% For a given run length, calculate the scores for all consecutive runs of
+%% files, comparing the score with the best run which has beens een so far.
+%% The best is a tuple of the actual run of candidates, along with the score
+%% achieved for that run
 assess_for_runlength(RunLength, AllCandidates, Params, Best) ->
    NumberOfRuns = 1 + length(AllCandidates) - RunLength,
    FoldFun =
@ -500,6 +535,12 @@ assess_for_runlength(RunLength, AllCandidates, Params, Best) ->
    lists:foldl(FoldFun, Best, lists:seq(1, NumberOfRuns)).


+-spec score_run(list(candidate()), score_parameters()) -> float().
+%% @doc
+%% Score a run.  Caluclate the avergae score across all the files in the run, 
+%% and deduct that from a target score.  Good candidate runs for comapction
+%% have larger (positive) scores.  Bad candidate runs for compaction have 
+%% negative scores.
 score_run([], _Params) ->
    0.0;
 score_run(Run, {MaxRunLength, MR_CT, SF_CT}) ->