From b4c79caf7a83e280ea5481a19e14572ec4be203c Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 27 Nov 2020 02:35:27 +0000
Subject: [PATCH 1/9] Allow for caching of compaction scores

Potentially reduce the overheads of scoring each file on every run.

The change also alters the default thresholds for compaction to favour longer runs (which will tend towards greater storage efficiency).
---
 docs/STARTUP_OPTIONS.md            |  3 +-
 include/leveled.hrl                |  2 +
 priv/leveled.schema                |  9 +++-
 src/leveled_bookie.erl             | 11 ++++-
 src/leveled_cdb.erl                | 37 ++++++++++++--
 src/leveled_iclerk.erl             | 23 +++++++--
 src/leveled_inker.erl              |  4 +-
 test/end_to_end/basic_SUITE.erl    |  2 +-
 test/end_to_end/recovery_SUITE.erl | 79 +++++++++++++++++++++++++++++-
 9 files changed, 153 insertions(+), 17 deletions(-)

diff --git a/docs/STARTUP_OPTIONS.md b/docs/STARTUP_OPTIONS.md
index c802f73..4dad87e 100644
--- a/docs/STARTUP_OPTIONS.md
+++ b/docs/STARTUP_OPTIONS.md
@@ -106,8 +106,9 @@ The `compaction_runs_perday` indicates for the leveled store how many times eahc
 
 The `compaction_low_hour` and `compaction_high_hour` are the hours of the day which support the compaction window - set to 0 and 23 respectively if compaction is required to be a continuous process.
 
-The `max_run_length` controls how many files can be compacted in a single compaction run.  The scoring of files and runs is controlled through `maxrunlength_compactionpercentage` and `singlefile_compactionpercentage`.
+The `max_run_length` controls how many files can be compacted in a single compaction run.  The scoring of files and runs is controlled through `maxrunlength_compactionpercentage` and `singlefile_compactionpercentage`.  The `singlefile_compactionpercentage` is an acceptable compaction score for a file to be eligible for compaction on its own, where as the `maxrunlength_compactionpercentage` is the score required for a run of the `max_run_length` to be considered eligible.  The higher the `maxrunlength_compactionpercentage` and the lower the `singlefile_compactionpercentage` - the more likely a longer run will be chosen over a shorter run.
 
+The `journalcompaction_scoreonein` option controls how frequently a file will be scored.  If this is set to one, then each and every file will be scored each and every compaction run.  If this is set to an integer greater than one ('n'), then on average any given file will only be score on one in 'n' runs.  On other runs. a cached score for the file will be used.  On startup all files will be scored on the first run.  As journals get very large, and where frequent comapction is required due to mutating objects, this can save significant resource.
 
 ## Snapshot Timeouts
 
diff --git a/include/leveled.hrl b/include/leveled.hrl
index 761b9a6..9db7941 100644
--- a/include/leveled.hrl
+++ b/include/leveled.hrl
@@ -69,6 +69,7 @@
                         max_run_length,
                         singlefile_compactionperc :: float()|undefined,
                         maxrunlength_compactionperc :: float()|undefined,
+                        score_onein = 1 :: pos_integer(),
                         snaptimeout_long :: pos_integer() | undefined}).
 
 -record(penciller_options,
@@ -94,4 +95,5 @@
                          compression_method = native :: lz4|native,
                          singlefile_compactionperc :: float()|undefined,
                          maxrunlength_compactionperc :: float()|undefined,
+                         score_onein = 1 :: pos_integer(),
                          reload_strategy = [] :: list()}).
diff --git a/priv/leveled.schema b/priv/leveled.schema
index 7c259dc..ab7300b 100644
--- a/priv/leveled.schema
+++ b/priv/leveled.schema
@@ -100,6 +100,13 @@
   {datatype, integer}
 ]}.
 
+%% @doc The number of times per day to score an individual file for compaction
+{mapping, "leveled.compaction_scores_perday", "leveled.compaction_scores_perday", [
+  {default, 1},
+  {datatype, integer},
+  hidden
+]}.
+
 %% @doc Compaction Low Hour
 %% The hour of the day in which journal compaction can start.  Use Low hour 
 %% of 0 and High hour of 23 to have no compaction window (i.e. always compact 
@@ -143,7 +150,7 @@
 %% then it is a candidate (e.g. in default case if 50% of space would be
 %% recovered)
 {mapping, "leveled.singlefile_compactionpercentage", "leveled.singlefile_compactionpercentage", [
-  {default, 50.0},
+  {default, 30.0},
   {datatype, float},
   hidden
 ]}.
diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index e1a2728..b2d9629 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -140,8 +140,9 @@
                 {head_only, false},
                 {waste_retention_period, undefined},
                 {max_run_length, undefined},
-                {singlefile_compactionpercentage, 50.0},
+                {singlefile_compactionpercentage, 30.0},
                 {maxrunlength_compactionpercentage, 70.0},
+                {journalcompaction_scoreonein, 1},
                 {reload_strategy, []},
                 {max_pencillercachesize, ?MAX_PCL_CACHE_SIZE},
                 {ledger_preloadpagecache_level, ?SST_PAGECACHELEVEL_LOOKUP},
@@ -292,6 +293,11 @@
             % a run of max_run_length, before that run can be a compaction 
             % candidate.  For runs between 1 and max_run_length, a 
             % proportionate score is calculated
+        {journalcompaction_scoreonein, pos_integer()} |
+            % When scoring for compaction run a probability (1 in x) of whether
+            % any file will be scored this run.  If not scored a cached score
+            % will be used, and the cached score is the average of the latest
+            % score and the rolling average of previous scores
         {reload_strategy, list()} |
             % The reload_strategy is exposed as an option as currently no firm
             % decision has been made about how recovery from failure should
@@ -1757,6 +1763,8 @@ set_options(Opts) ->
     
     MaxSSTSlots = proplists:get_value(max_sstslots, Opts),
 
+    ScoreOneIn = proplists:get_value(journalcompaction_scoreonein, Opts),
+
     {#inker_options{root_path = JournalFP,
                         reload_strategy = ReloadStrategy,
                         max_run_length = proplists:get_value(max_run_length, Opts),
@@ -1766,6 +1774,7 @@ set_options(Opts) ->
                         snaptimeout_long = SnapTimeoutLong,
                         compression_method = CompressionMethod,
                         compress_on_receipt = CompressOnReceipt,
+                        score_onein = ScoreOneIn,
                         cdb_options = 
                             #cdb_options{max_size=MaxJournalSize,
                                         max_count=MaxJournalCount,
diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl
index 3de322b..d91d2b7 100644
--- a/src/leveled_cdb.erl
+++ b/src/leveled_cdb.erl
@@ -113,7 +113,9 @@
             cdb_deletepending/1,
             cdb_deletepending/3,
             cdb_isrolling/1,
-            cdb_clerkcomplete/1]).
+            cdb_clerkcomplete/1,
+            cdb_getcachedscore/1,
+            cdb_putcachedscore/2]).
 
 -export([finished_rolling/1,
             hashtable_calc/2]).
@@ -152,7 +154,8 @@
                 timings = no_timing :: cdb_timings(),
                 timings_countdown = 0 :: integer(),
                 log_options = leveled_log:get_opts()
-                    :: leveled_log:log_options()}).
+                    :: leveled_log:log_options(),
+                cached_score :: float()|undefined}).
 
 -record(cdb_timings, {sample_count = 0 :: integer(),
                         sample_cyclecount = 0 :: integer(),
@@ -164,6 +167,9 @@
 -type cdb_timings() :: no_timing|#cdb_timings{}.
 -type hashtable_index() :: tuple().
 -type file_location() :: integer()|eof.
+-type filter_fun() ::
+        fun((any(), binary(), integer(), any(), fun((binary()) -> any())) ->
+            {stop|loop, any()}).
 
 
 
@@ -369,7 +375,7 @@ cdb_deletepending(Pid) ->
 cdb_deletepending(Pid, ManSQN, Inker) ->
     gen_fsm:send_event(Pid, {delete_pending, ManSQN, Inker}).
 
--spec cdb_scan(pid(), fun(), any(), integer()|undefined) ->
+-spec cdb_scan(pid(), filter_fun(), any(), integer()|undefined) ->
                                                     {integer()|eof, any()}.
 %% @doc
 %% cdb_scan returns {LastPosition, Acc}.  Use LastPosition as StartPosiiton to
@@ -424,6 +430,20 @@ cdb_isrolling(Pid) ->
 cdb_clerkcomplete(Pid) ->
     gen_fsm:send_all_state_event(Pid, clerk_complete).
 
+-spec cdb_getcachedscore(pid()) -> undefined|float().
+%% @doc
+%% Return the cached score for a CDB file
+cdb_getcachedscore(Pid) ->
+    gen_fsm:sync_send_all_state_event(Pid, get_cachedscore, infinity).
+
+
+-spec cdb_putcachedscore(pid(), float()) -> ok.
+%% @doc
+%% Return the cached score for a CDB file
+cdb_putcachedscore(Pid, Score) ->
+    gen_fsm:sync_send_all_state_event(Pid, {put_cachedscore, Score}, infinity).
+
+
 
 %%%============================================================================
 %%% gen_server callbacks
@@ -829,6 +849,10 @@ handle_sync_event(cdb_filename, _From, StateName, State) ->
     {reply, State#state.filename, StateName, State};
 handle_sync_event(cdb_isrolling, _From, StateName, State) ->
     {reply, StateName == rolling, StateName, State};
+handle_sync_event(get_cachedscore, _From, StateName, State) ->
+    {reply, State#state.cached_score, StateName, State};
+handle_sync_event({put_cachedscore, Score}, _From, StateName, State) ->
+    {reply, ok, StateName, State#state{cached_score = Score}};
 handle_sync_event(cdb_close, _From, delete_pending, State) ->
     leveled_log:log("CDB05", 
                         [State#state.filename, delete_pending, cdb_close]),
@@ -836,8 +860,7 @@ handle_sync_event(cdb_close, _From, delete_pending, State) ->
                         State#state.filename, 
                         State#state.waste_path),
     {stop, normal, ok, State};
-handle_sync_event(cdb_close, _From, StateName, State) ->
-    leveled_log:log("CDB05", [State#state.filename, StateName, cdb_close]),
+handle_sync_event(cdb_close, _From, _StateName, State) ->
     file:close(State#state.handle),
     {stop, normal, ok, State}.
 
@@ -2396,6 +2419,10 @@ get_keys_byposition_manykeys_test_to() ->
     SampleList3 = cdb_getpositions(P2, KeyCount + 1),
     ?assertMatch(KeyCount, length(SampleList3)),
     
+    ?assertMatch(undefined, cdb_getcachedscore(P2)),
+    ok = cdb_putcachedscore(P2, 80.0),
+    ?assertMatch(80.0, cdb_getcachedscore(P2)),
+
     ok = cdb_close(P2),
     ok = file:delete(F2).
 
diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl
index 9f1256d..c2c34d1 100644
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@@ -117,7 +117,8 @@
                 maxrunlength_compactionperc = ?MAXRUNLENGTH_COMPACTION_TARGET ::float(),
                 compression_method = native :: lz4|native,
                 scored_files = [] :: list(candidate()),
-                scoring_state :: scoring_state()|undefined}).
+                scoring_state :: scoring_state()|undefined,
+                score_onein = 1 :: pos_integer()}).
 
 -record(candidate, {low_sqn :: integer() | undefined,
                     filename :: string() | undefined,
@@ -270,7 +271,7 @@ init([LogOpts, IClerkOpts]) ->
             MRLCP when is_float(MRLCP) ->
                 MRLCP
         end,
-
+    
     {ok, #state{max_run_length = MRL,
                         inker = IClerkOpts#iclerk_options.inker,
                         cdb_options = CDBopts,
@@ -280,7 +281,10 @@ init([LogOpts, IClerkOpts]) ->
                         singlefile_compactionperc = SFL_CompPerc,
                         maxrunlength_compactionperc = MRL_CompPerc,
                         compression_method = 
-                            IClerkOpts#iclerk_options.compression_method}}.
+                            IClerkOpts#iclerk_options.compression_method,
+                        score_onein = 
+                            IClerkOpts#iclerk_options.score_onein
+                        }}.
 
 handle_call(stop, _From, State) ->
     case State#state.scoring_state of
@@ -325,13 +329,22 @@ handle_cast({score_filelist, [Entry|Tail]}, State) ->
     Candidates = State#state.scored_files,
     {LowSQN, FN, JournalP, _LK} = Entry,
     ScoringState = State#state.scoring_state,
-    CpctPerc = check_single_file(JournalP,
+    CpctPerc =
+        case {leveled_cdb:cdb_getcachedscore(JournalP),
+                leveled_rand:uniform(State#state.score_onein) == 1} of
+            {CachedScore, UseNewScore} 
+                    when CachedScore == undefined; UseNewScore ->
+                check_single_file(JournalP,
                                     ScoringState#scoring_state.filter_fun,
                                     ScoringState#scoring_state.filter_server,
                                     ScoringState#scoring_state.max_sqn,
                                     ?SAMPLE_SIZE,
                                     ?BATCH_SIZE,
-                                    State#state.reload_strategy),
+                                    State#state.reload_strategy);
+            {CachedScore, false} ->
+                CachedScore
+        end,
+    ok = leveled_cdb:cdb_putcachedscore(JournalP, CpctPerc),
     Candidate =
         #candidate{low_sqn = LowSQN,
                     filename = FN,
diff --git a/src/leveled_inker.erl b/src/leveled_inker.erl
index 432006c..8391007 100644
--- a/src/leveled_inker.erl
+++ b/src/leveled_inker.erl
@@ -806,6 +806,7 @@ start_from_file(InkOpts) ->
     PressMethod = InkOpts#inker_options.compression_method,
     PressOnReceipt = InkOpts#inker_options.compress_on_receipt,
     SnapTimeout = InkOpts#inker_options.snaptimeout_long,
+    ScoreOneIn = InkOpts#inker_options.score_onein,
 
     IClerkOpts = 
         #iclerk_options{inker = self(),
@@ -815,7 +816,8 @@ start_from_file(InkOpts) ->
                             compression_method = PressMethod,
                             max_run_length = MRL,
                             singlefile_compactionperc = SFL_CompactPerc,
-                            maxrunlength_compactionperc = MRL_CompactPerc},
+                            maxrunlength_compactionperc = MRL_CompactPerc,
+                            score_onein = ScoreOneIn},
     
     {ok, Clerk} = leveled_iclerk:clerk_new(IClerkOpts),
     
diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl
index 2495eb8..ac1a5ea 100644
--- a/test/end_to_end/basic_SUITE.erl
+++ b/test/end_to_end/basic_SUITE.erl
@@ -299,7 +299,7 @@ journal_compaction_tester(Restart, WRP) ->
     end,
     ok = leveled_penciller:pcl_close(PclClone),
     ok = leveled_inker:ink_close(InkClone),
-    % Snapshot released so deletes shoudl occur at next timeout
+    % Snapshot released so deletes should occur at next timeout
     case WRP of 
         undefined ->
             timer:sleep(10100); % wait for delete_pending timeout
diff --git a/test/end_to_end/recovery_SUITE.erl b/test/end_to_end/recovery_SUITE.erl
index 44d9418..162900d 100644
--- a/test/end_to_end/recovery_SUITE.erl
+++ b/test/end_to_end/recovery_SUITE.erl
@@ -16,7 +16,8 @@
             journal_compaction_bustedjournal/1,
             close_duringcompaction/1,
             allkeydelta_journal_multicompact/1,
-            recompact_keydeltas/1
+            recompact_keydeltas/1,
+            simple_cachescoring/1
             ]).
 
 all() -> [
@@ -33,7 +34,8 @@ all() -> [
             close_duringcompaction,
             allkeydelta_journal_multicompact,
             recompact_keydeltas,
-            stdtag_recalc
+            stdtag_recalc,
+            simple_cachescoring
             ].
 
 
@@ -555,6 +557,79 @@ aae_missingjournal(_Config) ->
     ok = leveled_bookie:book_close(Bookie2),
     testutil:reset_filestructure().
 
+simple_cachescoring(_Config) ->
+    RootPath = testutil:reset_filestructure(),
+    StartOpts = [{root_path, RootPath},
+                    {max_journalobjectcount, 2000},
+                    {sync_strategy, testutil:sync_strategy()}],
+    {ok, Bookie1} =
+        leveled_bookie:book_start(StartOpts ++
+                                    [{journalcompaction_scoreonein, 8}]),
+    {TestObject, TestSpec} = testutil:generate_testobject(),
+    ok = testutil:book_riakput(Bookie1, TestObject, TestSpec),
+    testutil:check_forobject(Bookie1, TestObject),
+    GenList = [2, 32002, 64002, 96002],
+    _CLs = testutil:load_objects(32000, GenList, Bookie1, TestObject,
+                                fun testutil:generate_objects/2),
+    
+    F = fun leveled_bookie:book_islastcompactionpending/1,
+    WaitForCompaction =
+        fun(B) -> 
+            fun(X, Pending) ->
+                case X of 
+                    1 ->
+                        leveled_bookie:book_compactjournal(B, 30000);
+                    _ ->
+                        ok
+                end,
+                case Pending of
+                    false ->
+                        false;
+                    true ->
+                        io:format("Loop ~w waiting for journal "
+                            ++ "compaction to complete~n", [X]),
+                        timer:sleep(100),
+                        F(B)
+                end
+            end
+        end,
+    io:format("Scoring for first time - every file should need scoring~n"),
+    Args1 = [WaitForCompaction(Bookie1), true, lists:seq(1, 300)],
+    {TC0, false} = timer:tc(lists, foldl, Args1),
+    io:format("Score four more times with cached scoring~n"),
+    {TC1, false} = timer:tc(lists, foldl, Args1),
+    {TC2, false} = timer:tc(lists, foldl, Args1),
+    {TC3, false} = timer:tc(lists, foldl, Args1),
+    {TC4, false} = timer:tc(lists, foldl, Args1),
+    
+    ok = leveled_bookie:book_close(Bookie1),
+    {ok, Bookie2} =
+        leveled_bookie:book_start(StartOpts),
+    io:format("Re-opened bookie withour caching - re-compare compaction time~n"),
+    io:format("Scoring for first time - every file should need scoring~n"),
+    Args2 = [WaitForCompaction(Bookie2), true, lists:seq(1, 300)],
+    {TN0, false} = timer:tc(lists, foldl, Args2),
+    io:format("Score four more times with cached scoring~n"),
+    {TN1, false} = timer:tc(lists, foldl, Args2),
+    {TN2, false} = timer:tc(lists, foldl, Args2),
+    {TN3, false} = timer:tc(lists, foldl, Args2),
+    {TN4, false} = timer:tc(lists, foldl, Args2),
+    
+    AvgSecondRunCache = (TC1 + TC2 +TC3 + TC4) div 4000,
+    AvgSecondRunNoCache = (TN1 + TN2 +TN3 + TN4) div 4000,
+
+    io:format("With caching ~w first run ~w average other runs~n",
+                [TC0 div 1000, AvgSecondRunCache]),
+    io:format("Without caching ~w first run ~w average other runs~n",
+                [TN0 div 1000, AvgSecondRunNoCache]),
+    true = (TC0 > AvgSecondRunCache),
+    true = (TC0/AvgSecondRunCache) > (TN0/AvgSecondRunNoCache),
+    ok = leveled_bookie:book_close(Bookie2),
+
+    io:format("Exit having proven simply that caching score is faster~n"),
+    testutil:reset_filestructure().
+
+
 aae_bustedjournal(_Config) ->
     RootPath = testutil:reset_filestructure(),
     StartOpts = [{root_path, RootPath},

From 0690136ab20ad3089eaa32ce2883ccaef39e268f Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 27 Nov 2020 03:01:38 +0000
Subject: [PATCH 2/9] Clarify how the new option will be controlled in Riak

---
 docs/STARTUP_OPTIONS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/STARTUP_OPTIONS.md b/docs/STARTUP_OPTIONS.md
index 4dad87e..d09ebd3 100644
--- a/docs/STARTUP_OPTIONS.md
+++ b/docs/STARTUP_OPTIONS.md
@@ -108,7 +108,7 @@ The `compaction_low_hour` and `compaction_high_hour` are the hours of the day wh
 
 The `max_run_length` controls how many files can be compacted in a single compaction run.  The scoring of files and runs is controlled through `maxrunlength_compactionpercentage` and `singlefile_compactionpercentage`.  The `singlefile_compactionpercentage` is an acceptable compaction score for a file to be eligible for compaction on its own, where as the `maxrunlength_compactionpercentage` is the score required for a run of the `max_run_length` to be considered eligible.  The higher the `maxrunlength_compactionpercentage` and the lower the `singlefile_compactionpercentage` - the more likely a longer run will be chosen over a shorter run.
 
-The `journalcompaction_scoreonein` option controls how frequently a file will be scored.  If this is set to one, then each and every file will be scored each and every compaction run.  If this is set to an integer greater than one ('n'), then on average any given file will only be score on one in 'n' runs.  On other runs. a cached score for the file will be used.  On startup all files will be scored on the first run.  As journals get very large, and where frequent comapction is required due to mutating objects, this can save significant resource.
+The `journalcompaction_scoreonein` option controls how frequently a file will be scored.  If this is set to one, then each and every file will be scored each and every compaction run.  If this is set to an integer greater than one ('n'), then on average any given file will only be score on one in 'n' runs.  On other runs. a cached score for the file will be used.  On startup all files will be scored on the first run.  As journals get very large, and where frequent comapction is required due to mutating objects, this can save significant resource.  In Riak, this option is controlled via `leveled.compaction_scores_perday`, with the number of `leveled.compaction_runs_perday` being divided by this to produce the `journalcompaction_scoreonein`.  By default each file will only be scored once per day.
 
 ## Snapshot Timeouts
 

From be562c85cb77d38c82f65d58bea58ee2526f5688 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 27 Nov 2020 03:12:35 +0000
Subject: [PATCH 3/9] Don't hide option

---
 priv/leveled.schema | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/priv/leveled.schema b/priv/leveled.schema
index ab7300b..8e85925 100644
--- a/priv/leveled.schema
+++ b/priv/leveled.schema
@@ -103,8 +103,7 @@
 %% @doc The number of times per day to score an individual file for compaction
 {mapping, "leveled.compaction_scores_perday", "leveled.compaction_scores_perday", [
   {default, 1},
-  {datatype, integer},
-  hidden
+  {datatype, integer}
 ]}.
 
 %% @doc Compaction Low Hour

From bcc331da1050b0ffbcca47fc81022de366601433 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 27 Nov 2020 13:56:47 +0000
Subject: [PATCH 4/9] Set max limit of 24 hours on cached score

---
 src/leveled_cdb.erl    | 42 ++++++++++++++++++++++++++++++++----------
 src/leveled_iclerk.erl |  2 +-
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl
index d91d2b7..d6f5e7f 100644
--- a/src/leveled_cdb.erl
+++ b/src/leveled_cdb.erl
@@ -114,7 +114,7 @@
             cdb_deletepending/3,
             cdb_isrolling/1,
             cdb_clerkcomplete/1,
-            cdb_getcachedscore/1,
+            cdb_getcachedscore/2,
             cdb_putcachedscore/2]).
 
 -export([finished_rolling/1,
@@ -135,6 +135,8 @@
 -define(GETPOS_FACTOR, 8).
 -define(MAX_OBJECT_SIZE, 1000000000). 
     % 1GB but really should be much smaller than this
+-define(MEGA, 1000000).
+-define(CACHE_LIFE, 86400).
 
 -record(state, {hashtree,
                 last_position :: integer() | undefined,
@@ -155,7 +157,7 @@
                 timings_countdown = 0 :: integer(),
                 log_options = leveled_log:get_opts()
                     :: leveled_log:log_options(),
-                cached_score :: float()|undefined}).
+                cached_score :: {float(), erlang:timestamp()}|undefined}).
 
 -record(cdb_timings, {sample_count = 0 :: integer(),
                         sample_cyclecount = 0 :: integer(),
@@ -430,11 +432,11 @@ cdb_isrolling(Pid) ->
 cdb_clerkcomplete(Pid) ->
     gen_fsm:send_all_state_event(Pid, clerk_complete).
 
--spec cdb_getcachedscore(pid()) -> undefined|float().
+-spec cdb_getcachedscore(pid(), erlang:timestamp()) -> undefined|float().
 %% @doc
 %% Return the cached score for a CDB file
-cdb_getcachedscore(Pid) ->
-    gen_fsm:sync_send_all_state_event(Pid, get_cachedscore, infinity).
+cdb_getcachedscore(Pid, Now) ->
+    gen_fsm:sync_send_all_state_event(Pid, {get_cachedscore, Now}, infinity).
 
 
 -spec cdb_putcachedscore(pid(), float()) -> ok.
@@ -849,10 +851,24 @@ handle_sync_event(cdb_filename, _From, StateName, State) ->
     {reply, State#state.filename, StateName, State};
 handle_sync_event(cdb_isrolling, _From, StateName, State) ->
     {reply, StateName == rolling, StateName, State};
-handle_sync_event(get_cachedscore, _From, StateName, State) ->
-    {reply, State#state.cached_score, StateName, State};
+handle_sync_event({get_cachedscore, {NowMega, NowSecs, _}},
+                                                    _From, StateName, State) ->
+    ScoreToReturn =
+        case State#state.cached_score of
+            undefined ->
+                undefined;
+            {Score, {CacheMega, CacheSecs, _}} ->
+                case (NowMega * ?MEGA + NowSecs) >
+                        (CacheMega * ?MEGA + CacheSecs + ?CACHE_LIFE) of
+                    true ->
+                        undefined;
+                    false ->
+                        Score
+                end
+        end,
+    {reply, ScoreToReturn, StateName, State};
 handle_sync_event({put_cachedscore, Score}, _From, StateName, State) ->
-    {reply, ok, StateName, State#state{cached_score = Score}};
+    {reply, ok, StateName, State#state{cached_score = {Score,os:timestamp()}}};
 handle_sync_event(cdb_close, _From, delete_pending, State) ->
     leveled_log:log("CDB05", 
                         [State#state.filename, delete_pending, cdb_close]),
@@ -2419,9 +2435,15 @@ get_keys_byposition_manykeys_test_to() ->
     SampleList3 = cdb_getpositions(P2, KeyCount + 1),
     ?assertMatch(KeyCount, length(SampleList3)),
     
-    ?assertMatch(undefined, cdb_getcachedscore(P2)),
+    ?assertMatch(undefined, cdb_getcachedscore(P2, os:timestamp())),
     ok = cdb_putcachedscore(P2, 80.0),
-    ?assertMatch(80.0, cdb_getcachedscore(P2)),
+    ?assertMatch(80.0, cdb_getcachedscore(P2, os:timestamp())),
+    timer:sleep(1000),
+    {NowMega, NowSecs, _} = Now = os:timestamp(),
+    ?assertMatch(80.0, cdb_getcachedscore(P2, Now)),
+    FutureEpoch = NowMega * ?MEGA + NowSecs + ?CACHE_LIFE,
+    Future = {FutureEpoch div ?MEGA, FutureEpoch rem ?MEGA, 0},
+    ?assertMatch(undefined, cdb_getcachedscore(P2, Future)),
 
     ok = cdb_close(P2),
     ok = file:delete(F2).
diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl
index c2c34d1..c2a1f8a 100644
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@@ -330,7 +330,7 @@ handle_cast({score_filelist, [Entry|Tail]}, State) ->
     {LowSQN, FN, JournalP, _LK} = Entry,
     ScoringState = State#state.scoring_state,
     CpctPerc =
-        case {leveled_cdb:cdb_getcachedscore(JournalP),
+        case {leveled_cdb:cdb_getcachedscore(JournalP, os:timestamp()),
                 leveled_rand:uniform(State#state.score_onein) == 1} of
             {CachedScore, UseNewScore} 
                     when CachedScore == undefined; UseNewScore ->

From 00823584eca37c030950d516f16ea1f14958a8a3 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 27 Nov 2020 20:03:44 +0000
Subject: [PATCH 5/9] Improve the quality of score

Move the average towards the current score if not scoring each run.   Score from more keys to get a better score (as overheads of scoring are now better sorted by setting score_onein rather than by reducing the sample size).
---
 src/leveled_iclerk.erl | 26 +++++++++++++++++++++-----
 src/leveled_sst.erl    |  4 ++--
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl
index c2a1f8a..f7cec00 100644
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@@ -97,7 +97,7 @@
 
 -define(JOURNAL_FILEX, "cdb").
 -define(PENDING_FILEX, "pnd").
--define(SAMPLE_SIZE, 100).
+-define(SAMPLE_SIZE, 192).
 -define(BATCH_SIZE, 32).
 -define(BATCHES_TO_CHECK, 8).
 -define(CRC_SIZE, 4).
@@ -331,9 +331,11 @@ handle_cast({score_filelist, [Entry|Tail]}, State) ->
     ScoringState = State#state.scoring_state,
     CpctPerc =
         case {leveled_cdb:cdb_getcachedscore(JournalP, os:timestamp()),
-                leveled_rand:uniform(State#state.score_onein) == 1} of
-            {CachedScore, UseNewScore} 
-                    when CachedScore == undefined; UseNewScore ->
+                leveled_rand:uniform(State#state.score_onein) == 1,
+                State#state.score_onein} of
+            {CachedScore, _UseNewScore, ScoreOneIn} 
+                    when CachedScore == undefined; ScoreOneIn == 1 ->
+                % If caches are not used, always use the current score
                 check_single_file(JournalP,
                                     ScoringState#scoring_state.filter_fun,
                                     ScoringState#scoring_state.filter_server,
@@ -341,7 +343,21 @@ handle_cast({score_filelist, [Entry|Tail]}, State) ->
                                     ?SAMPLE_SIZE,
                                     ?BATCH_SIZE,
                                     State#state.reload_strategy);
-            {CachedScore, false} ->
+            {CachedScore, true, _ScoreOneIn} ->
+                % If caches are used roll the score towards the current score
+                % Expectation is that this will reduce instances of individual
+                % files being compacted when a run is missed due to cached
+                % scores being used in surrounding journals
+                NewScore = 
+                    check_single_file(JournalP,
+                                    ScoringState#scoring_state.filter_fun,
+                                    ScoringState#scoring_state.filter_server,
+                                    ScoringState#scoring_state.max_sqn,
+                                    ?SAMPLE_SIZE,
+                                    ?BATCH_SIZE,
+                                    State#state.reload_strategy),
+                (NewScore + CachedScore) / 2;
+            {CachedScore, false, _ScoreOneIn} ->
                 CachedScore
         end,
     ok = leveled_cdb:cdb_putcachedscore(JournalP, CpctPerc),
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 09b95f6..2f513ca 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -850,7 +850,7 @@ code_change(_OldVsn, StateName, State, _Extra) ->
 %% @doc
 %% Expand a list of pointers, maybe ending up with a list of keys and values
 %% with a tail of pointers
-%% By defauls will not have a segment filter, or a low last_modified_date, but
+%% By default will not have a segment filter, or a low last_modified_date, but
 %% they can be used. Range checking a last modified date must still be made on
 %% the output - at this stage the low last_modified_date has been used to bulk
 %% skip those slots not containing any information over the low last modified
@@ -1867,7 +1867,7 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache},
     % List of segments passed so only {K, V} pairs matching those segments
     % should be returned.  This required the {K, V} pair to have been added 
     % with the appropriate hash - if the pair were added with no_lookup as 
-    % the hash value this will fial unexpectedly.
+    % the hash value this will fail unexpectedly.
     BinMapFun = 
         fun(Pointer, Acc) ->
             {SP, _L, ID, SK, EK} = pointer_mapfun(Pointer),

From 80e6920d6c94aacfce233ae0716eb3680175b1b9 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Sun, 29 Nov 2020 15:43:29 +0000
Subject: [PATCH 6/9] Standardise retention decision

Use the same function to decide for both scoring and compaction - and avoid the situation where somethig is scored for cmpaction, but doesnt change (which was the case previously with tombstones that were still in the ledger).
---
 src/leveled_iclerk.erl | 145 ++++++++++++++++++++++-------------------
 src/leveled_inker.erl  |   8 +++
 2 files changed, 86 insertions(+), 67 deletions(-)

diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl
index f7cec00..760914e 100644
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@@ -125,10 +125,10 @@
                     journal :: pid() | undefined,
                     compaction_perc :: float() | undefined}).
 
--record(scoring_state, {filter_fun :: fun(),
-                        filter_server :: pid(),
+-record(scoring_state, {filter_fun :: leveled_inker:filterfun(),
+                        filter_server :: leveled_inker:filterserver(),
                         max_sqn :: non_neg_integer(),
-                        close_fun :: fun(),
+                        close_fun :: leveled_inker:filterclosefun(),
                         start_time :: erlang:timestamp()}).
 
 -type iclerk_options() :: #iclerk_options{}.
@@ -166,8 +166,11 @@
 clerk_new(InkerClerkOpts) ->
     gen_server:start_link(?MODULE, [leveled_log:get_opts(), InkerClerkOpts], []).
 
--spec clerk_compact(pid(), pid(), 
-                    fun(), fun(), fun(),  
+-spec clerk_compact(pid(),
+                    pid(), 
+                    leveled_inker:filterinitfun(),
+                    leveled_inker:filterclosefun(),
+                    leveled_inker:filterfun(),  
                     list()) -> ok.
 %% @doc
 %% Trigger a compaction for this clerk if the threshold of data recovery has 
@@ -538,7 +541,10 @@ schedule_compaction(CompactionHours, RunsPerDay, CurrentTS) ->
 %%% Internal functions
 %%%============================================================================
 
--spec check_single_file(pid(), fun(), any(), non_neg_integer(),
+-spec check_single_file(pid(),
+                        leveled_inker:filterfun(),
+                        leveled_inker:filterserver(),
+                        leveled_codec:sqn(),
                         non_neg_integer(), non_neg_integer(),
                         leveled_codec:compaction_strategy()) ->
                             float().
@@ -578,44 +584,31 @@ safely_log_filescore(PositionList, FN, Score, SW) ->
     leveled_log:log_timer("IC004", [Score, AvgJump, FN], SW).
 
 -spec size_comparison_score(list(key_size() | corrupted_test_key_size()),
-                                    fun(),
-                                    any(),
-                                    non_neg_integer(),
+                                    leveled_inker:filterfun(),
+                                    leveled_inker:filterserver(),
+                                    leveled_codec:sqn(),
                                     leveled_codec:compaction_strategy()) ->
                                         float().
 size_comparison_score(KeySizeList,
                         FilterFun, FilterServer, MaxSQN,
-                        RS) ->
+                        ReloadStrategy) ->
     FoldFunForSizeCompare =
         fun(KS, {ActSize, RplSize}) ->
             case KS of
                 {{SQN, Type, PK}, Size} ->
-                    IsJournalEntry =
-                        leveled_codec:is_full_journalentry({SQN, Type, PK}),
-                    case IsJournalEntry of
-                        false ->
-                            TS = leveled_codec:get_tagstrategy(PK, RS),
-                            % If the strategy is to retain key deltas, then
-                            % scoring must reflect that.  Key deltas are
-                            % possible even if strategy does not allow as
-                            % there is support for changing strategy from
-                            % retain to recalc
-                            case TS of
-                                retain ->
-                                    {ActSize + Size - ?CRC_SIZE, RplSize};
-                                _ ->
-                                    {ActSize, RplSize + Size - ?CRC_SIZE}
-                            end;
+                    ToRetain =
+                        to_retain({SQN, Type, PK},
+                                    FilterFun,
+                                    FilterServer,
+                                    MaxSQN,
+                                    ReloadStrategy),
+                    case ToRetain of
                         true ->
-                            Check = FilterFun(FilterServer, PK, SQN),
-                            case {Check, SQN > MaxSQN} of
-                                {current, _} ->
-                                    {ActSize + Size - ?CRC_SIZE, RplSize};
-                                {_, true} ->
-                                    {ActSize + Size - ?CRC_SIZE, RplSize};
-                                _ ->
-                                    {ActSize, RplSize + Size - ?CRC_SIZE}
-                            end
+                            {ActSize + Size - ?CRC_SIZE, RplSize};
+                        convert ->
+                            {ActSize, RplSize  + Size - ?CRC_SIZE};
+                        false ->
+                            {ActSize, RplSize + Size - ?CRC_SIZE}
                     end;
                 _ ->
                     % There is a key which is not in expected format
@@ -839,53 +832,71 @@ split_positions_into_batches(Positions, Journal, Batches) ->
 %% if it contains index entries.  The hot_backup approach is also not safe with
 %% a `recovr` strategy.  The recovr strategy assumes faults in the ledger will
 %% be resolved via application-level anti-entropy
-filter_output(KVCs, FilterFun, FilterServer, MaxSQN, ReloadStrategy) ->
+filter_output(KVCs, FilterFun, FilterServer, MaxSQN, Strategy) ->
     FoldFun =
-        filter_output_fun(FilterFun, FilterServer, MaxSQN, ReloadStrategy),
+        filter_output_fun(FilterFun, FilterServer, MaxSQN, Strategy),
     lists:reverse(lists:foldl(FoldFun, [], KVCs)).
 
 
-filter_output_fun(FilterFun, FilterServer, MaxSQN, ReloadStrategy) ->
+filter_output_fun(FilterFun, FilterServer, MaxSQN, Strategy) ->
     fun(KVC0, Acc) ->
         case KVC0 of
             {_InkKey, crc_wonky, false} ->
                 % Bad entry, disregard, don't check
                 Acc;
             {JK, JV, _Check} ->
-                {SQN, LK} =
-                    leveled_codec:from_journalkey(JK),
-                CompactStrategy =
-                    leveled_codec:get_tagstrategy(LK, ReloadStrategy),
-                IsJournalEntry =
-                    leveled_codec:is_full_journalentry(JK),
-                case {CompactStrategy, IsJournalEntry} of
-                    {retain, false} ->
+                ToRetain =
+                    to_retain(JK, FilterFun, FilterServer, MaxSQN, Strategy),
+                case ToRetain of
+                    true ->
                         [KVC0|Acc];
-                    _ ->
-                        KeyCurrent = FilterFun(FilterServer, LK, SQN),
-                        IsInMemory = SQN > MaxSQN,
-                        case {KeyCurrent, IsInMemory, CompactStrategy} of
-                            {KC, InMem, _} when KC == current; InMem ->
-                                % This entry may still be required
-                                % regardless of strategy
-                                [KVC0|Acc];
-                            {_, _, retain} ->
-                                % If we have a retain strategy, it can't be
-                                % discarded - but the value part is no
-                                % longer required as this version has been
-                                % replaced
-                                {JK0, JV0} =
-                                    leveled_codec:revert_to_keydeltas(JK, JV),
-                                [{JK0, JV0, null}|Acc];
-                            {_, _, _} ->
-                                % This is out of date and not retained so
-                                % discard
-                                Acc
-                        end
+                    convert ->
+                        {JK0, JV0} =
+                            leveled_codec:revert_to_keydeltas(JK, JV),
+                        [{JK0, JV0, null}|Acc];
+                    false ->
+                        Acc
                 end
         end
     end.
 
+-spec to_retain(leveled_codec:journal_key(),
+                leveled_inker:filterfun(),
+                leveled_inker:fillter_server(),
+                leveled_codec:sqn(),
+                leveled_codec:compaction_strategy()) -> boolean()|convert.
+to_retain(JournalKey, FilterFun, FilterServer, MaxSQN, ReloadStrategy) ->
+    {SQN, LK} =
+        leveled_codec:from_journalkey(JournalKey),
+    CompactStrategy =
+        leveled_codec:get_tagstrategy(LK, ReloadStrategy),
+    IsJournalEntry =
+        leveled_codec:is_full_journalentry(JournalKey),
+    case {CompactStrategy, IsJournalEntry} of
+        {retain, false} ->
+            true;
+        _ ->
+            KeyCurrent = FilterFun(FilterServer, LK, SQN),
+            IsInMemory = SQN > MaxSQN,
+            case {KeyCurrent, IsInMemory, CompactStrategy} of
+                {KC, InMem, _} when KC == current; InMem ->
+                    % This entry may still be required
+                    % regardless of strategy
+                    true;
+                {_, _, retain} ->
+                    % If we have a retain strategy, it can't be
+                    % discarded - but the value part is no
+                    % longer required as this version has been
+                    % replaced
+                    convert;
+                {_, _, _} ->
+                    % This is out of date and not retained so
+                    % discard
+                    false
+            end
+    end.
+
+
 write_values([], _CDBopts, Journal0, ManSlice0, _PressMethod) ->
     {Journal0, ManSlice0};
 write_values(KVCList, CDBopts, Journal0, ManSlice0, PressMethod) ->
diff --git a/src/leveled_inker.erl b/src/leveled_inker.erl
index 8391007..4eb34c6 100644
--- a/src/leveled_inker.erl
+++ b/src/leveled_inker.erl
@@ -157,6 +157,14 @@
 -type inker_options() :: #inker_options{}.
 -type ink_state() :: #state{}.
 -type registered_snapshot() :: {pid(), os:timestamp(), integer()}.
+-type filterserver() :: pid()|list(tuple()).
+-type filterfun() ::
+    fun((filterserver(), leveled_codec:ledger_key(), leveled_codec:sqn()) ->
+            current|replaced|missing).
+-type filterclosefun() :: fun((filterserver()) -> ok).
+-type filterinitfun() :: fun((pid()) -> {filterserver(), leveled_codec:sqn()}).
+
+-export_type([filterserver/0, filterfun/0, filterclosefun/0, filterinitfun/0]).
 
 %%%============================================================================
 %%% API

From a210aa6846144ba459838839fce60c661f470ba4 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 2 Dec 2020 13:29:50 +0000
Subject: [PATCH 7/9] Promote cache when scanning

When scanning over a leveled store with a helper (e.g. segment filter and last modified date range), applying the filter will speed up the query when the block index cache is available to get_slots.

If it is not available, previously the leveled_sst did not then promote the cache after it had accessed the underlying blocks.

Now the code does this, and also when the cache has all been added, it extracts the largest last modified date so that sst files older than the passed in date can be immediately dismissed
---
 priv/leveled.schema            |   2 +-
 src/leveled_sst.erl            | 278 ++++++++++++++++++++++++---------
 test/end_to_end/riak_SUITE.erl |  68 ++++++--
 3 files changed, 262 insertions(+), 86 deletions(-)

diff --git a/priv/leveled.schema b/priv/leveled.schema
index 8e85925..5087548 100644
--- a/priv/leveled.schema
+++ b/priv/leveled.schema
@@ -146,7 +146,7 @@
 %% @doc Target Percentage for Single File
 %% What is the target score for a run of a single file, to qualify for 
 %% compaction.  If less than this percentage would be retained after compaction
-%% then it is a candidate (e.g. in default case if 50% of space would be
+%% then it is a candidate (e.g. in default case if 70% of space would be
 %% recovered)
 {mapping, "leveled.singlefile_compactionpercentage", "leveled.singlefile_compactionpercentage", [
   {default, 30.0},
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 2f513ca..c42005b 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -181,6 +181,8 @@
         :: {binary(), binary(), list(integer()), leveled_codec:ledger_key()}.
 -type sst_summary()
         :: #summary{}.
+-type blockindex_cache()
+        :: any().  % An array but OTP 16 types
 
 %% yield_blockquery is used to determine if the work necessary to process a
 %% range query beyond the fetching the slot should be managed from within
@@ -196,7 +198,7 @@
                     root_path,
                     filename,
                     yield_blockquery = false :: boolean(),
-                    blockindex_cache,
+                    blockindex_cache :: blockindex_cache()|undefined,
                     compression_method = native :: press_method(),
                     index_moddate = ?INDEX_MODDATE :: boolean(),
                     timings = no_timing :: sst_timings(),
@@ -207,7 +209,8 @@
                     deferred_startup_tuple :: tuple()|undefined,
                     level :: non_neg_integer()|undefined,
                     tomb_count = not_counted
-                            :: non_neg_integer()|not_counted}).
+                            :: non_neg_integer()|not_counted,
+                    high_modified_date :: non_neg_integer()|undefined}).
 
 -record(sst_timings, 
                 {sample_count = 0 :: integer(),
@@ -526,8 +529,14 @@ starting({sst_new,
     SW = os:timestamp(),
     leveled_log:save(OptsSST#sst_options.log_options),
     PressMethod = OptsSST#sst_options.press_method,
-    {Length, SlotIndex, BlockIndex, SlotsBin, Bloom} = 
+    {Length, SlotIndex, BlockEntries, SlotsBin, Bloom} = 
         build_all_slots(SlotList),
+    {BlockIndex, HighModDate} =
+        update_blockindex_cache(true,
+                                BlockEntries,
+                                new_blockindex_cache(Length),
+                                undefined,
+                                IdxModDate),
     SummaryBin = 
         build_table_summary(SlotIndex, Level, FirstKey, Length,
                             MaxSQN, Bloom, CountOfTombs),
@@ -550,6 +559,7 @@ starting({sst_new,
         {ok, {Summary#summary.first_key, Summary#summary.last_key}, Bloom},
         reader,
         UpdState#state{blockindex_cache = BlockIndex,
+                        high_modified_date = HighModDate,
                         starting_pid = StartingPID,
                         level = Level}};
 starting({sst_newlevelzero, RootPath, Filename,
@@ -583,8 +593,14 @@ starting(complete_l0startup, State) ->
     Time1 = timer:now_diff(os:timestamp(), SW1),
 
     SW2 = os:timestamp(),
-    {SlotCount, SlotIndex, BlockIndex, SlotsBin,Bloom} =
+    {SlotCount, SlotIndex, BlockEntries, SlotsBin,Bloom} =
         build_all_slots(SlotList),
+    {BlockIndex, HighModDate} =
+        update_blockindex_cache(true,
+                                BlockEntries,
+                                new_blockindex_cache(SlotCount),
+                                undefined,
+                                IdxModDate),
     Time2 = timer:now_diff(os:timestamp(), SW2),
     
     SW3 = os:timestamp(),
@@ -616,19 +632,19 @@ starting(complete_l0startup, State) ->
 
     case Penciller of
         undefined ->
-            {next_state, 
-                reader, 
-                UpdState#state{blockindex_cache = BlockIndex}};
+            ok;
         _ ->
             leveled_penciller:pcl_confirml0complete(Penciller,
                                                     UpdState#state.filename,
                                                     Summary#summary.first_key,
                                                     Summary#summary.last_key,
                                                     Bloom),
-            {next_state, 
-                reader, 
-                UpdState#state{blockindex_cache = BlockIndex}}
-    end;
+            ok
+    end,
+    {next_state,
+        reader,
+        UpdState#state{blockindex_cache = BlockIndex,
+                        high_modified_date = HighModDate}};
 starting({sst_returnslot, FetchedSlot, FetchFun, SlotCount}, State) ->
     Self = self(),
     FetchedSlots = 
@@ -673,13 +689,10 @@ reader({get_kv, LedgerKey, Hash}, _From, State) ->
                                             timings_countdown = CountDown}};
 reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
                                                             _From, State) ->
-    {SlotsToFetchBinList, SlotsToPoint} = fetch_range(StartKey,
-                                                        EndKey,
-                                                        ScanWidth,
-                                                        SegList,
-                                                        LowLastMod,
-                                                        State),
-    
+    {NeedBlockIdx, SlotsToFetchBinList, SlotsToPoint} =
+        fetch_range(StartKey, EndKey, ScanWidth,
+                            SegList, LowLastMod,
+                            State),
     PressMethod = State#state.compression_method,
     IdxModDate = State#state.index_moddate,
     
@@ -694,34 +707,47 @@ reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
                 reader,
                 State};
         false ->
-            {L, BIC} = 
+            {L, FoundBIC} = 
                 binaryslot_reader(SlotsToFetchBinList, 
-                                    PressMethod, IdxModDate, SegList),
-            FoldFun = 
-                fun(CacheEntry, Cache) ->
-                    case CacheEntry of
-                        {_ID, none} ->
-                            Cache;
-                        {ID, Header} ->
-                            array:set(ID - 1, binary:copy(Header), Cache)
-                    end
-                end,
-            BlockIdxC0 = lists:foldl(FoldFun, State#state.blockindex_cache, BIC),
+                                    PressMethod,
+                                    IdxModDate,
+                                    SegList),
+            {BlockIdxC0, HighModDate} =
+                update_blockindex_cache(NeedBlockIdx,
+                                        FoundBIC,
+                                        State#state.blockindex_cache,
+                                        State#state.high_modified_date,
+                                        State#state.index_moddate),
             {reply, 
                 L ++ SlotsToPoint, 
                 reader, 
-                State#state{blockindex_cache = BlockIdxC0}}
+                State#state{blockindex_cache = BlockIdxC0,
+                            high_modified_date = HighModDate}}
     end;
 reader({get_slots, SlotList, SegList, LowLastMod}, _From, State) ->
     PressMethod = State#state.compression_method,
     IdxModDate = State#state.index_moddate,
-    SlotBins = 
-        read_slots(State#state.handle, 
-                    SlotList, 
-                    {SegList, LowLastMod, State#state.blockindex_cache},
-                    State#state.compression_method,
-                    State#state.index_moddate),
-    {reply, {SlotBins, PressMethod, IdxModDate}, reader, State};
+    ReadNeeded =
+        check_modified(State#state.high_modified_date,
+                        LowLastMod,
+                        State#state.index_moddate),
+    {NeedBlockIdx, SlotBins} = 
+        case ReadNeeded of
+            true ->
+                read_slots(State#state.handle, 
+                            SlotList, 
+                            {SegList,
+                                LowLastMod,
+                                State#state.blockindex_cache},
+                            State#state.compression_method,
+                            State#state.index_moddate);
+            false ->
+                {false, []}
+        end,
+    {reply,
+        {NeedBlockIdx, SlotBins, PressMethod, IdxModDate},
+        reader,
+        State};
 reader(get_maxsequencenumber, _From, State) ->
     Summary = State#state.summary,
     {reply, Summary#summary.max_sqn, reader, State};
@@ -759,12 +785,8 @@ delete_pending({get_kv, LedgerKey, Hash}, _From, State) ->
     {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT};
 delete_pending({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
                                                             _From, State) ->
-    {SlotsToFetchBinList, SlotsToPoint} = fetch_range(StartKey,
-                                                        EndKey,
-                                                        ScanWidth,
-                                                        SegList,
-                                                        LowLastMod,
-                                                        State),
+    {_NeedBlockIdx, SlotsToFetchBinList, SlotsToPoint} =
+        fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State),
     % Always yield as about to clear and de-reference
     PressMethod = State#state.compression_method,
     IdxModDate = State#state.index_moddate,
@@ -776,14 +798,14 @@ delete_pending({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
 delete_pending({get_slots, SlotList, SegList, LowLastMod}, _From, State) ->
     PressMethod = State#state.compression_method,
     IdxModDate = State#state.index_moddate,
-    SlotBins = 
+    {_NeedBlockIdx, SlotBins} = 
         read_slots(State#state.handle, 
                     SlotList, 
                     {SegList, LowLastMod, State#state.blockindex_cache},
                     PressMethod,
                     IdxModDate),
     {reply, 
-        {SlotBins, PressMethod, IdxModDate}, 
+        {false, SlotBins, PressMethod, IdxModDate}, 
         delete_pending, 
         State, 
         ?DELETE_TIMEOUT};
@@ -815,8 +837,17 @@ delete_pending(close, State) ->
 handle_sync_event(_Msg, _From, StateName, State) ->
     {reply, undefined, StateName, State}.
 
-handle_event(_Msg, StateName, State) ->
-    {next_state, StateName, State}.
+handle_event({update_blockindex_cache, BIC}, StateName, State) ->
+    {BlockIndexCache, HighModDate} =
+        update_blockindex_cache(true,
+                                BIC,
+                                State#state.blockindex_cache,
+                                State#state.high_modified_date,
+                                State#state.index_moddate),
+    {next_state,
+        StateName,
+        State#state{blockindex_cache = BlockIndexCache,
+                    high_modified_date = HighModDate}}.
 
 handle_info(tidyup_after_startup, delete_pending, State) ->
     % No need to GC, this file is to be shutdown.  This message may have
@@ -983,11 +1014,17 @@ sst_getslots(Pid, SlotList) ->
 %% of the object, if the object is to be covered by the query
 sst_getfilteredslots(Pid, SlotList, SegList, LowLastMod) ->
     SegL0 = tune_seglist(SegList),
-    {SlotBins, PressMethod, IdxModDate} = 
+    {NeedBlockIdx, SlotBins, PressMethod, IdxModDate} = 
         gen_fsm:sync_send_event(Pid, 
                                 {get_slots, SlotList, SegL0, LowLastMod},
                                 infinity),
-    {L, _BIC} = binaryslot_reader(SlotBins, PressMethod, IdxModDate, SegL0),
+    {L, BIC} = binaryslot_reader(SlotBins, PressMethod, IdxModDate, SegL0),
+    case NeedBlockIdx of
+        true ->
+            gen_fsm:send_all_state_event(Pid, {update_blockindex_cache, BIC});
+        false ->
+            ok
+    end,
     L.
 
 
@@ -1065,6 +1102,62 @@ tune_seglist(SegList) ->
 %%% Internal Functions
 %%%============================================================================
 
+-spec new_blockindex_cache(pos_integer()) -> blockindex_cache().
+new_blockindex_cache(Size) ->
+    array:new([{size, Size}, {default, none}]).
+
+-spec update_blockindex_cache(boolean(),
+                                list({integer(), binary()}),
+                                blockindex_cache(),
+                                non_neg_integer()|undefined,
+                                boolean()) ->
+                                    {blockindex_cache(),
+                                        non_neg_integer()|undefined}.
+update_blockindex_cache(Needed, Entries, BIC, HighModDate, IdxModDate)
+                                            when Needed,
+                                                    HighModDate == undefined ->
+    FoldFun = 
+        fun(CacheEntry, Cache) ->
+            case CacheEntry of
+                {ID, Header} when is_binary(Header) ->
+                    array:set(ID - 1, binary:copy(Header), Cache);
+                _ ->
+                    Cache
+            end
+        end,
+    BlockIdxC0 = lists:foldl(FoldFun, BIC, Entries),
+    Size = array:size(BlockIdxC0),
+    BestModDates =
+        case IdxModDate of
+            true ->
+                ModDateFold =
+                    fun(_ID, Header, Acc) when is_binary(Header) ->
+                        [element(2, extract_header(Header, IdxModDate))|Acc]
+                    end,
+                array:sparse_foldl(ModDateFold, [], BlockIdxC0);
+            false ->
+                []
+        end,
+    BestModDate =
+        case length(BestModDates) of
+            Size ->
+                lists:max(BestModDates);
+            _ ->
+                undefined
+        end,
+    {BlockIdxC0, BestModDate};
+update_blockindex_cache(_Needed, _Entries, BIC, HighModDate, _IdxModDate) ->
+    {BIC, HighModDate}.
+
+-spec check_modified(non_neg_integer()|undefined,
+                        non_neg_integer(),
+                        boolean())  -> boolean().
+check_modified(HighLastModifiedInSST, LowModDate, true)
+                when is_integer(HighLastModifiedInSST) ->
+    LowModDate =< HighLastModifiedInSST;
+check_modified(_, _, _) ->
+    true.
+
 -spec fetch(tuple(), 
             {integer(), integer()}|integer(), 
             sst_state(), sst_timings()) 
@@ -1093,14 +1186,17 @@ fetch(LedgerKey, Hash, State, Timings0) ->
             SlotBin = read_slot(State#state.handle, Slot),
             {Result, Header} = 
                 binaryslot_get(SlotBin, LedgerKey, Hash, PressMethod, IdxModDate),
-            BlockIndexCache = 
-                array:set(SlotID - 1,
-                            binary:copy(Header),
-                            State#state.blockindex_cache),
+            {BlockIndexCache, HighModDate} =
+                update_blockindex_cache(true,
+                                        [{SlotID, Header}],
+                                        State#state.blockindex_cache,
+                                        State#state.high_modified_date,
+                                        State#state.index_moddate),
             {_SW3, Timings3} = 
                 update_timings(SW2, Timings2, noncached_block, false),
             {Result, 
-                State#state{blockindex_cache = BlockIndexCache}, 
+                State#state{blockindex_cache = BlockIndexCache,
+                            high_modified_date = HighModDate}, 
                 Timings3};
         {BlockLengths, _LMD, PosBin} ->
             PosList = find_pos(PosBin, extract_hash(Hash), [], 0),
@@ -1150,7 +1246,8 @@ fetch(LedgerKey, Hash, State, Timings0) ->
 
 -spec fetch_range(tuple(), tuple(), integer(),
                     leveled_codec:segment_list(), non_neg_integer(), 
-                    sst_state()) -> {list(), list()}.
+                    sst_state()) ->
+                        {boolean(), list(), list()}.
 %% @doc
 %% Fetch the contents of the SST file for a given key range.  This will 
 %% pre-fetch some results, and append pointers for additional results.
@@ -1209,13 +1306,13 @@ fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State) ->
                 lists:split(ScanWidth, ExpandedSlots)
         end,
 
-    SlotsToFetchBinList = 
+    {NeededBlockIdx, SlotsToFetchBinList} = 
         read_slots(Handle, 
                     SlotsToFetch, 
                     {SegList, LowLastMod, State#state.blockindex_cache},
                     State#state.compression_method,
                     State#state.index_moddate),
-    {SlotsToFetchBinList, SlotsToPoint}.
+    {NeededBlockIdx, SlotsToFetchBinList, SlotsToPoint}.
 
 -spec compress_level(integer(), press_method()) -> press_method().
 %% @doc
@@ -1258,8 +1355,7 @@ read_file(Filename, State, LoadPageCache) ->
     UpdState0 = imp_fileversion(FileVersion, State),
     {Summary, Bloom, SlotList, TombCount} =
         read_table_summary(SummaryBin, UpdState0#state.tomb_count),
-    BlockIndexCache = array:new([{size, Summary#summary.size},
-                                    {default, none}]),
+    BlockIndexCache = new_blockindex_cache(Summary#summary.size),
     UpdState1 = UpdState0#state{blockindex_cache = BlockIndexCache},
     SlotIndex = from_list(SlotList),
     UpdSummary = Summary#summary{index = SlotIndex},
@@ -1389,8 +1485,7 @@ build_all_slots(SlotList) ->
                             9,
                             1,
                             [],
-                            array:new([{size, SlotCount}, 
-                                        {default, none}]),
+                            [],
                             <<>>,
                             []),
     Bloom = leveled_ebloom:create_bloom(HashLists),
@@ -1410,7 +1505,7 @@ build_all_slots([SlotD|Rest], Pos, SlotID,
                     Pos + Length,
                     SlotID + 1,
                     [{LastKey, SlotIndexV}|SlotIdxAcc],
-                    array:set(SlotID - 1, BlockIdx, BlockIdxAcc),
+                    [{SlotID, BlockIdx}|BlockIdxAcc],
                     <<SlotBinAcc/binary, SlotBin/binary>>,
                     lists:append(HashLists, HashList)).
 
@@ -1842,7 +1937,8 @@ binarysplit_mapfun(MultiSlotBin, StartPos) ->
 
 -spec read_slots(file:io_device(), list(), 
                     {false|list(), non_neg_integer(), binary()},
-                    press_method(), boolean()) -> list(binaryslot_element()).
+                    press_method(), boolean()) -> 
+                        {boolean(), list(binaryslot_element())}.
 %% @doc
 %% The reading of sots will return a list of either 2-tuples containing 
 %% {K, V} pairs - or 3-tuples containing {Binary, SK, EK}.  The 3 tuples 
@@ -1861,7 +1957,7 @@ read_slots(Handle, SlotList, {false, 0, _BlockIndexCache},
                 _PressMethod, _IdxModDate) ->
     % No list of segments passed or useful Low LastModified Date
     % Just read slots in SlotList
-    read_slotlist(SlotList, Handle);
+    {false, read_slotlist(SlotList, Handle)};
 read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, 
                 PressMethod, IdxModDate) ->
     % List of segments passed so only {K, V} pairs matching those segments
@@ -1869,7 +1965,7 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache},
     % with the appropriate hash - if the pair were added with no_lookup as 
     % the hash value this will fail unexpectedly.
     BinMapFun = 
-        fun(Pointer, Acc) ->
+        fun(Pointer, {NeededBlockIdx, Acc}) ->
             {SP, _L, ID, SK, EK} = pointer_mapfun(Pointer),
             CachedHeader = array:get(ID - 1, BlockIndexCache),
             case extract_header(CachedHeader, IdxModDate) of
@@ -1877,7 +1973,7 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache},
                     % If there is an attempt to use the seg list query and the
                     % index block cache isn't cached for any part this may be 
                     % slower as each slot will be read in turn
-                    Acc ++ read_slotlist([Pointer], Handle);
+                    {true, Acc ++ read_slotlist([Pointer], Handle)};
                 {BlockLengths, LMD, BlockIdx} ->
                     % If there is a BlockIndex cached then we can use it to 
                     % check to see if any of the expected segments are 
@@ -1894,12 +1990,14 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache},
                             % LowLastMod date passed in the query - therefore
                             % there are no interesting modifications in this
                             % slot - it is all too old
-                            Acc;
+                            {NeededBlockIdx, Acc};
                         false ->
                             case SegList of
                                 false ->
                                     % Need all the slot now
-                                    Acc ++ read_slotlist([Pointer], Handle);
+                                    {NeededBlockIdx,
+                                        Acc ++
+                                            read_slotlist([Pointer], Handle)};
                                 _SL ->
                                     % Need to find just the right keys
                                     PositionList = 
@@ -1920,12 +2018,13 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache},
                                     % to be filtered
                                     FilterFun =
                                         fun(KV) -> in_range(KV, SK, EK) end,
-                                    Acc ++ lists:filter(FilterFun, KVL)
+                                    {NeededBlockIdx,
+                                        Acc ++ lists:filter(FilterFun, KVL)}
                             end
                     end
             end 
         end,
-    lists:foldl(BinMapFun, [], SlotList).
+    lists:foldl(BinMapFun, {false, []}, SlotList).
 
 
 -spec in_range(leveled_codec:ledger_kv(),
@@ -2015,7 +2114,7 @@ read_length_list(Handle, LengthList) ->
 
 
 -spec extract_header(binary()|none, boolean()) ->
-                                        {binary(), integer(), binary()}|none.
+                                {binary(), non_neg_integer(), binary()}|none.
 %% @doc
 %% Helper for extracting the binaries from the header ignoring the missing LMD
 %% if LMD is not indexed
@@ -3657,8 +3756,6 @@ key_dominates_test() ->
                     key_dominates([KV7|KL2], [KV2], {true, 1})).
 
 nonsense_coverage_test() ->
-    {ok, Pid} = gen_fsm:start_link(?MODULE, [], []),
-    ok = gen_fsm:send_all_state_event(Pid, nonsense),
     ?assertMatch({ok, reader, #state{}}, code_change(nonsense,
                                                         reader,
                                                         #state{},
@@ -3861,6 +3958,39 @@ corrupted_block_fetch_tester(PressMethod) ->
     ExpectedMisses = element(2, ?LOOK_BLOCKSIZE),
     ?assertMatch(ExpectedMisses, MissCount).
 
+block_index_cache_test() ->
+    {Mega, Sec, _} = os:timestamp(),
+    Now = Mega * 1000000 + Sec,
+    EntriesTS =
+        lists:map(fun(I) ->
+                        TS = Now - I + 1,
+                        {I, <<0:160/integer, TS:32/integer, 0:32/integer>>}
+                    end,
+                    lists:seq(1, 8)),
+    EntriesNoTS = 
+        lists:map(fun(I) ->
+                        {I, <<0:160/integer, 0:32/integer>>}
+                    end,
+                    lists:seq(1, 8)),
+    HeaderTS = <<0:160/integer, Now:32/integer, 0:32/integer>>,
+    HeaderNoTS = <<0:192>>,
+    BIC = array:new([{size, 8}, {default, none}]),
+    {BIC0, undefined} =
+        update_blockindex_cache(false, EntriesNoTS, BIC, undefined, false),
+    {BIC1, undefined} =
+        update_blockindex_cache(false, EntriesTS, BIC, undefined, true),
+    {BIC2, undefined} =
+        update_blockindex_cache(true, EntriesNoTS, BIC, undefined, false),
+    {BIC3, LMD3} =
+        update_blockindex_cache(true, EntriesTS, BIC, undefined, true),
+    
+    ?assertMatch(none, array:get(0, BIC0)),
+    ?assertMatch(none, array:get(0, BIC1)),
+    ?assertMatch(HeaderNoTS, array:get(0, BIC2)),
+    ?assertMatch(HeaderTS, array:get(0, BIC3)),
+    ?assertMatch(Now, LMD3).
+
+
 
 receive_fun() ->
     receive
diff --git a/test/end_to_end/riak_SUITE.erl b/test/end_to_end/riak_SUITE.erl
index 17b02d0..e8cb06e 100644
--- a/test/end_to_end/riak_SUITE.erl
+++ b/test/end_to_end/riak_SUITE.erl
@@ -258,6 +258,13 @@ fetchclocks_modifiedbetween(_Config) ->
     {ok, Bookie1A} = leveled_bookie:book_start(StartOpts1A),
     {ok, Bookie1B} = leveled_bookie:book_start(StartOpts1B),
 
+    ObjList0 = 
+        testutil:generate_objects(100000, 
+                                    {fixed_binary, 1}, [],
+                                    leveled_rand:rand_bytes(32),
+                                    fun() -> [] end,
+                                    <<"BaselineB">>),
+
     ObjL1StartTS = testutil:convert_to_seconds(os:timestamp()),
     ObjList1 = 
         testutil:generate_objects(20000, 
@@ -331,6 +338,7 @@ fetchclocks_modifiedbetween(_Config) ->
     testutil:riakload(Bookie1A, ObjList4),
     testutil:riakload(Bookie1A, ObjList6),
 
+    testutil:riakload(Bookie1B, ObjList0),
     testutil:riakload(Bookie1B, ObjList4),
     testutil:riakload(Bookie1B, ObjList5),
     testutil:riakload(Bookie1B, ObjList1),
@@ -412,7 +420,7 @@ fetchclocks_modifiedbetween(_Config) ->
             fun(_B, K, V, {LK, AccC}) ->
                 % Value is proxy_object?  Can we get the metadata and
                 % read the last modified date?  The do a non-accelerated
-                % fold to chekc that it is slower
+                % fold to check that it is slower
                 {proxy_object, MDBin, _Size, _Fetcher} = binary_to_term(V),
                 LMDTS = testutil:get_lastmodified(MDBin),
                 LMD = testutil:convert_to_seconds(LMDTS),
@@ -458,13 +466,20 @@ fetchclocks_modifiedbetween(_Config) ->
     true = NoFilterTime > PlusFilterTime,
 
     SimpleCountFun =
-        fun(_B, _K, _V, AccC) -> AccC + 1 end,
+        fun(BucketList) ->
+            fun(B, _K, _V, AccC) -> 
+                case lists:member(B, BucketList) of
+                    true -> AccC + 1;
+                    false -> AccC
+                end
+            end
+        end,
 
     {async, R4A_MultiBucketRunner} = 
         leveled_bookie:book_headfold(Bookie1A,
                                         ?RIAK_TAG,
                                         {bucket_list, [<<"B0">>, <<"B2">>]},
-                                        {SimpleCountFun, 0},
+                                        {SimpleCountFun([<<"B0">>, <<"B2">>]), 0},
                                         false,
                                         true,
                                         false,
@@ -482,7 +497,7 @@ fetchclocks_modifiedbetween(_Config) ->
                                         {bucket_list, [<<"B2">>, <<"B0">>]},
                                             % Reverse the buckets in the bucket
                                             % list
-                                        {SimpleCountFun, 0},
+                                        {SimpleCountFun([<<"B0">>, <<"B2">>]), 0},
                                         false,
                                         true,
                                         false,
@@ -495,10 +510,10 @@ fetchclocks_modifiedbetween(_Config) ->
 
     {async, R5B_MultiBucketRunner} = 
         leveled_bookie:book_headfold(Bookie1B,
-                                            % Same query - other bookie
                                         ?RIAK_TAG,
-                                        {bucket_list, [<<"B2">>, <<"B0">>]},
-                                        {SimpleCountFun, 0},
+                                        {bucket_list,
+                                            [<<"BaselineB">>, <<"B2">>, <<"B0">>]},
+                                        {SimpleCountFun([<<"B0">>, <<"B2">>]), 0},
                                         false,
                                         true,
                                         false,
@@ -506,7 +521,7 @@ fetchclocks_modifiedbetween(_Config) ->
                                         false),
     R5B_MultiBucket = R5B_MultiBucketRunner(),
     io:format("R5B_MultiBucket ~w ~n", [R5B_MultiBucket]),
-    true = R5A_MultiBucket == 37000,
+    true = R5B_MultiBucket == 37000,
 
     testutil:update_some_objects(Bookie1A, ObjList1, 1000),
     R6A_PlusFilter = lists:foldl(FoldRangesFun(Bookie1A, 
@@ -523,7 +538,7 @@ fetchclocks_modifiedbetween(_Config) ->
         leveled_bookie:book_headfold(Bookie1A,
                                         ?RIAK_TAG,
                                         {bucket_list, [<<"B1">>, <<"B2">>]},
-                                        {SimpleCountFun, 0},
+                                        {SimpleCountFun([<<"B1">>, <<"B2">>]), 0},
                                         false,
                                         true,
                                         false,
@@ -537,7 +552,7 @@ fetchclocks_modifiedbetween(_Config) ->
         leveled_bookie:book_headfold(Bookie1A,
                                         ?RIAK_TAG,
                                         {bucket_list, [<<"B1">>, <<"B2">>]},
-                                        {SimpleCountFun, 0},
+                                        {SimpleCountFun([<<"B1">>, <<"B2">>]), 0},
                                         false,
                                         true,
                                         false,
@@ -547,8 +562,39 @@ fetchclocks_modifiedbetween(_Config) ->
     io:format("R8A_MultiBucket ~w ~n", [R8A_MultiBucket]),
     true = R8A_MultiBucket == {0, 5000},
 
+    ok = leveled_bookie:book_close(Bookie1B),
+    io:format("Double query to generate index cache and use~n"),
+    {ok, Bookie1BS} = leveled_bookie:book_start(StartOpts1B),
+    {async, R5B_MultiBucketRunner0} = 
+        leveled_bookie:book_headfold(Bookie1BS,
+                                        ?RIAK_TAG,
+                                        all,
+                                        {SimpleCountFun([<<"B0">>, <<"B2">>]), 0},
+                                        false,
+                                        true,
+                                        false,
+                                        {ObjL4StartTS, ObjL6EndTS},
+                                        false),
+    R5B_MultiBucket0 = R5B_MultiBucketRunner0(),
+    io:format("R5B_MultiBucket ~w ~n", [R5B_MultiBucket0]),
+    true = R5B_MultiBucket0 == 37000,
+    {async, R5B_MultiBucketRunner1} = 
+        leveled_bookie:book_headfold(Bookie1BS,
+                                        ?RIAK_TAG,
+                                        all,
+                                        {SimpleCountFun([<<"B0">>, <<"B2">>]), 0},
+                                        false,
+                                        true,
+                                        false,
+                                        {ObjL4StartTS, ObjL6EndTS},
+                                        false),
+    R5B_MultiBucket1 = R5B_MultiBucketRunner1(),
+    io:format("R5B_MultiBucket ~w ~n", [R5B_MultiBucket1]),
+    true = R5B_MultiBucket1 == 37000,
+
+
     ok = leveled_bookie:book_destroy(Bookie1A),
-    ok = leveled_bookie:book_destroy(Bookie1B).
+    ok = leveled_bookie:book_destroy(Bookie1BS).
     
 
 

From f3f574de02a0e4f46b1836350e34b2d51a224fd7 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 3 Dec 2020 13:37:22 +0000
Subject: [PATCH 8/9] Switch to checking on get_kvrange

In production scale testing, placing te check_modified call on get_kvrange not get_slots made the performance difference.

It should help in get_lots as well, but unable to reliably get coverage in tests with this.  So for now, will leave off until a proper test can be constructed which demonstrates any benefits.
---
 src/leveled_sst.erl            |  36 +++++------
 test/end_to_end/riak_SUITE.erl | 113 +++++++++++++++++++++++++++++++--
 2 files changed, 125 insertions(+), 24 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index c42005b..6fc2c8a 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -689,10 +689,19 @@ reader({get_kv, LedgerKey, Hash}, _From, State) ->
                                             timings_countdown = CountDown}};
 reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
                                                             _From, State) ->
+    ReadNeeded =
+        check_modified(State#state.high_modified_date,
+                        LowLastMod,
+                        State#state.index_moddate),
     {NeedBlockIdx, SlotsToFetchBinList, SlotsToPoint} =
-        fetch_range(StartKey, EndKey, ScanWidth,
+        case ReadNeeded of
+            true -> 
+                fetch_range(StartKey, EndKey, ScanWidth,
                             SegList, LowLastMod,
-                            State),
+                            State);
+            false ->
+                {false, [], []}
+        end,
     PressMethod = State#state.compression_method,
     IdxModDate = State#state.index_moddate,
     
@@ -727,23 +736,14 @@ reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
 reader({get_slots, SlotList, SegList, LowLastMod}, _From, State) ->
     PressMethod = State#state.compression_method,
     IdxModDate = State#state.index_moddate,
-    ReadNeeded =
-        check_modified(State#state.high_modified_date,
-                        LowLastMod,
-                        State#state.index_moddate),
     {NeedBlockIdx, SlotBins} = 
-        case ReadNeeded of
-            true ->
-                read_slots(State#state.handle, 
-                            SlotList, 
-                            {SegList,
-                                LowLastMod,
-                                State#state.blockindex_cache},
-                            State#state.compression_method,
-                            State#state.index_moddate);
-            false ->
-                {false, []}
-        end,
+        read_slots(State#state.handle, 
+                        SlotList, 
+                        {SegList,
+                            LowLastMod,
+                            State#state.blockindex_cache},
+                        State#state.compression_method,
+                        State#state.index_moddate),
     {reply,
         {NeedBlockIdx, SlotBins, PressMethod, IdxModDate},
         reader,
diff --git a/test/end_to_end/riak_SUITE.erl b/test/end_to_end/riak_SUITE.erl
index e8cb06e..c4881fd 100644
--- a/test/end_to_end/riak_SUITE.erl
+++ b/test/end_to_end/riak_SUITE.erl
@@ -320,7 +320,7 @@ fetchclocks_modifiedbetween(_Config) ->
     _ObjL5EndTS = testutil:convert_to_seconds(os:timestamp()),
     timer:sleep(1000),
 
-    _ObjL6StartTS = testutil:convert_to_seconds(os:timestamp()),
+    ObjL6StartTS = testutil:convert_to_seconds(os:timestamp()),
     ObjList6 = 
         testutil:generate_objects(7000, 
                                     {fixed_binary, 1}, [],
@@ -563,8 +563,33 @@ fetchclocks_modifiedbetween(_Config) ->
     true = R8A_MultiBucket == {0, 5000},
 
     ok = leveled_bookie:book_close(Bookie1B),
+
     io:format("Double query to generate index cache and use~n"),
     {ok, Bookie1BS} = leveled_bookie:book_start(StartOpts1B),
+    
+    TooLate = testutil:convert_to_seconds(os:timestamp()),
+
+    lmdrange_tester(Bookie1BS, SimpleCountFun,
+                    ObjL4StartTS, ObjL6StartTS, ObjL6EndTS, TooLate),
+
+    io:format("Push tested keys down levels with new objects~n"),
+    ObjList7 = 
+        testutil:generate_objects(200000, 
+                                    {fixed_binary, 1}, [],
+                                    leveled_rand:rand_bytes(32),
+                                    fun() -> [] end,
+                                    <<"B1.9">>),
+    testutil:riakload(Bookie1BS, ObjList7),
+
+    lmdrange_tester(Bookie1BS, SimpleCountFun,
+                    ObjL4StartTS, ObjL6StartTS, ObjL6EndTS, TooLate),
+
+    ok = leveled_bookie:book_destroy(Bookie1A),
+    ok = leveled_bookie:book_destroy(Bookie1BS).
+    
+
+lmdrange_tester(Bookie1BS, SimpleCountFun,
+                ObjL4StartTS, ObjL6StartTS, ObjL6EndTS, TooLate) ->
     {async, R5B_MultiBucketRunner0} = 
         leveled_bookie:book_headfold(Bookie1BS,
                                         ?RIAK_TAG,
@@ -576,7 +601,7 @@ fetchclocks_modifiedbetween(_Config) ->
                                         {ObjL4StartTS, ObjL6EndTS},
                                         false),
     R5B_MultiBucket0 = R5B_MultiBucketRunner0(),
-    io:format("R5B_MultiBucket ~w ~n", [R5B_MultiBucket0]),
+    io:format("R5B_MultiBucket0 ~w ~n", [R5B_MultiBucket0]),
     true = R5B_MultiBucket0 == 37000,
     {async, R5B_MultiBucketRunner1} = 
         leveled_bookie:book_headfold(Bookie1BS,
@@ -589,13 +614,89 @@ fetchclocks_modifiedbetween(_Config) ->
                                         {ObjL4StartTS, ObjL6EndTS},
                                         false),
     R5B_MultiBucket1 = R5B_MultiBucketRunner1(),
-    io:format("R5B_MultiBucket ~w ~n", [R5B_MultiBucket1]),
+    io:format("R5B_MultiBucket1 ~w ~n", [R5B_MultiBucket1]),
     true = R5B_MultiBucket1 == 37000,
+    SimpleMinMaxFun = 
+        fun(B, K, _V, Acc) ->
+            case lists:keyfind(B, 1, Acc) of
+                {B, MinK, MaxK} ->
+                    lists:ukeysort(1, [{B, min(K, MinK), max(K, MaxK)}|Acc]);
+                false ->
+                    lists:ukeysort(1, [{B, K, K}|Acc])
+            end
+        end,
+    {async, R5B_MultiBucketRunner2} = 
+        leveled_bookie:book_headfold(Bookie1BS,
+                                        ?RIAK_TAG,
+                                        {bucket_list, [<<"B0">>, <<"B2">>]},
+                                        {SimpleMinMaxFun, []},
+                                        false,
+                                        true,
+                                        false,
+                                        {ObjL4StartTS, ObjL6EndTS},
+                                        false),
+    [{<<"B0">>, MinB0K, MaxB0K}, {<<"B2">>, MinB2K, MaxB2K}] =
+        R5B_MultiBucketRunner2(),
+    io:format("Found Min and Max Keys~n"),
+    io:format("B ~s MinK ~s MaxK ~s~n", [<<"B0">>, MinB0K, MaxB0K]),
+    io:format("B ~s MinK ~s MaxK ~s~n", [<<"B2">>, MinB2K, MaxB2K]),
+    {async, R5B_MultiBucketRunner3a} = 
+        leveled_bookie:book_headfold(Bookie1BS,
+                                        ?RIAK_TAG,
+                                        {range, <<"B0">>, {MinB0K, MaxB0K}},
+                                        {SimpleCountFun([<<"B0">>]), 0},
+                                        false,
+                                        true,
+                                        false,
+                                        {ObjL4StartTS, ObjL6EndTS},
+                                        false),
+    {async, R5B_MultiBucketRunner3b} = 
+        leveled_bookie:book_headfold(Bookie1BS,
+                                        ?RIAK_TAG,
+                                        {range, <<"B2">>, {MinB2K, MaxB2K}},
+                                        {SimpleCountFun([<<"B2">>]), 0},
+                                        false,
+                                        true,
+                                        false,
+                                        {ObjL4StartTS, ObjL6EndTS},
+                                        false),
+    R5B_MultiBucket3a = R5B_MultiBucketRunner3a(),
+    io:format("R5B_MultiBucket3a ~w ~n", [R5B_MultiBucket3a]),
+    R5B_MultiBucket3b = R5B_MultiBucketRunner3b(),
+    io:format("R5B_MultiBucket3b ~w ~n", [R5B_MultiBucket3b]),
+    true = (R5B_MultiBucket3a + R5B_MultiBucket3b) == 37000,
 
+    io:format("Query outside of time range~n"),
+    {async, R5B_MultiBucketRunner4} = 
+        leveled_bookie:book_headfold(Bookie1BS,
+                                        ?RIAK_TAG,
+                                        all,
+                                        {SimpleCountFun([<<"B0">>, <<"B2">>]), 0},
+                                        false,
+                                        true,
+                                        false,
+                                        {ObjL6EndTS,
+                                            TooLate},
+                                        false),
+    R5B_MultiBucket4 = R5B_MultiBucketRunner4(),
+    io:format("R5B_MultiBucket4 ~w ~n", [R5B_MultiBucket4]),
+    true = R5B_MultiBucket4 == 0,
 
-    ok = leveled_bookie:book_destroy(Bookie1A),
-    ok = leveled_bookie:book_destroy(Bookie1BS).
-    
+    io:format("Query with one foot inside of time range~n"),
+    {async, R5B_MultiBucketRunner5} = 
+        leveled_bookie:book_headfold(Bookie1BS,
+                                        ?RIAK_TAG,
+                                        all,
+                                        {SimpleCountFun([<<"B0">>, <<"B2">>]), 0},
+                                        false,
+                                        true,
+                                        false,
+                                        {ObjL6StartTS,
+                                            TooLate},
+                                        false),
+    R5B_MultiBucket5 = R5B_MultiBucketRunner5(),
+    io:format("R5B_MultiBucket5 ~w ~n", [R5B_MultiBucket5]),
+    true = R5B_MultiBucket5 == 7000.
 
 
 crossbucket_aae(_Config) ->

From 7bf67563ef2272bf9f641e751c941af0a6ad8cd3 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 4 Dec 2020 14:31:47 +0000
Subject: [PATCH 9/9] Update src/leveled_iclerk.erl

Co-authored-by: Thomas Arts <thomas.arts@quviq.com>
---
 src/leveled_iclerk.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl
index 760914e..d7edc73 100644
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@@ -606,7 +606,7 @@ size_comparison_score(KeySizeList,
                         true ->
                             {ActSize + Size - ?CRC_SIZE, RplSize};
                         convert ->
-                            {ActSize, RplSize  + Size - ?CRC_SIZE};
+                            {ActSize, RplSize + Size - ?CRC_SIZE};
                         false ->
                             {ActSize, RplSize + Size - ?CRC_SIZE}
                     end;