Query don't copy (#380)

* Query don't copy Queries the manifest to avoid copying the whole manifest when taking a snapshot of a penciller to run a query. Change the logging of fold setup in the Bookie to record the actual snapshot time (rather than the uninteresting and fast returning the the function which will request the snapshot). A little tidy to avoid duplicating the ?MAX_LEVELS macro. * Clarify log is of snapshot time not fold time * Updates after review
2022-10-11 13:45:55 +01:00 · 2022-10-11 13:45:55 +01:00 · d09f5c778b
commit d09f5c778b
parent 28d3701f6e
5 changed files with 175 additions and 117 deletions
--- a/include/leveled.hrl
+++ b/include/leveled.hrl
@ -24,6 +24,9 @@

 -define(CACHE_TYPE, skpl).

+-define(MAX_LEVELS, 8).
+%% Should equal the length of the LEVEL_SCALEFACTOR
+

 -record(level,
                        {level :: integer(),
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@ -178,12 +178,12 @@

                put_countdown = 0 :: integer(),
                get_countdown = 0 :: integer(),
-                fold_countdown = 0 :: integer(),
+                snapshot_countdown = 0 :: integer(),
                head_countdown = 0 :: integer(),
                cache_ratio = {0, 0, 0} :: cache_ratio(),
                get_timings = no_timing :: get_timings(),
                put_timings = no_timing :: put_timings(),
-                fold_timings = no_timing :: fold_timings(),
+                snapshot_timings = no_timing :: snapshot_timings(),
                head_timings = no_timing :: head_timings()}).


@ -201,8 +201,9 @@
                        ink_time = 0 :: integer(),
                        total_size = 0 :: integer()}).

-record(fold_timings, {sample_count = 0 :: integer(),
-                        setup_time = 0 :: integer()}).
+-record(snapshot_timings, {sample_count = 0 :: integer(),
+                            bookie_time = 0 :: integer(),
+                            pcl_time = 0 :: integer()}).


 -type book_state() :: #state{}.
@ -210,9 +211,11 @@
 -type ledger_cache() :: #ledger_cache{}.
 -type get_timings() :: no_timing|#get_timings{}.
 -type put_timings() :: no_timing|#put_timings{}.
-type fold_timings() :: no_timing|#fold_timings{}.
+-type snapshot_timings() :: no_timing|#snapshot_timings{}.
 -type head_timings() :: no_timing|#head_timings{}.
-type timing_types() :: head|get|put|fold.
+-type timings() ::
+    put_timings()|get_timings()|snapshot_timings()|head_timings().
+-type timing_types() :: head|get|put|snapshot.
 -type cache_ratio() ::
    {non_neg_integer(), non_neg_integer(), non_neg_integer()}.

@ -1460,26 +1463,27 @@ handle_call({snapshot, SnapType, Query, LongRunning}, _From, State) ->
    % Snapshot the store, specifying if the snapshot should be long running 
    % (i.e. will the snapshot be queued or be required for an extended period 
    % e.g. many minutes)
-    Reply = snapshot_store(State, SnapType, Query, LongRunning),
-    {reply, Reply, State};
+    {ok, PclSnap, InkSnap, Timings} =
+        snapshot_store(State, SnapType, Query, LongRunning),
+    {UpdTimings, CountDown} =
+        update_statetimings(snapshot, Timings, State#state.snapshot_countdown),
+    {reply,
+        {ok, PclSnap, InkSnap},
+        State#state{
+            snapshot_timings = UpdTimings,
+            snapshot_countdown = CountDown}};
 handle_call(log_settings, _From, State) ->
    {reply, leveled_log:return_settings(), State};
 handle_call({return_runner, QueryType}, _From, State) ->
-    SW = os:timestamp(),
    Runner = get_runner(State, QueryType),
-    {_SW, Timings1} = 
-        update_timings(SW, {fold, setup}, State#state.fold_timings),
-    {Timings, CountDown} = 
-        update_statetimings(fold, Timings1, State#state.fold_countdown),
-    {reply, Runner, State#state{fold_timings = Timings, 
-                                fold_countdown = CountDown}};
+    {reply, Runner, State};
 handle_call({compact_journal, Timeout}, _From, State)
                                        when State#state.head_only == false ->
    case leveled_inker:ink_compactionpending(State#state.inker) of
        true ->
            {reply, {busy, undefined}, State};
        false ->
-            {ok, PclSnap, null} =
+            {ok, PclSnap, null, _Timings} =
                snapshot_store(State, ledger, undefined, true),
            R = leveled_inker:ink_compactjournal(State#state.inker,
                                                    PclSnap,
@ -1609,9 +1613,13 @@ loadqueue_ledgercache(Cache) ->
    Cache#ledger_cache{load_queue = [], loader = T}.

 -spec snapshot_store(ledger_cache(), 
-                        pid(), null|pid(), store|ledger, 
-                        undefined|tuple(), undefined|boolean()) ->
-                                                {ok, pid(), pid()|null}.
+                        pid(),
+                        null|pid(),
+                        snapshot_timings(),
+                        store|ledger, 
+                        undefined|tuple(),
+                        undefined|boolean()) ->
+                            {ok, pid(), pid()|null, snapshot_timings()}.
 %% @doc 
 %% Allow all a snapshot to be created from part of the store, preferably
 %% passing in a query filter so that all of the LoopState does not need to
@ -1626,38 +1634,49 @@ loadqueue_ledgercache(Cache) ->
 %% setup, assuming the range is a small subset of the overall key space).  If 
 %% lookup is required but the range isn't defined then 'undefined' should be 
 %% passed as the query
-snapshot_store(LedgerCache, Penciller, Inker, SnapType, Query, LongRunning) ->
+snapshot_store(
+        LedgerCache, Penciller, Inker, Timings, SnapType, Query, LongRunning) ->
+    TS0 = os:timestamp(),
    LedgerCacheReady = readycache_forsnapshot(LedgerCache, Query),
    BookiesMem = {LedgerCacheReady#ledger_cache.loader,
                    LedgerCacheReady#ledger_cache.index,
                    LedgerCacheReady#ledger_cache.min_sqn,
                    LedgerCacheReady#ledger_cache.max_sqn},
-    PCLopts = #penciller_options{start_snapshot = true,
+    PCLopts = 
+        #penciller_options{start_snapshot = true,
                            source_penciller = Penciller,
                            snapshot_query = Query,
                            snapshot_longrunning = LongRunning,
 				            bookies_pid = self(),
                            bookies_mem = BookiesMem},
+    {TS1, Timings1} = update_timings(TS0, {snapshot, bookie}, Timings), 
    {ok, LedgerSnapshot} = leveled_penciller:pcl_snapstart(PCLopts),
+    {_TS2, Timings2} = update_timings(TS1, {snapshot, pcl}, Timings1),
    case SnapType of
        store ->
            InkerOpts = #inker_options{start_snapshot=true,
                                       bookies_pid = self(),
                                       source_inker=Inker},
            {ok, JournalSnapshot} = leveled_inker:ink_snapstart(InkerOpts),
-            {ok, LedgerSnapshot, JournalSnapshot};
+            {ok, LedgerSnapshot, JournalSnapshot, Timings2};
        ledger ->
-            {ok, LedgerSnapshot, null}
+            {ok, LedgerSnapshot, null, Timings2}
    end.

+snapshot_store(LedgerCache, Penciller, Inker, SnapType, Query, LongRunning) ->
+    snapshot_store(
+        LedgerCache, Penciller, Inker, no_timing, SnapType, Query, LongRunning).
+
 snapshot_store(State, SnapType, Query, LongRunning) ->
    snapshot_store(State#state.ledger_cache,
                    State#state.penciller,
                    State#state.inker,
+                    State#state.snapshot_timings,
                    SnapType,
                    Query,
                    LongRunning).

+
 -spec fetch_value(pid(), leveled_codec:journal_ref()) -> not_present|any().
 %% @doc
 %% Fetch a value from the Journal
@ -1822,7 +1841,8 @@ set_options(Opts) ->
 return_snapfun(State, SnapType, Query, LongRunning, SnapPreFold) ->
    case SnapPreFold of
        true ->
-            {ok, LS, JS} = snapshot_store(State, SnapType, Query, LongRunning),
+            {ok, LS, JS, _Timings} =
+                snapshot_store(State, SnapType, Query, LongRunning),
            fun() -> {ok, LS, JS} end;
        false ->
            Self = self(),
@ -2457,12 +2477,8 @@ delete_path(DirPath) ->
 %%% Timing Functions
 %%%============================================================================

-spec update_statetimings(timing_types(), 
-                    put_timings()|get_timings()|fold_timings()|head_timings(), 
-                    integer()) 
-                    -> 
-                    {put_timings()|get_timings()|fold_timings()|head_timings(), 
-                    integer()}.
+-spec update_statetimings(timing_types(), timings(), integer()) -> 
+                    {timings(), integer()}.
 %% @doc
 %%
 %% The timings state is either in countdown to the next set of samples of
@ -2478,8 +2494,8 @@ update_statetimings(put, no_timing, 0) ->
    {#put_timings{}, 0};
 update_statetimings(get, no_timing, 0) ->
    {#get_timings{}, 0};
-update_statetimings(fold, no_timing, 0) ->
-    {#fold_timings{}, 0};
+update_statetimings(snapshot, no_timing, 0) ->
+    {#snapshot_timings{}, 0};
 update_statetimings(head, Timings, 0) ->
    case Timings#head_timings.sample_count of 
        SC when SC >= ?TIMING_SAMPLESIZE ->
@ -2504,12 +2520,12 @@ update_statetimings(get, Timings, 0) ->
        _SC ->
            {Timings, 0}
    end;
-update_statetimings(fold, Timings, 0) ->
-    case Timings#fold_timings.sample_count of 
-        SC when SC >= (?TIMING_SAMPLESIZE div 10) ->
-            log_timings(fold, Timings),
+update_statetimings(snapshot, Timings, 0) ->
+    case Timings#snapshot_timings.sample_count of 
+        SC when SC >= ?TIMING_SAMPLESIZE ->
+            log_timings(snapshot, Timings),
            {no_timing, 
-                leveled_rand:uniform(2 * (?TIMING_SAMPLECOUNTDOWN div 10))};
+                leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
        _SC ->
            {Timings, 0}
    end;
@ -2531,15 +2547,17 @@ log_timings(get, Timings) ->
                                Timings#get_timings.head_time,
                                Timings#get_timings.body_time,
                                Timings#get_timings.fetch_count]);
-log_timings(fold, Timings) ->    
-    leveled_log:log("B0017", [Timings#fold_timings.sample_count, 
-                                Timings#fold_timings.setup_time]).
+log_timings(snapshot, Timings) ->    
+    leveled_log:log("B0017", [Timings#snapshot_timings.sample_count, 
+                                Timings#snapshot_timings.bookie_time,
+                                Timings#snapshot_timings.pcl_time]).


 update_timings(_SW, _Stage, no_timing) ->
    {no_timing, no_timing};
 update_timings(SW, {head, Stage}, Timings) ->
-    Timer = timer:now_diff(os:timestamp(), SW),
+    NextSW = os:timestamp(), 
+    Timer = timer:now_diff(NextSW, SW),
    Timings0 = 
        case Stage of 
            pcl ->
@ -2550,9 +2568,10 @@ update_timings(SW, {head, Stage}, Timings) ->
                CNT = Timings#head_timings.sample_count + 1,
                Timings#head_timings{buildhead_time = BHT, sample_count = CNT}
        end,
-    {os:timestamp(), Timings0};
+    {NextSW, Timings0};
 update_timings(SW, {put, Stage}, Timings) ->
-    Timer = timer:now_diff(os:timestamp(), SW),
+    NextSW = os:timestamp(),
+    Timer = timer:now_diff(NextSW, SW),
    Timings0 = 
        case Stage of 
            {inker, ObjectSize} ->
@ -2564,24 +2583,32 @@ update_timings(SW, {put, Stage}, Timings) ->
                CNT = Timings#put_timings.sample_count + 1,
                Timings#put_timings{mem_time = PCT, sample_count = CNT}
        end,
-    {os:timestamp(), Timings0};
+    {NextSW, Timings0};
 update_timings(SW, {get, head}, Timings) ->
-    Timer = timer:now_diff(os:timestamp(), SW),
+    NextSW = os:timestamp(), 
+    Timer = timer:now_diff(NextSW, SW),
    GHT = Timings#get_timings.head_time + Timer,
    CNT = Timings#get_timings.sample_count + 1,
    Timings0 = Timings#get_timings{head_time = GHT, sample_count = CNT},
-    {os:timestamp(), Timings0};
+    {NextSW, Timings0};
 update_timings(SW, {get, body}, Timings) ->
    Timer = timer:now_diff(os:timestamp(), SW),
    GBT = Timings#get_timings.body_time + Timer,
    FCNT = Timings#get_timings.fetch_count + 1,
    Timings0 = Timings#get_timings{body_time = GBT, fetch_count = FCNT},
    {no_timing, Timings0};
-update_timings(SW, {fold, setup}, Timings) ->
-    Timer = timer:now_diff(os:timestamp(), SW),
-    FST = Timings#fold_timings.setup_time + Timer,
-    CNT = Timings#fold_timings.sample_count + 1,
-    Timings0 = Timings#fold_timings{setup_time = FST, sample_count = CNT},
+update_timings(SW, {snapshot, bookie}, Timings) ->
+    NextSW = os:timestamp(), 
+    Timer = timer:now_diff(NextSW, SW),
+    BST = Timings#snapshot_timings.bookie_time + Timer,
+    CNT = Timings#snapshot_timings.sample_count + 1,
+    Timings0 = Timings#snapshot_timings{bookie_time = BST, sample_count = CNT},
+    {NextSW, Timings0};
+update_timings(SW, {snapshot, pcl}, Timings) ->
+    NextSW = os:timestamp(), 
+    Timer = timer:now_diff(NextSW, SW),
+    PST = Timings#snapshot_timings.pcl_time + Timer,
+    Timings0 = Timings#snapshot_timings{pcl_time = PST},
    {no_timing, Timings0}.


--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@ -70,7 +70,7 @@
        {info, "Get timing with sample_count=~w and head_time=~w body_time=~w"
                ++ " with fetch_count=~w"}},
    {"B0017",
-        {info, "Fold timing with sample_count=~w and setup_time=~w"}},
+        {info, "Snapshot timing with sample_count=~w and bookie_time=~w pcl_time=~w"}},
    {"B0018",
        {info, "Positive HEAD responses timed with sample_count=~w and "
                ++ " pcl_time=~w rsp_time=~w"}},
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@ -206,23 +206,6 @@

 -include_lib("eunit/include/eunit.hrl").

-define(LEVEL_SCALEFACTOR, 
-            [{0, 0}, 
-                {1, 4}, {2, 16}, {3, 64}, % Factor of 4
-                {4, 384}, {5, 2304}, % Factor of 6 
-                {6, 18432}, % Factor of 8 
-                {7, infinity}]).
-            % As an alternative to going up by a factor of 8 at each level, 
-            % increase by a factor of 4 at young levels - to make early  
-            % compaction jobs shorter.
-            %  
-            % There are 32K keys per files => with 4096 files there are 100M
-            % keys supported,
-            
-            % 600M keys is supported before hitting the infinite level.  
-            % At o(10) trillion keys behaviour may become increasingly 
-            % difficult to predict.
-define(MAX_LEVELS, 8).
 -define(MAX_WORK_WAIT, 300).
 -define(MANIFEST_FP, "ledger_manifest").
 -define(FILES_FP, "ledger_files").
@ -230,7 +213,6 @@
 -define(PENDING_FILEX, "pnd").
 -define(SST_FILEX, ".sst").
 -define(ARCHIVE_FILEX, ".bak").
-define(MEMTABLE, mem).
 -define(SUPER_MAX_TABLE_SIZE, 40000).
 -define(PROMPT_WAIT_ONL0, 5).
 -define(WORKQUEUE_BACKLOG_TOLERANCE, 4).
@ -243,6 +225,14 @@

 -record(state, {manifest ::
                    leveled_pmanifest:manifest() | undefined | redacted,
+                query_manifest :: 
+                    {list(),
+                        leveled_codec:ledger_key(),
+                        leveled_codec:ledger_key()} | undefined,
+                    % Slimmed down version of the manifest containing part
+                    % related to  specific query, and the StartKey/EndKey
+                    % used to extract this part
+
                persisted_sqn = 0 :: integer(), % The highest SQN persisted
                
                ledger_sqn = 0 :: integer(), % The highest SQN added to L0
@ -809,21 +799,17 @@ handle_call({fetch_keys,
    
    %% Rename any reference to loop state that may be used by the function
    %% to be returned - https://github.com/martinsumner/leveled/issues/326
-    Manifest = State#state.manifest,
+    SSTiter =
+        case State#state.query_manifest of
+            undefined ->
+                leveled_pmanifest:query_manifest(
+                    State#state.manifest, StartKey, EndKey);
+            {QueryManifest, StartKeyQM, EndKeyQM}
+                    when StartKey >= StartKeyQM, EndKey =< EndKeyQM ->
+                QueryManifest
+        end,    
    SnapshotTime = State#state.snapshot_time,
    
-    SetupFoldFun =
-        fun(Level, Acc) ->
-            Pointers = leveled_pmanifest:range_lookup(Manifest,
-                                                        Level,
-                                                        StartKey,
-                                                        EndKey),
-            case Pointers of
-                [] -> Acc;
-                PL -> Acc ++ [{Level, PL}]
-            end
-        end,
-    SSTiter = lists:foldl(SetupFoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)),
    Folder = 
        fun() -> 
            keyfolder({FilteredL0, SSTiter},
@ -867,7 +853,7 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
                BookieIncrTree
        end,

-    CloneState = 
+    {CloneState, ManifestClone, QueryManifest} = 
        case Query of
            no_lookup ->
                {UpdMaxSQN, UpdSize, L0Cache} =
@ -875,10 +861,12 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
                                                {LM1Cache, MinSQN, MaxSQN},
                                                State#state.ledger_sqn,
                                                State#state.levelzero_cache),
-                #state{levelzero_cache = L0Cache,
+                {#state{levelzero_cache = L0Cache,
                        ledger_sqn = UpdMaxSQN,
                        levelzero_size = UpdSize,
-                        persisted_sqn = State#state.persisted_sqn};
+                        persisted_sqn = State#state.persisted_sqn},
+                    leveled_pmanifest:copy_manifest(State#state.manifest),
+                    undefined};
            {StartKey, EndKey} ->
                SW = os:timestamp(),
                L0AsTree =
@ -889,10 +877,15 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
                leveled_log:log_randomtimer("P0037",
                                            [State#state.levelzero_size],
                                            SW,
-                                            0.1),
-                #state{levelzero_astree = L0AsTree,
+                                            0.01),
+                {#state{levelzero_astree = L0AsTree,
                        ledger_sqn = MaxSQN,
-                        persisted_sqn = State#state.persisted_sqn};
+                        persisted_sqn = State#state.persisted_sqn},
+                    undefined,
+                    {leveled_pmanifest:query_manifest(
+                        State#state.manifest, StartKey, EndKey),
+                        StartKey,
+                        EndKey}};
            undefined ->
                {UpdMaxSQN, UpdSize, L0Cache} =
                    leveled_pmem:add_to_cache(State#state.levelzero_size,
@ -908,18 +901,20 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
                                                        State#state.levelzero_index,
                                                        length(L0Cache))
                    end,
-                #state{levelzero_cache = L0Cache,
+                {#state{levelzero_cache = L0Cache,
                        levelzero_index = L0Index,
                        levelzero_size = UpdSize,
                        ledger_sqn = UpdMaxSQN,
-                        persisted_sqn = State#state.persisted_sqn}
+                        persisted_sqn = State#state.persisted_sqn},
+                    leveled_pmanifest:copy_manifest(State#state.manifest),
+                    undefined}
        end,
-    ManifestClone = leveled_pmanifest:copy_manifest(State#state.manifest),
    {reply,
        {ok,
            CloneState#state{snapshot_fully_loaded = true,
                                snapshot_time = leveled_util:integer_now(),
-                                manifest=ManifestClone}},
+                                manifest = ManifestClone,
+                                query_manifest = QueryManifest}},
        State#state{manifest = Manifest0}};
 handle_call(close, _From, State=#state{is_snapshot=Snap}) when Snap == true ->
    ok = pcl_releasesnapshot(State#state.source_penciller, self()),
@ -980,8 +975,7 @@ handle_call({checkbloom_fortest, Key, Hash}, _From, State) ->
        end,
    {reply, lists:foldl(FoldFun, false, lists:seq(0, ?MAX_LEVELS)), State};
 handle_call(check_for_work, _From, State) ->
-    {_WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest,
-                                                    ?LEVEL_SCALEFACTOR),
+    {_WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest),
    {reply, WC > 0, State};
 handle_call(persisted_sqn, _From, State) ->
    {reply, State#state.persisted_sqn, State}.
@ -1101,8 +1095,7 @@ handle_cast(work_for_clerk, State) ->
            %
            % Perhaps the pclerk should not be restarted because of this, and
            % the failure should ripple up
-            {WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest,
-                                                        ?LEVEL_SCALEFACTOR),
+            {WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest),
            case WC of
                0 ->
                    {noreply, State#state{work_backlog=false}};
@ -2216,7 +2209,7 @@ simple_server_test() ->
    ?assertMatch(Key3, pcl_fetch(PCLr, {o,"Bucket0003", "Key0003", null})),
    ?assertMatch(Key4, pcl_fetch(PCLr, {o,"Bucket0004", "Key0004", null})),
    
-    {ok, PclSnap, null} = 
+    {ok, PclSnap, null, _} = 
        leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
                                        PCLr,
                                        null,
@ -2271,7 +2264,7 @@ simple_server_test() ->
                                                1)),
    ok = pcl_close(PclSnap),
     
-    {ok, PclSnap2, null} = 
+    {ok, PclSnap2, null, _} = 
        leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
                                        PCLr,
                                        null,
@ -2561,13 +2554,11 @@ handle_down_test() ->
 loop() ->
    receive
        {snap, PCLr, TestPid} ->
-            Res = leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
-                                          PCLr,
-                                          null,
-                                          ledger,
-                                          undefined,
-                                          false),
-            TestPid ! {self(), Res},
+            {ok, Snap, null, _Timings} =
+                leveled_bookie:snapshot_store(
+                    leveled_bookie:empty_ledgercache(),
+                    PCLr, null, ledger, undefined, false),
+            TestPid ! {self(), {ok, Snap, null}},
            loop();
        stop ->
            ok
--- a/src/leveled_pmanifest.erl
+++ b/src/leveled_pmanifest.erl
@ -26,6 +26,7 @@
        load_manifest/3,
        close_manifest/2,
        save_manifest/2,
+        query_manifest/3,
        get_manifest_sqn/1,
        key_lookup/3,
        range_lookup/4,
@ -40,7 +41,7 @@
        merge_snapshot/2,
        ready_to_delete/2,
        clear_pending/3,
-        check_for_work/2,
+        check_for_work/1,
        is_basement/2,
        levelzero_present/1,
        check_bloom/3,
@ -56,7 +57,27 @@
 -define(MANIFEST_FILEX, "man").
 -define(PENDING_FILEX, "pnd").
 -define(MANIFEST_FP, "ledger_manifest").
-define(MAX_LEVELS, 8).
+-define(LEVEL_SCALEFACTOR, 
+            [{0, 0}, 
+                {1, 4}, {2, 16}, {3, 64}, % Factor of 4
+                {4, 384}, {5, 2304}, % Factor of 6 
+                {6, 18432}, % Factor of 8 
+                {7, infinity}]).
+            % As an alternative to going up by a factor of 8 at each level, 
+            % increase by a factor of 4 at young levels - to make early  
+            % compaction jobs shorter.
+            %  
+            % There are 32K keys per files => with 4096 files there are 100M
+            % keys supported,
+            
+            % 600M keys is supported before hitting the infinite level.  
+            % At o(10) trillion keys behaviour may become increasingly 
+            % difficult to predict.
+
+-if(length(?LEVEL_SCALEFACTOR) /= ?MAX_LEVELS).
+-error("length ?LEVEL_SCALEFACTOR differs from ?MAX_LEVELS").
+-endif.
+
 -define(TREE_TYPE, idxt).
 -define(TREE_WIDTH, 8).
 -define(PHANTOM_PID, r2d_fail).
@ -403,6 +424,22 @@ key_lookup(Manifest, LevelIdx, Key) ->
                                Key)
    end.

+-spec query_manifest(
+    manifest(),
+    leveled_codec:ledger_key(),
+    leveled_codec:ledger_key()) -> list().
+query_manifest(Manifest, StartKey, EndKey) ->
+    SetupFoldFun =
+        fun(Level, Acc) ->
+            Pointers =
+                range_lookup(Manifest, Level, StartKey, EndKey),
+            case Pointers of
+                [] -> Acc;
+                PL -> Acc ++ [{Level, PL}]
+            end
+        end,
+    lists:foldl(SetupFoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)).
+
 -spec range_lookup(manifest(), 
                    integer(), 
                    leveled_codec:ledger_key(), 
@ -576,7 +613,7 @@ clear_pending(Manifest, [FN|RestFN], MaybeRelease) ->
        RestFN,
        MaybeRelease).

-spec check_for_work(manifest(), list()) -> {list(), integer()}.
+-spec check_for_work(manifest()) -> {list(), integer()}.
 %% @doc
 %% Check for compaction work in the manifest - look at levels which contain
 %% more files in the threshold.
@ -588,7 +625,7 @@ clear_pending(Manifest, [FN|RestFN], MaybeRelease) ->
 %%
 %% Return a list of levels which are over-sized as well as the total items
 %% across the manifest which are beyond the size (the total work outstanding).
-check_for_work(Manifest, Thresholds) ->
+check_for_work(Manifest) ->
    CheckLevelFun =
        fun({LevelIdx, MaxCount}, {AccL, AccC}) ->
            case LevelIdx > Manifest#manifest.basement of
@ -605,7 +642,7 @@ check_for_work(Manifest, Thresholds) ->
                    end
            end
        end,
-    lists:foldr(CheckLevelFun, {[], 0}, Thresholds).    
+    lists:foldr(CheckLevelFun, {[], 0}, ?LEVEL_SCALEFACTOR).    

 -spec is_basement(manifest(), integer()) -> boolean().
 %% @doc