Query don't copy (#380)

* Query don't copy

Queries the manifest to avoid copying the whole manifest when taking a snapshot of a penciller to run a query.

Change the logging of fold setup in the Bookie to record the actual snapshot time (rather than the uninteresting and fast returning the the function which will request the snapshot).

A little tidy to avoid duplicating the ?MAX_LEVELS macro.

* Clarify log is of snapshot time not fold time

* Updates after review
This commit is contained in:
Martin Sumner 2022-10-11 13:45:55 +01:00
parent 28d3701f6e
commit d09f5c778b
5 changed files with 175 additions and 117 deletions

View file

@ -24,6 +24,9 @@
-define(CACHE_TYPE, skpl). -define(CACHE_TYPE, skpl).
-define(MAX_LEVELS, 8).
%% Should equal the length of the LEVEL_SCALEFACTOR
-record(level, -record(level,
{level :: integer(), {level :: integer(),

View file

@ -178,12 +178,12 @@
put_countdown = 0 :: integer(), put_countdown = 0 :: integer(),
get_countdown = 0 :: integer(), get_countdown = 0 :: integer(),
fold_countdown = 0 :: integer(), snapshot_countdown = 0 :: integer(),
head_countdown = 0 :: integer(), head_countdown = 0 :: integer(),
cache_ratio = {0, 0, 0} :: cache_ratio(), cache_ratio = {0, 0, 0} :: cache_ratio(),
get_timings = no_timing :: get_timings(), get_timings = no_timing :: get_timings(),
put_timings = no_timing :: put_timings(), put_timings = no_timing :: put_timings(),
fold_timings = no_timing :: fold_timings(), snapshot_timings = no_timing :: snapshot_timings(),
head_timings = no_timing :: head_timings()}). head_timings = no_timing :: head_timings()}).
@ -201,8 +201,9 @@
ink_time = 0 :: integer(), ink_time = 0 :: integer(),
total_size = 0 :: integer()}). total_size = 0 :: integer()}).
-record(fold_timings, {sample_count = 0 :: integer(), -record(snapshot_timings, {sample_count = 0 :: integer(),
setup_time = 0 :: integer()}). bookie_time = 0 :: integer(),
pcl_time = 0 :: integer()}).
-type book_state() :: #state{}. -type book_state() :: #state{}.
@ -210,9 +211,11 @@
-type ledger_cache() :: #ledger_cache{}. -type ledger_cache() :: #ledger_cache{}.
-type get_timings() :: no_timing|#get_timings{}. -type get_timings() :: no_timing|#get_timings{}.
-type put_timings() :: no_timing|#put_timings{}. -type put_timings() :: no_timing|#put_timings{}.
-type fold_timings() :: no_timing|#fold_timings{}. -type snapshot_timings() :: no_timing|#snapshot_timings{}.
-type head_timings() :: no_timing|#head_timings{}. -type head_timings() :: no_timing|#head_timings{}.
-type timing_types() :: head|get|put|fold. -type timings() ::
put_timings()|get_timings()|snapshot_timings()|head_timings().
-type timing_types() :: head|get|put|snapshot.
-type cache_ratio() :: -type cache_ratio() ::
{non_neg_integer(), non_neg_integer(), non_neg_integer()}. {non_neg_integer(), non_neg_integer(), non_neg_integer()}.
@ -1460,26 +1463,27 @@ handle_call({snapshot, SnapType, Query, LongRunning}, _From, State) ->
% Snapshot the store, specifying if the snapshot should be long running % Snapshot the store, specifying if the snapshot should be long running
% (i.e. will the snapshot be queued or be required for an extended period % (i.e. will the snapshot be queued or be required for an extended period
% e.g. many minutes) % e.g. many minutes)
Reply = snapshot_store(State, SnapType, Query, LongRunning), {ok, PclSnap, InkSnap, Timings} =
{reply, Reply, State}; snapshot_store(State, SnapType, Query, LongRunning),
{UpdTimings, CountDown} =
update_statetimings(snapshot, Timings, State#state.snapshot_countdown),
{reply,
{ok, PclSnap, InkSnap},
State#state{
snapshot_timings = UpdTimings,
snapshot_countdown = CountDown}};
handle_call(log_settings, _From, State) -> handle_call(log_settings, _From, State) ->
{reply, leveled_log:return_settings(), State}; {reply, leveled_log:return_settings(), State};
handle_call({return_runner, QueryType}, _From, State) -> handle_call({return_runner, QueryType}, _From, State) ->
SW = os:timestamp(),
Runner = get_runner(State, QueryType), Runner = get_runner(State, QueryType),
{_SW, Timings1} = {reply, Runner, State};
update_timings(SW, {fold, setup}, State#state.fold_timings),
{Timings, CountDown} =
update_statetimings(fold, Timings1, State#state.fold_countdown),
{reply, Runner, State#state{fold_timings = Timings,
fold_countdown = CountDown}};
handle_call({compact_journal, Timeout}, _From, State) handle_call({compact_journal, Timeout}, _From, State)
when State#state.head_only == false -> when State#state.head_only == false ->
case leveled_inker:ink_compactionpending(State#state.inker) of case leveled_inker:ink_compactionpending(State#state.inker) of
true -> true ->
{reply, {busy, undefined}, State}; {reply, {busy, undefined}, State};
false -> false ->
{ok, PclSnap, null} = {ok, PclSnap, null, _Timings} =
snapshot_store(State, ledger, undefined, true), snapshot_store(State, ledger, undefined, true),
R = leveled_inker:ink_compactjournal(State#state.inker, R = leveled_inker:ink_compactjournal(State#state.inker,
PclSnap, PclSnap,
@ -1609,9 +1613,13 @@ loadqueue_ledgercache(Cache) ->
Cache#ledger_cache{load_queue = [], loader = T}. Cache#ledger_cache{load_queue = [], loader = T}.
-spec snapshot_store(ledger_cache(), -spec snapshot_store(ledger_cache(),
pid(), null|pid(), store|ledger, pid(),
undefined|tuple(), undefined|boolean()) -> null|pid(),
{ok, pid(), pid()|null}. snapshot_timings(),
store|ledger,
undefined|tuple(),
undefined|boolean()) ->
{ok, pid(), pid()|null, snapshot_timings()}.
%% @doc %% @doc
%% Allow all a snapshot to be created from part of the store, preferably %% Allow all a snapshot to be created from part of the store, preferably
%% passing in a query filter so that all of the LoopState does not need to %% passing in a query filter so that all of the LoopState does not need to
@ -1626,38 +1634,49 @@ loadqueue_ledgercache(Cache) ->
%% setup, assuming the range is a small subset of the overall key space). If %% setup, assuming the range is a small subset of the overall key space). If
%% lookup is required but the range isn't defined then 'undefined' should be %% lookup is required but the range isn't defined then 'undefined' should be
%% passed as the query %% passed as the query
snapshot_store(LedgerCache, Penciller, Inker, SnapType, Query, LongRunning) -> snapshot_store(
LedgerCache, Penciller, Inker, Timings, SnapType, Query, LongRunning) ->
TS0 = os:timestamp(),
LedgerCacheReady = readycache_forsnapshot(LedgerCache, Query), LedgerCacheReady = readycache_forsnapshot(LedgerCache, Query),
BookiesMem = {LedgerCacheReady#ledger_cache.loader, BookiesMem = {LedgerCacheReady#ledger_cache.loader,
LedgerCacheReady#ledger_cache.index, LedgerCacheReady#ledger_cache.index,
LedgerCacheReady#ledger_cache.min_sqn, LedgerCacheReady#ledger_cache.min_sqn,
LedgerCacheReady#ledger_cache.max_sqn}, LedgerCacheReady#ledger_cache.max_sqn},
PCLopts = #penciller_options{start_snapshot = true, PCLopts =
source_penciller = Penciller, #penciller_options{start_snapshot = true,
snapshot_query = Query, source_penciller = Penciller,
snapshot_longrunning = LongRunning, snapshot_query = Query,
bookies_pid = self(), snapshot_longrunning = LongRunning,
bookies_mem = BookiesMem}, bookies_pid = self(),
bookies_mem = BookiesMem},
{TS1, Timings1} = update_timings(TS0, {snapshot, bookie}, Timings),
{ok, LedgerSnapshot} = leveled_penciller:pcl_snapstart(PCLopts), {ok, LedgerSnapshot} = leveled_penciller:pcl_snapstart(PCLopts),
{_TS2, Timings2} = update_timings(TS1, {snapshot, pcl}, Timings1),
case SnapType of case SnapType of
store -> store ->
InkerOpts = #inker_options{start_snapshot=true, InkerOpts = #inker_options{start_snapshot=true,
bookies_pid = self(), bookies_pid = self(),
source_inker=Inker}, source_inker=Inker},
{ok, JournalSnapshot} = leveled_inker:ink_snapstart(InkerOpts), {ok, JournalSnapshot} = leveled_inker:ink_snapstart(InkerOpts),
{ok, LedgerSnapshot, JournalSnapshot}; {ok, LedgerSnapshot, JournalSnapshot, Timings2};
ledger -> ledger ->
{ok, LedgerSnapshot, null} {ok, LedgerSnapshot, null, Timings2}
end. end.
snapshot_store(LedgerCache, Penciller, Inker, SnapType, Query, LongRunning) ->
snapshot_store(
LedgerCache, Penciller, Inker, no_timing, SnapType, Query, LongRunning).
snapshot_store(State, SnapType, Query, LongRunning) -> snapshot_store(State, SnapType, Query, LongRunning) ->
snapshot_store(State#state.ledger_cache, snapshot_store(State#state.ledger_cache,
State#state.penciller, State#state.penciller,
State#state.inker, State#state.inker,
State#state.snapshot_timings,
SnapType, SnapType,
Query, Query,
LongRunning). LongRunning).
-spec fetch_value(pid(), leveled_codec:journal_ref()) -> not_present|any(). -spec fetch_value(pid(), leveled_codec:journal_ref()) -> not_present|any().
%% @doc %% @doc
%% Fetch a value from the Journal %% Fetch a value from the Journal
@ -1822,7 +1841,8 @@ set_options(Opts) ->
return_snapfun(State, SnapType, Query, LongRunning, SnapPreFold) -> return_snapfun(State, SnapType, Query, LongRunning, SnapPreFold) ->
case SnapPreFold of case SnapPreFold of
true -> true ->
{ok, LS, JS} = snapshot_store(State, SnapType, Query, LongRunning), {ok, LS, JS, _Timings} =
snapshot_store(State, SnapType, Query, LongRunning),
fun() -> {ok, LS, JS} end; fun() -> {ok, LS, JS} end;
false -> false ->
Self = self(), Self = self(),
@ -2457,12 +2477,8 @@ delete_path(DirPath) ->
%%% Timing Functions %%% Timing Functions
%%%============================================================================ %%%============================================================================
-spec update_statetimings(timing_types(), -spec update_statetimings(timing_types(), timings(), integer()) ->
put_timings()|get_timings()|fold_timings()|head_timings(), {timings(), integer()}.
integer())
->
{put_timings()|get_timings()|fold_timings()|head_timings(),
integer()}.
%% @doc %% @doc
%% %%
%% The timings state is either in countdown to the next set of samples of %% The timings state is either in countdown to the next set of samples of
@ -2478,8 +2494,8 @@ update_statetimings(put, no_timing, 0) ->
{#put_timings{}, 0}; {#put_timings{}, 0};
update_statetimings(get, no_timing, 0) -> update_statetimings(get, no_timing, 0) ->
{#get_timings{}, 0}; {#get_timings{}, 0};
update_statetimings(fold, no_timing, 0) -> update_statetimings(snapshot, no_timing, 0) ->
{#fold_timings{}, 0}; {#snapshot_timings{}, 0};
update_statetimings(head, Timings, 0) -> update_statetimings(head, Timings, 0) ->
case Timings#head_timings.sample_count of case Timings#head_timings.sample_count of
SC when SC >= ?TIMING_SAMPLESIZE -> SC when SC >= ?TIMING_SAMPLESIZE ->
@ -2504,12 +2520,12 @@ update_statetimings(get, Timings, 0) ->
_SC -> _SC ->
{Timings, 0} {Timings, 0}
end; end;
update_statetimings(fold, Timings, 0) -> update_statetimings(snapshot, Timings, 0) ->
case Timings#fold_timings.sample_count of case Timings#snapshot_timings.sample_count of
SC when SC >= (?TIMING_SAMPLESIZE div 10) -> SC when SC >= ?TIMING_SAMPLESIZE ->
log_timings(fold, Timings), log_timings(snapshot, Timings),
{no_timing, {no_timing,
leveled_rand:uniform(2 * (?TIMING_SAMPLECOUNTDOWN div 10))}; leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
_SC -> _SC ->
{Timings, 0} {Timings, 0}
end; end;
@ -2531,15 +2547,17 @@ log_timings(get, Timings) ->
Timings#get_timings.head_time, Timings#get_timings.head_time,
Timings#get_timings.body_time, Timings#get_timings.body_time,
Timings#get_timings.fetch_count]); Timings#get_timings.fetch_count]);
log_timings(fold, Timings) -> log_timings(snapshot, Timings) ->
leveled_log:log("B0017", [Timings#fold_timings.sample_count, leveled_log:log("B0017", [Timings#snapshot_timings.sample_count,
Timings#fold_timings.setup_time]). Timings#snapshot_timings.bookie_time,
Timings#snapshot_timings.pcl_time]).
update_timings(_SW, _Stage, no_timing) -> update_timings(_SW, _Stage, no_timing) ->
{no_timing, no_timing}; {no_timing, no_timing};
update_timings(SW, {head, Stage}, Timings) -> update_timings(SW, {head, Stage}, Timings) ->
Timer = timer:now_diff(os:timestamp(), SW), NextSW = os:timestamp(),
Timer = timer:now_diff(NextSW, SW),
Timings0 = Timings0 =
case Stage of case Stage of
pcl -> pcl ->
@ -2550,9 +2568,10 @@ update_timings(SW, {head, Stage}, Timings) ->
CNT = Timings#head_timings.sample_count + 1, CNT = Timings#head_timings.sample_count + 1,
Timings#head_timings{buildhead_time = BHT, sample_count = CNT} Timings#head_timings{buildhead_time = BHT, sample_count = CNT}
end, end,
{os:timestamp(), Timings0}; {NextSW, Timings0};
update_timings(SW, {put, Stage}, Timings) -> update_timings(SW, {put, Stage}, Timings) ->
Timer = timer:now_diff(os:timestamp(), SW), NextSW = os:timestamp(),
Timer = timer:now_diff(NextSW, SW),
Timings0 = Timings0 =
case Stage of case Stage of
{inker, ObjectSize} -> {inker, ObjectSize} ->
@ -2564,24 +2583,32 @@ update_timings(SW, {put, Stage}, Timings) ->
CNT = Timings#put_timings.sample_count + 1, CNT = Timings#put_timings.sample_count + 1,
Timings#put_timings{mem_time = PCT, sample_count = CNT} Timings#put_timings{mem_time = PCT, sample_count = CNT}
end, end,
{os:timestamp(), Timings0}; {NextSW, Timings0};
update_timings(SW, {get, head}, Timings) -> update_timings(SW, {get, head}, Timings) ->
Timer = timer:now_diff(os:timestamp(), SW), NextSW = os:timestamp(),
Timer = timer:now_diff(NextSW, SW),
GHT = Timings#get_timings.head_time + Timer, GHT = Timings#get_timings.head_time + Timer,
CNT = Timings#get_timings.sample_count + 1, CNT = Timings#get_timings.sample_count + 1,
Timings0 = Timings#get_timings{head_time = GHT, sample_count = CNT}, Timings0 = Timings#get_timings{head_time = GHT, sample_count = CNT},
{os:timestamp(), Timings0}; {NextSW, Timings0};
update_timings(SW, {get, body}, Timings) -> update_timings(SW, {get, body}, Timings) ->
Timer = timer:now_diff(os:timestamp(), SW), Timer = timer:now_diff(os:timestamp(), SW),
GBT = Timings#get_timings.body_time + Timer, GBT = Timings#get_timings.body_time + Timer,
FCNT = Timings#get_timings.fetch_count + 1, FCNT = Timings#get_timings.fetch_count + 1,
Timings0 = Timings#get_timings{body_time = GBT, fetch_count = FCNT}, Timings0 = Timings#get_timings{body_time = GBT, fetch_count = FCNT},
{no_timing, Timings0}; {no_timing, Timings0};
update_timings(SW, {fold, setup}, Timings) -> update_timings(SW, {snapshot, bookie}, Timings) ->
Timer = timer:now_diff(os:timestamp(), SW), NextSW = os:timestamp(),
FST = Timings#fold_timings.setup_time + Timer, Timer = timer:now_diff(NextSW, SW),
CNT = Timings#fold_timings.sample_count + 1, BST = Timings#snapshot_timings.bookie_time + Timer,
Timings0 = Timings#fold_timings{setup_time = FST, sample_count = CNT}, CNT = Timings#snapshot_timings.sample_count + 1,
Timings0 = Timings#snapshot_timings{bookie_time = BST, sample_count = CNT},
{NextSW, Timings0};
update_timings(SW, {snapshot, pcl}, Timings) ->
NextSW = os:timestamp(),
Timer = timer:now_diff(NextSW, SW),
PST = Timings#snapshot_timings.pcl_time + Timer,
Timings0 = Timings#snapshot_timings{pcl_time = PST},
{no_timing, Timings0}. {no_timing, Timings0}.

View file

@ -70,7 +70,7 @@
{info, "Get timing with sample_count=~w and head_time=~w body_time=~w" {info, "Get timing with sample_count=~w and head_time=~w body_time=~w"
++ " with fetch_count=~w"}}, ++ " with fetch_count=~w"}},
{"B0017", {"B0017",
{info, "Fold timing with sample_count=~w and setup_time=~w"}}, {info, "Snapshot timing with sample_count=~w and bookie_time=~w pcl_time=~w"}},
{"B0018", {"B0018",
{info, "Positive HEAD responses timed with sample_count=~w and " {info, "Positive HEAD responses timed with sample_count=~w and "
++ " pcl_time=~w rsp_time=~w"}}, ++ " pcl_time=~w rsp_time=~w"}},

View file

@ -206,23 +206,6 @@
-include_lib("eunit/include/eunit.hrl"). -include_lib("eunit/include/eunit.hrl").
-define(LEVEL_SCALEFACTOR,
[{0, 0},
{1, 4}, {2, 16}, {3, 64}, % Factor of 4
{4, 384}, {5, 2304}, % Factor of 6
{6, 18432}, % Factor of 8
{7, infinity}]).
% As an alternative to going up by a factor of 8 at each level,
% increase by a factor of 4 at young levels - to make early
% compaction jobs shorter.
%
% There are 32K keys per files => with 4096 files there are 100M
% keys supported,
% 600M keys is supported before hitting the infinite level.
% At o(10) trillion keys behaviour may become increasingly
% difficult to predict.
-define(MAX_LEVELS, 8).
-define(MAX_WORK_WAIT, 300). -define(MAX_WORK_WAIT, 300).
-define(MANIFEST_FP, "ledger_manifest"). -define(MANIFEST_FP, "ledger_manifest").
-define(FILES_FP, "ledger_files"). -define(FILES_FP, "ledger_files").
@ -230,7 +213,6 @@
-define(PENDING_FILEX, "pnd"). -define(PENDING_FILEX, "pnd").
-define(SST_FILEX, ".sst"). -define(SST_FILEX, ".sst").
-define(ARCHIVE_FILEX, ".bak"). -define(ARCHIVE_FILEX, ".bak").
-define(MEMTABLE, mem).
-define(SUPER_MAX_TABLE_SIZE, 40000). -define(SUPER_MAX_TABLE_SIZE, 40000).
-define(PROMPT_WAIT_ONL0, 5). -define(PROMPT_WAIT_ONL0, 5).
-define(WORKQUEUE_BACKLOG_TOLERANCE, 4). -define(WORKQUEUE_BACKLOG_TOLERANCE, 4).
@ -243,6 +225,14 @@
-record(state, {manifest :: -record(state, {manifest ::
leveled_pmanifest:manifest() | undefined | redacted, leveled_pmanifest:manifest() | undefined | redacted,
query_manifest ::
{list(),
leveled_codec:ledger_key(),
leveled_codec:ledger_key()} | undefined,
% Slimmed down version of the manifest containing part
% related to specific query, and the StartKey/EndKey
% used to extract this part
persisted_sqn = 0 :: integer(), % The highest SQN persisted persisted_sqn = 0 :: integer(), % The highest SQN persisted
ledger_sqn = 0 :: integer(), % The highest SQN added to L0 ledger_sqn = 0 :: integer(), % The highest SQN added to L0
@ -809,21 +799,17 @@ handle_call({fetch_keys,
%% Rename any reference to loop state that may be used by the function %% Rename any reference to loop state that may be used by the function
%% to be returned - https://github.com/martinsumner/leveled/issues/326 %% to be returned - https://github.com/martinsumner/leveled/issues/326
Manifest = State#state.manifest, SSTiter =
case State#state.query_manifest of
undefined ->
leveled_pmanifest:query_manifest(
State#state.manifest, StartKey, EndKey);
{QueryManifest, StartKeyQM, EndKeyQM}
when StartKey >= StartKeyQM, EndKey =< EndKeyQM ->
QueryManifest
end,
SnapshotTime = State#state.snapshot_time, SnapshotTime = State#state.snapshot_time,
SetupFoldFun =
fun(Level, Acc) ->
Pointers = leveled_pmanifest:range_lookup(Manifest,
Level,
StartKey,
EndKey),
case Pointers of
[] -> Acc;
PL -> Acc ++ [{Level, PL}]
end
end,
SSTiter = lists:foldl(SetupFoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)),
Folder = Folder =
fun() -> fun() ->
keyfolder({FilteredL0, SSTiter}, keyfolder({FilteredL0, SSTiter},
@ -867,7 +853,7 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
BookieIncrTree BookieIncrTree
end, end,
CloneState = {CloneState, ManifestClone, QueryManifest} =
case Query of case Query of
no_lookup -> no_lookup ->
{UpdMaxSQN, UpdSize, L0Cache} = {UpdMaxSQN, UpdSize, L0Cache} =
@ -875,10 +861,12 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
{LM1Cache, MinSQN, MaxSQN}, {LM1Cache, MinSQN, MaxSQN},
State#state.ledger_sqn, State#state.ledger_sqn,
State#state.levelzero_cache), State#state.levelzero_cache),
#state{levelzero_cache = L0Cache, {#state{levelzero_cache = L0Cache,
ledger_sqn = UpdMaxSQN, ledger_sqn = UpdMaxSQN,
levelzero_size = UpdSize, levelzero_size = UpdSize,
persisted_sqn = State#state.persisted_sqn}; persisted_sqn = State#state.persisted_sqn},
leveled_pmanifest:copy_manifest(State#state.manifest),
undefined};
{StartKey, EndKey} -> {StartKey, EndKey} ->
SW = os:timestamp(), SW = os:timestamp(),
L0AsTree = L0AsTree =
@ -889,10 +877,15 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
leveled_log:log_randomtimer("P0037", leveled_log:log_randomtimer("P0037",
[State#state.levelzero_size], [State#state.levelzero_size],
SW, SW,
0.1), 0.01),
#state{levelzero_astree = L0AsTree, {#state{levelzero_astree = L0AsTree,
ledger_sqn = MaxSQN, ledger_sqn = MaxSQN,
persisted_sqn = State#state.persisted_sqn}; persisted_sqn = State#state.persisted_sqn},
undefined,
{leveled_pmanifest:query_manifest(
State#state.manifest, StartKey, EndKey),
StartKey,
EndKey}};
undefined -> undefined ->
{UpdMaxSQN, UpdSize, L0Cache} = {UpdMaxSQN, UpdSize, L0Cache} =
leveled_pmem:add_to_cache(State#state.levelzero_size, leveled_pmem:add_to_cache(State#state.levelzero_size,
@ -908,18 +901,20 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
State#state.levelzero_index, State#state.levelzero_index,
length(L0Cache)) length(L0Cache))
end, end,
#state{levelzero_cache = L0Cache, {#state{levelzero_cache = L0Cache,
levelzero_index = L0Index, levelzero_index = L0Index,
levelzero_size = UpdSize, levelzero_size = UpdSize,
ledger_sqn = UpdMaxSQN, ledger_sqn = UpdMaxSQN,
persisted_sqn = State#state.persisted_sqn} persisted_sqn = State#state.persisted_sqn},
leveled_pmanifest:copy_manifest(State#state.manifest),
undefined}
end, end,
ManifestClone = leveled_pmanifest:copy_manifest(State#state.manifest),
{reply, {reply,
{ok, {ok,
CloneState#state{snapshot_fully_loaded=true, CloneState#state{snapshot_fully_loaded = true,
snapshot_time = leveled_util:integer_now(), snapshot_time = leveled_util:integer_now(),
manifest=ManifestClone}}, manifest = ManifestClone,
query_manifest = QueryManifest}},
State#state{manifest = Manifest0}}; State#state{manifest = Manifest0}};
handle_call(close, _From, State=#state{is_snapshot=Snap}) when Snap == true -> handle_call(close, _From, State=#state{is_snapshot=Snap}) when Snap == true ->
ok = pcl_releasesnapshot(State#state.source_penciller, self()), ok = pcl_releasesnapshot(State#state.source_penciller, self()),
@ -980,8 +975,7 @@ handle_call({checkbloom_fortest, Key, Hash}, _From, State) ->
end, end,
{reply, lists:foldl(FoldFun, false, lists:seq(0, ?MAX_LEVELS)), State}; {reply, lists:foldl(FoldFun, false, lists:seq(0, ?MAX_LEVELS)), State};
handle_call(check_for_work, _From, State) -> handle_call(check_for_work, _From, State) ->
{_WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest, {_WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest),
?LEVEL_SCALEFACTOR),
{reply, WC > 0, State}; {reply, WC > 0, State};
handle_call(persisted_sqn, _From, State) -> handle_call(persisted_sqn, _From, State) ->
{reply, State#state.persisted_sqn, State}. {reply, State#state.persisted_sqn, State}.
@ -1101,8 +1095,7 @@ handle_cast(work_for_clerk, State) ->
% %
% Perhaps the pclerk should not be restarted because of this, and % Perhaps the pclerk should not be restarted because of this, and
% the failure should ripple up % the failure should ripple up
{WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest, {WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest),
?LEVEL_SCALEFACTOR),
case WC of case WC of
0 -> 0 ->
{noreply, State#state{work_backlog=false}}; {noreply, State#state{work_backlog=false}};
@ -2216,7 +2209,7 @@ simple_server_test() ->
?assertMatch(Key3, pcl_fetch(PCLr, {o,"Bucket0003", "Key0003", null})), ?assertMatch(Key3, pcl_fetch(PCLr, {o,"Bucket0003", "Key0003", null})),
?assertMatch(Key4, pcl_fetch(PCLr, {o,"Bucket0004", "Key0004", null})), ?assertMatch(Key4, pcl_fetch(PCLr, {o,"Bucket0004", "Key0004", null})),
{ok, PclSnap, null} = {ok, PclSnap, null, _} =
leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(), leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
PCLr, PCLr,
null, null,
@ -2271,7 +2264,7 @@ simple_server_test() ->
1)), 1)),
ok = pcl_close(PclSnap), ok = pcl_close(PclSnap),
{ok, PclSnap2, null} = {ok, PclSnap2, null, _} =
leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(), leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
PCLr, PCLr,
null, null,
@ -2561,13 +2554,11 @@ handle_down_test() ->
loop() -> loop() ->
receive receive
{snap, PCLr, TestPid} -> {snap, PCLr, TestPid} ->
Res = leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(), {ok, Snap, null, _Timings} =
PCLr, leveled_bookie:snapshot_store(
null, leveled_bookie:empty_ledgercache(),
ledger, PCLr, null, ledger, undefined, false),
undefined, TestPid ! {self(), {ok, Snap, null}},
false),
TestPid ! {self(), Res},
loop(); loop();
stop -> stop ->
ok ok

View file

@ -26,6 +26,7 @@
load_manifest/3, load_manifest/3,
close_manifest/2, close_manifest/2,
save_manifest/2, save_manifest/2,
query_manifest/3,
get_manifest_sqn/1, get_manifest_sqn/1,
key_lookup/3, key_lookup/3,
range_lookup/4, range_lookup/4,
@ -40,7 +41,7 @@
merge_snapshot/2, merge_snapshot/2,
ready_to_delete/2, ready_to_delete/2,
clear_pending/3, clear_pending/3,
check_for_work/2, check_for_work/1,
is_basement/2, is_basement/2,
levelzero_present/1, levelzero_present/1,
check_bloom/3, check_bloom/3,
@ -56,7 +57,27 @@
-define(MANIFEST_FILEX, "man"). -define(MANIFEST_FILEX, "man").
-define(PENDING_FILEX, "pnd"). -define(PENDING_FILEX, "pnd").
-define(MANIFEST_FP, "ledger_manifest"). -define(MANIFEST_FP, "ledger_manifest").
-define(MAX_LEVELS, 8). -define(LEVEL_SCALEFACTOR,
[{0, 0},
{1, 4}, {2, 16}, {3, 64}, % Factor of 4
{4, 384}, {5, 2304}, % Factor of 6
{6, 18432}, % Factor of 8
{7, infinity}]).
% As an alternative to going up by a factor of 8 at each level,
% increase by a factor of 4 at young levels - to make early
% compaction jobs shorter.
%
% There are 32K keys per files => with 4096 files there are 100M
% keys supported,
% 600M keys is supported before hitting the infinite level.
% At o(10) trillion keys behaviour may become increasingly
% difficult to predict.
-if(length(?LEVEL_SCALEFACTOR) /= ?MAX_LEVELS).
-error("length ?LEVEL_SCALEFACTOR differs from ?MAX_LEVELS").
-endif.
-define(TREE_TYPE, idxt). -define(TREE_TYPE, idxt).
-define(TREE_WIDTH, 8). -define(TREE_WIDTH, 8).
-define(PHANTOM_PID, r2d_fail). -define(PHANTOM_PID, r2d_fail).
@ -403,6 +424,22 @@ key_lookup(Manifest, LevelIdx, Key) ->
Key) Key)
end. end.
-spec query_manifest(
manifest(),
leveled_codec:ledger_key(),
leveled_codec:ledger_key()) -> list().
query_manifest(Manifest, StartKey, EndKey) ->
SetupFoldFun =
fun(Level, Acc) ->
Pointers =
range_lookup(Manifest, Level, StartKey, EndKey),
case Pointers of
[] -> Acc;
PL -> Acc ++ [{Level, PL}]
end
end,
lists:foldl(SetupFoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)).
-spec range_lookup(manifest(), -spec range_lookup(manifest(),
integer(), integer(),
leveled_codec:ledger_key(), leveled_codec:ledger_key(),
@ -576,7 +613,7 @@ clear_pending(Manifest, [FN|RestFN], MaybeRelease) ->
RestFN, RestFN,
MaybeRelease). MaybeRelease).
-spec check_for_work(manifest(), list()) -> {list(), integer()}. -spec check_for_work(manifest()) -> {list(), integer()}.
%% @doc %% @doc
%% Check for compaction work in the manifest - look at levels which contain %% Check for compaction work in the manifest - look at levels which contain
%% more files in the threshold. %% more files in the threshold.
@ -588,7 +625,7 @@ clear_pending(Manifest, [FN|RestFN], MaybeRelease) ->
%% %%
%% Return a list of levels which are over-sized as well as the total items %% Return a list of levels which are over-sized as well as the total items
%% across the manifest which are beyond the size (the total work outstanding). %% across the manifest which are beyond the size (the total work outstanding).
check_for_work(Manifest, Thresholds) -> check_for_work(Manifest) ->
CheckLevelFun = CheckLevelFun =
fun({LevelIdx, MaxCount}, {AccL, AccC}) -> fun({LevelIdx, MaxCount}, {AccL, AccC}) ->
case LevelIdx > Manifest#manifest.basement of case LevelIdx > Manifest#manifest.basement of
@ -605,7 +642,7 @@ check_for_work(Manifest, Thresholds) ->
end end
end end
end, end,
lists:foldr(CheckLevelFun, {[], 0}, Thresholds). lists:foldr(CheckLevelFun, {[], 0}, ?LEVEL_SCALEFACTOR).
-spec is_basement(manifest(), integer()) -> boolean(). -spec is_basement(manifest(), integer()) -> boolean().
%% @doc %% @doc