Query don't copy (#380)
* Query don't copy Queries the manifest to avoid copying the whole manifest when taking a snapshot of a penciller to run a query. Change the logging of fold setup in the Bookie to record the actual snapshot time (rather than the uninteresting and fast returning the the function which will request the snapshot). A little tidy to avoid duplicating the ?MAX_LEVELS macro. * Clarify log is of snapshot time not fold time * Updates after review
This commit is contained in:
parent
28d3701f6e
commit
d09f5c778b
5 changed files with 175 additions and 117 deletions
|
@ -24,6 +24,9 @@
|
||||||
|
|
||||||
-define(CACHE_TYPE, skpl).
|
-define(CACHE_TYPE, skpl).
|
||||||
|
|
||||||
|
-define(MAX_LEVELS, 8).
|
||||||
|
%% Should equal the length of the LEVEL_SCALEFACTOR
|
||||||
|
|
||||||
|
|
||||||
-record(level,
|
-record(level,
|
||||||
{level :: integer(),
|
{level :: integer(),
|
||||||
|
|
|
@ -178,12 +178,12 @@
|
||||||
|
|
||||||
put_countdown = 0 :: integer(),
|
put_countdown = 0 :: integer(),
|
||||||
get_countdown = 0 :: integer(),
|
get_countdown = 0 :: integer(),
|
||||||
fold_countdown = 0 :: integer(),
|
snapshot_countdown = 0 :: integer(),
|
||||||
head_countdown = 0 :: integer(),
|
head_countdown = 0 :: integer(),
|
||||||
cache_ratio = {0, 0, 0} :: cache_ratio(),
|
cache_ratio = {0, 0, 0} :: cache_ratio(),
|
||||||
get_timings = no_timing :: get_timings(),
|
get_timings = no_timing :: get_timings(),
|
||||||
put_timings = no_timing :: put_timings(),
|
put_timings = no_timing :: put_timings(),
|
||||||
fold_timings = no_timing :: fold_timings(),
|
snapshot_timings = no_timing :: snapshot_timings(),
|
||||||
head_timings = no_timing :: head_timings()}).
|
head_timings = no_timing :: head_timings()}).
|
||||||
|
|
||||||
|
|
||||||
|
@ -201,8 +201,9 @@
|
||||||
ink_time = 0 :: integer(),
|
ink_time = 0 :: integer(),
|
||||||
total_size = 0 :: integer()}).
|
total_size = 0 :: integer()}).
|
||||||
|
|
||||||
-record(fold_timings, {sample_count = 0 :: integer(),
|
-record(snapshot_timings, {sample_count = 0 :: integer(),
|
||||||
setup_time = 0 :: integer()}).
|
bookie_time = 0 :: integer(),
|
||||||
|
pcl_time = 0 :: integer()}).
|
||||||
|
|
||||||
|
|
||||||
-type book_state() :: #state{}.
|
-type book_state() :: #state{}.
|
||||||
|
@ -210,9 +211,11 @@
|
||||||
-type ledger_cache() :: #ledger_cache{}.
|
-type ledger_cache() :: #ledger_cache{}.
|
||||||
-type get_timings() :: no_timing|#get_timings{}.
|
-type get_timings() :: no_timing|#get_timings{}.
|
||||||
-type put_timings() :: no_timing|#put_timings{}.
|
-type put_timings() :: no_timing|#put_timings{}.
|
||||||
-type fold_timings() :: no_timing|#fold_timings{}.
|
-type snapshot_timings() :: no_timing|#snapshot_timings{}.
|
||||||
-type head_timings() :: no_timing|#head_timings{}.
|
-type head_timings() :: no_timing|#head_timings{}.
|
||||||
-type timing_types() :: head|get|put|fold.
|
-type timings() ::
|
||||||
|
put_timings()|get_timings()|snapshot_timings()|head_timings().
|
||||||
|
-type timing_types() :: head|get|put|snapshot.
|
||||||
-type cache_ratio() ::
|
-type cache_ratio() ::
|
||||||
{non_neg_integer(), non_neg_integer(), non_neg_integer()}.
|
{non_neg_integer(), non_neg_integer(), non_neg_integer()}.
|
||||||
|
|
||||||
|
@ -1460,26 +1463,27 @@ handle_call({snapshot, SnapType, Query, LongRunning}, _From, State) ->
|
||||||
% Snapshot the store, specifying if the snapshot should be long running
|
% Snapshot the store, specifying if the snapshot should be long running
|
||||||
% (i.e. will the snapshot be queued or be required for an extended period
|
% (i.e. will the snapshot be queued or be required for an extended period
|
||||||
% e.g. many minutes)
|
% e.g. many minutes)
|
||||||
Reply = snapshot_store(State, SnapType, Query, LongRunning),
|
{ok, PclSnap, InkSnap, Timings} =
|
||||||
{reply, Reply, State};
|
snapshot_store(State, SnapType, Query, LongRunning),
|
||||||
|
{UpdTimings, CountDown} =
|
||||||
|
update_statetimings(snapshot, Timings, State#state.snapshot_countdown),
|
||||||
|
{reply,
|
||||||
|
{ok, PclSnap, InkSnap},
|
||||||
|
State#state{
|
||||||
|
snapshot_timings = UpdTimings,
|
||||||
|
snapshot_countdown = CountDown}};
|
||||||
handle_call(log_settings, _From, State) ->
|
handle_call(log_settings, _From, State) ->
|
||||||
{reply, leveled_log:return_settings(), State};
|
{reply, leveled_log:return_settings(), State};
|
||||||
handle_call({return_runner, QueryType}, _From, State) ->
|
handle_call({return_runner, QueryType}, _From, State) ->
|
||||||
SW = os:timestamp(),
|
|
||||||
Runner = get_runner(State, QueryType),
|
Runner = get_runner(State, QueryType),
|
||||||
{_SW, Timings1} =
|
{reply, Runner, State};
|
||||||
update_timings(SW, {fold, setup}, State#state.fold_timings),
|
|
||||||
{Timings, CountDown} =
|
|
||||||
update_statetimings(fold, Timings1, State#state.fold_countdown),
|
|
||||||
{reply, Runner, State#state{fold_timings = Timings,
|
|
||||||
fold_countdown = CountDown}};
|
|
||||||
handle_call({compact_journal, Timeout}, _From, State)
|
handle_call({compact_journal, Timeout}, _From, State)
|
||||||
when State#state.head_only == false ->
|
when State#state.head_only == false ->
|
||||||
case leveled_inker:ink_compactionpending(State#state.inker) of
|
case leveled_inker:ink_compactionpending(State#state.inker) of
|
||||||
true ->
|
true ->
|
||||||
{reply, {busy, undefined}, State};
|
{reply, {busy, undefined}, State};
|
||||||
false ->
|
false ->
|
||||||
{ok, PclSnap, null} =
|
{ok, PclSnap, null, _Timings} =
|
||||||
snapshot_store(State, ledger, undefined, true),
|
snapshot_store(State, ledger, undefined, true),
|
||||||
R = leveled_inker:ink_compactjournal(State#state.inker,
|
R = leveled_inker:ink_compactjournal(State#state.inker,
|
||||||
PclSnap,
|
PclSnap,
|
||||||
|
@ -1609,9 +1613,13 @@ loadqueue_ledgercache(Cache) ->
|
||||||
Cache#ledger_cache{load_queue = [], loader = T}.
|
Cache#ledger_cache{load_queue = [], loader = T}.
|
||||||
|
|
||||||
-spec snapshot_store(ledger_cache(),
|
-spec snapshot_store(ledger_cache(),
|
||||||
pid(), null|pid(), store|ledger,
|
pid(),
|
||||||
undefined|tuple(), undefined|boolean()) ->
|
null|pid(),
|
||||||
{ok, pid(), pid()|null}.
|
snapshot_timings(),
|
||||||
|
store|ledger,
|
||||||
|
undefined|tuple(),
|
||||||
|
undefined|boolean()) ->
|
||||||
|
{ok, pid(), pid()|null, snapshot_timings()}.
|
||||||
%% @doc
|
%% @doc
|
||||||
%% Allow all a snapshot to be created from part of the store, preferably
|
%% Allow all a snapshot to be created from part of the store, preferably
|
||||||
%% passing in a query filter so that all of the LoopState does not need to
|
%% passing in a query filter so that all of the LoopState does not need to
|
||||||
|
@ -1626,38 +1634,49 @@ loadqueue_ledgercache(Cache) ->
|
||||||
%% setup, assuming the range is a small subset of the overall key space). If
|
%% setup, assuming the range is a small subset of the overall key space). If
|
||||||
%% lookup is required but the range isn't defined then 'undefined' should be
|
%% lookup is required but the range isn't defined then 'undefined' should be
|
||||||
%% passed as the query
|
%% passed as the query
|
||||||
snapshot_store(LedgerCache, Penciller, Inker, SnapType, Query, LongRunning) ->
|
snapshot_store(
|
||||||
|
LedgerCache, Penciller, Inker, Timings, SnapType, Query, LongRunning) ->
|
||||||
|
TS0 = os:timestamp(),
|
||||||
LedgerCacheReady = readycache_forsnapshot(LedgerCache, Query),
|
LedgerCacheReady = readycache_forsnapshot(LedgerCache, Query),
|
||||||
BookiesMem = {LedgerCacheReady#ledger_cache.loader,
|
BookiesMem = {LedgerCacheReady#ledger_cache.loader,
|
||||||
LedgerCacheReady#ledger_cache.index,
|
LedgerCacheReady#ledger_cache.index,
|
||||||
LedgerCacheReady#ledger_cache.min_sqn,
|
LedgerCacheReady#ledger_cache.min_sqn,
|
||||||
LedgerCacheReady#ledger_cache.max_sqn},
|
LedgerCacheReady#ledger_cache.max_sqn},
|
||||||
PCLopts = #penciller_options{start_snapshot = true,
|
PCLopts =
|
||||||
|
#penciller_options{start_snapshot = true,
|
||||||
source_penciller = Penciller,
|
source_penciller = Penciller,
|
||||||
snapshot_query = Query,
|
snapshot_query = Query,
|
||||||
snapshot_longrunning = LongRunning,
|
snapshot_longrunning = LongRunning,
|
||||||
bookies_pid = self(),
|
bookies_pid = self(),
|
||||||
bookies_mem = BookiesMem},
|
bookies_mem = BookiesMem},
|
||||||
|
{TS1, Timings1} = update_timings(TS0, {snapshot, bookie}, Timings),
|
||||||
{ok, LedgerSnapshot} = leveled_penciller:pcl_snapstart(PCLopts),
|
{ok, LedgerSnapshot} = leveled_penciller:pcl_snapstart(PCLopts),
|
||||||
|
{_TS2, Timings2} = update_timings(TS1, {snapshot, pcl}, Timings1),
|
||||||
case SnapType of
|
case SnapType of
|
||||||
store ->
|
store ->
|
||||||
InkerOpts = #inker_options{start_snapshot=true,
|
InkerOpts = #inker_options{start_snapshot=true,
|
||||||
bookies_pid = self(),
|
bookies_pid = self(),
|
||||||
source_inker=Inker},
|
source_inker=Inker},
|
||||||
{ok, JournalSnapshot} = leveled_inker:ink_snapstart(InkerOpts),
|
{ok, JournalSnapshot} = leveled_inker:ink_snapstart(InkerOpts),
|
||||||
{ok, LedgerSnapshot, JournalSnapshot};
|
{ok, LedgerSnapshot, JournalSnapshot, Timings2};
|
||||||
ledger ->
|
ledger ->
|
||||||
{ok, LedgerSnapshot, null}
|
{ok, LedgerSnapshot, null, Timings2}
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
snapshot_store(LedgerCache, Penciller, Inker, SnapType, Query, LongRunning) ->
|
||||||
|
snapshot_store(
|
||||||
|
LedgerCache, Penciller, Inker, no_timing, SnapType, Query, LongRunning).
|
||||||
|
|
||||||
snapshot_store(State, SnapType, Query, LongRunning) ->
|
snapshot_store(State, SnapType, Query, LongRunning) ->
|
||||||
snapshot_store(State#state.ledger_cache,
|
snapshot_store(State#state.ledger_cache,
|
||||||
State#state.penciller,
|
State#state.penciller,
|
||||||
State#state.inker,
|
State#state.inker,
|
||||||
|
State#state.snapshot_timings,
|
||||||
SnapType,
|
SnapType,
|
||||||
Query,
|
Query,
|
||||||
LongRunning).
|
LongRunning).
|
||||||
|
|
||||||
|
|
||||||
-spec fetch_value(pid(), leveled_codec:journal_ref()) -> not_present|any().
|
-spec fetch_value(pid(), leveled_codec:journal_ref()) -> not_present|any().
|
||||||
%% @doc
|
%% @doc
|
||||||
%% Fetch a value from the Journal
|
%% Fetch a value from the Journal
|
||||||
|
@ -1822,7 +1841,8 @@ set_options(Opts) ->
|
||||||
return_snapfun(State, SnapType, Query, LongRunning, SnapPreFold) ->
|
return_snapfun(State, SnapType, Query, LongRunning, SnapPreFold) ->
|
||||||
case SnapPreFold of
|
case SnapPreFold of
|
||||||
true ->
|
true ->
|
||||||
{ok, LS, JS} = snapshot_store(State, SnapType, Query, LongRunning),
|
{ok, LS, JS, _Timings} =
|
||||||
|
snapshot_store(State, SnapType, Query, LongRunning),
|
||||||
fun() -> {ok, LS, JS} end;
|
fun() -> {ok, LS, JS} end;
|
||||||
false ->
|
false ->
|
||||||
Self = self(),
|
Self = self(),
|
||||||
|
@ -2457,12 +2477,8 @@ delete_path(DirPath) ->
|
||||||
%%% Timing Functions
|
%%% Timing Functions
|
||||||
%%%============================================================================
|
%%%============================================================================
|
||||||
|
|
||||||
-spec update_statetimings(timing_types(),
|
-spec update_statetimings(timing_types(), timings(), integer()) ->
|
||||||
put_timings()|get_timings()|fold_timings()|head_timings(),
|
{timings(), integer()}.
|
||||||
integer())
|
|
||||||
->
|
|
||||||
{put_timings()|get_timings()|fold_timings()|head_timings(),
|
|
||||||
integer()}.
|
|
||||||
%% @doc
|
%% @doc
|
||||||
%%
|
%%
|
||||||
%% The timings state is either in countdown to the next set of samples of
|
%% The timings state is either in countdown to the next set of samples of
|
||||||
|
@ -2478,8 +2494,8 @@ update_statetimings(put, no_timing, 0) ->
|
||||||
{#put_timings{}, 0};
|
{#put_timings{}, 0};
|
||||||
update_statetimings(get, no_timing, 0) ->
|
update_statetimings(get, no_timing, 0) ->
|
||||||
{#get_timings{}, 0};
|
{#get_timings{}, 0};
|
||||||
update_statetimings(fold, no_timing, 0) ->
|
update_statetimings(snapshot, no_timing, 0) ->
|
||||||
{#fold_timings{}, 0};
|
{#snapshot_timings{}, 0};
|
||||||
update_statetimings(head, Timings, 0) ->
|
update_statetimings(head, Timings, 0) ->
|
||||||
case Timings#head_timings.sample_count of
|
case Timings#head_timings.sample_count of
|
||||||
SC when SC >= ?TIMING_SAMPLESIZE ->
|
SC when SC >= ?TIMING_SAMPLESIZE ->
|
||||||
|
@ -2504,12 +2520,12 @@ update_statetimings(get, Timings, 0) ->
|
||||||
_SC ->
|
_SC ->
|
||||||
{Timings, 0}
|
{Timings, 0}
|
||||||
end;
|
end;
|
||||||
update_statetimings(fold, Timings, 0) ->
|
update_statetimings(snapshot, Timings, 0) ->
|
||||||
case Timings#fold_timings.sample_count of
|
case Timings#snapshot_timings.sample_count of
|
||||||
SC when SC >= (?TIMING_SAMPLESIZE div 10) ->
|
SC when SC >= ?TIMING_SAMPLESIZE ->
|
||||||
log_timings(fold, Timings),
|
log_timings(snapshot, Timings),
|
||||||
{no_timing,
|
{no_timing,
|
||||||
leveled_rand:uniform(2 * (?TIMING_SAMPLECOUNTDOWN div 10))};
|
leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
|
||||||
_SC ->
|
_SC ->
|
||||||
{Timings, 0}
|
{Timings, 0}
|
||||||
end;
|
end;
|
||||||
|
@ -2531,15 +2547,17 @@ log_timings(get, Timings) ->
|
||||||
Timings#get_timings.head_time,
|
Timings#get_timings.head_time,
|
||||||
Timings#get_timings.body_time,
|
Timings#get_timings.body_time,
|
||||||
Timings#get_timings.fetch_count]);
|
Timings#get_timings.fetch_count]);
|
||||||
log_timings(fold, Timings) ->
|
log_timings(snapshot, Timings) ->
|
||||||
leveled_log:log("B0017", [Timings#fold_timings.sample_count,
|
leveled_log:log("B0017", [Timings#snapshot_timings.sample_count,
|
||||||
Timings#fold_timings.setup_time]).
|
Timings#snapshot_timings.bookie_time,
|
||||||
|
Timings#snapshot_timings.pcl_time]).
|
||||||
|
|
||||||
|
|
||||||
update_timings(_SW, _Stage, no_timing) ->
|
update_timings(_SW, _Stage, no_timing) ->
|
||||||
{no_timing, no_timing};
|
{no_timing, no_timing};
|
||||||
update_timings(SW, {head, Stage}, Timings) ->
|
update_timings(SW, {head, Stage}, Timings) ->
|
||||||
Timer = timer:now_diff(os:timestamp(), SW),
|
NextSW = os:timestamp(),
|
||||||
|
Timer = timer:now_diff(NextSW, SW),
|
||||||
Timings0 =
|
Timings0 =
|
||||||
case Stage of
|
case Stage of
|
||||||
pcl ->
|
pcl ->
|
||||||
|
@ -2550,9 +2568,10 @@ update_timings(SW, {head, Stage}, Timings) ->
|
||||||
CNT = Timings#head_timings.sample_count + 1,
|
CNT = Timings#head_timings.sample_count + 1,
|
||||||
Timings#head_timings{buildhead_time = BHT, sample_count = CNT}
|
Timings#head_timings{buildhead_time = BHT, sample_count = CNT}
|
||||||
end,
|
end,
|
||||||
{os:timestamp(), Timings0};
|
{NextSW, Timings0};
|
||||||
update_timings(SW, {put, Stage}, Timings) ->
|
update_timings(SW, {put, Stage}, Timings) ->
|
||||||
Timer = timer:now_diff(os:timestamp(), SW),
|
NextSW = os:timestamp(),
|
||||||
|
Timer = timer:now_diff(NextSW, SW),
|
||||||
Timings0 =
|
Timings0 =
|
||||||
case Stage of
|
case Stage of
|
||||||
{inker, ObjectSize} ->
|
{inker, ObjectSize} ->
|
||||||
|
@ -2564,24 +2583,32 @@ update_timings(SW, {put, Stage}, Timings) ->
|
||||||
CNT = Timings#put_timings.sample_count + 1,
|
CNT = Timings#put_timings.sample_count + 1,
|
||||||
Timings#put_timings{mem_time = PCT, sample_count = CNT}
|
Timings#put_timings{mem_time = PCT, sample_count = CNT}
|
||||||
end,
|
end,
|
||||||
{os:timestamp(), Timings0};
|
{NextSW, Timings0};
|
||||||
update_timings(SW, {get, head}, Timings) ->
|
update_timings(SW, {get, head}, Timings) ->
|
||||||
Timer = timer:now_diff(os:timestamp(), SW),
|
NextSW = os:timestamp(),
|
||||||
|
Timer = timer:now_diff(NextSW, SW),
|
||||||
GHT = Timings#get_timings.head_time + Timer,
|
GHT = Timings#get_timings.head_time + Timer,
|
||||||
CNT = Timings#get_timings.sample_count + 1,
|
CNT = Timings#get_timings.sample_count + 1,
|
||||||
Timings0 = Timings#get_timings{head_time = GHT, sample_count = CNT},
|
Timings0 = Timings#get_timings{head_time = GHT, sample_count = CNT},
|
||||||
{os:timestamp(), Timings0};
|
{NextSW, Timings0};
|
||||||
update_timings(SW, {get, body}, Timings) ->
|
update_timings(SW, {get, body}, Timings) ->
|
||||||
Timer = timer:now_diff(os:timestamp(), SW),
|
Timer = timer:now_diff(os:timestamp(), SW),
|
||||||
GBT = Timings#get_timings.body_time + Timer,
|
GBT = Timings#get_timings.body_time + Timer,
|
||||||
FCNT = Timings#get_timings.fetch_count + 1,
|
FCNT = Timings#get_timings.fetch_count + 1,
|
||||||
Timings0 = Timings#get_timings{body_time = GBT, fetch_count = FCNT},
|
Timings0 = Timings#get_timings{body_time = GBT, fetch_count = FCNT},
|
||||||
{no_timing, Timings0};
|
{no_timing, Timings0};
|
||||||
update_timings(SW, {fold, setup}, Timings) ->
|
update_timings(SW, {snapshot, bookie}, Timings) ->
|
||||||
Timer = timer:now_diff(os:timestamp(), SW),
|
NextSW = os:timestamp(),
|
||||||
FST = Timings#fold_timings.setup_time + Timer,
|
Timer = timer:now_diff(NextSW, SW),
|
||||||
CNT = Timings#fold_timings.sample_count + 1,
|
BST = Timings#snapshot_timings.bookie_time + Timer,
|
||||||
Timings0 = Timings#fold_timings{setup_time = FST, sample_count = CNT},
|
CNT = Timings#snapshot_timings.sample_count + 1,
|
||||||
|
Timings0 = Timings#snapshot_timings{bookie_time = BST, sample_count = CNT},
|
||||||
|
{NextSW, Timings0};
|
||||||
|
update_timings(SW, {snapshot, pcl}, Timings) ->
|
||||||
|
NextSW = os:timestamp(),
|
||||||
|
Timer = timer:now_diff(NextSW, SW),
|
||||||
|
PST = Timings#snapshot_timings.pcl_time + Timer,
|
||||||
|
Timings0 = Timings#snapshot_timings{pcl_time = PST},
|
||||||
{no_timing, Timings0}.
|
{no_timing, Timings0}.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -70,7 +70,7 @@
|
||||||
{info, "Get timing with sample_count=~w and head_time=~w body_time=~w"
|
{info, "Get timing with sample_count=~w and head_time=~w body_time=~w"
|
||||||
++ " with fetch_count=~w"}},
|
++ " with fetch_count=~w"}},
|
||||||
{"B0017",
|
{"B0017",
|
||||||
{info, "Fold timing with sample_count=~w and setup_time=~w"}},
|
{info, "Snapshot timing with sample_count=~w and bookie_time=~w pcl_time=~w"}},
|
||||||
{"B0018",
|
{"B0018",
|
||||||
{info, "Positive HEAD responses timed with sample_count=~w and "
|
{info, "Positive HEAD responses timed with sample_count=~w and "
|
||||||
++ " pcl_time=~w rsp_time=~w"}},
|
++ " pcl_time=~w rsp_time=~w"}},
|
||||||
|
|
|
@ -206,23 +206,6 @@
|
||||||
|
|
||||||
-include_lib("eunit/include/eunit.hrl").
|
-include_lib("eunit/include/eunit.hrl").
|
||||||
|
|
||||||
-define(LEVEL_SCALEFACTOR,
|
|
||||||
[{0, 0},
|
|
||||||
{1, 4}, {2, 16}, {3, 64}, % Factor of 4
|
|
||||||
{4, 384}, {5, 2304}, % Factor of 6
|
|
||||||
{6, 18432}, % Factor of 8
|
|
||||||
{7, infinity}]).
|
|
||||||
% As an alternative to going up by a factor of 8 at each level,
|
|
||||||
% increase by a factor of 4 at young levels - to make early
|
|
||||||
% compaction jobs shorter.
|
|
||||||
%
|
|
||||||
% There are 32K keys per files => with 4096 files there are 100M
|
|
||||||
% keys supported,
|
|
||||||
|
|
||||||
% 600M keys is supported before hitting the infinite level.
|
|
||||||
% At o(10) trillion keys behaviour may become increasingly
|
|
||||||
% difficult to predict.
|
|
||||||
-define(MAX_LEVELS, 8).
|
|
||||||
-define(MAX_WORK_WAIT, 300).
|
-define(MAX_WORK_WAIT, 300).
|
||||||
-define(MANIFEST_FP, "ledger_manifest").
|
-define(MANIFEST_FP, "ledger_manifest").
|
||||||
-define(FILES_FP, "ledger_files").
|
-define(FILES_FP, "ledger_files").
|
||||||
|
@ -230,7 +213,6 @@
|
||||||
-define(PENDING_FILEX, "pnd").
|
-define(PENDING_FILEX, "pnd").
|
||||||
-define(SST_FILEX, ".sst").
|
-define(SST_FILEX, ".sst").
|
||||||
-define(ARCHIVE_FILEX, ".bak").
|
-define(ARCHIVE_FILEX, ".bak").
|
||||||
-define(MEMTABLE, mem).
|
|
||||||
-define(SUPER_MAX_TABLE_SIZE, 40000).
|
-define(SUPER_MAX_TABLE_SIZE, 40000).
|
||||||
-define(PROMPT_WAIT_ONL0, 5).
|
-define(PROMPT_WAIT_ONL0, 5).
|
||||||
-define(WORKQUEUE_BACKLOG_TOLERANCE, 4).
|
-define(WORKQUEUE_BACKLOG_TOLERANCE, 4).
|
||||||
|
@ -243,6 +225,14 @@
|
||||||
|
|
||||||
-record(state, {manifest ::
|
-record(state, {manifest ::
|
||||||
leveled_pmanifest:manifest() | undefined | redacted,
|
leveled_pmanifest:manifest() | undefined | redacted,
|
||||||
|
query_manifest ::
|
||||||
|
{list(),
|
||||||
|
leveled_codec:ledger_key(),
|
||||||
|
leveled_codec:ledger_key()} | undefined,
|
||||||
|
% Slimmed down version of the manifest containing part
|
||||||
|
% related to specific query, and the StartKey/EndKey
|
||||||
|
% used to extract this part
|
||||||
|
|
||||||
persisted_sqn = 0 :: integer(), % The highest SQN persisted
|
persisted_sqn = 0 :: integer(), % The highest SQN persisted
|
||||||
|
|
||||||
ledger_sqn = 0 :: integer(), % The highest SQN added to L0
|
ledger_sqn = 0 :: integer(), % The highest SQN added to L0
|
||||||
|
@ -809,21 +799,17 @@ handle_call({fetch_keys,
|
||||||
|
|
||||||
%% Rename any reference to loop state that may be used by the function
|
%% Rename any reference to loop state that may be used by the function
|
||||||
%% to be returned - https://github.com/martinsumner/leveled/issues/326
|
%% to be returned - https://github.com/martinsumner/leveled/issues/326
|
||||||
Manifest = State#state.manifest,
|
SSTiter =
|
||||||
|
case State#state.query_manifest of
|
||||||
|
undefined ->
|
||||||
|
leveled_pmanifest:query_manifest(
|
||||||
|
State#state.manifest, StartKey, EndKey);
|
||||||
|
{QueryManifest, StartKeyQM, EndKeyQM}
|
||||||
|
when StartKey >= StartKeyQM, EndKey =< EndKeyQM ->
|
||||||
|
QueryManifest
|
||||||
|
end,
|
||||||
SnapshotTime = State#state.snapshot_time,
|
SnapshotTime = State#state.snapshot_time,
|
||||||
|
|
||||||
SetupFoldFun =
|
|
||||||
fun(Level, Acc) ->
|
|
||||||
Pointers = leveled_pmanifest:range_lookup(Manifest,
|
|
||||||
Level,
|
|
||||||
StartKey,
|
|
||||||
EndKey),
|
|
||||||
case Pointers of
|
|
||||||
[] -> Acc;
|
|
||||||
PL -> Acc ++ [{Level, PL}]
|
|
||||||
end
|
|
||||||
end,
|
|
||||||
SSTiter = lists:foldl(SetupFoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)),
|
|
||||||
Folder =
|
Folder =
|
||||||
fun() ->
|
fun() ->
|
||||||
keyfolder({FilteredL0, SSTiter},
|
keyfolder({FilteredL0, SSTiter},
|
||||||
|
@ -867,7 +853,7 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
|
||||||
BookieIncrTree
|
BookieIncrTree
|
||||||
end,
|
end,
|
||||||
|
|
||||||
CloneState =
|
{CloneState, ManifestClone, QueryManifest} =
|
||||||
case Query of
|
case Query of
|
||||||
no_lookup ->
|
no_lookup ->
|
||||||
{UpdMaxSQN, UpdSize, L0Cache} =
|
{UpdMaxSQN, UpdSize, L0Cache} =
|
||||||
|
@ -875,10 +861,12 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
|
||||||
{LM1Cache, MinSQN, MaxSQN},
|
{LM1Cache, MinSQN, MaxSQN},
|
||||||
State#state.ledger_sqn,
|
State#state.ledger_sqn,
|
||||||
State#state.levelzero_cache),
|
State#state.levelzero_cache),
|
||||||
#state{levelzero_cache = L0Cache,
|
{#state{levelzero_cache = L0Cache,
|
||||||
ledger_sqn = UpdMaxSQN,
|
ledger_sqn = UpdMaxSQN,
|
||||||
levelzero_size = UpdSize,
|
levelzero_size = UpdSize,
|
||||||
persisted_sqn = State#state.persisted_sqn};
|
persisted_sqn = State#state.persisted_sqn},
|
||||||
|
leveled_pmanifest:copy_manifest(State#state.manifest),
|
||||||
|
undefined};
|
||||||
{StartKey, EndKey} ->
|
{StartKey, EndKey} ->
|
||||||
SW = os:timestamp(),
|
SW = os:timestamp(),
|
||||||
L0AsTree =
|
L0AsTree =
|
||||||
|
@ -889,10 +877,15 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
|
||||||
leveled_log:log_randomtimer("P0037",
|
leveled_log:log_randomtimer("P0037",
|
||||||
[State#state.levelzero_size],
|
[State#state.levelzero_size],
|
||||||
SW,
|
SW,
|
||||||
0.1),
|
0.01),
|
||||||
#state{levelzero_astree = L0AsTree,
|
{#state{levelzero_astree = L0AsTree,
|
||||||
ledger_sqn = MaxSQN,
|
ledger_sqn = MaxSQN,
|
||||||
persisted_sqn = State#state.persisted_sqn};
|
persisted_sqn = State#state.persisted_sqn},
|
||||||
|
undefined,
|
||||||
|
{leveled_pmanifest:query_manifest(
|
||||||
|
State#state.manifest, StartKey, EndKey),
|
||||||
|
StartKey,
|
||||||
|
EndKey}};
|
||||||
undefined ->
|
undefined ->
|
||||||
{UpdMaxSQN, UpdSize, L0Cache} =
|
{UpdMaxSQN, UpdSize, L0Cache} =
|
||||||
leveled_pmem:add_to_cache(State#state.levelzero_size,
|
leveled_pmem:add_to_cache(State#state.levelzero_size,
|
||||||
|
@ -908,18 +901,20 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
|
||||||
State#state.levelzero_index,
|
State#state.levelzero_index,
|
||||||
length(L0Cache))
|
length(L0Cache))
|
||||||
end,
|
end,
|
||||||
#state{levelzero_cache = L0Cache,
|
{#state{levelzero_cache = L0Cache,
|
||||||
levelzero_index = L0Index,
|
levelzero_index = L0Index,
|
||||||
levelzero_size = UpdSize,
|
levelzero_size = UpdSize,
|
||||||
ledger_sqn = UpdMaxSQN,
|
ledger_sqn = UpdMaxSQN,
|
||||||
persisted_sqn = State#state.persisted_sqn}
|
persisted_sqn = State#state.persisted_sqn},
|
||||||
|
leveled_pmanifest:copy_manifest(State#state.manifest),
|
||||||
|
undefined}
|
||||||
end,
|
end,
|
||||||
ManifestClone = leveled_pmanifest:copy_manifest(State#state.manifest),
|
|
||||||
{reply,
|
{reply,
|
||||||
{ok,
|
{ok,
|
||||||
CloneState#state{snapshot_fully_loaded=true,
|
CloneState#state{snapshot_fully_loaded = true,
|
||||||
snapshot_time = leveled_util:integer_now(),
|
snapshot_time = leveled_util:integer_now(),
|
||||||
manifest=ManifestClone}},
|
manifest = ManifestClone,
|
||||||
|
query_manifest = QueryManifest}},
|
||||||
State#state{manifest = Manifest0}};
|
State#state{manifest = Manifest0}};
|
||||||
handle_call(close, _From, State=#state{is_snapshot=Snap}) when Snap == true ->
|
handle_call(close, _From, State=#state{is_snapshot=Snap}) when Snap == true ->
|
||||||
ok = pcl_releasesnapshot(State#state.source_penciller, self()),
|
ok = pcl_releasesnapshot(State#state.source_penciller, self()),
|
||||||
|
@ -980,8 +975,7 @@ handle_call({checkbloom_fortest, Key, Hash}, _From, State) ->
|
||||||
end,
|
end,
|
||||||
{reply, lists:foldl(FoldFun, false, lists:seq(0, ?MAX_LEVELS)), State};
|
{reply, lists:foldl(FoldFun, false, lists:seq(0, ?MAX_LEVELS)), State};
|
||||||
handle_call(check_for_work, _From, State) ->
|
handle_call(check_for_work, _From, State) ->
|
||||||
{_WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest,
|
{_WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest),
|
||||||
?LEVEL_SCALEFACTOR),
|
|
||||||
{reply, WC > 0, State};
|
{reply, WC > 0, State};
|
||||||
handle_call(persisted_sqn, _From, State) ->
|
handle_call(persisted_sqn, _From, State) ->
|
||||||
{reply, State#state.persisted_sqn, State}.
|
{reply, State#state.persisted_sqn, State}.
|
||||||
|
@ -1101,8 +1095,7 @@ handle_cast(work_for_clerk, State) ->
|
||||||
%
|
%
|
||||||
% Perhaps the pclerk should not be restarted because of this, and
|
% Perhaps the pclerk should not be restarted because of this, and
|
||||||
% the failure should ripple up
|
% the failure should ripple up
|
||||||
{WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest,
|
{WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest),
|
||||||
?LEVEL_SCALEFACTOR),
|
|
||||||
case WC of
|
case WC of
|
||||||
0 ->
|
0 ->
|
||||||
{noreply, State#state{work_backlog=false}};
|
{noreply, State#state{work_backlog=false}};
|
||||||
|
@ -2216,7 +2209,7 @@ simple_server_test() ->
|
||||||
?assertMatch(Key3, pcl_fetch(PCLr, {o,"Bucket0003", "Key0003", null})),
|
?assertMatch(Key3, pcl_fetch(PCLr, {o,"Bucket0003", "Key0003", null})),
|
||||||
?assertMatch(Key4, pcl_fetch(PCLr, {o,"Bucket0004", "Key0004", null})),
|
?assertMatch(Key4, pcl_fetch(PCLr, {o,"Bucket0004", "Key0004", null})),
|
||||||
|
|
||||||
{ok, PclSnap, null} =
|
{ok, PclSnap, null, _} =
|
||||||
leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
|
leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
|
||||||
PCLr,
|
PCLr,
|
||||||
null,
|
null,
|
||||||
|
@ -2271,7 +2264,7 @@ simple_server_test() ->
|
||||||
1)),
|
1)),
|
||||||
ok = pcl_close(PclSnap),
|
ok = pcl_close(PclSnap),
|
||||||
|
|
||||||
{ok, PclSnap2, null} =
|
{ok, PclSnap2, null, _} =
|
||||||
leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
|
leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
|
||||||
PCLr,
|
PCLr,
|
||||||
null,
|
null,
|
||||||
|
@ -2561,13 +2554,11 @@ handle_down_test() ->
|
||||||
loop() ->
|
loop() ->
|
||||||
receive
|
receive
|
||||||
{snap, PCLr, TestPid} ->
|
{snap, PCLr, TestPid} ->
|
||||||
Res = leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
|
{ok, Snap, null, _Timings} =
|
||||||
PCLr,
|
leveled_bookie:snapshot_store(
|
||||||
null,
|
leveled_bookie:empty_ledgercache(),
|
||||||
ledger,
|
PCLr, null, ledger, undefined, false),
|
||||||
undefined,
|
TestPid ! {self(), {ok, Snap, null}},
|
||||||
false),
|
|
||||||
TestPid ! {self(), Res},
|
|
||||||
loop();
|
loop();
|
||||||
stop ->
|
stop ->
|
||||||
ok
|
ok
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
load_manifest/3,
|
load_manifest/3,
|
||||||
close_manifest/2,
|
close_manifest/2,
|
||||||
save_manifest/2,
|
save_manifest/2,
|
||||||
|
query_manifest/3,
|
||||||
get_manifest_sqn/1,
|
get_manifest_sqn/1,
|
||||||
key_lookup/3,
|
key_lookup/3,
|
||||||
range_lookup/4,
|
range_lookup/4,
|
||||||
|
@ -40,7 +41,7 @@
|
||||||
merge_snapshot/2,
|
merge_snapshot/2,
|
||||||
ready_to_delete/2,
|
ready_to_delete/2,
|
||||||
clear_pending/3,
|
clear_pending/3,
|
||||||
check_for_work/2,
|
check_for_work/1,
|
||||||
is_basement/2,
|
is_basement/2,
|
||||||
levelzero_present/1,
|
levelzero_present/1,
|
||||||
check_bloom/3,
|
check_bloom/3,
|
||||||
|
@ -56,7 +57,27 @@
|
||||||
-define(MANIFEST_FILEX, "man").
|
-define(MANIFEST_FILEX, "man").
|
||||||
-define(PENDING_FILEX, "pnd").
|
-define(PENDING_FILEX, "pnd").
|
||||||
-define(MANIFEST_FP, "ledger_manifest").
|
-define(MANIFEST_FP, "ledger_manifest").
|
||||||
-define(MAX_LEVELS, 8).
|
-define(LEVEL_SCALEFACTOR,
|
||||||
|
[{0, 0},
|
||||||
|
{1, 4}, {2, 16}, {3, 64}, % Factor of 4
|
||||||
|
{4, 384}, {5, 2304}, % Factor of 6
|
||||||
|
{6, 18432}, % Factor of 8
|
||||||
|
{7, infinity}]).
|
||||||
|
% As an alternative to going up by a factor of 8 at each level,
|
||||||
|
% increase by a factor of 4 at young levels - to make early
|
||||||
|
% compaction jobs shorter.
|
||||||
|
%
|
||||||
|
% There are 32K keys per files => with 4096 files there are 100M
|
||||||
|
% keys supported,
|
||||||
|
|
||||||
|
% 600M keys is supported before hitting the infinite level.
|
||||||
|
% At o(10) trillion keys behaviour may become increasingly
|
||||||
|
% difficult to predict.
|
||||||
|
|
||||||
|
-if(length(?LEVEL_SCALEFACTOR) /= ?MAX_LEVELS).
|
||||||
|
-error("length ?LEVEL_SCALEFACTOR differs from ?MAX_LEVELS").
|
||||||
|
-endif.
|
||||||
|
|
||||||
-define(TREE_TYPE, idxt).
|
-define(TREE_TYPE, idxt).
|
||||||
-define(TREE_WIDTH, 8).
|
-define(TREE_WIDTH, 8).
|
||||||
-define(PHANTOM_PID, r2d_fail).
|
-define(PHANTOM_PID, r2d_fail).
|
||||||
|
@ -403,6 +424,22 @@ key_lookup(Manifest, LevelIdx, Key) ->
|
||||||
Key)
|
Key)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
-spec query_manifest(
|
||||||
|
manifest(),
|
||||||
|
leveled_codec:ledger_key(),
|
||||||
|
leveled_codec:ledger_key()) -> list().
|
||||||
|
query_manifest(Manifest, StartKey, EndKey) ->
|
||||||
|
SetupFoldFun =
|
||||||
|
fun(Level, Acc) ->
|
||||||
|
Pointers =
|
||||||
|
range_lookup(Manifest, Level, StartKey, EndKey),
|
||||||
|
case Pointers of
|
||||||
|
[] -> Acc;
|
||||||
|
PL -> Acc ++ [{Level, PL}]
|
||||||
|
end
|
||||||
|
end,
|
||||||
|
lists:foldl(SetupFoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)).
|
||||||
|
|
||||||
-spec range_lookup(manifest(),
|
-spec range_lookup(manifest(),
|
||||||
integer(),
|
integer(),
|
||||||
leveled_codec:ledger_key(),
|
leveled_codec:ledger_key(),
|
||||||
|
@ -576,7 +613,7 @@ clear_pending(Manifest, [FN|RestFN], MaybeRelease) ->
|
||||||
RestFN,
|
RestFN,
|
||||||
MaybeRelease).
|
MaybeRelease).
|
||||||
|
|
||||||
-spec check_for_work(manifest(), list()) -> {list(), integer()}.
|
-spec check_for_work(manifest()) -> {list(), integer()}.
|
||||||
%% @doc
|
%% @doc
|
||||||
%% Check for compaction work in the manifest - look at levels which contain
|
%% Check for compaction work in the manifest - look at levels which contain
|
||||||
%% more files in the threshold.
|
%% more files in the threshold.
|
||||||
|
@ -588,7 +625,7 @@ clear_pending(Manifest, [FN|RestFN], MaybeRelease) ->
|
||||||
%%
|
%%
|
||||||
%% Return a list of levels which are over-sized as well as the total items
|
%% Return a list of levels which are over-sized as well as the total items
|
||||||
%% across the manifest which are beyond the size (the total work outstanding).
|
%% across the manifest which are beyond the size (the total work outstanding).
|
||||||
check_for_work(Manifest, Thresholds) ->
|
check_for_work(Manifest) ->
|
||||||
CheckLevelFun =
|
CheckLevelFun =
|
||||||
fun({LevelIdx, MaxCount}, {AccL, AccC}) ->
|
fun({LevelIdx, MaxCount}, {AccL, AccC}) ->
|
||||||
case LevelIdx > Manifest#manifest.basement of
|
case LevelIdx > Manifest#manifest.basement of
|
||||||
|
@ -605,7 +642,7 @@ check_for_work(Manifest, Thresholds) ->
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end,
|
end,
|
||||||
lists:foldr(CheckLevelFun, {[], 0}, Thresholds).
|
lists:foldr(CheckLevelFun, {[], 0}, ?LEVEL_SCALEFACTOR).
|
||||||
|
|
||||||
-spec is_basement(manifest(), integer()) -> boolean().
|
-spec is_basement(manifest(), integer()) -> boolean().
|
||||||
%% @doc
|
%% @doc
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue