Merge pull request #133 from martinsumner/mas-i61-dialyzertidy
Mas i61 dialyzertidy
This commit is contained in:
commit
931129d25f
9 changed files with 368 additions and 85 deletions
|
@ -107,7 +107,7 @@ A new database can be started by running
|
|||
{ok, Bookie} = leveled_bookie:book_start(RootPath, LedgerCacheSize, JournalSize, SyncStrategy)
|
||||
```
|
||||
|
||||
This will start a new Bookie. It will start and look for existing data files, under the RootPath, and start empty if none exist. A LedgerCacheSize of `2000`, a JournalSize of `500000000` (500MB) and a SyncStrategy of `none` should work OK.
|
||||
This will start a new Bookie. It will start and look for existing data files, under the RootPath, and start empty if none exist. A LedgerCacheSize of `2000`, a JournalSize of `500000000` (500MB) and a SyncStrategy of `none` should work OK. Further information on startup options can be found [here](docs/STARTUP_OPTIONS.md).
|
||||
|
||||
The book_start method should respond once startup is complete. The [leveled_bookie module](src/leveled_bookie.erl) includes the full API for external use of the store.
|
||||
|
||||
|
|
72
docs/STARTUP_OPTIONS.md
Normal file
72
docs/STARTUP_OPTIONS.md
Normal file
|
@ -0,0 +1,72 @@
|
|||
# Starting Leveled
|
||||
|
||||
There are a number of options that can be passed in when starting Leveled, this is an explainer of these options and what they do. The options are passed in a list of `{option_name, Option}` tuples on startup.
|
||||
|
||||
## Head Only
|
||||
|
||||
Starting with `{head_only, true}` (defaults to false), will start Leveled in a special mode. In this mode Leveled works a lot more like Leveldb, in that the Journal is just a buffer of recent writes to be used to recover objects on startup. The actual object value is now stored in the LSM tree itself rather than in the Journal.
|
||||
|
||||
Objects need to be put into the Leveled store using the `book_mput/2` or `book_mput/3` when running in head_only mode.
|
||||
|
||||
This mode was specifically added to support Leveled's use as a dedicated aae_store in the `kv_index_tictactree` library. It may be extensible for other uses where objects are small.
|
||||
|
||||
There is no current support for running leveled so that it supports both `head` objects which are stored entirely in the Ledger, along side other objects stored as normal split between the Journal and the Ledger. Setting `head_only` fundamentally changes the way the store works.
|
||||
|
||||
## Max Journal Size
|
||||
|
||||
The maximum size of an individual Journal file can be set using `{max_journalsize, netger()}`, which sets the size in bytes. The default value is 1,000,000,000 (~1GB), and the maximum size cannot exceed `2^32`.
|
||||
|
||||
If there are smaller objects then lookups within a Journal may get faster if each individual journal file is smaller.
|
||||
|
||||
## Ledger Cache Size
|
||||
|
||||
The option `{cache_size, integer()}` is the number of ledger objects that should be cached by the Bookie actor, before being pushed to the Ledger. Note these are ledger objects (so do not normally contain the actual object value, but does include index changes as separate objects). The default value is 2500.
|
||||
|
||||
## Penciller Cache Size
|
||||
|
||||
The option `{max_pencillercachesize, integer()}` sets the approximate number of options that should be kept in the penciller memory before it flushes that memory to disk. Note, when this limit is reached, the persist may be delayed by a some random jittering to prevent coordination between multiple stores in the same cluster.
|
||||
|
||||
The default number of objects is 28,000. A small number may be required if there is a particular shortage of memory. Note that this is just Ledger objects (so the actual values are not stored at in memory as part of this cache).
|
||||
|
||||
## File Write Sync Strategy
|
||||
|
||||
The sync strategy can be set as `{sync_strategy, sync|riak_sync|none}`. This controls whether each write requires that write to be flushed to disk before the write is acknowledged. If `none` is set flushing to disk is left in the hands of the operating system. `riak_sync` is a deprecated option (it is related to the lack of sync flag in OTP 16, and will prompt the flush after the write, rather than as part of the write operation).
|
||||
|
||||
The default is `sync`. Note, that without solid state drives and/or Flash-Backed Write Caches, this option will have a significant impact on performance.
|
||||
|
||||
## Waste Retention Period
|
||||
|
||||
The waste retention period can be used to keep old journal files that have already been compacted for that period. This might be useful if there is a desire to backup a machine to be restorable to a particular point in time (by clearing the ledger, and reverting the inker manifest).
|
||||
|
||||
The retention period can be set using `{waste_retention_period, integer()}` where the value is the period in seconds. If left as `undefined` all files will be garbage collected on compaction, and no waste will be retained.
|
||||
|
||||
## Reload Strategy
|
||||
|
||||
The purpose of the reload strategy is to define the behaviour at compaction of the Journal on finding a replaced record, in order to manage the behaviour when reloading the Ledger from the Journal.
|
||||
|
||||
By default nothing is compacted from the Journal if the SQN of the Journal entry is greater than the largest sequence number which has been persisted in the Ledger. So when an object is compacted in the Journal (as it has been replaced), it should not need to be replayed from the Journal into the Ledger in the future - as it, and all its related key changes, have already been persisted to the Ledger.
|
||||
|
||||
However, what if the Ledger had been erased? This could happen due to some corruption, or perhaps because only the Journal is to be backed up. As the object has been replaced, the value is not required - however KeyChanges ay be required (such as indexes which are built incrementally across a series of object changes). So to revert the indexes to their previous state the Key Changes would need to be retained in this case, so the indexes in the Ledger would be correctly rebuilt.
|
||||
|
||||
The are three potential strategies:
|
||||
|
||||
`skip` - don't worry about this scenario, require the Ledger to be backed up;
|
||||
`retain` - discard the object itself on compaction but keep the key changes;
|
||||
`recalc` - recalculate the indexes on reload by comparing the information on the object with the current state of the Ledger (as would be required by the PUT process when comparing IndexSpecs at PUT time).
|
||||
|
||||
There is no code for `recalc` at present it is simply a logical possibility. So to set a reload strategy there should be an entry like `{reload_strategy, [{TagName, skip|retain}]}`. By default tags are pre-set to `retain`. If there is no need to handle a corrupted Ledger, then all tags could be set to `skip`.
|
||||
|
||||
|
||||
## Compression Method
|
||||
|
||||
Compression method can be set to `native` or `lz4` (i.e. `{compression_method, native|lz4}`). Native compression will use the compress option in Erlangs native `term_to_binary/2` function, whereas lz4 compression will use a NIF'd Lz4 library.
|
||||
|
||||
This is the compression used both when writing an object to the jourrnal, and a block of keys to the ledger. There is a throughput advantage of around 2 - 5 % associated with using `lz4` compression.
|
||||
|
||||
## Compression Point
|
||||
|
||||
Compression point can be set using `{compression_point, on_receipt|on_compact}`. This refers only to compression in the Journal, key blocks are always compressed in the ledger. The option is whether to accept additional PUT latency by compressing as objects are received, or defer the compressing of objects in the Journal until they are re-written as part of a compaction (which may never happen).
|
||||
|
||||
## Root Path
|
||||
|
||||
The root path is the name of the folder in which the database has been (or should be) persisted.
|
|
@ -83,6 +83,7 @@
|
|||
-define(SNAPSHOT_TIMEOUT, 300000).
|
||||
-define(CACHE_SIZE_JITTER, 25).
|
||||
-define(JOURNAL_SIZE_JITTER, 20).
|
||||
-define(ABSOLUTEMAX_JOURNALSIZE, 4000000000).
|
||||
-define(LONG_RUNNING, 80000).
|
||||
-define(RECENT_AAE, false).
|
||||
-define(COMPRESSION_METHOD, lz4).
|
||||
|
@ -198,6 +199,8 @@ book_start(RootPath, LedgerCacheSize, JournalSize, SyncStrategy) ->
|
|||
%% - compression_method
|
||||
%% - compression_point
|
||||
%%
|
||||
%% For full description of options see ../docs/STARTUP_OPTIONS.md
|
||||
%%
|
||||
%% Both of the first two options relate to compaction in the Journal. The
|
||||
%% retain_strategy determines if a skinny record of the object should be
|
||||
%% retained following compaction, and how that should be used when recovering
|
||||
|
@ -323,7 +326,7 @@ book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag, TTL) ->
|
|||
-spec book_mput(pid(), list(tuple())) -> ok|pause.
|
||||
%% @doc
|
||||
%%
|
||||
%% When the store is being run in head_only mode, batches fo object specs may
|
||||
%% When the store is being run in head_only mode, batches of object specs may
|
||||
%% be inserted in to the store using book_mput/2. ObjectSpecs should be
|
||||
%% of the form {ObjectOp, Bucket, Key, SubKey, Value}. The Value will be
|
||||
%% stored within the HEAD of the object (in the Ledger), so the full object
|
||||
|
@ -336,7 +339,7 @@ book_mput(Pid, ObjectSpecs) ->
|
|||
-spec book_mput(pid(), list(tuple()), infinity|integer()) -> ok|pause.
|
||||
%% @doc
|
||||
%%
|
||||
%% When the store is being run in head_only mode, batches fo object specs may
|
||||
%% When the store is being run in head_only mode, batches of object specs may
|
||||
%% be inserted in to the store using book_mput/2. ObjectSpecs should be
|
||||
%% of the form {action, Bucket, Key, SubKey, Value}. The Value will be
|
||||
%% stored within the HEAD of the object (in the Ledger), so the full object
|
||||
|
@ -868,6 +871,101 @@ fetch_value(Inker, {Key, SQN}) ->
|
|||
%%% Internal functions
|
||||
%%%============================================================================
|
||||
|
||||
-spec startup(#inker_options{}, #penciller_options{}, book_state())
|
||||
-> {pid(), pid()}.
|
||||
%% @doc
|
||||
%% Startup the Inker and the Penciller, and prompt the loading of the Penciller
|
||||
%% from the Inker. The Penciller may be shutdown without the latest data
|
||||
%% having been persisted: and so the Iker must be able to update the Penciller
|
||||
%% on startup with anything that happened but wasn't flushed to disk.
|
||||
startup(InkerOpts, PencillerOpts, State) ->
|
||||
{ok, Inker} = leveled_inker:ink_start(InkerOpts),
|
||||
{ok, Penciller} = leveled_penciller:pcl_start(PencillerOpts),
|
||||
LedgerSQN = leveled_penciller:pcl_getstartupsequencenumber(Penciller),
|
||||
leveled_log:log("B0005", [LedgerSQN]),
|
||||
ok = leveled_inker:ink_loadpcl(Inker,
|
||||
LedgerSQN + 1,
|
||||
get_loadfun(State),
|
||||
Penciller),
|
||||
{Inker, Penciller}.
|
||||
|
||||
|
||||
-spec set_options(list()) -> {#inker_options{}, #penciller_options{}}.
|
||||
%% @doc
|
||||
%% Take the passed in property list of operations and extract out any relevant
|
||||
%% options to the Inker or the Penciller
|
||||
set_options(Opts) ->
|
||||
MaxJournalSize0 =
|
||||
min(?ABSOLUTEMAX_JOURNALSIZE,
|
||||
get_opt(max_journalsize, Opts, 1000000000)),
|
||||
JournalSizeJitter = MaxJournalSize0 div (100 div ?JOURNAL_SIZE_JITTER),
|
||||
MaxJournalSize =
|
||||
min(?ABSOLUTEMAX_JOURNALSIZE,
|
||||
MaxJournalSize0 - erlang:phash2(self()) rem JournalSizeJitter),
|
||||
|
||||
SyncStrat = get_opt(sync_strategy, Opts, sync),
|
||||
WRP = get_opt(waste_retention_period, Opts),
|
||||
|
||||
AltStrategy = get_opt(reload_strategy, Opts, []),
|
||||
ReloadStrategy = leveled_codec:inker_reload_strategy(AltStrategy),
|
||||
|
||||
PCLL0CacheSize = get_opt(max_pencillercachesize, Opts),
|
||||
RootPath = get_opt(root_path, Opts),
|
||||
|
||||
JournalFP = RootPath ++ "/" ++ ?JOURNAL_FP,
|
||||
LedgerFP = RootPath ++ "/" ++ ?LEDGER_FP,
|
||||
ok = filelib:ensure_dir(JournalFP),
|
||||
ok = filelib:ensure_dir(LedgerFP),
|
||||
|
||||
CompressionMethod =
|
||||
case get_opt(compression_method, Opts, ?COMPRESSION_METHOD) of
|
||||
native ->
|
||||
% Note native compression will have reduced performance
|
||||
% https://github.com/martinsumner/leveled/issues/95
|
||||
native;
|
||||
lz4 ->
|
||||
% Must include lz4 library in rebar.config
|
||||
lz4
|
||||
end,
|
||||
CompressOnReceipt =
|
||||
case get_opt(compression_point, Opts, ?COMPRESSION_POINT) of
|
||||
on_receipt ->
|
||||
% Note this will add measurable delay to PUT time
|
||||
% https://github.com/martinsumner/leveled/issues/95
|
||||
true;
|
||||
on_compact ->
|
||||
% If using lz4 this is not recommended
|
||||
false
|
||||
end,
|
||||
|
||||
{#inker_options{root_path = JournalFP,
|
||||
reload_strategy = ReloadStrategy,
|
||||
max_run_length = get_opt(max_run_length, Opts),
|
||||
waste_retention_period = WRP,
|
||||
compression_method = CompressionMethod,
|
||||
compress_on_receipt = CompressOnReceipt,
|
||||
cdb_options =
|
||||
#cdb_options{max_size=MaxJournalSize,
|
||||
binary_mode=true,
|
||||
sync_strategy=SyncStrat}},
|
||||
#penciller_options{root_path = LedgerFP,
|
||||
max_inmemory_tablesize = PCLL0CacheSize,
|
||||
levelzero_cointoss = true,
|
||||
compression_method = CompressionMethod}}.
|
||||
|
||||
|
||||
-spec return_snapfun(book_state(), store|ledger,
|
||||
tuple()|no_lookup|undefined,
|
||||
boolean(), boolean()) -> fun().
|
||||
%% @doc
|
||||
%% Generates a function from which a snapshot can be created. The primary
|
||||
%% factor here is the SnapPreFold boolean. If this is true then the snapshot
|
||||
%% will be taken before the Fold function is returned. If SnapPreFold is
|
||||
%% false then the snapshot will be taken when the Fold function is called.
|
||||
%%
|
||||
%% SnapPrefold is to be used when the intention is to queue the fold, and so
|
||||
%% claling of the fold may be delayed, but it is still desired that the fold
|
||||
%% represent the point in time that the query was requested.
|
||||
return_snapfun(State, SnapType, Query, LongRunning, SnapPreFold) ->
|
||||
case SnapPreFold of
|
||||
true ->
|
||||
|
@ -883,6 +981,13 @@ return_snapfun(State, SnapType, Query, LongRunning, SnapPreFold) ->
|
|||
fun() -> book_snapshot(Self, SnapType, Query, LongRunning) end
|
||||
end.
|
||||
|
||||
-spec snaptype_by_presence(boolean()) -> store|ledger.
|
||||
%% @doc
|
||||
%% Folds that traverse over object heads, may also either require to return
|
||||
%% the object,or at least confirm th eobject is present in the Ledger. This
|
||||
%% is achieved by enabling presence - and this will change the type of
|
||||
%% snapshot to one that covers the whole store (i.e. both ledger and journal),
|
||||
%% rather than just the ledger.
|
||||
snaptype_by_presence(true) ->
|
||||
store;
|
||||
snaptype_by_presence(false) ->
|
||||
|
@ -918,7 +1023,6 @@ get_runner(State, {keylist, Tag, Bucket, FoldAccT}) ->
|
|||
get_runner(State, {keylist, Tag, Bucket, KeyRange, FoldAccT}) ->
|
||||
SnapFun = return_snapfun(State, ledger, no_lookup, true, true),
|
||||
leveled_runner:bucketkey_query(SnapFun, Tag, Bucket, KeyRange, FoldAccT);
|
||||
|
||||
%% Set of runners for object or metadata folds
|
||||
get_runner(State,
|
||||
{foldheads_allkeys,
|
||||
|
@ -1078,6 +1182,11 @@ return_ledger_keyrange(Tag, Bucket, KeyRange) ->
|
|||
{StartKey, EndKey, SnapQuery}.
|
||||
|
||||
|
||||
-spec maybe_longrunning(erlang:timestamp(), atom()) -> ok.
|
||||
%% @doc
|
||||
%% Check the length of time an operation (named by Aspect) has taken, and
|
||||
%% see if it has crossed the long running threshold. If so log to indicate
|
||||
%% a long running event has occurred.
|
||||
maybe_longrunning(SW, Aspect) ->
|
||||
case timer:now_diff(os:timestamp(), SW) of
|
||||
N when N > ?LONG_RUNNING ->
|
||||
|
@ -1086,6 +1195,12 @@ maybe_longrunning(SW, Aspect) ->
|
|||
ok
|
||||
end.
|
||||
|
||||
-spec readycache_forsnapshot(ledger_cache(), tuple()|no_lookup|undefined)
|
||||
-> ledger_cache().
|
||||
%% @doc
|
||||
%% Strip the ledger cach back to only the relevant informaiton needed in
|
||||
%% the query, and to make the cache a snapshot (and so not subject to changes
|
||||
%% such as additions to the ets table)
|
||||
readycache_forsnapshot(LedgerCache, {StartKey, EndKey}) ->
|
||||
{KL, MinSQN, MaxSQN} = scan_table(LedgerCache#ledger_cache.mem,
|
||||
StartKey,
|
||||
|
@ -1127,6 +1242,13 @@ readycache_forsnapshot(LedgerCache, Query) ->
|
|||
max_sqn=LedgerCache#ledger_cache.max_sqn}
|
||||
end.
|
||||
|
||||
-spec scan_table(ets:tab(), tuple(), tuple()) ->
|
||||
{list(), non_neg_integer()|infinity, non_neg_integer()}.
|
||||
%% @doc
|
||||
%% Query the ETS table to find a range of keys (start inclusive). Should also
|
||||
%% return the miniumum and maximum sequence number found in the query. This
|
||||
%% is just then used as a safety check when loading these results into the
|
||||
%% penciller snapshot
|
||||
scan_table(Table, StartKey, EndKey) ->
|
||||
case ets:lookup(Table, StartKey) of
|
||||
[] ->
|
||||
|
@ -1158,74 +1280,11 @@ scan_table(Table, StartKey, EndKey, Acc, MinSQN, MaxSQN) ->
|
|||
end.
|
||||
|
||||
|
||||
set_options(Opts) ->
|
||||
MaxJournalSize0 = get_opt(max_journalsize, Opts, 10000000000),
|
||||
JournalSizeJitter = MaxJournalSize0 div (100 div ?JOURNAL_SIZE_JITTER),
|
||||
MaxJournalSize = MaxJournalSize0 -
|
||||
erlang:phash2(self()) rem JournalSizeJitter,
|
||||
|
||||
SyncStrat = get_opt(sync_strategy, Opts, sync),
|
||||
WRP = get_opt(waste_retention_period, Opts),
|
||||
|
||||
AltStrategy = get_opt(reload_strategy, Opts, []),
|
||||
ReloadStrategy = leveled_codec:inker_reload_strategy(AltStrategy),
|
||||
|
||||
PCLL0CacheSize = get_opt(max_pencillercachesize, Opts),
|
||||
RootPath = get_opt(root_path, Opts),
|
||||
|
||||
JournalFP = RootPath ++ "/" ++ ?JOURNAL_FP,
|
||||
LedgerFP = RootPath ++ "/" ++ ?LEDGER_FP,
|
||||
ok = filelib:ensure_dir(JournalFP),
|
||||
ok = filelib:ensure_dir(LedgerFP),
|
||||
|
||||
CompressionMethod =
|
||||
case get_opt(compression_method, Opts, ?COMPRESSION_METHOD) of
|
||||
native ->
|
||||
% Note native compression will have reduced performance
|
||||
% https://github.com/martinsumner/leveled/issues/95
|
||||
native;
|
||||
lz4 ->
|
||||
% Must include lz4 library in rebar.config
|
||||
lz4
|
||||
end,
|
||||
CompressOnReceipt =
|
||||
case get_opt(compression_point, Opts, ?COMPRESSION_POINT) of
|
||||
on_receipt ->
|
||||
% Note this will add measurable delay to PUT time
|
||||
% https://github.com/martinsumner/leveled/issues/95
|
||||
true;
|
||||
on_compact ->
|
||||
% If using lz4 this is not recommended
|
||||
false
|
||||
end,
|
||||
|
||||
{#inker_options{root_path = JournalFP,
|
||||
reload_strategy = ReloadStrategy,
|
||||
max_run_length = get_opt(max_run_length, Opts),
|
||||
waste_retention_period = WRP,
|
||||
compression_method = CompressionMethod,
|
||||
compress_on_receipt = CompressOnReceipt,
|
||||
cdb_options =
|
||||
#cdb_options{max_size=MaxJournalSize,
|
||||
binary_mode=true,
|
||||
sync_strategy=SyncStrat}},
|
||||
#penciller_options{root_path = LedgerFP,
|
||||
max_inmemory_tablesize = PCLL0CacheSize,
|
||||
levelzero_cointoss = true,
|
||||
compression_method = CompressionMethod}}.
|
||||
|
||||
startup(InkerOpts, PencillerOpts, State) ->
|
||||
{ok, Inker} = leveled_inker:ink_start(InkerOpts),
|
||||
{ok, Penciller} = leveled_penciller:pcl_start(PencillerOpts),
|
||||
LedgerSQN = leveled_penciller:pcl_getstartupsequencenumber(Penciller),
|
||||
leveled_log:log("B0005", [LedgerSQN]),
|
||||
ok = leveled_inker:ink_loadpcl(Inker,
|
||||
LedgerSQN + 1,
|
||||
get_loadfun(State),
|
||||
Penciller),
|
||||
{Inker, Penciller}.
|
||||
|
||||
|
||||
-spec fetch_head(tuple(), pid(), ledger_cache()) -> not_present|tuple().
|
||||
%% @doc
|
||||
%% Fetch only the head of the object from the Ledger (or the bookie's recent
|
||||
%% ledger cache if it has just been updated). not_present is returned if the
|
||||
%% Key is not found
|
||||
fetch_head(Key, Penciller, LedgerCache) ->
|
||||
SW = os:timestamp(),
|
||||
CacheResult =
|
||||
|
@ -1282,6 +1341,13 @@ preparefor_ledgercache(_InkTag,
|
|||
{KeyH, SQN, KeyChanges}.
|
||||
|
||||
|
||||
-spec addto_ledgercache({integer()|no_lookup,
|
||||
integer(), list()}, ledger_cache())
|
||||
-> ledger_cache().
|
||||
%% @doc
|
||||
%% Add a set of changes associated with a single sequence number (journal
|
||||
%% update) and key to the ledger cache. If the changes are not to be looked
|
||||
%% up directly, then they will not be indexed to accelerate lookup
|
||||
addto_ledgercache({H, SQN, KeyChanges}, Cache) ->
|
||||
ets:insert(Cache#ledger_cache.mem, KeyChanges),
|
||||
UpdIndex = leveled_pmem:prepare_for_index(Cache#ledger_cache.index, H),
|
||||
|
@ -1289,6 +1355,15 @@ addto_ledgercache({H, SQN, KeyChanges}, Cache) ->
|
|||
min_sqn=min(SQN, Cache#ledger_cache.min_sqn),
|
||||
max_sqn=max(SQN, Cache#ledger_cache.max_sqn)}.
|
||||
|
||||
-spec addto_ledgercache({integer()|no_lookup,
|
||||
integer(), list()}, ledger_cache(), loader)
|
||||
-> ledger_cache().
|
||||
%% @doc
|
||||
%% Add a set of changes associated witha single sequence number (journal
|
||||
%% update) to the ledger cache. This is used explicitly when laoding the
|
||||
%% ledger from the Journal (i.e. at startup) - and in this case the ETS insert
|
||||
%% can be bypassed, as all changes will be flushed to the Penciller before the
|
||||
%% load is complete.
|
||||
addto_ledgercache({H, SQN, KeyChanges}, Cache, loader) ->
|
||||
UpdQ = KeyChanges ++ Cache#ledger_cache.load_queue,
|
||||
UpdIndex = leveled_pmem:prepare_for_index(Cache#ledger_cache.index, H),
|
||||
|
@ -1298,6 +1373,18 @@ addto_ledgercache({H, SQN, KeyChanges}, Cache, loader) ->
|
|||
max_sqn=max(SQN, Cache#ledger_cache.max_sqn)}.
|
||||
|
||||
|
||||
-spec maybepush_ledgercache(integer(), ledger_cache(), pid())
|
||||
-> {ok|returned, ledger_cache()}.
|
||||
%% @doc
|
||||
%% Following an update to the ledger cache, check if this now big enough to be
|
||||
%% pushed down to the Penciller. There is some random jittering here, to
|
||||
%% prevent coordination across leveled instances (e.g. when running in Riak).
|
||||
%%
|
||||
%% The penciller may be too busy, as the LSM tree is backed up with merge
|
||||
%% activity. In this case the update is not made and 'returned' not ok is set
|
||||
%% in the reply. Try again later when it isn't busy (and also potentially
|
||||
%% implement a slow_offer state to slow down the pace at which PUTs are being
|
||||
%% received)
|
||||
maybepush_ledgercache(MaxCacheSize, Cache, Penciller) ->
|
||||
Tab = Cache#ledger_cache.mem,
|
||||
CacheSize = ets:info(Tab, size),
|
||||
|
@ -1321,7 +1408,10 @@ maybepush_ledgercache(MaxCacheSize, Cache, Penciller) ->
|
|||
{ok, Cache}
|
||||
end.
|
||||
|
||||
|
||||
-spec maybe_withjitter(integer(), integer()) -> boolean().
|
||||
%% @doc
|
||||
%% Push down randomly, but the closer to the maximum size, the more likely a
|
||||
%% push should be
|
||||
maybe_withjitter(CacheSize, MaxCacheSize) ->
|
||||
if
|
||||
CacheSize > MaxCacheSize ->
|
||||
|
@ -1337,6 +1427,10 @@ maybe_withjitter(CacheSize, MaxCacheSize) ->
|
|||
end.
|
||||
|
||||
|
||||
-spec get_loadfun(book_state()) -> fun().
|
||||
%% @doc
|
||||
%% The LoadFun will be sued by the Inker when walking across the Journal to
|
||||
%% load the Penciller at startup
|
||||
get_loadfun(State) ->
|
||||
PrepareFun =
|
||||
fun(Tag, PK, SQN, Obj, VS, IdxSpecs) ->
|
||||
|
|
|
@ -21,11 +21,13 @@
|
|||
-define(BAND_MASK, ?INTEGER_SIZE - 1).
|
||||
|
||||
|
||||
-type bloom() :: binary().
|
||||
|
||||
%%%============================================================================
|
||||
%%% API
|
||||
%%%============================================================================
|
||||
|
||||
-spec create_bloom(list(integer())) -> binary().
|
||||
-spec create_bloom(list(integer())) -> bloom().
|
||||
%% @doc
|
||||
%% Create a binary bloom filter from alist of hashes
|
||||
create_bloom(HashList) ->
|
||||
|
@ -51,7 +53,7 @@ create_bloom(HashList) ->
|
|||
end.
|
||||
|
||||
|
||||
-spec check_hash(integer(), binary()) -> boolean().
|
||||
-spec check_hash(integer(), bloom()) -> boolean().
|
||||
%% @doc
|
||||
%% Check for the presence of a given hash within a bloom
|
||||
check_hash(_Hash, <<>>) ->
|
||||
|
|
|
@ -145,6 +145,7 @@
|
|||
|
||||
|
||||
-type inker_options() :: #inker_options{}.
|
||||
-type ink_state() :: #state{}.
|
||||
|
||||
|
||||
%%%============================================================================
|
||||
|
@ -579,6 +580,9 @@ code_change(_OldVsn, State, _Extra) ->
|
|||
%%% Internal functions
|
||||
%%%============================================================================
|
||||
|
||||
-spec start_from_file(inker_options()) -> {ok, ink_state()}.
|
||||
%% @doc
|
||||
%% Start an Inker from the state on disk (i.e. not a snapshot).
|
||||
start_from_file(InkOpts) ->
|
||||
% Setting the correct CDB options is important when starting the inker, in
|
||||
% particular for waste retention which is determined by the CDB options
|
||||
|
@ -646,6 +650,9 @@ shutdown_manifest(Manifest) ->
|
|||
ManAsList = leveled_imanifest:to_list(Manifest),
|
||||
close_allmanifest(ManAsList).
|
||||
|
||||
-spec get_cdbopts(inker_options()) -> #cdb_options{}.
|
||||
%% @doc
|
||||
%% Extract the options for the indibvidal Journal files from the Inker options
|
||||
get_cdbopts(InkOpts)->
|
||||
CDBopts = InkOpts#inker_options.cdb_options,
|
||||
WasteFP =
|
||||
|
@ -664,6 +671,14 @@ get_cdbopts(InkOpts)->
|
|||
CDBopts#cdb_options{waste_path = WasteFP}.
|
||||
|
||||
|
||||
-spec put_object(tuple(), any(), list(), ink_state())
|
||||
-> {ok|rolling, ink_state(), integer()}.
|
||||
%% @doc
|
||||
%% Add the object to the current journal if it fits. If it doesn't fit, a new
|
||||
%% journal must be started, and the old journal is set to "roll" into a read
|
||||
%% only Journal.
|
||||
%% The reply contains the byte_size of the object, using the size calculated
|
||||
%% to store the object.
|
||||
put_object(LedgerKey, Object, KeyChanges, State) ->
|
||||
NewSQN = State#state.journal_sqn + 1,
|
||||
ActiveJournal = State#state.active_journaldb,
|
||||
|
@ -710,6 +725,12 @@ put_object(LedgerKey, Object, KeyChanges, State) ->
|
|||
end.
|
||||
|
||||
|
||||
-spec get_object(tuple(), integer(), leveled_imanifest:manifest()) -> any().
|
||||
%% @doc
|
||||
%% Find the SQN in the manifest and then fetch the object from the Journal,
|
||||
%% in the manifest. If the fetch is in response to a user GET request then
|
||||
%% the KeyChanges are irrelevant, so no need to process them. In this case
|
||||
%% the KeyChanges are processed (as ToIgnoreKeyChanges will be set to false).
|
||||
get_object(LedgerKey, SQN, Manifest) ->
|
||||
get_object(LedgerKey, SQN, Manifest, false).
|
||||
|
||||
|
@ -720,12 +741,28 @@ get_object(LedgerKey, SQN, Manifest, ToIgnoreKeyChanges) ->
|
|||
Obj = leveled_cdb:cdb_get(JournalP, InkerKey),
|
||||
leveled_codec:from_inkerkv(Obj, ToIgnoreKeyChanges).
|
||||
|
||||
|
||||
-spec key_check(tuple(), integer(), leveled_imanifest:manifest())
|
||||
-> missing|probably.
|
||||
%% @doc
|
||||
%% Checks for the presence of the key at that SQN withing the journal,
|
||||
%% avoiding the cost of actually reading the object from disk.
|
||||
%% a KeyCheck is not absolute proof of the existence of the object - there
|
||||
%% could be a hash collision, or the on-disk object could be corrupted. So
|
||||
%% the positive answer is 'probably' not 'true'
|
||||
key_check(LedgerKey, SQN, Manifest) ->
|
||||
JournalP = leveled_imanifest:find_entry(SQN, Manifest),
|
||||
{InkerKey, _V, true} =
|
||||
leveled_codec:to_inkerkv(LedgerKey, SQN, to_fetch),
|
||||
leveled_cdb:cdb_keycheck(JournalP, InkerKey).
|
||||
|
||||
|
||||
-spec build_manifest(list(), list(), #cdb_options{}) ->
|
||||
{leveled_imanifest:manifest(), integer(), integer(), pid()}.
|
||||
%% @doc
|
||||
%% Selectes the correct manifets to open, and the starts a process for each
|
||||
%% file in the manifest, storing the PID for that process within the manifest.
|
||||
%% Opens an active journal if one is not present.
|
||||
build_manifest(ManifestFilenames,
|
||||
RootPath,
|
||||
CDBopts) ->
|
||||
|
@ -779,6 +816,10 @@ build_manifest(ManifestFilenames,
|
|||
{OpenManifest, UpdManifestSQN, JournalSQN, ActiveJournal}.
|
||||
|
||||
|
||||
-spec close_allmanifest(list()) -> ok.
|
||||
%% @doc
|
||||
%% Close every file in the manifest. Will cause deletion of any delete_pending
|
||||
%% files.
|
||||
close_allmanifest([]) ->
|
||||
ok;
|
||||
close_allmanifest([H|ManifestT]) ->
|
||||
|
@ -787,6 +828,11 @@ close_allmanifest([H|ManifestT]) ->
|
|||
close_allmanifest(ManifestT).
|
||||
|
||||
|
||||
-spec open_all_manifest(leveled_imanifest:manifest(), list(), #cdb_options{})
|
||||
-> leveled_imanifest:manifest().
|
||||
%% @doc
|
||||
%% Open all the files in the manifets, and updating the manifest with the PIDs
|
||||
%% of the opened files
|
||||
open_all_manifest([], RootPath, CDBOpts) ->
|
||||
leveled_log:log("I0011", []),
|
||||
leveled_imanifest:add_entry([],
|
||||
|
|
|
@ -943,7 +943,10 @@ sst_filename(ManSQN, Level, Count) ->
|
|||
%%% Internal functions
|
||||
%%%============================================================================
|
||||
|
||||
|
||||
-spec start_from_file(penciller_options()) -> {ok, pcl_state()}.
|
||||
%% @doc
|
||||
%% Normal start of a penciller (i.e. not a snapshot), needs to read the
|
||||
%% filesystem and reconstruct the ledger from the files that it finds
|
||||
start_from_file(PCLopts) ->
|
||||
RootPath = PCLopts#penciller_options.root_path,
|
||||
MaxTableSize =
|
||||
|
@ -1035,6 +1038,12 @@ shutdown_manifest(Manifest)->
|
|||
leveled_pmanifest:close_manifest(Manifest, EntryCloseFun).
|
||||
|
||||
|
||||
-spec archive_files(list(), list()) -> ok.
|
||||
%% @doc
|
||||
%% Archive any sst files in the folder that have not been used to build the
|
||||
%% ledger at startup. They may have not deeleted as expected, so this saves
|
||||
%% them off as non-SST fies to make it easier for an admin to garbage collect
|
||||
%% theses files
|
||||
archive_files(RootPath, UsedFileList) ->
|
||||
{ok, AllFiles} = file:list_dir(sst_rootpath(RootPath)),
|
||||
FileCheckFun =
|
||||
|
@ -1066,9 +1075,20 @@ archive_files(RootPath, UsedFileList) ->
|
|||
ok.
|
||||
|
||||
|
||||
-spec update_levelzero(integer(), tuple(), integer(), list(), pcl_state())
|
||||
-> pcl_state().
|
||||
%% @doc
|
||||
%% Update the in-memory cache of recent changes for the penciller. This is
|
||||
%% the level zer at the top of the tree.
|
||||
%% Once the update is made, there needs to be a decision to potentially roll
|
||||
%% the level-zero memory to an on-disk level zero sst file. This can only
|
||||
%% happen when the cache has exeeded the size threshold (with some jitter
|
||||
%% to prevent coordination across multiple leveled instances), and when there
|
||||
%% is no level zero file already present, and when there is no manifest change
|
||||
%% pending.
|
||||
update_levelzero(L0Size, {PushedTree, PushedIdx, MinSQN, MaxSQN},
|
||||
LedgerSQN, L0Cache, State) ->
|
||||
SW = os:timestamp(),
|
||||
SW = os:timestamp(), % Time this for logging purposes
|
||||
Update = leveled_pmem:add_to_cache(L0Size,
|
||||
{PushedTree, MinSQN, MaxSQN},
|
||||
LedgerSQN,
|
||||
|
@ -1116,7 +1136,12 @@ update_levelzero(L0Size, {PushedTree, PushedIdx, MinSQN, MaxSQN},
|
|||
end
|
||||
end.
|
||||
|
||||
|
||||
-spec roll_memory(pcl_state(), boolean())
|
||||
-> {pid(), leveled_ebloom:bloom()|none}.
|
||||
%% @doc
|
||||
%% Roll the in-memory cache into a L0 file. If this is done synchronously,
|
||||
%% will return a bloom representing the contents of the file.
|
||||
%%
|
||||
%% Casting a large object (the levelzero cache) to the gen_server did not lead
|
||||
%% to an immediate return as expected. With 32K keys in the TreeList it could
|
||||
%% take around 35-40ms.
|
||||
|
@ -1126,7 +1151,6 @@ update_levelzero(L0Size, {PushedTree, PushedIdx, MinSQN, MaxSQN},
|
|||
%%
|
||||
%% The Wait is set to false to use a cast when calling this in normal operation
|
||||
%% where as the Wait of true is used at shutdown
|
||||
|
||||
roll_memory(State, false) ->
|
||||
ManSQN = leveled_pmanifest:get_manifest_sqn(State#state.manifest) + 1,
|
||||
RootPath = sst_rootpath(State#state.root_path),
|
||||
|
@ -1159,12 +1183,33 @@ roll_memory(State, true) ->
|
|||
{ok, Constructor, _, Bloom} = R,
|
||||
{Constructor, Bloom}.
|
||||
|
||||
|
||||
-spec timed_fetch_mem(tuple(), {integer(), integer()},
|
||||
leveled_pmanifest:manifest(), list(),
|
||||
leveled_pmem:index_array(), pcl_timings())
|
||||
-> {tuple(), pcl_timings()}.
|
||||
%% @doc
|
||||
%% Fetch the result from the penciller, starting by looking in the memory,
|
||||
%% and if it is not found looking down level by level through the LSM tree.
|
||||
%%
|
||||
%% This allows for the request to be timed, and the timing result to be added
|
||||
%% to the aggregate timings - so that timinings per level can be logged and
|
||||
%% the cost of requests dropping levels can be monitored.
|
||||
%%
|
||||
%% the result tuple includes the level at which the result was found.
|
||||
timed_fetch_mem(Key, Hash, Manifest, L0Cache, L0Index, Timings) ->
|
||||
SW = os:timestamp(),
|
||||
{R, Level} = fetch_mem(Key, Hash, Manifest, L0Cache, L0Index),
|
||||
UpdTimings = update_timings(SW, Timings, R, Level),
|
||||
{R, UpdTimings}.
|
||||
|
||||
|
||||
-spec plain_fetch_mem(tuple(), {integer(), integer()},
|
||||
leveled_pmanifest:manifest(), list(),
|
||||
leveled_pmem:index_array()) -> not_present|tuple().
|
||||
%% @doc
|
||||
%% Fetch the result from the penciller, starting by looking in the memory,
|
||||
%% and if it is not found looking down level by level through the LSM tree.
|
||||
plain_fetch_mem(Key, Hash, Manifest, L0Cache, L0Index) ->
|
||||
R = fetch_mem(Key, Hash, Manifest, L0Cache, L0Index),
|
||||
element(1, R).
|
||||
|
@ -1179,6 +1224,14 @@ fetch_mem(Key, Hash, Manifest, L0Cache, L0Index) ->
|
|||
{KV, memory}
|
||||
end.
|
||||
|
||||
-spec fetch(tuple(), {integer(), integer()},
|
||||
leveled_pmanifest:manifest(), integer(),
|
||||
fun()) -> {tuple()|not_present, integer()|basement}.
|
||||
%% @doc
|
||||
%% Fetch from the persisted portion of the LSM tree, checking each level in
|
||||
%% turn until a match is found.
|
||||
%% Levels can be skipped by checking the bloom for the relevant file at that
|
||||
%% level.
|
||||
fetch(_Key, _Hash, _Manifest, ?MAX_LEVELS + 1, _FetchFun) ->
|
||||
{not_present, basement};
|
||||
fetch(Key, Hash, Manifest, Level, FetchFun) ->
|
||||
|
@ -1217,6 +1270,12 @@ log_slowfetch(T0, R, PID, Level, FetchTolerance) ->
|
|||
R
|
||||
end.
|
||||
|
||||
|
||||
-spec compare_to_sqn(tuple()|not_present, integer()) -> boolean().
|
||||
%% @doc
|
||||
%% Check to see if the SQN in the penciller is after the SQN expected for an
|
||||
%% object (used to allow the journal to check compaction status from a cache
|
||||
%% of the ledger - objects with a more recent sequence number can be compacted).
|
||||
compare_to_sqn(Obj, SQN) ->
|
||||
case Obj of
|
||||
not_present ->
|
||||
|
|
|
@ -523,7 +523,7 @@ space_clear_ondelete(_Config) ->
|
|||
no_check,
|
||||
G2),
|
||||
|
||||
FoldKeysFun = fun(B, K, Acc) -> Acc ++ [{B, K}] end,
|
||||
FoldKeysFun = fun(B, K, Acc) -> [{B, K}|Acc] end,
|
||||
AllKeyQuery = {keylist, o_rkv, {FoldKeysFun, []}},
|
||||
{async, F1} = leveled_bookie:book_returnfolder(Book1, AllKeyQuery),
|
||||
SW1 = os:timestamp(),
|
||||
|
@ -548,7 +548,12 @@ space_clear_ondelete(_Config) ->
|
|||
?RIAK_TAG,
|
||||
FoldObjectsFun,
|
||||
false}),
|
||||
% This query does not Snap PreFold - and so will not prevent
|
||||
% pending deletes from prompting actual deletes
|
||||
|
||||
{async, KF1} = leveled_bookie:book_returnfolder(Book1, AllKeyQuery),
|
||||
% This query does Snap PreFold, and so will prevent deletes from
|
||||
% the ledger
|
||||
|
||||
% Delete the keys
|
||||
SW2 = os:timestamp(),
|
||||
|
@ -581,9 +586,14 @@ space_clear_ondelete(_Config) ->
|
|||
io:format("Waiting for journal deletes - blocked~n"),
|
||||
timer:sleep(20000),
|
||||
|
||||
% for this query snapshot is made at fold time
|
||||
io:format("Sleep over - Fold Objects query ~n"),
|
||||
% for this query snapshot is made at fold time, and so the results are
|
||||
% empty
|
||||
true = length(HTreeF1()) == 0,
|
||||
|
||||
% This query uses a genuine async fold on a snasphot made at request time
|
||||
% and so the results should be non-empty
|
||||
io:format("Now Query 2 - Fold Keys query~n"),
|
||||
true = length(KF1()) == 80000,
|
||||
|
||||
io:format("Waiting for journal deletes - unblocked~n"),
|
||||
|
|
|
@ -175,7 +175,7 @@ aae_bustedjournal(_Config) ->
|
|||
testutil:corrupt_journal(RootPath, HeadF, 1000, 2048, 1000),
|
||||
{ok, Bookie2} = leveled_bookie:book_start(StartOpts),
|
||||
|
||||
FoldKeysFun = fun(B, K, Acc) -> Acc ++ [{B, K}] end,
|
||||
FoldKeysFun = fun(B, K, Acc) -> [{B, K}|Acc] end,
|
||||
AllKeyQuery = {keylist, o_rkv, {FoldKeysFun, []}},
|
||||
{async, KeyF} = leveled_bookie:book_returnfolder(Bookie2, AllKeyQuery),
|
||||
KeyList = KeyF(),
|
||||
|
|
|
@ -596,12 +596,12 @@ get_randomdate() ->
|
|||
[Year, Month, Day, Hour, Minute, Second])).
|
||||
|
||||
|
||||
foldkeysfun(_Bucket, Item, Acc) -> Acc ++ [Item].
|
||||
foldkeysfun(_Bucket, Item, Acc) -> [Item|Acc].
|
||||
|
||||
foldkeysfun_returnbucket(Bucket, {Term, Key}, Acc) ->
|
||||
Acc ++ [{Term, {Bucket, Key}}];
|
||||
[{Term, {Bucket, Key}}|Acc];
|
||||
foldkeysfun_returnbucket(Bucket, Key, Acc) ->
|
||||
Acc ++ [{Bucket, Key}].
|
||||
[{Bucket, Key}|Acc].
|
||||
|
||||
check_indexed_objects(Book, B, KSpecL, V) ->
|
||||
% Check all objects match, return what should be the results of an all
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue