diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..26dc633 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.beam +/.eunit +/_build \ No newline at end of file diff --git a/include/leveled.hrl b/include/leveled.hrl new file mode 100644 index 0000000..e685a39 --- /dev/null +++ b/include/leveled.hrl @@ -0,0 +1,84 @@ + +%% Tag to be used on standard Riak KV objects +-define(RIAK_TAG, o_rkv). +%% Tag to be used on K/V objects for non-Riak purposes +-define(STD_TAG, o). +%% Tag used for secondary index keys +-define(IDX_TAG, i). + +%% Inker key type used for 'normal' objects +-define(INKT_STND, stnd). +%% Inker key type used for objects which contain no value, only key changes +%% This is used currently for objects formed under a 'retain' strategy on Inker +%% compaction, but could be used for special set-type objects +-define(INKT_KEYD, keyd). +%% Inker key type used for tombstones +-define(INKT_TOMB, tomb). + +-record(sft_options, + {wait = true :: boolean(), + expire_tombstones = false :: boolean(), + penciller :: pid()}). + +-record(penciller_work, + {next_sqn :: integer(), + clerk :: pid(), + src_level :: integer(), + manifest :: list(), + start_time :: tuple(), + ledger_filepath :: string(), + manifest_file :: string(), + new_manifest :: list(), + unreferenced_files :: list(), + target_is_basement = false ::boolean()}). + +-record(level, + {level :: integer(), + is_basement = false :: boolean(), + timestamp :: integer()}). + +-record(manifest_entry, + {start_key :: tuple(), + end_key :: tuple(), + owner :: pid(), + filename :: string()}). + +-record(cdb_options, + {max_size :: integer(), + file_path :: string(), + binary_mode = false :: boolean()}). + +-record(inker_options, + {cdb_max_size :: integer(), + root_path :: string(), + cdb_options :: #cdb_options{}, + start_snapshot = false :: boolean(), + source_inker :: pid(), + reload_strategy = [] :: list(), + max_run_length}). + +-record(penciller_options, + {root_path :: string(), + max_inmemory_tablesize :: integer(), + start_snapshot = false :: boolean(), + source_penciller :: pid()}). + +-record(iclerk_options, + {inker :: pid(), + max_run_length :: integer(), + cdb_options :: #cdb_options{}, + reload_strategy = [] :: list()}). + +-record(r_content, { + metadata, + value :: term() + }). + +-record(r_object, { + bucket, + key, + contents :: [#r_content{}], + vclock, + updatemetadata=dict:store(clean, true, dict:new()), + updatevalue :: term()}). + \ No newline at end of file diff --git a/rebar.lock b/rebar.lock new file mode 100644 index 0000000..57afcca --- /dev/null +++ b/rebar.lock @@ -0,0 +1 @@ +[]. diff --git a/src/eleveleddb.app.src b/src/eleveleddb.app.src new file mode 100644 index 0000000..37b004d --- /dev/null +++ b/src/eleveleddb.app.src @@ -0,0 +1,12 @@ +{application, eleveleddb, + [ + {description, "Key Value store based on LSM-Tree and designed for larger values"}, + {vsn, "1"}, + {registered, []}, + {applications, [ + kernel, + stdlib + ]}, + {mod, { eleveleddb_app, []}}, + {env, [{root_path, "test"}]} + ]}. diff --git a/src/eleveleddb_app.erl b/src/eleveleddb_app.erl new file mode 100644 index 0000000..18f7546 --- /dev/null +++ b/src/eleveleddb_app.erl @@ -0,0 +1,16 @@ +-module(eleveleddb_app). + +-behaviour(application). + +%% Application callbacks +-export([start/2, stop/1]). + +%% =================================================================== +%% Application callbacks +%% =================================================================== + +start(_StartType, _StartArgs) -> + eleveleddb_sup:start_link(). + +stop(_State) -> + ok. diff --git a/src/eleveleddb_sup.erl b/src/eleveleddb_sup.erl new file mode 100644 index 0000000..391aca9 --- /dev/null +++ b/src/eleveleddb_sup.erl @@ -0,0 +1,27 @@ +-module(eleveleddb_sup). + +-behaviour(supervisor). + +%% API +-export([start_link/0]). + +%% Supervisor callbacks +-export([init/1]). + +%% Helper macro for declaring children of supervisor +-define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). + +%% =================================================================== +%% API functions +%% =================================================================== + +start_link() -> + supervisor:start_link({local, leveled_bookie}, ?MODULE, []). + +%% =================================================================== +%% Supervisor callbacks +%% =================================================================== + +init([]) -> + {ok, { {one_for_one, 5, 10}, []} }. + diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl new file mode 100644 index 0000000..0958ee9 --- /dev/null +++ b/src/leveled_bookie.erl @@ -0,0 +1,1188 @@ +%% -------- Overview --------- +%% +%% The eleveleddb is based on the LSM-tree similar to leveldb, except that: +%% - Keys, Metadata and Values are not persisted together - the Keys and +%% Metadata are kept in a tree-based ledger, whereas the values are stored +%% only in a sequential Journal. +%% - Different file formats are used for Journal (based on constant +%% database), and the ledger (sft, based on sst) +%% - It is not intended to be general purpose, but be specifically suited for +%% use as a Riak backend in specific circumstances (relatively large values, +%% and frequent use of iterators) +%% - The Journal is an extended nursery log in leveldb terms. It is keyed +%% on the sequence number of the write +%% - The ledger is a merge tree, where the key is the actaul object key, and +%% the value is the metadata of the object including the sequence number +%% +%% +%% -------- The actors --------- +%% +%% The store is fronted by a Bookie, who takes support from different actors: +%% - An Inker who persists new data into the journal, and returns items from +%% the journal based on sequence number +%% - A Penciller who periodically redraws the ledger, that associates keys with +%% sequence numbers and other metadata, as well as secondary keys (for index +%% queries) +%% - One or more Clerks, who may be used by either the inker or the penciller +%% to fulfill background tasks +%% +%% Both the Inker and the Penciller maintain a manifest of the files which +%% represent the current state of the Journal and the Ledger repsectively. +%% For the Inker the manifest maps ranges of sequence numbers to cdb files. +%% For the Penciller the manifest maps key ranges to files at each level of +%% the Ledger. +%% +%% -------- PUT -------- +%% +%% A PUT request consists of +%% - A Primary Key and a Value +%% - IndexSpecs - a set of secondary key changes associated with the +%% transaction +%% +%% The Bookie takes the place request and passes it first to the Inker to add +%% the request to the ledger. +%% +%% The inker will pass the PK/Value/IndexSpecs to the current (append only) +%% CDB journal file to persist the change. The call should return either 'ok' +%% or 'roll'. -'roll' indicates that the CDB file has insufficient capacity for +%% this write. +%% +%% (Note that storing the IndexSpecs will create some duplication with the +%% Metadata wrapped up within the Object value. This Value and the IndexSpecs +%% are compressed before storage, so this should provide some mitigation for +%% the duplication). +%% +%% In resonse to a 'roll', the inker should: +%% - start a new active journal file with an open_write_request, and then; +%% - call to PUT the object in this file; +%% - reply to the bookie, but then in the background +%% - close the previously active journal file (writing the hashtree), and move +%% it to the historic journal +%% +%% The inker will also return the SQN which the change has been made at, as +%% well as the object size on disk within the Journal. +%% +%% Once the object has been persisted to the Journal, the Ledger can be updated. +%% The Ledger is updated by the Bookie applying a function (extract_metadata/4) +%% to the Value to return the Object Metadata, a function to generate a hash +%% of the Value and also taking the Primary Key, the IndexSpecs, the Sequence +%% Number in the Journal and the Object Size (returned from the Inker). +%% +%% The Bookie should generate a series of ledger key changes from this +%% information, using a function passed in at startup. For Riak this will be +%% of the form: +%% {{o_rkv, Bucket, Key, SubKey|null}, +%% SQN, +%% {Hash, Size, {Riak_Metadata}}, +%% {active, TS}|{tomb, TS}} or +%% {{i, Bucket, {IndexTerm, IndexField}, Key}, +%% SQN, +%% null, +%% {active, TS}|{tomb, TS}} +%% +%% Recent Ledger changes are retained initially in the Bookies' memory (in a +%% small generally balanced tree). Periodically, the current table is pushed to +%% the Penciller for eventual persistence, and a new table is started. +%% +%% This completes the non-deferrable work associated with a PUT +%% +%% -------- Snapshots (Key & Metadata Only) -------- +%% +%% If there is a snapshot request (e.g. to iterate over the keys) the Bookie +%% may request a clone of the Penciller, or the Penciller and the Inker. +%% +%% The clone is seeded with the manifest. Teh clone should be registered with +%% the real Inker/Penciller, so that the real Inker/Penciller may prevent the +%% deletion of files still in use by a snapshot clone. +%% +%% Iterators should de-register themselves from the Penciller on completion. +%% Iterators should be automatically release after a timeout period. A file +%% can only be deleted from the Ledger if it is no longer in the manifest, and +%% there are no registered iterators from before the point the file was +%% removed from the manifest. +%% +%% -------- Special Ops -------- +%% +%% e.g. Get all for SegmentID/Partition +%% +%% +%% +%% -------- On Startup -------- +%% +%% On startup the Bookie must restart both the Inker to load the Journal, and +%% the Penciller to load the Ledger. Once the Penciller has started, the +%% Bookie should request the highest sequence number in the Ledger, and then +%% and try and rebuild any missing information from the Journal. +%% +%% To rebuild the Ledger it requests the Inker to scan over the files from +%% the sequence number and re-generate the Ledger changes - pushing the changes +%% directly back into the Ledger. + + + +-module(leveled_bookie). + +-behaviour(gen_server). + +-include("include/leveled.hrl"). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3, + book_start/1, + book_start/3, + book_riakput/3, + book_riakdelete/4, + book_riakget/3, + book_riakhead/3, + book_put/5, + book_put/6, + book_tempput/7, + book_delete/4, + book_get/3, + book_head/3, + book_returnfolder/2, + book_snapshotstore/3, + book_snapshotledger/3, + book_compactjournal/2, + book_islastcompactionpending/1, + book_close/1]). + +-export([get_opt/2, + get_opt/3]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(CACHE_SIZE, 2000). +-define(JOURNAL_FP, "journal"). +-define(LEDGER_FP, "ledger"). +-define(SNAPSHOT_TIMEOUT, 300000). +-define(CHECKJOURNAL_PROB, 0.2). +-define(SLOWOFFER_DELAY, 5). + +-record(state, {inker :: pid(), + penciller :: pid(), + cache_size :: integer(), + ledger_cache :: gb_trees:tree(), + is_snapshot :: boolean(), + slow_offer = false :: boolean()}). + + + +%%%============================================================================ +%%% API +%%%============================================================================ + +book_start(RootPath, LedgerCacheSize, JournalSize) -> + book_start([{root_path, RootPath}, + {cache_size, LedgerCacheSize}, + {max_journalsize, JournalSize}]). + +book_start(Opts) -> + gen_server:start(?MODULE, [Opts], []). + +book_riakput(Pid, RiakObject, IndexSpecs) -> + {Bucket, Key} = leveled_codec:riakto_keydetails(RiakObject), + book_put(Pid, Bucket, Key, RiakObject, IndexSpecs, ?RIAK_TAG). + +book_tempput(Pid, Bucket, Key, Object, IndexSpecs, Tag, TTL) when is_integer(TTL) -> + book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag, TTL). + +book_put(Pid, Bucket, Key, Object, IndexSpecs) -> + book_put(Pid, Bucket, Key, Object, IndexSpecs, ?STD_TAG). + +book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag) -> + book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag, infinity). + +book_riakdelete(Pid, Bucket, Key, IndexSpecs) -> + book_put(Pid, Bucket, Key, delete, IndexSpecs, ?RIAK_TAG). + +book_delete(Pid, Bucket, Key, IndexSpecs) -> + book_put(Pid, Bucket, Key, delete, IndexSpecs, ?STD_TAG). + +book_riakget(Pid, Bucket, Key) -> + book_get(Pid, Bucket, Key, ?RIAK_TAG). + +book_get(Pid, Bucket, Key) -> + book_get(Pid, Bucket, Key, ?STD_TAG). + +book_riakhead(Pid, Bucket, Key) -> + book_head(Pid, Bucket, Key, ?RIAK_TAG). + +book_head(Pid, Bucket, Key) -> + book_head(Pid, Bucket, Key, ?STD_TAG). + +book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag, TTL) -> + gen_server:call(Pid, + {put, Bucket, Key, Object, IndexSpecs, Tag, TTL}, + infinity). + +book_get(Pid, Bucket, Key, Tag) -> + gen_server:call(Pid, {get, Bucket, Key, Tag}, infinity). + +book_head(Pid, Bucket, Key, Tag) -> + gen_server:call(Pid, {head, Bucket, Key, Tag}, infinity). + +book_returnfolder(Pid, FolderType) -> + gen_server:call(Pid, {return_folder, FolderType}, infinity). + +book_snapshotstore(Pid, Requestor, Timeout) -> + gen_server:call(Pid, {snapshot, Requestor, store, Timeout}, infinity). + +book_snapshotledger(Pid, Requestor, Timeout) -> + gen_server:call(Pid, {snapshot, Requestor, ledger, Timeout}, infinity). + +book_compactjournal(Pid, Timeout) -> + gen_server:call(Pid, {compact_journal, Timeout}, infinity). + +book_islastcompactionpending(Pid) -> + gen_server:call(Pid, confirm_compact, infinity). + +book_close(Pid) -> + gen_server:call(Pid, close, infinity). + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([Opts]) -> + case get_opt(snapshot_bookie, Opts) of + undefined -> + % Start from file not snapshot + {InkerOpts, PencillerOpts} = set_options(Opts), + {Inker, Penciller} = startup(InkerOpts, PencillerOpts), + CacheSize = get_opt(cache_size, Opts, ?CACHE_SIZE), + leveled_log:log("B0001", [Inker, Penciller]), + {ok, #state{inker=Inker, + penciller=Penciller, + cache_size=CacheSize, + ledger_cache=gb_trees:empty(), + is_snapshot=false}}; + Bookie -> + {ok, + {Penciller, LedgerCache}, + Inker} = book_snapshotstore(Bookie, self(), ?SNAPSHOT_TIMEOUT), + ok = leveled_penciller:pcl_loadsnapshot(Penciller, + gb_trees:empty()), + leveled_log:log("B0002", [Inker, Penciller]), + {ok, #state{penciller=Penciller, + inker=Inker, + ledger_cache=LedgerCache, + is_snapshot=true}} + end. + + +handle_call({put, Bucket, Key, Object, IndexSpecs, Tag, TTL}, From, State) -> + LedgerKey = leveled_codec:to_ledgerkey(Bucket, Key, Tag), + {ok, SQN, ObjSize} = leveled_inker:ink_put(State#state.inker, + LedgerKey, + Object, + {IndexSpecs, TTL}), + Changes = preparefor_ledgercache(no_type_assigned, + LedgerKey, + SQN, + Object, + ObjSize, + {IndexSpecs, TTL}), + Cache0 = addto_ledgercache(Changes, State#state.ledger_cache), + % If the previous push to memory was returned then punish this PUT with a + % delay. If the back-pressure in the Penciller continues, these delays + % will beocme more frequent + case State#state.slow_offer of + true -> + timer:sleep(?SLOWOFFER_DELAY); + false -> + ok + end, + gen_server:reply(From, ok), + case maybepush_ledgercache(State#state.cache_size, + Cache0, + State#state.penciller) of + {ok, NewCache} -> + {noreply, State#state{ledger_cache=NewCache, slow_offer=false}}; + {returned, NewCache} -> + {noreply, State#state{ledger_cache=NewCache, slow_offer=true}} + end; +handle_call({get, Bucket, Key, Tag}, _From, State) -> + LedgerKey = leveled_codec:to_ledgerkey(Bucket, Key, Tag), + case fetch_head(LedgerKey, + State#state.penciller, + State#state.ledger_cache) of + not_present -> + {reply, not_found, State}; + Head -> + {Seqn, Status, _MD} = leveled_codec:striphead_to_details(Head), + case Status of + tomb -> + {reply, not_found, State}; + {active, TS} -> + Active = TS >= leveled_codec:integer_now(), + case {Active, + fetch_value(LedgerKey, Seqn, State#state.inker)} of + {_, not_present} -> + {reply, not_found, State}; + {true, Object} -> + {reply, {ok, Object}, State}; + _ -> + {reply, not_found, State} + end + end + end; +handle_call({head, Bucket, Key, Tag}, _From, State) -> + LedgerKey = leveled_codec:to_ledgerkey(Bucket, Key, Tag), + case fetch_head(LedgerKey, + State#state.penciller, + State#state.ledger_cache) of + not_present -> + {reply, not_found, State}; + Head -> + {_Seqn, Status, MD} = leveled_codec:striphead_to_details(Head), + case Status of + tomb -> + {reply, not_found, State}; + {active, TS} -> + case TS >= leveled_codec:integer_now() of + true -> + OMD = leveled_codec:build_metadata_object(LedgerKey, MD), + {reply, {ok, OMD}, State}; + false -> + {reply, not_found, State} + end + end + end; +handle_call({snapshot, _Requestor, SnapType, _Timeout}, _From, State) -> + Reply = snapshot_store(State, SnapType), + {reply, Reply, State}; +handle_call({return_folder, FolderType}, _From, State) -> + case FolderType of + {bucket_stats, Bucket} -> + {reply, + bucket_stats(State, Bucket, ?STD_TAG), + State}; + {riakbucket_stats, Bucket} -> + {reply, + bucket_stats(State, Bucket, ?RIAK_TAG), + State}; + {index_query, + Bucket, + {IdxField, StartValue, EndValue}, + {ReturnTerms, TermRegex}} -> + {reply, + index_query(State, + Bucket, + {IdxField, StartValue, EndValue}, + {ReturnTerms, TermRegex}), + State}; + {keylist, Tag} -> + {reply, + allkey_query(State, Tag), + State}; + {hashtree_query, Tag, JournalCheck} -> + {reply, + hashtree_query(State, Tag, JournalCheck), + State}; + {foldobjects_allkeys, Tag, FoldObjectsFun} -> + {reply, + foldobjects_allkeys(State, Tag, FoldObjectsFun), + State}; + {foldobjects_bybucket, Tag, Bucket, FoldObjectsFun} -> + {reply, + foldobjects_bybucket(State, Tag, Bucket, FoldObjectsFun), + State}; + {foldobjects_byindex, + Tag, + Bucket, + {Field, FromTerm, ToTerm}, + FoldObjectsFun} -> + {reply, + foldobjects_byindex(State, + Tag, Bucket, + Field, FromTerm, ToTerm, + FoldObjectsFun), + State} + + end; +handle_call({compact_journal, Timeout}, _From, State) -> + ok = leveled_inker:ink_compactjournal(State#state.inker, + self(), + Timeout), + {reply, ok, State}; +handle_call(confirm_compact, _From, State) -> + {reply, leveled_inker:ink_compactionpending(State#state.inker), State}; +handle_call(close, _From, State) -> + {stop, normal, ok, State}. + +handle_cast(_Msg, State) -> + {noreply, State}. + +handle_info(_Info, State) -> + {noreply, State}. + +terminate(Reason, State) -> + leveled_log:log("B0003", [Reason]), + ok = leveled_inker:ink_close(State#state.inker), + ok = leveled_penciller:pcl_close(State#state.penciller). + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +bucket_stats(State, Bucket, Tag) -> + {ok, + {LedgerSnapshot, LedgerCache}, + _JournalSnapshot} = snapshot_store(State, ledger), + Folder = fun() -> + leveled_log:log("B0004", [gb_trees:size(LedgerCache)]), + ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, + LedgerCache), + StartKey = leveled_codec:to_ledgerkey(Bucket, null, Tag), + EndKey = leveled_codec:to_ledgerkey(Bucket, null, Tag), + AccFun = accumulate_size(), + Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot, + StartKey, + EndKey, + AccFun, + {0, 0}), + ok = leveled_penciller:pcl_close(LedgerSnapshot), + Acc + end, + {async, Folder}. + +index_query(State, + Bucket, + {IdxField, StartValue, EndValue}, + {ReturnTerms, TermRegex}) -> + {ok, + {LedgerSnapshot, LedgerCache}, + _JournalSnapshot} = snapshot_store(State, ledger), + Folder = fun() -> + leveled_log:log("B0004", [gb_trees:size(LedgerCache)]), + ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, + LedgerCache), + StartKey = leveled_codec:to_ledgerkey(Bucket, null, ?IDX_TAG, + IdxField, StartValue), + EndKey = leveled_codec:to_ledgerkey(Bucket, null, ?IDX_TAG, + IdxField, EndValue), + AddFun = case ReturnTerms of + true -> + fun add_terms/3; + _ -> + fun add_keys/3 + end, + AccFun = accumulate_index(TermRegex, AddFun), + Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot, + StartKey, + EndKey, + AccFun, + []), + ok = leveled_penciller:pcl_close(LedgerSnapshot), + Acc + end, + {async, Folder}. + + +hashtree_query(State, Tag, JournalCheck) -> + SnapType = case JournalCheck of + false -> + ledger; + check_presence -> + store + end, + {ok, + {LedgerSnapshot, LedgerCache}, + JournalSnapshot} = snapshot_store(State, SnapType), + Folder = fun() -> + leveled_log:log("B0004", [gb_trees:size(LedgerCache)]), + ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, + LedgerCache), + StartKey = leveled_codec:to_ledgerkey(null, null, Tag), + EndKey = leveled_codec:to_ledgerkey(null, null, Tag), + AccFun = accumulate_hashes(JournalCheck, JournalSnapshot), + Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot, + StartKey, + EndKey, + AccFun, + []), + ok = leveled_penciller:pcl_close(LedgerSnapshot), + case JournalCheck of + false -> + ok; + check_presence -> + leveled_inker:ink_close(JournalSnapshot) + end, + Acc + end, + {async, Folder}. + + +foldobjects_allkeys(State, Tag, FoldObjectsFun) -> + StartKey = leveled_codec:to_ledgerkey(null, null, Tag), + EndKey = leveled_codec:to_ledgerkey(null, null, Tag), + foldobjects(State, Tag, StartKey, EndKey, FoldObjectsFun). + +foldobjects_bybucket(State, Tag, Bucket, FoldObjectsFun) -> + StartKey = leveled_codec:to_ledgerkey(Bucket, null, Tag), + EndKey = leveled_codec:to_ledgerkey(Bucket, null, Tag), + foldobjects(State, Tag, StartKey, EndKey, FoldObjectsFun). + +foldobjects_byindex(State, Tag, Bucket, Field, FromTerm, ToTerm, FoldObjectsFun) -> + StartKey = leveled_codec:to_ledgerkey(Bucket, null, ?IDX_TAG, Field, + FromTerm), + EndKey = leveled_codec:to_ledgerkey(Bucket, null, ?IDX_TAG, Field, + ToTerm), + foldobjects(State, Tag, StartKey, EndKey, FoldObjectsFun). + +foldobjects(State, Tag, StartKey, EndKey, FoldObjectsFun) -> + {ok, + {LedgerSnapshot, LedgerCache}, + JournalSnapshot} = snapshot_store(State, store), + {FoldFun, InitAcc} = case is_tuple(FoldObjectsFun) of + true -> + FoldObjectsFun; + false -> + {FoldObjectsFun, []} + end, + Folder = fun() -> + leveled_log:log("B0004", [gb_trees:size(LedgerCache)]), + ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, + LedgerCache), + AccFun = accumulate_objects(FoldFun, JournalSnapshot, Tag), + Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot, + StartKey, + EndKey, + AccFun, + InitAcc), + ok = leveled_penciller:pcl_close(LedgerSnapshot), + ok = leveled_inker:ink_close(JournalSnapshot), + Acc + end, + {async, Folder}. + + +allkey_query(State, Tag) -> + {ok, + {LedgerSnapshot, LedgerCache}, + _JournalSnapshot} = snapshot_store(State, ledger), + Folder = fun() -> + leveled_log:log("B0004", [gb_trees:size(LedgerCache)]), + ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, + LedgerCache), + SK = leveled_codec:to_ledgerkey(null, null, Tag), + EK = leveled_codec:to_ledgerkey(null, null, Tag), + AccFun = accumulate_keys(), + Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot, + SK, + EK, + AccFun, + []), + ok = leveled_penciller:pcl_close(LedgerSnapshot), + lists:reverse(Acc) + end, + {async, Folder}. + + +snapshot_store(State, SnapType) -> + PCLopts = #penciller_options{start_snapshot=true, + source_penciller=State#state.penciller}, + {ok, LedgerSnapshot} = leveled_penciller:pcl_start(PCLopts), + case SnapType of + store -> + InkerOpts = #inker_options{start_snapshot=true, + source_inker=State#state.inker}, + {ok, JournalSnapshot} = leveled_inker:ink_start(InkerOpts), + {ok, {LedgerSnapshot, State#state.ledger_cache}, + JournalSnapshot}; + ledger -> + {ok, {LedgerSnapshot, State#state.ledger_cache}, + null} + end. + +set_options(Opts) -> + MaxJournalSize = get_opt(max_journalsize, Opts, 10000000000), + + AltStrategy = get_opt(reload_strategy, Opts, []), + ReloadStrategy = leveled_codec:inker_reload_strategy(AltStrategy), + + PCLL0CacheSize = get_opt(max_pencillercachesize, Opts), + RootPath = get_opt(root_path, Opts), + JournalFP = RootPath ++ "/" ++ ?JOURNAL_FP, + LedgerFP = RootPath ++ "/" ++ ?LEDGER_FP, + ok =filelib:ensure_dir(JournalFP), + ok =filelib:ensure_dir(LedgerFP), + + {#inker_options{root_path = JournalFP, + reload_strategy = ReloadStrategy, + max_run_length = get_opt(max_run_length, Opts), + cdb_options = #cdb_options{max_size=MaxJournalSize, + binary_mode=true}}, + #penciller_options{root_path = LedgerFP, + max_inmemory_tablesize = PCLL0CacheSize}}. + +startup(InkerOpts, PencillerOpts) -> + {ok, Inker} = leveled_inker:ink_start(InkerOpts), + {ok, Penciller} = leveled_penciller:pcl_start(PencillerOpts), + LedgerSQN = leveled_penciller:pcl_getstartupsequencenumber(Penciller), + leveled_log:log("B0005", [LedgerSQN]), + ok = leveled_inker:ink_loadpcl(Inker, + LedgerSQN + 1, + fun load_fun/5, + Penciller), + {Inker, Penciller}. + + +fetch_head(Key, Penciller, LedgerCache) -> + case gb_trees:lookup(Key, LedgerCache) of + {value, Head} -> + Head; + none -> + case leveled_penciller:pcl_fetch(Penciller, Key) of + {Key, Head} -> + Head; + not_present -> + not_present + end + end. + +fetch_value(Key, SQN, Inker) -> + case leveled_inker:ink_fetch(Inker, Key, SQN) of + {ok, Value} -> + Value; + not_present -> + not_present + end. + + +accumulate_size() -> + Now = leveled_codec:integer_now(), + AccFun = fun(Key, Value, {Size, Count}) -> + case leveled_codec:is_active(Key, Value, Now) of + true -> + {Size + leveled_codec:get_size(Key, Value), + Count + 1}; + false -> + {Size, Count} + end + end, + AccFun. + +accumulate_hashes(JournalCheck, InkerClone) -> + Now = leveled_codec:integer_now(), + AccFun = fun(LK, V, KHList) -> + case leveled_codec:is_active(LK, V, Now) of + true -> + {B, K, H} = leveled_codec:get_keyandhash(LK, V), + Check = random:uniform() < ?CHECKJOURNAL_PROB, + case {JournalCheck, Check} of + {check_presence, true} -> + case check_presence(LK, V, InkerClone) of + true -> + [{B, K, H}|KHList]; + false -> + KHList + end; + _ -> + [{B, K, H}|KHList] + end; + false -> + KHList + end + end, + AccFun. + +accumulate_objects(FoldObjectsFun, InkerClone, Tag) -> + Now = leveled_codec:integer_now(), + AccFun = fun(LK, V, Acc) -> + case leveled_codec:is_active(LK, V, Now) of + true -> + SQN = leveled_codec:strip_to_seqonly({LK, V}), + {B, K} = case leveled_codec:from_ledgerkey(LK) of + {B0, K0} -> {B0, K0}; + {B0, K0, _T0} -> {B0, K0} + end, + QK = leveled_codec:to_ledgerkey(B, K, Tag), + R = leveled_inker:ink_fetch(InkerClone, QK, SQN), + case R of + {ok, Value} -> + FoldObjectsFun(B, K, Value, Acc); + not_present -> + Acc + end; + false -> + Acc + end + end, + AccFun. + + + + +check_presence(Key, Value, InkerClone) -> + {LedgerKey, SQN} = leveled_codec:strip_to_keyseqonly({Key, Value}), + case leveled_inker:ink_keycheck(InkerClone, LedgerKey, SQN) of + probably -> + true; + missing -> + false + end. + +accumulate_keys() -> + Now = leveled_codec:integer_now(), + AccFun = fun(Key, Value, KeyList) -> + case leveled_codec:is_active(Key, Value, Now) of + true -> + [leveled_codec:from_ledgerkey(Key)|KeyList]; + false -> + KeyList + end + end, + AccFun. + +add_keys(ObjKey, _IdxValue, Acc) -> + Acc ++ [ObjKey]. + +add_terms(ObjKey, IdxValue, Acc) -> + Acc ++ [{IdxValue, ObjKey}]. + +accumulate_index(TermRe, AddFun) -> + Now = leveled_codec:integer_now(), + case TermRe of + undefined -> + fun(Key, Value, Acc) -> + case leveled_codec:is_active(Key, Value, Now) of + true -> + {_Bucket, + ObjKey, + IdxValue} = leveled_codec:from_ledgerkey(Key), + AddFun(ObjKey, IdxValue, Acc); + false -> + Acc + end end; + TermRe -> + fun(Key, Value, Acc) -> + case leveled_codec:is_active(Key, Value, Now) of + true -> + {_Bucket, + ObjKey, + IdxValue} = leveled_codec:from_ledgerkey(Key), + case re:run(IdxValue, TermRe) of + nomatch -> + Acc; + _ -> + AddFun(ObjKey, IdxValue, Acc) + end; + false -> + Acc + end end + end. + + +preparefor_ledgercache(?INKT_KEYD, + LedgerKey, SQN, _Obj, _Size, {IndexSpecs, TTL}) -> + {Bucket, Key} = leveled_codec:from_ledgerkey(LedgerKey), + leveled_codec:convert_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL); +preparefor_ledgercache(_Type, LedgerKey, SQN, Obj, Size, {IndexSpecs, TTL}) -> + {Bucket, Key, PrimaryChange} = leveled_codec:generate_ledgerkv(LedgerKey, + SQN, + Obj, + Size, + TTL), + [PrimaryChange] ++ leveled_codec:convert_indexspecs(IndexSpecs, + Bucket, + Key, + SQN, + TTL). + + +addto_ledgercache(Changes, Cache) -> + lists:foldl(fun({K, V}, Acc) -> gb_trees:enter(K, V, Acc) end, + Cache, + Changes). + +maybepush_ledgercache(MaxCacheSize, Cache, Penciller) -> + CacheSize = gb_trees:size(Cache), + TimeToPush = maybe_withjitter(CacheSize, MaxCacheSize), + if + TimeToPush -> + case leveled_penciller:pcl_pushmem(Penciller, Cache) of + ok -> + {ok, gb_trees:empty()}; + returned -> + {returned, Cache} + end; + true -> + {ok, Cache} + end. + + +maybe_withjitter(CacheSize, MaxCacheSize) -> + if + CacheSize > MaxCacheSize -> + R = random:uniform(7 * MaxCacheSize), + if + (CacheSize - MaxCacheSize) > R -> + true; + true -> + false + end; + true -> + false + end. + + + +load_fun(KeyInLedger, ValueInLedger, _Position, Acc0, ExtractFun) -> + {MinSQN, MaxSQN, OutputTree} = Acc0, + {SQN, Type, PK} = KeyInLedger, + % VBin may already be a term + {VBin, VSize} = ExtractFun(ValueInLedger), + {Obj, IndexSpecs} = leveled_codec:split_inkvalue(VBin), + case SQN of + SQN when SQN < MinSQN -> + {loop, Acc0}; + SQN when SQN < MaxSQN -> + Changes = preparefor_ledgercache(Type, PK, SQN, + Obj, VSize, IndexSpecs), + {loop, {MinSQN, MaxSQN, addto_ledgercache(Changes, OutputTree)}}; + MaxSQN -> + leveled_log:log("B0006", [SQN]), + Changes = preparefor_ledgercache(Type, PK, SQN, + Obj, VSize, IndexSpecs), + {stop, {MinSQN, MaxSQN, addto_ledgercache(Changes, OutputTree)}}; + SQN when SQN > MaxSQN -> + leveled_log:log("B0007", [MaxSQN, SQN]), + {stop, Acc0} + end. + + +get_opt(Key, Opts) -> + get_opt(Key, Opts, undefined). + +get_opt(Key, Opts, Default) -> + case proplists:get_value(Key, Opts) of + undefined -> + case application:get_env(?MODULE, Key) of + {ok, Value} -> + Value; + undefined -> + Default + end; + Value -> + Value + end. + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +reset_filestructure() -> + RootPath = "../test", + leveled_inker:clean_testdir(RootPath ++ "/" ++ ?JOURNAL_FP), + leveled_penciller:clean_testdir(RootPath ++ "/" ++ ?LEDGER_FP), + RootPath. + + +generate_multiple_objects(Count, KeyNumber) -> + generate_multiple_objects(Count, KeyNumber, []). + +generate_multiple_objects(0, _KeyNumber, ObjL) -> + ObjL; +generate_multiple_objects(Count, KeyNumber, ObjL) -> + Key = "Key" ++ integer_to_list(KeyNumber), + Value = crypto:rand_bytes(256), + IndexSpec = [{add, "idx1_bin", "f" ++ integer_to_list(KeyNumber rem 10)}], + generate_multiple_objects(Count - 1, + KeyNumber + 1, + ObjL ++ [{Key, Value, IndexSpec}]). + + +generate_multiple_robjects(Count, KeyNumber) -> + generate_multiple_robjects(Count, KeyNumber, []). + +generate_multiple_robjects(0, _KeyNumber, ObjL) -> + ObjL; +generate_multiple_robjects(Count, KeyNumber, ObjL) -> + Obj = {"Bucket", + "Key" ++ integer_to_list(KeyNumber), + crypto:rand_bytes(1024), + [], + [{"MDK", "MDV" ++ integer_to_list(KeyNumber)}, + {"MDK2", "MDV" ++ integer_to_list(KeyNumber)}]}, + {B1, K1, V1, Spec1, MD} = Obj, + Content = #r_content{metadata=MD, value=V1}, + Obj1 = #r_object{bucket=B1, key=K1, contents=[Content], vclock=[{'a',1}]}, + generate_multiple_robjects(Count - 1, KeyNumber + 1, ObjL ++ [{Obj1, Spec1}]). + + +single_key_test() -> + RootPath = reset_filestructure(), + {ok, Bookie1} = book_start([{root_path, RootPath}]), + {B1, K1, V1, Spec1, MD} = {"Bucket1", + "Key1", + "Value1", + [], + {"MDK1", "MDV1"}}, + Content = #r_content{metadata=MD, value=V1}, + Object = #r_object{bucket=B1, key=K1, contents=[Content], vclock=[{'a',1}]}, + ok = book_riakput(Bookie1, Object, Spec1), + {ok, F1} = book_riakget(Bookie1, B1, K1), + ?assertMatch(F1, Object), + ok = book_close(Bookie1), + {ok, Bookie2} = book_start([{root_path, RootPath}]), + {ok, F2} = book_riakget(Bookie2, B1, K1), + ?assertMatch(F2, Object), + ok = book_close(Bookie2), + reset_filestructure(). + +multi_key_test() -> + RootPath = reset_filestructure(), + {ok, Bookie1} = book_start([{root_path, RootPath}]), + {B1, K1, V1, Spec1, MD1} = {"Bucket", + "Key1", + "Value1", + [], + {"MDK1", "MDV1"}}, + C1 = #r_content{metadata=MD1, value=V1}, + Obj1 = #r_object{bucket=B1, key=K1, contents=[C1], vclock=[{'a',1}]}, + {B2, K2, V2, Spec2, MD2} = {"Bucket", + "Key2", + "Value2", + [], + {"MDK2", "MDV2"}}, + C2 = #r_content{metadata=MD2, value=V2}, + Obj2 = #r_object{bucket=B2, key=K2, contents=[C2], vclock=[{'a',1}]}, + ok = book_riakput(Bookie1, Obj1, Spec1), + ObjL1 = generate_multiple_robjects(100, 3), + SW1 = os:timestamp(), + lists:foreach(fun({O, S}) -> ok = book_riakput(Bookie1, O, S) end, ObjL1), + io:format("PUT of 100 objects completed in ~w microseconds~n", + [timer:now_diff(os:timestamp(),SW1)]), + ok = book_riakput(Bookie1, Obj2, Spec2), + {ok, F1A} = book_riakget(Bookie1, B1, K1), + ?assertMatch(F1A, Obj1), + {ok, F2A} = book_riakget(Bookie1, B2, K2), + ?assertMatch(F2A, Obj2), + ObjL2 = generate_multiple_robjects(100, 103), + SW2 = os:timestamp(), + lists:foreach(fun({O, S}) -> ok = book_riakput(Bookie1, O, S) end, ObjL2), + io:format("PUT of 100 objects completed in ~w microseconds~n", + [timer:now_diff(os:timestamp(),SW2)]), + {ok, F1B} = book_riakget(Bookie1, B1, K1), + ?assertMatch(F1B, Obj1), + {ok, F2B} = book_riakget(Bookie1, B2, K2), + ?assertMatch(F2B, Obj2), + ok = book_close(Bookie1), + % Now reopen the file, and confirm that a fetch is still possible + {ok, Bookie2} = book_start([{root_path, RootPath}]), + {ok, F1C} = book_riakget(Bookie2, B1, K1), + ?assertMatch(F1C, Obj1), + {ok, F2C} = book_riakget(Bookie2, B2, K2), + ?assertMatch(F2C, Obj2), + ObjL3 = generate_multiple_robjects(100, 203), + SW3 = os:timestamp(), + lists:foreach(fun({O, S}) -> ok = book_riakput(Bookie2, O, S) end, ObjL3), + io:format("PUT of 100 objects completed in ~w microseconds~n", + [timer:now_diff(os:timestamp(),SW3)]), + {ok, F1D} = book_riakget(Bookie2, B1, K1), + ?assertMatch(F1D, Obj1), + {ok, F2D} = book_riakget(Bookie2, B2, K2), + ?assertMatch(F2D, Obj2), + ok = book_close(Bookie2), + reset_filestructure(). + +ttl_test() -> + RootPath = reset_filestructure(), + {ok, Bookie1} = book_start([{root_path, RootPath}]), + ObjL1 = generate_multiple_objects(100, 1), + % Put in all the objects with a TTL in the future + Future = leveled_codec:integer_now() + 300, + lists:foreach(fun({K, V, S}) -> ok = book_tempput(Bookie1, + "Bucket", K, V, S, + ?STD_TAG, + Future) end, + ObjL1), + lists:foreach(fun({K, V, _S}) -> + {ok, V} = book_get(Bookie1, "Bucket", K, ?STD_TAG) + end, + ObjL1), + lists:foreach(fun({K, _V, _S}) -> + {ok, _} = book_head(Bookie1, "Bucket", K, ?STD_TAG) + end, + ObjL1), + + ObjL2 = generate_multiple_objects(100, 101), + Past = leveled_codec:integer_now() - 300, + lists:foreach(fun({K, V, S}) -> ok = book_tempput(Bookie1, + "Bucket", K, V, S, + ?STD_TAG, + Past) end, + ObjL2), + lists:foreach(fun({K, _V, _S}) -> + not_found = book_get(Bookie1, "Bucket", K, ?STD_TAG) + end, + ObjL2), + lists:foreach(fun({K, _V, _S}) -> + not_found = book_head(Bookie1, "Bucket", K, ?STD_TAG) + end, + ObjL2), + + {async, BucketFolder} = book_returnfolder(Bookie1, + {bucket_stats, "Bucket"}), + {_Size, Count} = BucketFolder(), + ?assertMatch(100, Count), + {async, + IndexFolder} = book_returnfolder(Bookie1, + {index_query, + "Bucket", + {"idx1_bin", "f8", "f9"}, + {false, undefined}}), + KeyList = IndexFolder(), + ?assertMatch(20, length(KeyList)), + + {ok, Regex} = re:compile("f8"), + {async, + IndexFolderTR} = book_returnfolder(Bookie1, + {index_query, + "Bucket", + {"idx1_bin", "f8", "f9"}, + {true, Regex}}), + TermKeyList = IndexFolderTR(), + ?assertMatch(10, length(TermKeyList)), + + ok = book_close(Bookie1), + {ok, Bookie2} = book_start([{root_path, RootPath}]), + + {async, + IndexFolderTR2} = book_returnfolder(Bookie2, + {index_query, + "Bucket", + {"idx1_bin", "f7", "f9"}, + {false, Regex}}), + KeyList2 = IndexFolderTR2(), + ?assertMatch(10, length(KeyList2)), + + lists:foreach(fun({K, _V, _S}) -> + not_found = book_get(Bookie2, "Bucket", K, ?STD_TAG) + end, + ObjL2), + lists:foreach(fun({K, _V, _S}) -> + not_found = book_head(Bookie2, "Bucket", K, ?STD_TAG) + end, + ObjL2), + + ok = book_close(Bookie2), + reset_filestructure(). + +hashtree_query_test() -> + RootPath = reset_filestructure(), + {ok, Bookie1} = book_start([{root_path, RootPath}, + {max_journalsize, 1000000}, + {cache_size, 500}]), + ObjL1 = generate_multiple_objects(1200, 1), + % Put in all the objects with a TTL in the future + Future = leveled_codec:integer_now() + 300, + lists:foreach(fun({K, V, S}) -> ok = book_tempput(Bookie1, + "Bucket", K, V, S, + ?STD_TAG, + Future) end, + ObjL1), + ObjL2 = generate_multiple_objects(20, 1201), + % Put in a few objects with a TTL in the past + Past = leveled_codec:integer_now() - 300, + lists:foreach(fun({K, V, S}) -> ok = book_tempput(Bookie1, + "Bucket", K, V, S, + ?STD_TAG, + Past) end, + ObjL2), + % Scan the store for the Bucket, Keys and Hashes + {async, HTFolder} = book_returnfolder(Bookie1, + {hashtree_query, + ?STD_TAG, + false}), + KeyHashList = HTFolder(), + lists:foreach(fun({B, _K, H}) -> + ?assertMatch("Bucket", B), + ?assertMatch(true, is_integer(H)) + end, + KeyHashList), + ?assertMatch(1200, length(KeyHashList)), + ok = book_close(Bookie1), + {ok, Bookie2} = book_start([{root_path, RootPath}, + {max_journalsize, 200000}, + {cache_size, 500}]), + {async, HTFolder2} = book_returnfolder(Bookie2, + {hashtree_query, + ?STD_TAG, + false}), + ?assertMatch(KeyHashList, HTFolder2()), + ok = book_close(Bookie2), + reset_filestructure(). + +hashtree_query_withjournalcheck_test() -> + RootPath = reset_filestructure(), + {ok, Bookie1} = book_start([{root_path, RootPath}, + {max_journalsize, 1000000}, + {cache_size, 500}]), + ObjL1 = generate_multiple_objects(800, 1), + % Put in all the objects with a TTL in the future + Future = leveled_codec:integer_now() + 300, + lists:foreach(fun({K, V, S}) -> ok = book_tempput(Bookie1, + "Bucket", K, V, S, + ?STD_TAG, + Future) end, + ObjL1), + {async, HTFolder1} = book_returnfolder(Bookie1, + {hashtree_query, + ?STD_TAG, + false}), + KeyHashList = HTFolder1(), + {async, HTFolder2} = book_returnfolder(Bookie1, + {hashtree_query, + ?STD_TAG, + check_presence}), + ?assertMatch(KeyHashList, HTFolder2()), + ok = book_close(Bookie1), + reset_filestructure(). + +foldobjects_vs_hashtree_test() -> + RootPath = reset_filestructure(), + {ok, Bookie1} = book_start([{root_path, RootPath}, + {max_journalsize, 1000000}, + {cache_size, 500}]), + ObjL1 = generate_multiple_objects(800, 1), + % Put in all the objects with a TTL in the future + Future = leveled_codec:integer_now() + 300, + lists:foreach(fun({K, V, S}) -> ok = book_tempput(Bookie1, + "Bucket", K, V, S, + ?STD_TAG, + Future) end, + ObjL1), + {async, HTFolder1} = book_returnfolder(Bookie1, + {hashtree_query, + ?STD_TAG, + false}), + KeyHashList1 = lists:usort(HTFolder1()), + io:format("First item ~w~n", [lists:nth(1, KeyHashList1)]), + FoldObjectsFun = fun(B, K, V, Acc) -> + [{B, K, erlang:phash2(term_to_binary(V))}|Acc] end, + {async, HTFolder2} = book_returnfolder(Bookie1, + {foldobjects_allkeys, + ?STD_TAG, + FoldObjectsFun}), + KeyHashList2 = HTFolder2(), + ?assertMatch(KeyHashList1, lists:usort(KeyHashList2)), + + ok = book_close(Bookie1), + reset_filestructure(). + + +-endif. diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl new file mode 100644 index 0000000..783f545 --- /dev/null +++ b/src/leveled_cdb.erl @@ -0,0 +1,1809 @@ +%% +%% This is a modified version of the cdb module provided by Tom Whitcomb. +%% +%% - https://github.com/thomaswhitcomb/erlang-cdb +%% +%% The primary differences are: +%% - Support for incrementally writing a CDB file while keeping the hash table +%% in memory +%% - The ability to scan a database and accumulate all the Key, Values to +%% rebuild in-memory tables on startup +%% - The ability to scan a database in blocks of sequence numbers +%% +%% This is to be used in eleveledb, and in this context: +%% - Keys will be a combinatio of the PrimaryKey and the Sequence Number +%% - Values will be a serialised version on the whole object, and the +%% IndexChanges associated with the transaction +%% Where the IndexChanges are all the Key changes required to be added to the +%% ledger to complete the changes (the addition of postings and tombstones). +%% +%% This module provides functions to create and query a CDB (constant database). +%% A CDB implements a two-level hashtable which provides fast {key,value} +%% lookups that remain fairly constant in speed regardless of the CDBs size. +%% +%% The first level in the CDB occupies the first 255 doublewords in the file. +%% Each doubleword slot contains two values. The first is a file pointer to +%% the primary hashtable (at the end of the file) and the second value is the +%% number of entries in the hashtable. The first level table of 255 entries +%% is indexed with the lower eight bits of the hash of the input key. +%% +%% Following the 255 doublewords are the {key,value} tuples. The tuples are +%% packed in the file without regard to word boundaries. Each {key,value} +%% tuple is represented with a four byte key length, a four byte value length, +%% the actual key value followed by the actual value. +%% +%% Following the {key,value} tuples are the primary hash tables. There are +%% at most 255 hash tables. Each hash table is referenced by one of the 255 +%% doubleword entries at the top of the file. For efficiency reasons, each +%% hash table is allocated twice the number of entries that it will need. +%% Each entry in the hash table is a doubleword. +%% The first word is the corresponding hash value and the second word is a +%% file pointer to the actual {key,value} tuple higher in the file. +%% +%% + + +-module(leveled_cdb). + +-behaviour(gen_fsm). +-include("include/leveled.hrl"). + +-export([init/1, + handle_sync_event/4, + handle_event/3, + handle_info/3, + terminate/3, + code_change/4, + starting/3, + writer/3, + writer/2, + rolling/2, + rolling/3, + reader/3, + reader/2, + delete_pending/3, + delete_pending/2]). + +-export([cdb_open_writer/1, + cdb_open_writer/2, + cdb_open_reader/1, + cdb_get/2, + cdb_put/3, + cdb_mput/2, + cdb_getpositions/2, + cdb_directfetch/3, + cdb_lastkey/1, + cdb_firstkey/1, + cdb_filename/1, + cdb_keycheck/2, + cdb_scan/4, + cdb_close/1, + cdb_complete/1, + cdb_roll/1, + cdb_returnhashtable/3, + cdb_destroy/1, + cdb_deletepending/1, + cdb_deletepending/3, + hashtable_calc/2]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(DWORD_SIZE, 8). +-define(WORD_SIZE, 4). +-define(MAX_FILE_SIZE, 3221225472). +-define(BINARY_MODE, false). +-define(BASE_POSITION, 2048). +-define(WRITE_OPS, [binary, raw, read, write]). +-define(PENDING_ROLL_WAIT, 30). +-define(DELETE_TIMEOUT, 10000). + +-record(state, {hashtree, + last_position :: integer(), + last_key = empty, + hash_index = [] :: list(), + filename :: string(), + handle :: file:fd(), + max_size :: integer(), + binary_mode = false :: boolean(), + delete_point = 0 :: integer(), + inker :: pid(), + deferred_delete = false :: boolean()}). + + +%%%============================================================================ +%%% API +%%%============================================================================ + +cdb_open_writer(Filename) -> + %% No options passed + cdb_open_writer(Filename, #cdb_options{binary_mode=true}). + +cdb_open_writer(Filename, Opts) -> + {ok, Pid} = gen_fsm:start(?MODULE, [Opts], []), + ok = gen_fsm:sync_send_event(Pid, {open_writer, Filename}, infinity), + {ok, Pid}. + +cdb_open_reader(Filename) -> + cdb_open_reader(Filename, #cdb_options{binary_mode=true}). + +cdb_open_reader(Filename, Opts) -> + {ok, Pid} = gen_fsm:start(?MODULE, [Opts], []), + ok = gen_fsm:sync_send_event(Pid, {open_reader, Filename}, infinity), + {ok, Pid}. + +cdb_get(Pid, Key) -> + gen_fsm:sync_send_event(Pid, {get_kv, Key}, infinity). + +cdb_put(Pid, Key, Value) -> + gen_fsm:sync_send_event(Pid, {put_kv, Key, Value}, infinity). + +cdb_mput(Pid, KVList) -> + gen_fsm:sync_send_event(Pid, {mput_kv, KVList}, infinity). + +%% SampleSize can be an integer or the atom all +cdb_getpositions(Pid, SampleSize) -> + gen_fsm:sync_send_event(Pid, {get_positions, SampleSize}, infinity). + +%% Info can be key_only, key_size (size being the size of the value) or +%% key_value_check (with the check part indicating if the CRC is correct for +%% the value) +cdb_directfetch(Pid, PositionList, Info) -> + gen_fsm:sync_send_event(Pid, {direct_fetch, PositionList, Info}, infinity). + +cdb_close(Pid) -> + cdb_close(Pid, ?PENDING_ROLL_WAIT). + +cdb_close(Pid, WaitsLeft) -> + if + WaitsLeft > 0 -> + case gen_fsm:sync_send_all_state_event(Pid, cdb_close, infinity) of + pending_roll -> + timer:sleep(1), + cdb_close(Pid, WaitsLeft - 1); + R -> + R + end; + true -> + gen_fsm:sync_send_event(Pid, cdb_kill, infinity) + end. + +cdb_complete(Pid) -> + gen_fsm:sync_send_event(Pid, cdb_complete, infinity). + +cdb_roll(Pid) -> + gen_fsm:send_event(Pid, cdb_roll). + +cdb_returnhashtable(Pid, IndexList, HashTreeBin) -> + gen_fsm:sync_send_event(Pid, {return_hashtable, IndexList, HashTreeBin}, infinity). + +cdb_destroy(Pid) -> + gen_fsm:send_event(Pid, destroy). + +cdb_deletepending(Pid) -> + cdb_deletepending(Pid, 0, no_poll). + +cdb_deletepending(Pid, ManSQN, Inker) -> + gen_fsm:send_event(Pid, {delete_pending, ManSQN, Inker}). + +%% cdb_scan returns {LastPosition, Acc}. Use LastPosition as StartPosiiton to +%% continue from that point (calling function has to protect against) double +%% counting. +%% +%% LastPosition could be the atom complete when the last key processed was at +%% the end of the file. last_key must be defined in LoopState. + +cdb_scan(Pid, FilterFun, InitAcc, StartPosition) -> + gen_fsm:sync_send_all_state_event(Pid, + {cdb_scan, + FilterFun, + InitAcc, + StartPosition}, + infinity). + +%% Get the last key to be added to the file (which will have the highest +%% sequence number) +cdb_lastkey(Pid) -> + gen_fsm:sync_send_all_state_event(Pid, cdb_lastkey, infinity). + +cdb_firstkey(Pid) -> + gen_fsm:sync_send_all_state_event(Pid, cdb_firstkey, infinity). + +%% Get the filename of the database +cdb_filename(Pid) -> + gen_fsm:sync_send_all_state_event(Pid, cdb_filename, infinity). + +%% Check to see if the key is probably present, will return either +%% probably or missing. Does not do a definitive check +cdb_keycheck(Pid, Key) -> + gen_fsm:sync_send_event(Pid, {key_check, Key}, infinity). + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([Opts]) -> + MaxSize = case Opts#cdb_options.max_size of + undefined -> + ?MAX_FILE_SIZE; + M -> + M + end, + {ok, + starting, + #state{max_size=MaxSize, binary_mode=Opts#cdb_options.binary_mode}}. + +starting({open_writer, Filename}, _From, State) -> + leveled_log:log("CDB01", [Filename]), + {LastPosition, HashTree, LastKey} = open_active_file(Filename), + {ok, Handle} = file:open(Filename, [sync | ?WRITE_OPS]), + {reply, ok, writer, State#state{handle=Handle, + last_position=LastPosition, + last_key=LastKey, + filename=Filename, + hashtree=HashTree}}; +starting({open_reader, Filename}, _From, State) -> + leveled_log:log("CDB02", [Filename]), + {Handle, Index, LastKey} = open_for_readonly(Filename), + {reply, ok, reader, State#state{handle=Handle, + last_key=LastKey, + filename=Filename, + hash_index=Index}}. + +writer({get_kv, Key}, _From, State) -> + {reply, + get_mem(Key, State#state.handle, State#state.hashtree), + writer, + State}; +writer({key_check, Key}, _From, State) -> + {reply, + get_mem(Key, State#state.handle, State#state.hashtree, loose_presence), + writer, + State}; +writer({put_kv, Key, Value}, _From, State) -> + Result = put(State#state.handle, + Key, + Value, + {State#state.last_position, State#state.hashtree}, + State#state.binary_mode, + State#state.max_size), + case Result of + roll -> + %% Key and value could not be written + {reply, roll, writer, State}; + {UpdHandle, NewPosition, HashTree} -> + {reply, ok, writer, State#state{handle=UpdHandle, + last_position=NewPosition, + last_key=Key, + hashtree=HashTree}} + end; +writer({mput_kv, []}, _From, State) -> + {reply, ok, writer, State}; +writer({mput_kv, KVList}, _From, State) -> + Result = mput(State#state.handle, + KVList, + {State#state.last_position, State#state.hashtree}, + State#state.binary_mode, + State#state.max_size), + case Result of + roll -> + %% Keys and values could not be written + {reply, roll, writer, State}; + {UpdHandle, NewPosition, HashTree, LastKey} -> + {reply, ok, writer, State#state{handle=UpdHandle, + last_position=NewPosition, + last_key=LastKey, + hashtree=HashTree}} + end; +writer(cdb_complete, _From, State) -> + NewName = determine_new_filename(State#state.filename), + ok = close_file(State#state.handle, + State#state.hashtree, + State#state.last_position), + ok = rename_for_read(State#state.filename, NewName), + {stop, normal, {ok, NewName}, State}. + +writer(cdb_roll, State) -> + ok = leveled_iclerk:clerk_hashtablecalc(State#state.hashtree, + State#state.last_position, + self()), + {next_state, rolling, State}. + + +rolling({get_kv, Key}, _From, State) -> + {reply, + get_mem(Key, State#state.handle, State#state.hashtree), + rolling, + State}; +rolling({key_check, Key}, _From, State) -> + {reply, + get_mem(Key, State#state.handle, State#state.hashtree, loose_presence), + rolling, + State}; +rolling({get_positions, _SampleSize}, _From, State) -> + {reply, [], rolling, State}; +rolling({return_hashtable, IndexList, HashTreeBin}, _From, State) -> + Handle = State#state.handle, + {ok, BasePos} = file:position(Handle, State#state.last_position), + NewName = determine_new_filename(State#state.filename), + ok = perform_write_hash_tables(Handle, HashTreeBin, BasePos), + ok = write_top_index_table(Handle, BasePos, IndexList), + file:close(Handle), + ok = rename_for_read(State#state.filename, NewName), + leveled_log:log("CDB03", [NewName]), + {NewHandle, Index, LastKey} = open_for_readonly(NewName), + case State#state.deferred_delete of + true -> + {reply, ok, delete_pending, State#state{handle=NewHandle, + last_key=LastKey, + filename=NewName, + hash_index=Index}}; + false -> + {reply, ok, reader, State#state{handle=NewHandle, + last_key=LastKey, + filename=NewName, + hash_index=Index}} + end; +rolling(cdb_kill, _From, State) -> + {stop, killed, ok, State}. + + +rolling({delete_pending, ManSQN, Inker}, State) -> + {next_state, + rolling, + State#state{delete_point=ManSQN, inker=Inker, deferred_delete=true}}. + +reader({get_kv, Key}, _From, State) -> + {reply, + get_withcache(State#state.handle, Key, State#state.hash_index), + reader, + State}; +reader({key_check, Key}, _From, State) -> + {reply, + get_withcache(State#state.handle, + Key, + State#state.hash_index, + loose_presence), + reader, + State}; +reader({get_positions, SampleSize}, _From, State) -> + case SampleSize of + all -> + {reply, + scan_index(State#state.handle, + State#state.hash_index, + {fun scan_index_returnpositions/4, []}), + reader, + State}; + _ -> + SeededL = lists:map(fun(X) -> {random:uniform(), X} end, + State#state.hash_index), + SortedL = lists:keysort(1, SeededL), + RandomisedHashIndex = lists:map(fun({_R, X}) -> X end, SortedL), + {reply, + scan_index_forsample(State#state.handle, + RandomisedHashIndex, + fun scan_index_returnpositions/4, + [], + SampleSize), + reader, + State} + end; +reader({direct_fetch, PositionList, Info}, _From, State) -> + H = State#state.handle, + case Info of + key_only -> + KeyList = lists:map(fun(P) -> + extract_key(H, P) end, + PositionList), + {reply, KeyList, reader, State}; + key_size -> + KeySizeList = lists:map(fun(P) -> + extract_key_size(H, P) end, + PositionList), + {reply, KeySizeList, reader, State}; + key_value_check -> + KVCList = lists:map(fun(P) -> + extract_key_value_check(H, P) end, + PositionList), + {reply, KVCList, reader, State} + end; +reader(cdb_complete, _From, State) -> + ok = file:close(State#state.handle), + {stop, normal, {ok, State#state.filename}, State#state{handle=undefined}}. + + +reader({delete_pending, 0, no_poll}, State) -> + {next_state, + delete_pending, + State#state{delete_point=0}}; +reader({delete_pending, ManSQN, Inker}, State) -> + {next_state, + delete_pending, + State#state{delete_point=ManSQN, inker=Inker}, + ?DELETE_TIMEOUT}. + + +delete_pending({get_kv, Key}, _From, State) -> + {reply, + get_withcache(State#state.handle, Key, State#state.hash_index), + delete_pending, + State, + ?DELETE_TIMEOUT}; +delete_pending({key_check, Key}, _From, State) -> + {reply, + get_withcache(State#state.handle, + Key, + State#state.hash_index, + loose_presence), + delete_pending, + State, + ?DELETE_TIMEOUT}. + +delete_pending(timeout, State) -> + case State#state.delete_point of + 0 -> + {next_state, delete_pending, State}; + ManSQN -> + case is_process_alive(State#state.inker) of + true -> + case leveled_inker:ink_confirmdelete(State#state.inker, + ManSQN) of + true -> + leveled_log:log("CDB04", [State#state.filename, + ManSQN]), + {stop, normal, State}; + false -> + {next_state, + delete_pending, + State, + ?DELETE_TIMEOUT} + end; + false -> + {stop, normal, State} + end + end; +delete_pending(destroy, State) -> + ok = file:close(State#state.handle), + ok = file:delete(State#state.filename), + {stop, normal, State}. + + +handle_sync_event({cdb_scan, FilterFun, Acc, StartPos}, + _From, + StateName, + State) -> + {ok, StartPos0} = case StartPos of + undefined -> + file:position(State#state.handle, + ?BASE_POSITION); + StartPos -> + {ok, StartPos} + end, + case check_last_key(State#state.last_key) of + ok -> + {LastPosition, Acc2} = scan_over_file(State#state.handle, + StartPos0, + FilterFun, + Acc, + State#state.last_key), + {reply, {LastPosition, Acc2}, StateName, State}; + empty -> + {reply, {eof, Acc}, StateName, State} + end; +handle_sync_event(cdb_lastkey, _From, StateName, State) -> + {reply, State#state.last_key, StateName, State}; +handle_sync_event(cdb_firstkey, _From, StateName, State) -> + {ok, EOFPos} = file:position(State#state.handle, eof), + FirstKey = case EOFPos of + ?BASE_POSITION -> + empty; + _ -> + extract_key(State#state.handle, ?BASE_POSITION) + end, + {reply, FirstKey, StateName, State}; +handle_sync_event(cdb_filename, _From, StateName, State) -> + {reply, State#state.filename, StateName, State}; +handle_sync_event(cdb_close, _From, rolling, State) -> + {reply, pending_roll, rolling, State}; +handle_sync_event(cdb_close, _From, _StateName, State) -> + ok = file:close(State#state.handle), + {stop, normal, ok, State#state{handle=undefined}}. + +handle_event(_Msg, StateName, State) -> + {next_state, StateName, State}. + +handle_info(_Msg, StateName, State) -> + {next_state, StateName, State}. + +terminate(Reason, StateName, State) -> + leveled_log:log("CDB05", [State#state.filename, Reason]), + case {State#state.handle, StateName} of + {undefined, _} -> + ok; + {Handle, delete_pending} -> + file:close(Handle), + file:delete(State#state.filename); + {Handle, _} -> + file:close(Handle) + end. + +code_change(_OldVsn, StateName, State, _Extra) -> + {ok, StateName, State}. + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + + +%% from_dict(FileName,ListOfKeyValueTuples) +%% Given a filename and a dictionary, create a cdb +%% using the key value pairs from the dict. +from_dict(FileName,Dict) -> + KeyValueList = dict:to_list(Dict), + create(FileName, KeyValueList). + +%% +%% create(FileName,ListOfKeyValueTuples) -> ok +%% Given a filename and a list of {key,value} tuples, +%% this function creates a CDB +%% +create(FileName,KeyValueList) -> + {ok, Handle} = file:open(FileName, ?WRITE_OPS), + {ok, _} = file:position(Handle, {bof, ?BASE_POSITION}), + {BasePos, HashTree} = write_key_value_pairs(Handle, KeyValueList), + close_file(Handle, HashTree, BasePos). + + +%% Open an active file - one for which it is assumed the hash tables have not +%% yet been written +%% +%% Needs to scan over file to incrementally produce the hash list, starting at +%% the end of the top index table. +%% +%% Should return a dictionary keyed by index containing a list of {Hash, Pos} +%% tuples as the write_key_value_pairs function, and the current position, and +%% the file handle +open_active_file(FileName) when is_list(FileName) -> + {ok, Handle} = file:open(FileName, ?WRITE_OPS), + {ok, Position} = file:position(Handle, {bof, 256*?DWORD_SIZE}), + {LastPosition, {HashTree, LastKey}} = startup_scan_over_file(Handle, + Position), + case file:position(Handle, eof) of + {ok, LastPosition} -> + ok = file:close(Handle); + {ok, EndPosition} -> + leveled_log:log("CDB06", [LastPosition, EndPosition]), + {ok, _LastPosition} = file:position(Handle, LastPosition), + ok = file:truncate(Handle), + ok = file:close(Handle) + end, + {LastPosition, HashTree, LastKey}. + +%% put(Handle, Key, Value, {LastPosition, HashDict}) -> {NewPosition, KeyDict} +%% Append to an active file a new key/value pair returning an updated +%% dictionary of Keys and positions. Returns an updated Position +%% +put(FileName, + Key, + Value, + {LastPosition, HashTree}, + BinaryMode, + MaxSize) when is_list(FileName) -> + {ok, Handle} = file:open(FileName, ?WRITE_OPS), + put(Handle, Key, Value, {LastPosition, HashTree}, BinaryMode, MaxSize); +put(Handle, Key, Value, {LastPosition, HashTree}, BinaryMode, MaxSize) -> + Bin = key_value_to_record({Key, Value}, BinaryMode), + PotentialNewSize = LastPosition + byte_size(Bin), + if + PotentialNewSize > MaxSize -> + roll; + true -> + ok = file:pwrite(Handle, LastPosition, Bin), + {Handle, + PotentialNewSize, + put_hashtree(Key, LastPosition, HashTree)} + end. + +mput(Handle, KVList, {LastPosition, HashTree0}, BinaryMode, MaxSize) -> + {KPList, Bin, LastKey} = multi_key_value_to_record(KVList, + BinaryMode, + LastPosition), + PotentialNewSize = LastPosition + byte_size(Bin), + if + PotentialNewSize > MaxSize -> + roll; + true -> + ok = file:pwrite(Handle, LastPosition, Bin), + HashTree1 = lists:foldl(fun({K, P}, Acc) -> + put_hashtree(K, P, Acc) + end, + HashTree0, + KPList), + {Handle, PotentialNewSize, HashTree1, LastKey} + end. + +%% Should not be used for non-test PUTs by the inker - as the Max File Size +%% should be taken from the startup options not the default +put(FileName, Key, Value, {LastPosition, HashTree}) -> + put(FileName, Key, Value, {LastPosition, HashTree}, + ?BINARY_MODE, ?MAX_FILE_SIZE). + +%% +%% get(FileName,Key) -> {key,value} +%% Given a filename and a key, returns a key and value tuple. +%% + + +get_withcache(Handle, Key, Cache) -> + get(Handle, Key, Cache, true). + +get_withcache(Handle, Key, Cache, QuickCheck) -> + get(Handle, Key, Cache, QuickCheck). + +get(FileNameOrHandle, Key) -> + get(FileNameOrHandle, Key, no_cache, true). + +get(FileName, Key, Cache, QuickCheck) when is_list(FileName) -> + {ok, Handle} = file:open(FileName,[binary, raw, read]), + get(Handle, Key, Cache, QuickCheck); +get(Handle, Key, Cache, QuickCheck) when is_tuple(Handle) -> + Hash = hash(Key), + Index = hash_to_index(Hash), + {HashTable, Count} = get_index(Handle, Index, Cache), + % If the count is 0 for that index - key must be missing + case Count of + 0 -> + missing; + _ -> + % Get starting slot in hashtable + {ok, FirstHashPosition} = file:position(Handle, {bof, HashTable}), + Slot = hash_to_slot(Hash, Count), + {ok, _} = file:position(Handle, {cur, Slot * ?DWORD_SIZE}), + LastHashPosition = HashTable + ((Count-1) * ?DWORD_SIZE), + LocList = lists:seq(FirstHashPosition, + LastHashPosition, + ?DWORD_SIZE), + % Split list around starting slot. + {L1, L2} = lists:split(Slot, LocList), + search_hash_table(Handle, + lists:append(L2, L1), + Hash, + Key, + QuickCheck) + end. + +get_index(Handle, Index, no_cache) -> + {ok,_} = file:position(Handle, {bof, ?DWORD_SIZE * Index}), + % Get location of hashtable and number of entries in the hash + read_next_2_integers(Handle); +get_index(_Handle, Index, Cache) -> + {Index, {Pointer, Count}} = lists:keyfind(Index, 1, Cache), + {Pointer, Count}. + +%% Get a Key/Value pair from an active CDB file (with no hash table written) +%% This requires a key dictionary to be passed in (mapping keys to positions) +%% Will return {Key, Value} or missing +get_mem(Key, FNOrHandle, HashTree) -> + get_mem(Key, FNOrHandle, HashTree, true). + +get_mem(Key, Filename, HashTree, QuickCheck) when is_list(Filename) -> + {ok, Handle} = file:open(Filename, [binary, raw, read]), + get_mem(Key, Handle, HashTree, QuickCheck); +get_mem(Key, Handle, HashTree, QuickCheck) -> + ListToCheck = get_hashtree(Key, HashTree), + case {QuickCheck, ListToCheck} of + {loose_presence, []} -> + missing; + {loose_presence, _L} -> + probably; + _ -> + extract_kvpair(Handle, ListToCheck, Key) + end. + +%% Get the next key at a position in the file (or the first key if no position +%% is passed). Will return both a key and the next position +get_nextkey(Filename) when is_list(Filename) -> + {ok, Handle} = file:open(Filename, [binary, raw, read]), + get_nextkey(Handle); +get_nextkey(Handle) -> + {ok, _} = file:position(Handle, bof), + {FirstHashPosition, _} = read_next_2_integers(Handle), + get_nextkey(Handle, {256 * ?DWORD_SIZE, FirstHashPosition}). + +get_nextkey(Handle, {Position, FirstHashPosition}) -> + {ok, Position} = file:position(Handle, Position), + case read_next_2_integers(Handle) of + {KeyLength, ValueLength} -> + NextKey = read_next_term(Handle, KeyLength), + NextPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE, + case NextPosition of + FirstHashPosition -> + {NextKey, nomorekeys}; + _ -> + {NextKey, Handle, {NextPosition, FirstHashPosition}} + end; + eof -> + nomorekeys +end. + +hashtable_calc(HashTree, StartPos) -> + Seq = lists:seq(0, 255), + SWC = os:timestamp(), + {IndexList, HashTreeBin} = write_hash_tables(Seq, + HashTree, + StartPos, + [], + <<>>), + leveled_log:log_timer("CDB07", [], SWC), + {IndexList, HashTreeBin}. + +%%%%%%%%%%%%%%%%%%%% +%% Internal functions +%%%%%%%%%%%%%%%%%%%% + +determine_new_filename(Filename) -> + filename:rootname(Filename, ".pnd") ++ ".cdb". + +rename_for_read(Filename, NewName) -> + %% Rename file + leveled_log:log("CDB08", [Filename, NewName, filelib:is_file(NewName)]), + file:rename(Filename, NewName). + +open_for_readonly(Filename) -> + {ok, Handle} = file:open(Filename, [binary, raw, read]), + Index = load_index(Handle), + LastKey = find_lastkey(Handle, Index), + {Handle, Index, LastKey}. + +load_index(Handle) -> + Index = lists:seq(0, 255), + lists:map(fun(X) -> + file:position(Handle, {bof, ?DWORD_SIZE * X}), + {HashTablePos, Count} = read_next_2_integers(Handle), + {X, {HashTablePos, Count}} end, + Index). + +%% Function to find the LastKey in the file +find_lastkey(Handle, IndexCache) -> + {LastPosition, TotalKeys} = scan_index(Handle, + IndexCache, + {fun scan_index_findlast/4, + {0, 0}}), + case TotalKeys of + 0 -> + empty; + _ -> + {ok, _} = file:position(Handle, LastPosition), + {KeyLength, _ValueLength} = read_next_2_integers(Handle), + read_next_term(Handle, KeyLength) + end. + + +scan_index(Handle, IndexCache, {ScanFun, InitAcc}) -> + lists:foldl(fun({_X, {Pos, Count}}, Acc) -> + ScanFun(Handle, Pos, Count, Acc) + end, + InitAcc, + IndexCache). + +scan_index_forsample(_Handle, [], _ScanFun, Acc, SampleSize) -> + lists:sublist(Acc, SampleSize); +scan_index_forsample(Handle, [CacheEntry|Tail], ScanFun, Acc, SampleSize) -> + case length(Acc) of + L when L >= SampleSize -> + lists:sublist(Acc, SampleSize); + _ -> + {_X, {Pos, Count}} = CacheEntry, + scan_index_forsample(Handle, + Tail, + ScanFun, + ScanFun(Handle, Pos, Count, Acc), + SampleSize) + end. + + +scan_index_findlast(Handle, Position, Count, {LastPosition, TotalKeys}) -> + {ok, _} = file:position(Handle, Position), + MaxPos = lists:foldl(fun({_Hash, HPos}, MaxPos) -> max(HPos, MaxPos) end, + LastPosition, + read_next_n_integerpairs(Handle, Count)), + {MaxPos, TotalKeys + Count}. + +scan_index_returnpositions(Handle, Position, Count, PosList0) -> + {ok, _} = file:position(Handle, Position), + lists:foldl(fun({Hash, HPosition}, PosList) -> + case Hash of + 0 -> PosList; + _ -> PosList ++ [HPosition] + end end, + PosList0, + read_next_n_integerpairs(Handle, Count)). + + +%% Take an active file and write the hash details necessary to close that +%% file and roll a new active file if requested. +%% +%% Base Pos should be at the end of the KV pairs written (the position for) +%% the hash tables +close_file(Handle, HashTree, BasePos) -> + {ok, BasePos} = file:position(Handle, BasePos), + IndexList = write_hash_tables(Handle, HashTree), + ok = write_top_index_table(Handle, BasePos, IndexList), + file:close(Handle). + + +%% Fetch a list of positions by passing a key to the HashTree +get_hashtree(Key, HashTree) -> + Hash = hash(Key), + Index = hash_to_index(Hash), + Tree = array:get(Index, HashTree), + case gb_trees:lookup(Hash, Tree) of + {value, List} -> + List; + _ -> + [] + end. + +%% Add to hash tree - this is an array of 256 gb_trees that contains the Hash +%% and position of objects which have been added to an open CDB file +put_hashtree(Key, Position, HashTree) -> + Hash = hash(Key), + Index = hash_to_index(Hash), + Tree = array:get(Index, HashTree), + case gb_trees:lookup(Hash, Tree) of + none -> + array:set(Index, gb_trees:insert(Hash, [Position], Tree), HashTree); + {value, L} -> + array:set(Index, gb_trees:update(Hash, [Position|L], Tree), HashTree) + end. + +%% Function to extract a Key-Value pair given a file handle and a position +%% Will confirm that the key matches and do a CRC check +extract_kvpair(_, [], _) -> + missing; +extract_kvpair(Handle, [Position|Rest], Key) -> + {ok, _} = file:position(Handle, Position), + {KeyLength, ValueLength} = read_next_2_integers(Handle), + case safe_read_next_term(Handle, KeyLength) of + Key -> % If same key as passed in, then found! + case read_next_term(Handle, ValueLength, crc) of + {false, _} -> + crc_wonky; + {_, Value} -> + {Key,Value} + end; + _ -> + extract_kvpair(Handle, Rest, Key) + end. + +extract_key(Handle, Position) -> + {ok, _} = file:position(Handle, Position), + {KeyLength, _ValueLength} = read_next_2_integers(Handle), + read_next_term(Handle, KeyLength). + +extract_key_size(Handle, Position) -> + {ok, _} = file:position(Handle, Position), + {KeyLength, ValueLength} = read_next_2_integers(Handle), + {read_next_term(Handle, KeyLength), ValueLength}. + +extract_key_value_check(Handle, Position) -> + {ok, _} = file:position(Handle, Position), + {KeyLength, ValueLength} = read_next_2_integers(Handle), + K = read_next_term(Handle, KeyLength), + {Check, V} = read_next_term(Handle, ValueLength, crc), + {K, V, Check}. + +%% Scan through the file until there is a failure to crc check an input, and +%% at that point return the position and the key dictionary scanned so far +startup_scan_over_file(Handle, Position) -> + HashTree = array:new(256, {default, gb_trees:empty()}), + scan_over_file(Handle, + Position, + fun startup_filter/5, + {HashTree, empty}, + empty). + +%% Specific filter to be used at startup to build a hashtree for an incomplete +%% cdb file, and returns at the end the hashtree and the final Key seen in the +%% journal + +startup_filter(Key, ValueAsBin, Position, {Hashtree, LastKey}, _ExtractFun) -> + case crccheck_value(ValueAsBin) of + true -> + {loop, {put_hashtree(Key, Position, Hashtree), Key}}; + false -> + {stop, {Hashtree, LastKey}} + end. + + +%% Scan for key changes - scan over file returning applying FilterFun +%% The FilterFun should accept as input: +%% - Key, ValueBin, Position, Accumulator, Fun (to extract values from Binary) +%% -> outputting a new Accumulator and a loop|stop instruction as a tuple +%% i.e. {loop, Acc} or {stop, Acc} + +scan_over_file(Handle, Position, FilterFun, Output, LastKey) -> + case saferead_keyvalue(Handle) of + false -> + leveled_log:log("CDB09", [Position]), + {Position, Output}; + {Key, ValueAsBin, KeyLength, ValueLength} -> + NewPosition = case Key of + LastKey -> + eof; + _ -> + Position + KeyLength + ValueLength + + ?DWORD_SIZE + end, + case FilterFun(Key, + ValueAsBin, + Position, + Output, + fun extract_valueandsize/1) of + {stop, UpdOutput} -> + {Position, UpdOutput}; + {loop, UpdOutput} -> + case NewPosition of + eof -> + {eof, UpdOutput}; + _ -> + scan_over_file(Handle, + NewPosition, + FilterFun, + UpdOutput, + LastKey) + end + end + end. + +%% Confirm that the last key has been defined and set to a non-default value + +check_last_key(LastKey) -> + case LastKey of + empty -> empty; + _ -> ok + end. + +%% Read the Key/Value at this point, returning {ok, Key, Value} +%% catch expected exceptions associated with file corruption (or end) and +%% return eof +saferead_keyvalue(Handle) -> + case read_next_2_integers(Handle) of + eof -> + false; + {KeyL, ValueL} -> + case safe_read_next_term(Handle, KeyL) of + {error, _} -> + false; + eof -> + false; + false -> + false; + Key -> + case file:read(Handle, ValueL) of + eof -> + false; + {ok, Value} -> + case crccheck_value(Value) of + true -> + {Key, Value, KeyL, ValueL}; + false -> + false + end + end + end + end. + + +safe_read_next_term(Handle, Length) -> + try read_next_term(Handle, Length) of + Term -> + Term + catch + error:badarg -> + false + end. + +%% The first four bytes of the value are the crc check +crccheck_value(Value) when byte_size(Value) >4 -> + << Hash:32/integer, Tail/bitstring>> = Value, + case calc_crc(Tail) of + Hash -> + true; + _ -> + leveled_log:log("CDB10", []), + false + end; +crccheck_value(_) -> + leveled_log:log("CDB11", []), + false. + +%% Run a crc check filling out any values which don't fit on byte boundary +calc_crc(Value) -> + case bit_size(Value) rem 8 of + 0 -> + erlang:crc32(Value); + N -> + M = 8 - N, + erlang:crc32(<>) + end. + +read_next_term(Handle, Length) -> + case file:read(Handle, Length) of + {ok, Bin} -> + binary_to_term(Bin); + ReadError -> + ReadError + end. + +%% Read next string where the string has a CRC prepended - stripping the crc +%% and checking if requested +read_next_term(Handle, Length, crc) -> + {ok, <>} = file:read(Handle, Length), + case calc_crc(Bin) of + CRC -> + {true, binary_to_term(Bin)}; + _ -> + {false, crc_wonky} + end. + +%% Extract value and size from binary containing CRC +extract_valueandsize(ValueAsBin) -> + <<_CRC:32/integer, Bin/binary>> = ValueAsBin, + {binary_to_term(Bin), byte_size(Bin)}. + + +%% Used for reading lengths +%% Note that the endian_flip is required to make the file format compatible +%% with CDB +read_next_2_integers(Handle) -> + case file:read(Handle,?DWORD_SIZE) of + {ok, <>} -> + {endian_flip(Int1), endian_flip(Int2)}; + ReadError -> + ReadError + end. + +read_next_n_integerpairs(Handle, NumberOfPairs) -> + {ok, Block} = file:read(Handle, ?DWORD_SIZE * NumberOfPairs), + read_integerpairs(Block, []). + +read_integerpairs(<<>>, Pairs) -> + Pairs; +read_integerpairs(<>, Pairs) -> + read_integerpairs(<>, + Pairs ++ [{endian_flip(Int1), + endian_flip(Int2)}]). + +%% Seach the hash table for the matching hash and key. Be prepared for +%% multiple keys to have the same hash value. +%% +%% There are three possible values of CRCCheck: +%% true - check the CRC before returning key & value +%% false - don't check the CRC before returning key & value +%% loose_presence - confirm that the hash of the key is present + +search_hash_table(_Handle, [], _Hash, _Key, _QuickCheck) -> + missing; +search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, QuickCheck) -> + {ok, _} = file:position(Handle, Entry), + {StoredHash, DataLoc} = read_next_2_integers(Handle), + case StoredHash of + Hash -> + KV = case QuickCheck of + loose_presence -> + probably; + _ -> + extract_kvpair(Handle, [DataLoc], Key) + end, + case KV of + missing -> + search_hash_table(Handle, + RestOfEntries, + Hash, + Key, + QuickCheck); + _ -> + KV + end; + 0 -> + % Hash is 0 so key must be missing as 0 found before Hash matched + missing; + _ -> + search_hash_table(Handle, RestOfEntries, Hash, Key, QuickCheck) + end. + +% Write Key and Value tuples into the CDB. Each tuple consists of a +% 4 byte key length, a 4 byte value length, the actual key followed +% by the value. +% +% Returns a dictionary that is keyed by +% the least significant 8 bits of each hash with the +% values being a list of the hash and the position of the +% key/value binary in the file. +write_key_value_pairs(Handle, KeyValueList) -> + {ok, Position} = file:position(Handle, cur), + HashTree = array:new(256, {default, gb_trees:empty()}), + write_key_value_pairs(Handle, KeyValueList, {Position, HashTree}). + +write_key_value_pairs(_, [], Acc) -> + Acc; +write_key_value_pairs(Handle, [HeadPair|TailList], Acc) -> + {Key, Value} = HeadPair, + {Handle, NewPosition, HashTree} = put(Handle, Key, Value, Acc), + write_key_value_pairs(Handle, TailList, {NewPosition, HashTree}). + +%% Write the actual hashtables at the bottom of the file. Each hash table +%% entry is a doubleword in length. The first word is the hash value +%% corresponding to a key and the second word is a file pointer to the +%% corresponding {key,value} tuple. +write_hash_tables(Handle, HashTree) -> + {ok, StartPos} = file:position(Handle, cur), + {IndexList, HashTreeBin} = hashtable_calc(HashTree, StartPos), + ok = perform_write_hash_tables(Handle, HashTreeBin, StartPos), + IndexList. + +perform_write_hash_tables(Handle, HashTreeBin, StartPos) -> + SWW = os:timestamp(), + ok = file:write(Handle, HashTreeBin), + {ok, EndPos} = file:position(Handle, cur), + ok = file:advise(Handle, StartPos, EndPos - StartPos, will_need), + leveled_log:log_timer("CDB12", [], SWW), + ok. + + +write_hash_tables([], _HashTree, _CurrPos, IndexList, HashTreeBin) -> + {IndexList, HashTreeBin}; +write_hash_tables([Index|Rest], HashTree, CurrPos, IndexList, HashTreeBin) -> + Tree = array:get(Index, HashTree), + case gb_trees:keys(Tree) of + [] -> + write_hash_tables(Rest, HashTree, CurrPos, IndexList, HashTreeBin); + _ -> + HashList = gb_trees:to_list(Tree), + BinList = build_binaryhashlist(HashList, []), + IndexLength = length(BinList) * 2, + SlotList = lists:duplicate(IndexLength, <<0:32, 0:32>>), + + Fn = fun({Hash, Binary}, AccSlotList) -> + Slot1 = find_open_slot(AccSlotList, Hash), + {L1, [<<0:32, 0:32>>|L2]} = lists:split(Slot1, AccSlotList), + lists:append(L1, [Binary|L2]) + end, + + NewSlotList = lists:foldl(Fn, SlotList, BinList), + NewSlotBin = lists:foldl(fun(X, Acc) -> + <> end, + HashTreeBin, + NewSlotList), + write_hash_tables(Rest, + HashTree, + CurrPos + length(NewSlotList) * ?DWORD_SIZE, + [{Index, CurrPos, IndexLength}|IndexList], + NewSlotBin) + end. + +%% The list created from the original HashTree may have duplicate positions +%% e.g. {Key, [Value1, Value2]}. Before any writing is done it is necessary +%% to know the actual number of hashes - or the Slot may not be sized correctly +%% +%% This function creates {Hash, Binary} pairs on a list where there is a unique +%% entry for eveyr Key/Value +build_binaryhashlist([], BinList) -> + BinList; +build_binaryhashlist([{Hash, [Position|TailP]}|TailKV], BinList) -> + HashLE = endian_flip(Hash), + PosLE = endian_flip(Position), + NewBin = <>, + case TailP of + [] -> + build_binaryhashlist(TailKV, + [{Hash, NewBin}|BinList]); + _ -> + build_binaryhashlist([{Hash, TailP}|TailKV], + [{Hash, NewBin}|BinList]) + end. + +%% Slot is zero based because it comes from a REM +find_open_slot(List, Hash) -> + Len = length(List), + Slot = hash_to_slot(Hash, Len), + Seq = lists:seq(1, Len), + {CL1, CL2} = lists:split(Slot, Seq), + {L1, L2} = lists:split(Slot, List), + find_open_slot1(lists:append(CL2, CL1), lists:append(L2, L1)). + +find_open_slot1([Slot|_RestOfSlots], [<<0:32,0:32>>|_RestOfEntries]) -> + Slot - 1; +find_open_slot1([_|RestOfSlots], [_|RestOfEntries]) -> + find_open_slot1(RestOfSlots, RestOfEntries). + + +%% Write the top most 255 doubleword entries. First word is the +%% file pointer to a hashtable and the second word is the number of entries +%% in the hash table +%% The List passed in should be made up of {Index, Position, Count} tuples +write_top_index_table(Handle, BasePos, List) -> + % fold function to find any missing index tuples, and add one a replacement + % in this case with a count of 0. Also orders the list by index + FnMakeIndex = fun(I) -> + case lists:keysearch(I, 1, List) of + {value, Tuple} -> + Tuple; + false -> + {I, BasePos, 0} + end + end, + % Fold function to write the index entries + FnWriteIndex = fun({_Index, Pos, Count}, {AccBin, CurrPos}) -> + case Count == 0 of + true -> + PosLE = endian_flip(CurrPos), + NextPos = CurrPos; + false -> + PosLE = endian_flip(Pos), + NextPos = Pos + (Count * ?DWORD_SIZE) + end, + CountLE = endian_flip(Count), + {<>, NextPos} + end, + + Seq = lists:seq(0, 255), + CompleteList = lists:keysort(1, lists:map(FnMakeIndex, Seq)), + {IndexBin, _Pos} = lists:foldl(FnWriteIndex, + {<<>>, BasePos}, + CompleteList), + {ok, _} = file:position(Handle, 0), + ok = file:write(Handle, IndexBin), + ok = file:advise(Handle, 0, ?DWORD_SIZE * 256, will_need), + ok. + +%% To make this compatible with original Bernstein format this endian flip +%% and also the use of the standard hash function required. +%% +%% Hash function contains mysterious constants, some explanation here as to +%% what they are - +%% http://stackoverflow.com/ ++ +%% questions/10696223/reason-for-5381-number-in-djb-hash-function + +endian_flip(Int) -> + <> = <>, + X. + +hash(Key) -> + BK = term_to_binary(Key), + H = 5381, + hash1(H, BK) band 16#FFFFFFFF. + +hash1(H, <<>>) -> + H; +hash1(H, <>) -> + H1 = H * 33, + H2 = H1 bxor B, + hash1(H2, Rest). + +% Get the least significant 8 bits from the hash. +hash_to_index(Hash) -> + Hash band 255. + +hash_to_slot(Hash, L) -> + (Hash bsr 8) rem L. + +%% Create a binary of the LengthKeyLengthValue, adding a CRC check +%% at the front of the value +key_value_to_record({Key, Value}, BinaryMode) -> + BK = term_to_binary(Key), + BV = case BinaryMode of + true -> + Value; + false -> + term_to_binary(Value) + end, + LK = byte_size(BK), + LV = byte_size(BV), + LK_FL = endian_flip(LK), + LV_FL = endian_flip(LV + 4), + CRC = calc_crc(BV), + <>. + + +multi_key_value_to_record(KVList, BinaryMode, LastPosition) -> + lists:foldl(fun({K, V}, {KPosL, Bin, _LK}) -> + Bin0 = key_value_to_record({K, V}, BinaryMode), + {[{K, byte_size(Bin) + LastPosition}|KPosL], + <>, + K} end, + {[], <<>>, empty}, + KVList). + + +%%%%%%%%%%%%%%%% +% T E S T +%%%%%%%%%%%%%%% +-ifdef(TEST). + +%% +%% dump(FileName) -> List +%% Given a file name, this function returns a list +%% of {key,value} tuples from the CDB. +%% + +dump(FileName) -> + {ok, Handle} = file:open(FileName, [binary, raw, read]), + Fn = fun(Index, Acc) -> + {ok, _} = file:position(Handle, ?DWORD_SIZE * Index), + {_, Count} = read_next_2_integers(Handle), + Acc + Count + end, + NumberOfPairs = lists:foldl(Fn, 0, lists:seq(0,255)) bsr 1, + io:format("Count of keys in db is ~w~n", [NumberOfPairs]), + {ok, _} = file:position(Handle, {bof, 2048}), + Fn1 = fun(_I,Acc) -> + {KL,VL} = read_next_2_integers(Handle), + Key = read_next_term(Handle, KL), + case read_next_term(Handle, VL, crc) of + {_, Value} -> + {ok, CurrLoc} = file:position(Handle, cur), + Return = + case get(Handle, Key) of + {Key,Value} -> {Key ,Value}; + X -> {wonky, X} + end + end, + {ok, _} = file:position(Handle, CurrLoc), + [Return | Acc] + end, + lists:foldr(Fn1, [], lists:seq(0, NumberOfPairs-1)). + +%% +%% to_dict(FileName) +%% Given a filename returns a dict containing +%% the key value pairs from the dict. +%% +%% @spec to_dict(filename()) -> dictionary() +%% where +%% filename() = string(), +%% dictionary() = dict() +%% +to_dict(FileName) -> + KeyValueList = dump(FileName), + dict:from_list(KeyValueList). + + + + +write_key_value_pairs_1_test() -> + {ok,Handle} = file:open("../test/test.cdb",[write]), + {_, HashTree} = write_key_value_pairs(Handle, + [{"key1","value1"}, + {"key2","value2"}]), + Hash1 = hash("key1"), + Index1 = hash_to_index(Hash1), + Hash2 = hash("key2"), + Index2 = hash_to_index(Hash2), + R0 = array:new(256, {default, gb_trees:empty()}), + R1 = array:set(Index1, + gb_trees:insert(Hash1, + [0], + array:get(Index1, R0)), + R0), + R2 = array:set(Index2, + gb_trees:insert(Hash2, + [30], + array:get(Index2, R1)), + R1), + io:format("HashTree is ~w~n", [HashTree]), + io:format("Expected HashTree is ~w~n", [R2]), + ?assertMatch(R2, HashTree), + ok = file:delete("../test/test.cdb"). + + +write_hash_tables_1_test() -> + {ok, Handle} = file:open("../test/testx.cdb", [write]), + R0 = array:new(256, {default, gb_trees:empty()}), + R1 = array:set(64, + gb_trees:insert(6383014720, + [18], + array:get(64, R0)), + R0), + R2 = array:set(67, + gb_trees:insert(6383014723, + [0], + array:get(67, R1)), + R1), + Result = write_hash_tables(Handle, R2), + io:format("write hash tables result of ~w ~n", [Result]), + ?assertMatch(Result,[{67,16,2},{64,0,2}]), + ok = file:delete("../test/testx.cdb"). + +find_open_slot_1_test() -> + List = [<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], + Slot = find_open_slot(List,0), + ?assertMatch(Slot,1). + +find_open_slot_2_test() -> + List = [<<0:32,0:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], + Slot = find_open_slot(List,0), + ?assertMatch(Slot,0). + +find_open_slot_3_test() -> + List = [<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>], + Slot = find_open_slot(List,2), + ?assertMatch(Slot,3). + +find_open_slot_4_test() -> + List = [<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>], + Slot = find_open_slot(List,1), + ?assertMatch(Slot,0). + +find_open_slot_5_test() -> + List = [<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>], + Slot = find_open_slot(List,3), + ?assertMatch(Slot,2). + +full_1_test() -> + List1 = lists:sort([{"key1","value1"},{"key2","value2"}]), + create("../test/simple.cdb", + lists:sort([{"key1","value1"},{"key2","value2"}])), + List2 = lists:sort(dump("../test/simple.cdb")), + ?assertMatch(List1,List2), + ok = file:delete("../test/simple.cdb"). + +full_2_test() -> + List1 = lists:sort([{lists:flatten(io_lib:format("~s~p",[Prefix,Plug])), + lists:flatten(io_lib:format("value~p",[Plug]))} + || Plug <- lists:seq(1,200), + Prefix <- ["dsd","so39ds","oe9%#*(","020dkslsldclsldowlslf%$#", + "tiep4||","qweq"]]), + create("../test/full.cdb",List1), + List2 = lists:sort(dump("../test/full.cdb")), + ?assertMatch(List1,List2), + ok = file:delete("../test/full.cdb"). + +from_dict_test() -> + D = dict:new(), + D1 = dict:store("a","b",D), + D2 = dict:store("c","d",D1), + ok = from_dict("../test/from_dict_test.cdb",D2), + io:format("Store created ~n", []), + KVP = lists:sort(dump("../test/from_dict_test.cdb")), + D3 = lists:sort(dict:to_list(D2)), + io:format("KVP is ~w~n", [KVP]), + io:format("D3 is ~w~n", [D3]), + ?assertMatch(KVP, D3), + ok = file:delete("../test/from_dict_test.cdb"). + +to_dict_test() -> + D = dict:new(), + D1 = dict:store("a","b",D), + D2 = dict:store("c","d",D1), + ok = from_dict("../test/from_dict_test1.cdb",D2), + Dict = to_dict("../test/from_dict_test1.cdb"), + D3 = lists:sort(dict:to_list(D2)), + D4 = lists:sort(dict:to_list(Dict)), + ?assertMatch(D4,D3), + ok = file:delete("../test/from_dict_test1.cdb"). + +crccheck_emptyvalue_test() -> + ?assertMatch(false, crccheck_value(<<>>)). + +crccheck_shortvalue_test() -> + Value = <<128,128,32>>, + ?assertMatch(false, crccheck_value(Value)). + +crccheck_justshortvalue_test() -> + Value = <<128,128,32,64>>, + ?assertMatch(false, crccheck_value(Value)). + +crccheck_correctvalue_test() -> + Value = term_to_binary("some text as value"), + Hash = erlang:crc32(Value), + ValueOnDisk = <>, + ?assertMatch(true, crccheck_value(ValueOnDisk)). + +crccheck_wronghash_test() -> + Value = term_to_binary("some text as value"), + Hash = erlang:crc32(Value) + 1, + ValueOnDisk = <>, + ?assertMatch(false, crccheck_value(ValueOnDisk)). + +crccheck_truncatedvalue_test() -> + Value = term_to_binary("some text as value"), + Hash = erlang:crc32(Value), + ValueOnDisk = <>, + Size = bit_size(ValueOnDisk) - 1, + <> = ValueOnDisk, + ?assertMatch(false, crccheck_value(TruncatedValue)). + +activewrite_singlewrite_test() -> + Key = "0002", + Value = "some text as new value", + InitialD = dict:new(), + InitialD1 = dict:store("0001", "Initial value", InitialD), + ok = from_dict("../test/test_mem.cdb", InitialD1), + io:format("New db file created ~n", []), + {LastPosition, KeyDict, _} = open_active_file("../test/test_mem.cdb"), + io:format("File opened as new active file " + "with LastPosition=~w ~n", [LastPosition]), + {_, _, UpdKeyDict} = put("../test/test_mem.cdb", + Key, Value, + {LastPosition, KeyDict}), + io:format("New key and value added to active file ~n", []), + ?assertMatch({Key, Value}, + get_mem(Key, "../test/test_mem.cdb", + UpdKeyDict)), + ?assertMatch(probably, + get_mem(Key, "../test/test_mem.cdb", + UpdKeyDict, + loose_presence)), + ?assertMatch(missing, + get_mem("not_present", "../test/test_mem.cdb", + UpdKeyDict, + loose_presence)), + ok = file:delete("../test/test_mem.cdb"). + +search_hash_table_findinslot_test() -> + Key1 = "key1", % this is in slot 3 if count is 8 + D = dict:from_list([{Key1, "value1"}, {"K2", "V2"}, {"K3", "V3"}, + {"K4", "V4"}, {"K5", "V5"}, {"K6", "V6"}, {"K7", "V7"}, + {"K8", "V8"}]), + ok = from_dict("../test/hashtable1_test.cdb",D), + {ok, Handle} = file:open("../test/hashtable1_test.cdb", + [binary, raw, read, write]), + Hash = hash(Key1), + Index = hash_to_index(Hash), + {ok, _} = file:position(Handle, {bof, ?DWORD_SIZE*Index}), + {HashTable, Count} = read_next_2_integers(Handle), + io:format("Count of ~w~n", [Count]), + {ok, FirstHashPosition} = file:position(Handle, {bof, HashTable}), + Slot = hash_to_slot(Hash, Count), + io:format("Slot of ~w~n", [Slot]), + {ok, _} = file:position(Handle, {cur, Slot * ?DWORD_SIZE}), + {ReadH3, ReadP3} = read_next_2_integers(Handle), + {ReadH4, ReadP4} = read_next_2_integers(Handle), + io:format("Slot 1 has Hash ~w Position ~w~n", [ReadH3, ReadP3]), + io:format("Slot 2 has Hash ~w Position ~w~n", [ReadH4, ReadP4]), + ?assertMatch(0, ReadH4), + ?assertMatch({"key1", "value1"}, get(Handle, Key1)), + ?assertMatch(probably, get(Handle, Key1, no_cache, loose_presence)), + ?assertMatch(missing, get(Handle, "Key99", no_cache, loose_presence)), + {ok, _} = file:position(Handle, FirstHashPosition), + FlipH3 = endian_flip(ReadH3), + FlipP3 = endian_flip(ReadP3), + RBin = <>, + io:format("Replacement binary of ~w~n", [RBin]), + {ok, OldBin} = file:pread(Handle, + FirstHashPosition + (Slot -1) * ?DWORD_SIZE, 16), + io:format("Bin to be replaced is ~w ~n", [OldBin]), + ok = file:pwrite(Handle, + FirstHashPosition + (Slot -1) * ?DWORD_SIZE, + RBin), + ok = file:close(Handle), + io:format("Find key following change to hash table~n"), + ?assertMatch(missing, get("../test/hashtable1_test.cdb", Key1)), + ok = file:delete("../test/hashtable1_test.cdb"). + +getnextkey_inclemptyvalue_test() -> + L = [{"K9", "V9"}, {"K2", "V2"}, {"K3", ""}, + {"K4", "V4"}, {"K5", "V5"}, {"K6", "V6"}, {"K7", "V7"}, + {"K8", "V8"}, {"K1", "V1"}], + ok = create("../test/hashtable2_test.cdb", L), + {FirstKey, Handle, P1} = get_nextkey("../test/hashtable2_test.cdb"), + io:format("Next position details of ~w~n", [P1]), + ?assertMatch("K9", FirstKey), + {SecondKey, Handle, P2} = get_nextkey(Handle, P1), + ?assertMatch("K2", SecondKey), + {ThirdKeyNoValue, Handle, P3} = get_nextkey(Handle, P2), + ?assertMatch("K3", ThirdKeyNoValue), + {_, Handle, P4} = get_nextkey(Handle, P3), + {_, Handle, P5} = get_nextkey(Handle, P4), + {_, Handle, P6} = get_nextkey(Handle, P5), + {_, Handle, P7} = get_nextkey(Handle, P6), + {_, Handle, P8} = get_nextkey(Handle, P7), + {LastKey, nomorekeys} = get_nextkey(Handle, P8), + ?assertMatch("K1", LastKey), + ok = file:delete("../test/hashtable2_test.cdb"). + +newactivefile_test() -> + {LastPosition, _, _} = open_active_file("../test/activefile_test.cdb"), + ?assertMatch(256 * ?DWORD_SIZE, LastPosition), + Response = get_nextkey("../test/activefile_test.cdb"), + ?assertMatch(nomorekeys, Response), + ok = file:delete("../test/activefile_test.cdb"). + +emptyvalue_fromdict_test() -> + D = dict:new(), + D1 = dict:store("K1", "V1", D), + D2 = dict:store("K2", "", D1), + D3 = dict:store("K3", "V3", D2), + D4 = dict:store("K4", "", D3), + ok = from_dict("../test/from_dict_test_ev.cdb",D4), + io:format("Store created ~n", []), + KVP = lists:sort(dump("../test/from_dict_test_ev.cdb")), + D_Result = lists:sort(dict:to_list(D4)), + io:format("KVP is ~w~n", [KVP]), + io:format("D_Result is ~w~n", [D_Result]), + ?assertMatch(KVP, D_Result), + ok = file:delete("../test/from_dict_test_ev.cdb"). + +find_lastkey_test() -> + {ok, P1} = cdb_open_writer("../test/lastkey.pnd", + #cdb_options{binary_mode=false}), + ok = cdb_put(P1, "Key1", "Value1"), + ok = cdb_put(P1, "Key3", "Value3"), + ok = cdb_put(P1, "Key2", "Value2"), + ?assertMatch("Key2", cdb_lastkey(P1)), + ?assertMatch("Key1", cdb_firstkey(P1)), + probably = cdb_keycheck(P1, "Key2"), + ok = cdb_close(P1), + {ok, P2} = cdb_open_writer("../test/lastkey.pnd", + #cdb_options{binary_mode=false}), + ?assertMatch("Key2", cdb_lastkey(P2)), + probably = cdb_keycheck(P2, "Key2"), + {ok, F2} = cdb_complete(P2), + {ok, P3} = cdb_open_reader(F2), + ?assertMatch("Key2", cdb_lastkey(P3)), + {ok, _FN} = cdb_complete(P3), + {ok, P4} = cdb_open_reader(F2), + ?assertMatch("Key2", cdb_lastkey(P4)), + ok = cdb_close(P4), + ok = file:delete("../test/lastkey.cdb"). + +get_keys_byposition_simple_test() -> + {ok, P1} = cdb_open_writer("../test/poskey.pnd", + #cdb_options{binary_mode=false}), + ok = cdb_put(P1, "Key1", "Value1"), + ok = cdb_put(P1, "Key3", "Value3"), + ok = cdb_put(P1, "Key2", "Value2"), + KeyList = ["Key1", "Key2", "Key3"], + {ok, F2} = cdb_complete(P1), + {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=false}), + PositionList = cdb_getpositions(P2, all), + io:format("Position list of ~w~n", [PositionList]), + ?assertMatch(3, length(PositionList)), + R1 = cdb_directfetch(P2, PositionList, key_only), + ?assertMatch(3, length(R1)), + lists:foreach(fun(Key) -> + Check = lists:member(Key, KeyList), + ?assertMatch(Check, true) end, + R1), + R2 = cdb_directfetch(P2, PositionList, key_size), + ?assertMatch(3, length(R2)), + lists:foreach(fun({Key, _Size}) -> + Check = lists:member(Key, KeyList), + ?assertMatch(Check, true) end, + R2), + R3 = cdb_directfetch(P2, PositionList, key_value_check), + ?assertMatch(3, length(R3)), + lists:foreach(fun({Key, Value, Check}) -> + ?assertMatch(Check, true), + {K, V} = cdb_get(P2, Key), + ?assertMatch(K, Key), + ?assertMatch(V, Value) end, + R3), + ok = cdb_close(P2), + ok = file:delete(F2). + +generate_sequentialkeys(0, KVList) -> + lists:reverse(KVList); +generate_sequentialkeys(Count, KVList) -> + KV = {"Key" ++ integer_to_list(Count), "Value" ++ integer_to_list(Count)}, + generate_sequentialkeys(Count - 1, KVList ++ [KV]). + +get_keys_byposition_manykeys_test() -> + KeyCount = 1024, + {ok, P1} = cdb_open_writer("../test/poskeymany.pnd", + #cdb_options{binary_mode=false}), + KVList = generate_sequentialkeys(KeyCount, []), + lists:foreach(fun({K, V}) -> cdb_put(P1, K, V) end, KVList), + SW1 = os:timestamp(), + {ok, F2} = cdb_complete(P1), + SW2 = os:timestamp(), + io:format("CDB completed in ~w microseconds~n", + [timer:now_diff(SW2, SW1)]), + {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=false}), + SW3 = os:timestamp(), + io:format("CDB opened for read in ~w microseconds~n", + [timer:now_diff(SW3, SW2)]), + PositionList = cdb_getpositions(P2, all), + io:format("Positions fetched in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW3)]), + L1 = length(PositionList), + ?assertMatch(L1, KeyCount), + + SampleList1 = cdb_getpositions(P2, 10), + ?assertMatch(10, length(SampleList1)), + SampleList2 = cdb_getpositions(P2, KeyCount), + ?assertMatch(KeyCount, length(SampleList2)), + SampleList3 = cdb_getpositions(P2, KeyCount + 1), + ?assertMatch(KeyCount, length(SampleList3)), + + ok = cdb_close(P2), + ok = file:delete(F2). + + +nokeys_test() -> + {ok, P1} = cdb_open_writer("../test/nohash_emptyfile.pnd", + #cdb_options{binary_mode=false}), + {ok, F2} = cdb_complete(P1), + {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=false}), + io:format("FirstKey is ~s~n", [cdb_firstkey(P2)]), + io:format("LastKey is ~s~n", [cdb_lastkey(P2)]), + ?assertMatch(empty, cdb_firstkey(P2)), + ?assertMatch(empty, cdb_lastkey(P2)), + ok = cdb_close(P2), + ok = file:delete(F2). + +mput_test() -> + KeyCount = 1024, + {ok, P1} = cdb_open_writer("../test/nohash_keysinfile.pnd", + #cdb_options{binary_mode=false}), + KVList = generate_sequentialkeys(KeyCount, []), + ok = cdb_mput(P1, KVList), + ?assertMatch({"Key1", "Value1"}, cdb_get(P1, "Key1")), + ?assertMatch({"Key1024", "Value1024"}, cdb_get(P1, "Key1024")), + ?assertMatch(missing, cdb_get(P1, "Key1025")), + ?assertMatch(missing, cdb_get(P1, "Key1026")), + {ok, F2} = cdb_complete(P1), + {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=false}), + ?assertMatch("Key1", cdb_firstkey(P2)), + ?assertMatch("Key1024", cdb_lastkey(P2)), + ?assertMatch({"Key1", "Value1"}, cdb_get(P2, "Key1")), + ?assertMatch({"Key1024", "Value1024"}, cdb_get(P2, "Key1024")), + ?assertMatch(missing, cdb_get(P2, "Key1025")), + ?assertMatch(missing, cdb_get(P2, "Key1026")), + ok = cdb_close(P2), + ok = file:delete(F2). + +state_test() -> + {ok, P1} = cdb_open_writer("../test/state_test.pnd", + #cdb_options{binary_mode=false}), + KVList = generate_sequentialkeys(1000, []), + ok = cdb_mput(P1, KVList), + ?assertMatch(probably, cdb_keycheck(P1, "Key1")), + ?assertMatch({"Key1", "Value1"}, cdb_get(P1, "Key1")), + ok = cdb_roll(P1), + ?assertMatch(probably, cdb_keycheck(P1, "Key1")), + ?assertMatch({"Key1", "Value1"}, cdb_get(P1, "Key1")), + ok = cdb_deletepending(P1), + ?assertMatch(probably, cdb_keycheck(P1, "Key1")), + ?assertMatch({"Key1", "Value1"}, cdb_get(P1, "Key1")), + timer:sleep(500), + ?assertMatch(probably, cdb_keycheck(P1, "Key1")), + ?assertMatch({"Key1", "Value1"}, cdb_get(P1, "Key1")), + ok = cdb_close(P1). + +corruptfile_test() -> + file:delete("../test/corrupt_test.pnd"), + {ok, P1} = cdb_open_writer("../test/corrupt_test.pnd", + #cdb_options{binary_mode=false}), + KVList = generate_sequentialkeys(100, []), + ok = cdb_mput(P1, []), % Not relevant to this test, but needs testing + lists:foreach(fun({K, V}) -> cdb_put(P1, K, V) end, KVList), + ?assertMatch(probably, cdb_keycheck(P1, "Key1")), + ?assertMatch({"Key1", "Value1"}, cdb_get(P1, "Key1")), + ?assertMatch({"Key100", "Value100"}, cdb_get(P1, "Key100")), + ok = cdb_close(P1), + lists:foreach(fun(Offset) -> corrupt_testfile_at_offset(Offset) end, + lists:seq(1, 40)), + ok = file:delete("../test/corrupt_test.pnd"). + +corrupt_testfile_at_offset(Offset) -> + {ok, F1} = file:open("../test/corrupt_test.pnd", ?WRITE_OPS), + {ok, EofPos} = file:position(F1, eof), + file:position(F1, EofPos - Offset), + ok = file:truncate(F1), + ok = file:close(F1), + {ok, P2} = cdb_open_writer("../test/corrupt_test.pnd", + #cdb_options{binary_mode=false}), + ?assertMatch(probably, cdb_keycheck(P2, "Key1")), + ?assertMatch({"Key1", "Value1"}, cdb_get(P2, "Key1")), + ?assertMatch(missing, cdb_get(P2, "Key100")), + ok = cdb_put(P2, "Key100", "Value100"), + ?assertMatch({"Key100", "Value100"}, cdb_get(P2, "Key100")), + ok = cdb_close(P2). + +-endif. diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl new file mode 100644 index 0000000..3c0e598 --- /dev/null +++ b/src/leveled_codec.erl @@ -0,0 +1,400 @@ +%% -------- Key Codec --------- +%% +%% Functions for manipulating keys and values within leveled. +%% +%% +%% Within the LEDGER: +%% Keys are of the form - +%% {Tag, Bucket, Key, SubKey|null} +%% Values are of the form +%% {SQN, Status, MD} +%% +%% Within the JOURNAL: +%% Keys are of the form - +%% {SQN, LedgerKey} +%% Values are of the form +%% {Object, IndexSpecs} (as a binary) +%% +%% IndexSpecs are of the form of a Ledger Key/Value +%% +%% Tags need to be set during PUT operations and each Tag used must be +%% supported in an extract_metadata and a build_metadata_object function clause +%% +%% Currently the only tags supported are: +%% - o (standard objects) +%% - o_rkv (riak objects) +%% - i (index entries) + + +-module(leveled_codec). + +-include("include/leveled.hrl"). + +-include_lib("eunit/include/eunit.hrl"). + +-export([ + inker_reload_strategy/1, + strip_to_keyonly/1, + strip_to_seqonly/1, + strip_to_statusonly/1, + strip_to_keyseqstatusonly/1, + strip_to_keyseqonly/1, + striphead_to_details/1, + is_active/3, + endkey_passed/2, + key_dominates/2, + maybe_reap_expiredkey/2, + print_key/1, + to_ledgerkey/3, + to_ledgerkey/5, + from_ledgerkey/1, + to_inkerkv/4, + from_inkerkv/1, + from_journalkey/1, + compact_inkerkvc/2, + split_inkvalue/1, + check_forinkertype/2, + create_value_for_journal/1, + build_metadata_object/2, + generate_ledgerkv/5, + get_size/2, + get_keyandhash/2, + convert_indexspecs/5, + riakto_keydetails/1, + generate_uuid/0, + integer_now/0]). + + +%% Credit to +%% https://github.com/afiskon/erlang-uuid-v4/blob/master/src/uuid.erl +generate_uuid() -> + <> = crypto:rand_bytes(16), + io_lib:format("~8.16.0b-~4.16.0b-4~3.16.0b-~4.16.0b-~12.16.0b", + [A, B, C band 16#0fff, D band 16#3fff bor 16#8000, E]). + +inker_reload_strategy(AltList) -> + ReloadStrategy0 = [{?RIAK_TAG, retain}, {?STD_TAG, retain}], + lists:foldl(fun({X, Y}, SList) -> + lists:keyreplace(X, 1, SList, {X, Y}) + end, + ReloadStrategy0, + AltList). + +strip_to_keyonly({keyonly, K}) -> K; +strip_to_keyonly({K, _V}) -> K. + +strip_to_keyseqstatusonly({K, {SeqN, St, _MD}}) -> {K, SeqN, St}. + +strip_to_statusonly({_, {_, St, _}}) -> St. + +strip_to_seqonly({_, {SeqN, _, _}}) -> SeqN. + +strip_to_keyseqonly({LK, {SeqN, _, _}}) -> {LK, SeqN}. + +striphead_to_details({SeqN, St, MD}) -> {SeqN, St, MD}. + +key_dominates(LeftKey, RightKey) -> + case {LeftKey, RightKey} of + {{LK, _LVAL}, {RK, _RVAL}} when LK < RK -> + left_hand_first; + {{LK, _LVAL}, {RK, _RVAL}} when RK < LK -> + right_hand_first; + {{LK, {LSN, _LST, _LMD}}, {RK, {RSN, _RST, _RMD}}} + when LK == RK, LSN >= RSN -> + left_hand_dominant; + {{LK, {LSN, _LST, _LMD}}, {RK, {RSN, _RST, _RMD}}} + when LK == RK, LSN < RSN -> + right_hand_dominant + end. + + +maybe_reap_expiredkey(KV, LevelD) -> + Status = strip_to_statusonly(KV), + maybe_reap(Status, LevelD). + +maybe_reap({_, infinity}, _) -> + false; % key is not set to expire +maybe_reap({_, TS}, {true, CurrTS}) when CurrTS > TS -> + true; % basement and ready to expire +maybe_reap(tomb, {true, _CurrTS}) -> + true; % always expire in basement +maybe_reap(_, _) -> + false. + +is_active(Key, Value, Now) -> + case strip_to_statusonly({Key, Value}) of + {active, infinity} -> + true; + tomb -> + false; + {active, TS} when TS >= Now -> + true; + {active, _TS} -> + false + end. + +from_ledgerkey({Tag, Bucket, {_IdxField, IdxValue}, Key}) + when Tag == ?IDX_TAG -> + {Bucket, Key, IdxValue}; +from_ledgerkey({_Tag, Bucket, Key, null}) -> + {Bucket, Key}. + +to_ledgerkey(Bucket, Key, Tag, Field, Value) when Tag == ?IDX_TAG -> + {?IDX_TAG, Bucket, {Field, Value}, Key}. + +to_ledgerkey(Bucket, Key, Tag) -> + {Tag, Bucket, Key, null}. + +%% Return the Key, Value and Hash Option for this object. The hash option +%% indicates whether the key would ever be looked up directly, and so if it +%% requires an entry in the hash table +to_inkerkv(LedgerKey, SQN, to_fetch, null) -> + {{SQN, ?INKT_STND, LedgerKey}, null, true}; +to_inkerkv(LedgerKey, SQN, Object, KeyChanges) -> + InkerType = check_forinkertype(LedgerKey, Object), + Value = create_value_for_journal({Object, KeyChanges}), + {{SQN, InkerType, LedgerKey}, Value}. + +%% Used when fetching objects, so only handles standard, hashable entries +from_inkerkv(Object) -> + case Object of + {{SQN, ?INKT_STND, PK}, Bin} when is_binary(Bin) -> + {{SQN, PK}, binary_to_term(Bin)}; + {{SQN, ?INKT_STND, PK}, Term} -> + {{SQN, PK}, Term}; + _ -> + Object + end. + +from_journalkey({SQN, _Type, LedgerKey}) -> + {SQN, LedgerKey}. + +compact_inkerkvc({_InkerKey, crc_wonky, false}, _Strategy) -> + skip; +compact_inkerkvc({{_SQN, ?INKT_TOMB, _LK}, _V, _CrcCheck}, _Strategy) -> + skip; +compact_inkerkvc({{SQN, ?INKT_KEYD, LK}, V, CrcCheck}, Strategy) -> + {Tag, _, _, _} = LK, + {Tag, TagStrat} = lists:keyfind(Tag, 1, Strategy), + case TagStrat of + retain -> + {retain, {{SQN, ?INKT_KEYD, LK}, V, CrcCheck}}; + TagStrat -> + {TagStrat, null} + end; +compact_inkerkvc({{SQN, ?INKT_STND, LK}, V, CrcCheck}, Strategy) -> + {Tag, _, _, _} = LK, + {Tag, TagStrat} = lists:keyfind(Tag, 1, Strategy), + case TagStrat of + retain -> + {_V, KeyDeltas} = split_inkvalue(V), + {TagStrat, {{SQN, ?INKT_KEYD, LK}, {null, KeyDeltas}, CrcCheck}}; + TagStrat -> + {TagStrat, null} + end. + +split_inkvalue(VBin) -> + case is_binary(VBin) of + true -> + binary_to_term(VBin); + false -> + VBin + end. + +check_forinkertype(_LedgerKey, delete) -> + ?INKT_TOMB; +check_forinkertype(_LedgerKey, _Object) -> + ?INKT_STND. + +create_value_for_journal(Value) -> + case Value of + {Object, KeyChanges} -> + term_to_binary({Object, KeyChanges}, [compressed]); + Value when is_binary(Value) -> + Value + end. + + + +hash(Obj) -> + erlang:phash2(term_to_binary(Obj)). + +% Return a tuple of strings to ease the printing of keys to logs +print_key(Key) -> + {A_STR, B_TERM, C_TERM} = case Key of + {?STD_TAG, B, K, _SK} -> + {"Object", B, K}; + {?RIAK_TAG, B, K, _SK} -> + {"RiakObject", B, K}; + {?IDX_TAG, B, {F, _V}, _K} -> + {"Index", B, F} + end, + B_STR = turn_to_string(B_TERM), + C_STR = turn_to_string(C_TERM), + {A_STR, B_STR, C_STR}. + +turn_to_string(Item) -> + if + is_binary(Item) == true -> + binary_to_list(Item); + is_integer(Item) == true -> + integer_to_list(Item); + is_list(Item) == true -> + Item; + true -> + [Output] = io_lib:format("~w", [Item]), + Output + end. + + +% Compare a key against a query key, only comparing elements that are non-null +% in the Query key. This is used for comparing against end keys in queries. +endkey_passed({EK1, null, null, null}, {CK1, _, _, _}) -> + EK1 < CK1; +endkey_passed({EK1, EK2, null, null}, {CK1, CK2, _, _}) -> + {EK1, EK2} < {CK1, CK2}; +endkey_passed({EK1, EK2, EK3, null}, {CK1, CK2, CK3, _}) -> + {EK1, EK2, EK3} < {CK1, CK2, CK3}; +endkey_passed(EndKey, CheckingKey) -> + EndKey < CheckingKey. + +convert_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL) -> + lists:map(fun({IndexOp, IdxField, IdxValue}) -> + Status = case IndexOp of + add -> + {active, TTL}; + remove -> + %% TODO: timestamps for delayed reaping + tomb + end, + {to_ledgerkey(Bucket, Key, ?IDX_TAG, + IdxField, IdxValue), + {SQN, Status, null}} + end, + IndexSpecs). + +generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> + {Tag, Bucket, Key, _} = PrimaryKey, + Status = case Obj of + delete -> + tomb; + _ -> + {active, TS} + end, + {Bucket, + Key, + {PrimaryKey, {SQN, Status, extract_metadata(Obj, Size, Tag)}}}. + + +integer_now() -> + integer_time(os:timestamp()). + +integer_time(TS) -> + DT = calendar:now_to_universal_time(TS), + calendar:datetime_to_gregorian_seconds(DT). + +extract_metadata(Obj, Size, ?RIAK_TAG) -> + riak_extract_metadata(Obj, Size); +extract_metadata(Obj, Size, ?STD_TAG) -> + {hash(Obj), Size}. + +get_size(PK, Value) -> + {Tag, _Bucket, _Key, _} = PK, + {_, _, MD} = Value, + case Tag of + ?RIAK_TAG -> + {_RMD, _VC, _Hash, Size} = MD, + Size; + ?STD_TAG -> + {_Hash, Size} = MD, + Size + end. + +get_keyandhash(LK, Value) -> + {Tag, Bucket, Key, _} = LK, + {_, _, MD} = Value, + case Tag of + ?RIAK_TAG -> + {_RMD, _VC, Hash, _Size} = MD, + {Bucket, Key, Hash}; + ?STD_TAG -> + {Hash, _Size} = MD, + {Bucket, Key, Hash} + end. + + +build_metadata_object(PrimaryKey, MD) -> + {Tag, Bucket, Key, null} = PrimaryKey, + case Tag of + ?RIAK_TAG -> + riak_metadata_object(Bucket, Key, MD); + ?STD_TAG -> + MD + end. + + + + +riak_metadata_object(Bucket, Key, MD) -> + {RMD, VC, _Hash, _Size} = MD, + Contents = lists:foldl(fun(X, Acc) -> Acc ++ [#r_content{metadata=X}] end, + [], + RMD), + #r_object{contents=Contents, bucket=Bucket, key=Key, vclock=VC}. + +riak_extract_metadata(delete, Size) -> + {delete, null, null, Size}; +riak_extract_metadata(Obj, Size) -> + {get_metadatas(Obj), vclock(Obj), riak_hash(Obj), Size}. + +riak_hash(Obj=#r_object{}) -> + Vclock = vclock(Obj), + UpdObj = set_vclock(Obj, lists:sort(Vclock)), + erlang:phash2(term_to_binary(UpdObj)). + +riakto_keydetails(Object) -> + {Object#r_object.bucket, Object#r_object.key}. + +get_metadatas(#r_object{contents=Contents}) -> + [Content#r_content.metadata || Content <- Contents]. + +set_vclock(Object=#r_object{}, VClock) -> Object#r_object{vclock=VClock}. + +vclock(#r_object{vclock=VClock}) -> VClock. + + + + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + + +indexspecs_test() -> + IndexSpecs = [{add, "t1_int", 456}, + {add, "t1_bin", "adbc123"}, + {remove, "t1_bin", "abdc456"}], + Changes = convert_indexspecs(IndexSpecs, "Bucket", "Key2", 1, infinity), + ?assertMatch({{i, "Bucket", {"t1_int", 456}, "Key2"}, + {1, {active, infinity}, null}}, lists:nth(1, Changes)), + ?assertMatch({{i, "Bucket", {"t1_bin", "adbc123"}, "Key2"}, + {1, {active, infinity}, null}}, lists:nth(2, Changes)), + ?assertMatch({{i, "Bucket", {"t1_bin", "abdc456"}, "Key2"}, + {1, tomb, null}}, lists:nth(3, Changes)). + +endkey_passed_test() -> + TestKey = {i, null, null, null}, + K1 = {i, 123, {"a", "b"}, <<>>}, + K2 = {o, 123, {"a", "b"}, <<>>}, + ?assertMatch(false, endkey_passed(TestKey, K1)), + ?assertMatch(true, endkey_passed(TestKey, K2)). + +stringcheck_test() -> + ?assertMatch("Bucket", turn_to_string("Bucket")), + ?assertMatch("Bucket", turn_to_string(<<"Bucket">>)), + ?assertMatch("bucket", turn_to_string(bucket)). + +-endif. \ No newline at end of file diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl new file mode 100644 index 0000000..5c69362 --- /dev/null +++ b/src/leveled_iclerk.erl @@ -0,0 +1,818 @@ +%% -------- Inker's Clerk --------- +%% +%% The Inker's clerk runs compaction jobs on behalf of the Inker, informing the +%% Inker of any manifest changes when complete. +%% +%% -------- Value Compaction --------- +%% +%% Compaction requires the Inker to have four different types of keys +%% * stnd - A standard key of the form {SQN, stnd, LedgerKey} which maps to a +%% value of {Object, KeyDeltas} +%% * tomb - A tombstone for a LedgerKey {SQN, tomb, LedgerKey} +%% * keyd - An object containing key deltas only of the form +%% {SQN, keyd, LedgerKey} which maps to a value of {KeyDeltas} +%% +%% Each LedgerKey has a Tag, and for each Tag there should be a compaction +%% strategy, which will be set to one of the following: +%% * retain - KeyDeltas must be retained permanently, only values can be +%% compacted (if replaced or not_present in the ledger) +%% * recalc - The full object can be removed through comapction (if replaced or +%% not_present in the ledger), as each object with that tag can have the Key +%% Deltas recreated by passing into an assigned recalc function {LedgerKey, +%% SQN, Object, KeyDeltas, PencillerSnapshot} +%% * recovr - At compaction time this is equivalent to recalc, only KeyDeltas +%% are lost when reloading the Ledger from the Journal, and it is assumed that +%% those deltas will be resolved through external anti-entropy (e.g. read +%% repair or AAE) - or alternatively the risk of loss of persisted data from +%% the ledger is accepted for this data type +%% +%% During the compaction process for the Journal, the file chosen for +%% compaction is scanned in SQN order, and a FilterFun is passed (which will +%% normally perform a check against a snapshot of the persisted part of the +%% Ledger). If the given key is of type stnd, and this object is no longer the +%% active object under the LedgerKey, then the object can be compacted out of +%% the journal. This will lead to either its removal (if the strategy for the +%% Tag is recovr or recalc), or its replacement with a KeyDelta object. +%% +%% Tombstones cannot be reaped through this compaction process. +%% +%% Currently, KeyDeltas are also reaped if the LedgerKey has been updated and +%% the Tag has a recovr strategy. This may be the case when KeyDeltas are used +%% as a way of directly representing a change, and where anti-entropy can +%% recover from a loss. +%% +%% -------- Tombstone Reaping --------- +%% +%% Value compaction does not remove tombstones from the database, and so a +%% separate compaction job is required for this. +%% +%% Tombstones can only be reaped for Tags set to recovr or recalc. +%% +%% The tombstone reaping process should select a file to compact, and then +%% take that file and discover the LedgerKeys of all reapable tombstones. +%% The lesger should then be scanned from SQN 0 looking for unreaped objects +%% before the tombstone. If no ushc objects exist for that tombstone, it can +%% now be reaped as part of the compaction job. +%% +%% Other tombstones cannot be reaped, as otherwis eon laoding a ledger an old +%% version of the object may re-emerge. + +-module(leveled_iclerk). + +-behaviour(gen_server). + +-include("include/leveled.hrl"). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + clerk_new/1, + clerk_compact/6, + clerk_hashtablecalc/3, + clerk_stop/1, + code_change/3]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(JOURNAL_FILEX, "cdb"). +-define(PENDING_FILEX, "pnd"). +-define(SAMPLE_SIZE, 200). +-define(BATCH_SIZE, 32). +-define(BATCHES_TO_CHECK, 8). +%% How many consecutive files to compact in one run +-define(MAX_COMPACTION_RUN, 4). +%% Sliding scale to allow preference of longer runs up to maximum +-define(SINGLEFILE_COMPACTION_TARGET, 60.0). +-define(MAXRUN_COMPACTION_TARGET, 80.0). +-define(CRC_SIZE, 4). +-define(DEFAULT_RELOAD_STRATEGY, leveled_codec:inker_reload_strategy([])). + +-record(state, {inker :: pid(), + max_run_length :: integer(), + cdb_options, + reload_strategy = ?DEFAULT_RELOAD_STRATEGY :: list()}). + +-record(candidate, {low_sqn :: integer(), + filename :: string(), + journal :: pid(), + compaction_perc :: float()}). + + +%%%============================================================================ +%%% API +%%%============================================================================ + +clerk_new(InkerClerkOpts) -> + gen_server:start(?MODULE, [InkerClerkOpts], []). + +clerk_compact(Pid, Checker, InitiateFun, FilterFun, Inker, Timeout) -> + gen_server:cast(Pid, + {compact, + Checker, + InitiateFun, + FilterFun, + Inker, + Timeout}). + +clerk_hashtablecalc(HashTree, StartPos, CDBpid) -> + {ok, Clerk} = gen_server:start(?MODULE, [#iclerk_options{}], []), + gen_server:cast(Clerk, {hashtable_calc, HashTree, StartPos, CDBpid}). + +clerk_stop(Pid) -> + gen_server:cast(Pid, stop). + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([IClerkOpts]) -> + ReloadStrategy = IClerkOpts#iclerk_options.reload_strategy, + case IClerkOpts#iclerk_options.max_run_length of + undefined -> + {ok, #state{max_run_length = ?MAX_COMPACTION_RUN, + inker = IClerkOpts#iclerk_options.inker, + cdb_options = IClerkOpts#iclerk_options.cdb_options, + reload_strategy = ReloadStrategy}}; + MRL -> + {ok, #state{max_run_length = MRL, + inker = IClerkOpts#iclerk_options.inker, + cdb_options = IClerkOpts#iclerk_options.cdb_options, + reload_strategy = ReloadStrategy}} + end. + +handle_call(_Msg, _From, State) -> + {reply, not_supported, State}. + +handle_cast({compact, Checker, InitiateFun, FilterFun, Inker, _Timeout}, + State) -> + % Need to fetch manifest at start rather than have it be passed in + % Don't want to process a queued call waiting on an old manifest + [_Active|Manifest] = leveled_inker:ink_getmanifest(Inker), + MaxRunLength = State#state.max_run_length, + {FilterServer, MaxSQN} = InitiateFun(Checker), + CDBopts = State#state.cdb_options, + FP = CDBopts#cdb_options.file_path, + ok = filelib:ensure_dir(FP), + + Candidates = scan_all_files(Manifest, FilterFun, FilterServer, MaxSQN), + BestRun0 = assess_candidates(Candidates, MaxRunLength), + case score_run(BestRun0, MaxRunLength) of + Score when Score > 0.0 -> + BestRun1 = sort_run(BestRun0), + print_compaction_run(BestRun1, MaxRunLength), + {ManifestSlice, + PromptDelete} = compact_files(BestRun1, + CDBopts, + FilterFun, + FilterServer, + MaxSQN, + State#state.reload_strategy), + FilesToDelete = lists:map(fun(C) -> + {C#candidate.low_sqn, + C#candidate.filename, + C#candidate.journal} + end, + BestRun1), + leveled_log:log("IC002", [length(FilesToDelete)]), + case is_process_alive(Inker) of + true -> + update_inker(Inker, + ManifestSlice, + FilesToDelete, + PromptDelete), + {noreply, State}; + false -> + leveled_log:log("IC001", []), + {stop, normal, State} + end; + Score -> + leveled_log:log("IC003", [Score]), + ok = leveled_inker:ink_compactioncomplete(Inker), + {noreply, State} + end; +handle_cast({hashtable_calc, HashTree, StartPos, CDBpid}, State) -> + {IndexList, HashTreeBin} = leveled_cdb:hashtable_calc(HashTree, StartPos), + ok = leveled_cdb:cdb_returnhashtable(CDBpid, IndexList, HashTreeBin), + {stop, normal, State}; +handle_cast(stop, State) -> + {stop, normal, State}. + +handle_info(_Info, State) -> + {noreply, State}. + +terminate(_Reason, _State) -> + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + + +check_single_file(CDB, FilterFun, FilterServer, MaxSQN, SampleSize, BatchSize) -> + FN = leveled_cdb:cdb_filename(CDB), + PositionList = leveled_cdb:cdb_getpositions(CDB, SampleSize), + KeySizeList = fetch_inbatches(PositionList, BatchSize, CDB, []), + R0 = lists:foldl(fun(KS, {ActSize, RplSize}) -> + {{SQN, _Type, PK}, Size} = KS, + Check = FilterFun(FilterServer, PK, SQN), + case {Check, SQN > MaxSQN} of + {true, _} -> + {ActSize + Size - ?CRC_SIZE, RplSize}; + {false, true} -> + {ActSize + Size - ?CRC_SIZE, RplSize}; + _ -> + {ActSize, RplSize + Size - ?CRC_SIZE} + end end, + {0, 0}, + KeySizeList), + {ActiveSize, ReplacedSize} = R0, + Score = case ActiveSize + ReplacedSize of + 0 -> + 100.0; + _ -> + 100 * ActiveSize / (ActiveSize + ReplacedSize) + end, + leveled_log:log("IC004", [FN, Score]), + Score. + +scan_all_files(Manifest, FilterFun, FilterServer, MaxSQN) -> + scan_all_files(Manifest, FilterFun, FilterServer, MaxSQN, []). + +scan_all_files([], _FilterFun, _FilterServer, _MaxSQN, CandidateList) -> + CandidateList; +scan_all_files([Entry|Tail], FilterFun, FilterServer, MaxSQN, CandidateList) -> + {LowSQN, FN, JournalP} = Entry, + CpctPerc = check_single_file(JournalP, + FilterFun, + FilterServer, + MaxSQN, + ?SAMPLE_SIZE, + ?BATCH_SIZE), + scan_all_files(Tail, + FilterFun, + FilterServer, + MaxSQN, + CandidateList ++ + [#candidate{low_sqn = LowSQN, + filename = FN, + journal = JournalP, + compaction_perc = CpctPerc}]). + +fetch_inbatches([], _BatchSize, _CDB, CheckedList) -> + CheckedList; +fetch_inbatches(PositionList, BatchSize, CDB, CheckedList) -> + {Batch, Tail} = if + length(PositionList) >= BatchSize -> + lists:split(BatchSize, PositionList); + true -> + {PositionList, []} + end, + KL_List = leveled_cdb:cdb_directfetch(CDB, Batch, key_size), + fetch_inbatches(Tail, BatchSize, CDB, CheckedList ++ KL_List). + +assess_candidates(AllCandidates, MaxRunLength) -> + NaiveBestRun = assess_candidates(AllCandidates, MaxRunLength, [], []), + case length(AllCandidates) of + L when L > MaxRunLength, MaxRunLength > 1 -> + %% Assess with different offsets from the start + SqL = lists:seq(1, MaxRunLength - 1), + lists:foldl(fun(Counter, BestRun) -> + SubList = lists:nthtail(Counter, + AllCandidates), + assess_candidates(SubList, + MaxRunLength, + [], + BestRun) + end, + NaiveBestRun, + SqL); + _ -> + NaiveBestRun + end. + +assess_candidates([], _MaxRunLength, _CurrentRun0, BestAssessment) -> + BestAssessment; +assess_candidates([HeadC|Tail], MaxRunLength, CurrentRun0, BestAssessment) -> + CurrentRun1 = choose_best_assessment(CurrentRun0 ++ [HeadC], + [HeadC], + MaxRunLength), + assess_candidates(Tail, + MaxRunLength, + CurrentRun1, + choose_best_assessment(CurrentRun1, + BestAssessment, + MaxRunLength)). + + +choose_best_assessment(RunToAssess, BestRun, MaxRunLength) -> + case length(RunToAssess) of + LR1 when LR1 > MaxRunLength -> + BestRun; + _ -> + AssessScore = score_run(RunToAssess, MaxRunLength), + BestScore = score_run(BestRun, MaxRunLength), + if + AssessScore > BestScore -> + RunToAssess; + true -> + BestRun + end + end. + +score_run([], _MaxRunLength) -> + 0.0; +score_run(Run, MaxRunLength) -> + TargetIncr = case MaxRunLength of + 1 -> + 0.0; + MaxRunSize -> + (?MAXRUN_COMPACTION_TARGET + - ?SINGLEFILE_COMPACTION_TARGET) + / (MaxRunSize - 1) + end, + Target = ?SINGLEFILE_COMPACTION_TARGET + TargetIncr * (length(Run) - 1), + RunTotal = lists:foldl(fun(Cand, Acc) -> + Acc + Cand#candidate.compaction_perc end, + 0.0, + Run), + Target - RunTotal / length(Run). + + +print_compaction_run(BestRun, MaxRunLength) -> + leveled_log:log("IC005", [length(BestRun), + score_run(BestRun, MaxRunLength)]), + lists:foreach(fun(File) -> + leveled_log:log("IC006", [File#candidate.filename]) + end, + BestRun). + +sort_run(RunOfFiles) -> + CompareFun = fun(Cand1, Cand2) -> + Cand1#candidate.low_sqn =< Cand2#candidate.low_sqn end, + lists:sort(CompareFun, RunOfFiles). + +update_inker(Inker, ManifestSlice, FilesToDelete, PromptDelete) -> + {ok, ManSQN} = leveled_inker:ink_updatemanifest(Inker, + ManifestSlice, + FilesToDelete), + ok = leveled_inker:ink_compactioncomplete(Inker), + leveled_log:log("IC007", []), + case PromptDelete of + true -> + lists:foreach(fun({_SQN, _FN, J2D}) -> + leveled_cdb:cdb_deletepending(J2D, + ManSQN, + Inker) + end, + FilesToDelete), + ok; + false -> + ok + end. + +compact_files(BestRun, CDBopts, FilterFun, FilterServer, MaxSQN, RStrategy) -> + BatchesOfPositions = get_all_positions(BestRun, []), + compact_files(BatchesOfPositions, + CDBopts, + null, + FilterFun, + FilterServer, + MaxSQN, + RStrategy, + [], + true). + + +compact_files([], _CDBopts, null, _FilterFun, _FilterServer, _MaxSQN, + _RStrategy, ManSlice0, PromptDelete0) -> + {ManSlice0, PromptDelete0}; +compact_files([], _CDBopts, ActiveJournal0, _FilterFun, _FilterServer, _MaxSQN, + _RStrategy, ManSlice0, PromptDelete0) -> + ManSlice1 = ManSlice0 ++ generate_manifest_entry(ActiveJournal0), + {ManSlice1, PromptDelete0}; +compact_files([Batch|T], CDBopts, ActiveJournal0, + FilterFun, FilterServer, MaxSQN, + RStrategy, ManSlice0, PromptDelete0) -> + {SrcJournal, PositionList} = Batch, + KVCs0 = leveled_cdb:cdb_directfetch(SrcJournal, + PositionList, + key_value_check), + R0 = filter_output(KVCs0, + FilterFun, + FilterServer, + MaxSQN, + RStrategy), + {KVCs1, PromptDelete1} = R0, + PromptDelete2 = case {PromptDelete0, PromptDelete1} of + {true, true} -> + true; + _ -> + false + end, + {ActiveJournal1, ManSlice1} = write_values(KVCs1, + CDBopts, + ActiveJournal0, + ManSlice0), + compact_files(T, CDBopts, ActiveJournal1, FilterFun, FilterServer, MaxSQN, + RStrategy, ManSlice1, PromptDelete2). + +get_all_positions([], PositionBatches) -> + PositionBatches; +get_all_positions([HeadRef|RestOfBest], PositionBatches) -> + SrcJournal = HeadRef#candidate.journal, + Positions = leveled_cdb:cdb_getpositions(SrcJournal, all), + leveled_log:log("IC008", [HeadRef#candidate.filename, length(Positions)]), + Batches = split_positions_into_batches(lists:sort(Positions), + SrcJournal, + []), + get_all_positions(RestOfBest, PositionBatches ++ Batches). + +split_positions_into_batches([], _Journal, Batches) -> + Batches; +split_positions_into_batches(Positions, Journal, Batches) -> + {ThisBatch, Tail} = if + length(Positions) > ?BATCH_SIZE -> + lists:split(?BATCH_SIZE, Positions); + true -> + {Positions, []} + end, + split_positions_into_batches(Tail, + Journal, + Batches ++ [{Journal, ThisBatch}]). + + +filter_output(KVCs, FilterFun, FilterServer, MaxSQN, ReloadStrategy) -> + lists:foldl(fun(KVC0, {Acc, PromptDelete}) -> + R = leveled_codec:compact_inkerkvc(KVC0, ReloadStrategy), + case R of + skip -> + {Acc, PromptDelete}; + {TStrat, KVC1} -> + {K, _V, CrcCheck} = KVC0, + {SQN, LedgerKey} = leveled_codec:from_journalkey(K), + KeyValid = FilterFun(FilterServer, LedgerKey, SQN), + case {KeyValid, CrcCheck, SQN > MaxSQN, TStrat} of + {true, true, _, _} -> + {Acc ++ [KVC0], PromptDelete}; + {false, true, true, _} -> + {Acc ++ [KVC0], PromptDelete}; + {false, true, false, retain} -> + {Acc ++ [KVC1], PromptDelete}; + {false, true, false, _} -> + {Acc, PromptDelete} + end + end + end, + {[], true}, + KVCs). + + +write_values([], _CDBopts, Journal0, ManSlice0) -> + {Journal0, ManSlice0}; +write_values(KVCList, CDBopts, Journal0, ManSlice0) -> + KVList = lists:map(fun({K, V, _C}) -> + {K, leveled_codec:create_value_for_journal(V)} + end, + KVCList), + {ok, Journal1} = case Journal0 of + null -> + {TK, _TV} = lists:nth(1, KVList), + {SQN, _LK} = leveled_codec:from_journalkey(TK), + FP = CDBopts#cdb_options.file_path, + FN = leveled_inker:filepath(FP, + SQN, + compact_journal), + leveled_log:log("IC009", [FN]), + leveled_cdb:cdb_open_writer(FN, + CDBopts); + _ -> + {ok, Journal0} + end, + R = leveled_cdb:cdb_mput(Journal1, KVList), + case R of + ok -> + {Journal1, ManSlice0}; + roll -> + ManSlice1 = ManSlice0 ++ generate_manifest_entry(Journal1), + write_values(KVCList, CDBopts, null, ManSlice1) + end. + + +generate_manifest_entry(ActiveJournal) -> + {ok, NewFN} = leveled_cdb:cdb_complete(ActiveJournal), + {ok, PidR} = leveled_cdb:cdb_open_reader(NewFN), + {StartSQN, _Type, _PK} = leveled_cdb:cdb_firstkey(PidR), + [{StartSQN, NewFN, PidR}]. + + + + + + + +%%%============================================================================ +%%% Test +%%%============================================================================ + + +-ifdef(TEST). + +simple_score_test() -> + Run1 = [#candidate{compaction_perc = 75.0}, + #candidate{compaction_perc = 75.0}, + #candidate{compaction_perc = 76.0}, + #candidate{compaction_perc = 70.0}], + ?assertMatch(6.0, score_run(Run1, 4)), + Run2 = [#candidate{compaction_perc = 75.0}], + ?assertMatch(-15.0, score_run(Run2, 4)), + ?assertMatch(0.0, score_run([], 4)), + Run3 = [#candidate{compaction_perc = 100.0}], + ?assertMatch(-40.0, score_run(Run3, 4)). + +score_compare_test() -> + Run1 = [#candidate{compaction_perc = 75.0}, + #candidate{compaction_perc = 75.0}, + #candidate{compaction_perc = 76.0}, + #candidate{compaction_perc = 70.0}], + ?assertMatch(6.0, score_run(Run1, 4)), + Run2 = [#candidate{compaction_perc = 75.0}], + ?assertMatch(Run1, choose_best_assessment(Run1, Run2, 4)), + ?assertMatch(Run2, choose_best_assessment(Run1 ++ Run2, Run2, 4)). + +find_bestrun_test() -> +%% Tests dependent on these defaults +%% -define(MAX_COMPACTION_RUN, 4). +%% -define(SINGLEFILE_COMPACTION_TARGET, 60.0). +%% -define(MAXRUN_COMPACTION_TARGET, 80.0). +%% Tested first with blocks significant as no back-tracking + Block1 = [#candidate{compaction_perc = 75.0}, + #candidate{compaction_perc = 85.0}, + #candidate{compaction_perc = 62.0}, + #candidate{compaction_perc = 70.0}], + Block2 = [#candidate{compaction_perc = 58.0}, + #candidate{compaction_perc = 95.0}, + #candidate{compaction_perc = 95.0}, + #candidate{compaction_perc = 65.0}], + Block3 = [#candidate{compaction_perc = 90.0}, + #candidate{compaction_perc = 100.0}, + #candidate{compaction_perc = 100.0}, + #candidate{compaction_perc = 100.0}], + Block4 = [#candidate{compaction_perc = 75.0}, + #candidate{compaction_perc = 76.0}, + #candidate{compaction_perc = 76.0}, + #candidate{compaction_perc = 60.0}], + Block5 = [#candidate{compaction_perc = 80.0}, + #candidate{compaction_perc = 80.0}], + CList0 = Block1 ++ Block2 ++ Block3 ++ Block4 ++ Block5, + ?assertMatch(Block4, assess_candidates(CList0, 4, [], [])), + CList1 = CList0 ++ [#candidate{compaction_perc = 20.0}], + ?assertMatch([#candidate{compaction_perc = 20.0}], + assess_candidates(CList1, 4, [], [])), + CList2 = Block4 ++ Block3 ++ Block2 ++ Block1 ++ Block5, + ?assertMatch(Block4, assess_candidates(CList2, 4, [], [])), + CList3 = Block5 ++ Block1 ++ Block2 ++ Block3 ++ Block4, + ?assertMatch([#candidate{compaction_perc = 62.0}, + #candidate{compaction_perc = 70.0}, + #candidate{compaction_perc = 58.0}], + assess_candidates(CList3, 4, [], [])), + %% Now do some back-tracking to get a genuinely optimal solution without + %% needing to re-order + ?assertMatch([#candidate{compaction_perc = 62.0}, + #candidate{compaction_perc = 70.0}, + #candidate{compaction_perc = 58.0}], + assess_candidates(CList0, 4)), + ?assertMatch([#candidate{compaction_perc = 62.0}, + #candidate{compaction_perc = 70.0}, + #candidate{compaction_perc = 58.0}], + assess_candidates(CList0, 5)), + ?assertMatch([#candidate{compaction_perc = 62.0}, + #candidate{compaction_perc = 70.0}, + #candidate{compaction_perc = 58.0}, + #candidate{compaction_perc = 95.0}, + #candidate{compaction_perc = 95.0}, + #candidate{compaction_perc = 65.0}], + assess_candidates(CList0, 6)). + +test_ledgerkey(Key) -> + {o, "Bucket", Key, null}. + +test_inkerkv(SQN, Key, V, IdxSpecs) -> + {{SQN, ?INKT_STND, test_ledgerkey(Key)}, term_to_binary({V, IdxSpecs})}. + +fetch_testcdb(RP) -> + FN1 = leveled_inker:filepath(RP, 1, new_journal), + {ok, CDB1} = leveled_cdb:cdb_open_writer(FN1, #cdb_options{}), + {K1, V1} = test_inkerkv(1, "Key1", "Value1", []), + {K2, V2} = test_inkerkv(2, "Key2", "Value2", []), + {K3, V3} = test_inkerkv(3, "Key3", "Value3", []), + {K4, V4} = test_inkerkv(4, "Key1", "Value4", []), + {K5, V5} = test_inkerkv(5, "Key1", "Value5", []), + {K6, V6} = test_inkerkv(6, "Key1", "Value6", []), + {K7, V7} = test_inkerkv(7, "Key1", "Value7", []), + {K8, V8} = test_inkerkv(8, "Key1", "Value8", []), + ok = leveled_cdb:cdb_put(CDB1, K1, V1), + ok = leveled_cdb:cdb_put(CDB1, K2, V2), + ok = leveled_cdb:cdb_put(CDB1, K3, V3), + ok = leveled_cdb:cdb_put(CDB1, K4, V4), + ok = leveled_cdb:cdb_put(CDB1, K5, V5), + ok = leveled_cdb:cdb_put(CDB1, K6, V6), + ok = leveled_cdb:cdb_put(CDB1, K7, V7), + ok = leveled_cdb:cdb_put(CDB1, K8, V8), + {ok, FN2} = leveled_cdb:cdb_complete(CDB1), + leveled_cdb:cdb_open_reader(FN2). + +check_single_file_test() -> + RP = "../test/journal", + {ok, CDB} = fetch_testcdb(RP), + LedgerSrv1 = [{8, {o, "Bucket", "Key1", null}}, + {2, {o, "Bucket", "Key2", null}}, + {3, {o, "Bucket", "Key3", null}}], + LedgerFun1 = fun(Srv, Key, ObjSQN) -> + case lists:keyfind(ObjSQN, 1, Srv) of + {ObjSQN, Key} -> + true; + _ -> + false + end end, + Score1 = check_single_file(CDB, LedgerFun1, LedgerSrv1, 9, 8, 4), + ?assertMatch(37.5, Score1), + LedgerFun2 = fun(_Srv, _Key, _ObjSQN) -> true end, + Score2 = check_single_file(CDB, LedgerFun2, LedgerSrv1, 9, 8, 4), + ?assertMatch(100.0, Score2), + Score3 = check_single_file(CDB, LedgerFun1, LedgerSrv1, 9, 8, 3), + ?assertMatch(37.5, Score3), + Score4 = check_single_file(CDB, LedgerFun1, LedgerSrv1, 4, 8, 4), + ?assertMatch(75.0, Score4), + ok = leveled_cdb:cdb_deletepending(CDB), + ok = leveled_cdb:cdb_destroy(CDB). + + +compact_single_file_setup() -> + RP = "../test/journal", + {ok, CDB} = fetch_testcdb(RP), + Candidate = #candidate{journal = CDB, + low_sqn = 1, + filename = "test", + compaction_perc = 37.5}, + LedgerSrv1 = [{8, {o, "Bucket", "Key1", null}}, + {2, {o, "Bucket", "Key2", null}}, + {3, {o, "Bucket", "Key3", null}}], + LedgerFun1 = fun(Srv, Key, ObjSQN) -> + case lists:keyfind(ObjSQN, 1, Srv) of + {ObjSQN, Key} -> + true; + _ -> + false + end end, + CompactFP = leveled_inker:filepath(RP, journal_compact_dir), + ok = filelib:ensure_dir(CompactFP), + {Candidate, LedgerSrv1, LedgerFun1, CompactFP, CDB}. + +compact_single_file_recovr_test() -> + {Candidate, + LedgerSrv1, + LedgerFun1, + CompactFP, + CDB} = compact_single_file_setup(), + R1 = compact_files([Candidate], + #cdb_options{file_path=CompactFP}, + LedgerFun1, + LedgerSrv1, + 9, + [{?STD_TAG, recovr}]), + {ManSlice1, PromptDelete1} = R1, + ?assertMatch(true, PromptDelete1), + [{LowSQN, FN, PidR}] = ManSlice1, + io:format("FN of ~s~n", [FN]), + ?assertMatch(2, LowSQN), + ?assertMatch(probably, + leveled_cdb:cdb_keycheck(PidR, + {8, + stnd, + test_ledgerkey("Key1")})), + ?assertMatch(missing, leveled_cdb:cdb_get(PidR, + {7, + stnd, + test_ledgerkey("Key1")})), + ?assertMatch(missing, leveled_cdb:cdb_get(PidR, + {1, + stnd, + test_ledgerkey("Key1")})), + {_RK1, RV1} = leveled_cdb:cdb_get(PidR, + {2, + stnd, + test_ledgerkey("Key2")}), + ?assertMatch({"Value2", []}, binary_to_term(RV1)), + ok = leveled_cdb:cdb_deletepending(CDB), + ok = leveled_cdb:cdb_destroy(CDB). + + +compact_single_file_retain_test() -> + {Candidate, + LedgerSrv1, + LedgerFun1, + CompactFP, + CDB} = compact_single_file_setup(), + R1 = compact_files([Candidate], + #cdb_options{file_path=CompactFP}, + LedgerFun1, + LedgerSrv1, + 9, + [{?STD_TAG, retain}]), + {ManSlice1, PromptDelete1} = R1, + ?assertMatch(true, PromptDelete1), + [{LowSQN, FN, PidR}] = ManSlice1, + io:format("FN of ~s~n", [FN]), + ?assertMatch(1, LowSQN), + ?assertMatch(probably, + leveled_cdb:cdb_keycheck(PidR, + {8, + stnd, + test_ledgerkey("Key1")})), + ?assertMatch(missing, leveled_cdb:cdb_get(PidR, + {7, + stnd, + test_ledgerkey("Key1")})), + ?assertMatch(missing, leveled_cdb:cdb_get(PidR, + {1, + stnd, + test_ledgerkey("Key1")})), + {_RK1, RV1} = leveled_cdb:cdb_get(PidR, + {2, + stnd, + test_ledgerkey("Key2")}), + ?assertMatch({"Value2", []}, binary_to_term(RV1)), + ok = leveled_cdb:cdb_deletepending(CDB), + ok = leveled_cdb:cdb_destroy(CDB). + +compact_empty_file_test() -> + RP = "../test/journal", + FN1 = leveled_inker:filepath(RP, 1, new_journal), + CDBopts = #cdb_options{binary_mode=true}, + {ok, CDB1} = leveled_cdb:cdb_open_writer(FN1, CDBopts), + ok = leveled_cdb:cdb_put(CDB1, {1, stnd, test_ledgerkey("Key1")}, <<>>), + {ok, FN2} = leveled_cdb:cdb_complete(CDB1), + {ok, CDB2} = leveled_cdb:cdb_open_reader(FN2), + LedgerSrv1 = [{8, {o, "Bucket", "Key1", null}}, + {2, {o, "Bucket", "Key2", null}}, + {3, {o, "Bucket", "Key3", null}}], + LedgerFun1 = fun(_Srv, _Key, _ObjSQN) -> false end, + Score1 = check_single_file(CDB2, LedgerFun1, LedgerSrv1, 9, 8, 4), + ?assertMatch(100.0, Score1). + +compare_candidate_test() -> + Candidate1 = #candidate{low_sqn=1}, + Candidate2 = #candidate{low_sqn=2}, + Candidate3 = #candidate{low_sqn=3}, + Candidate4 = #candidate{low_sqn=4}, + ?assertMatch([Candidate1, Candidate2, Candidate3, Candidate4], + sort_run([Candidate3, Candidate2, Candidate4, Candidate1])). + +compact_singlefile_totwosmallfiles_test() -> + RP = "../test/journal", + CP = "../test/journal/journal_file/post_compact/", + ok = filelib:ensure_dir(CP), + FN1 = leveled_inker:filepath(RP, 1, new_journal), + CDBoptsLarge = #cdb_options{binary_mode=true, max_size=30000000}, + {ok, CDB1} = leveled_cdb:cdb_open_writer(FN1, CDBoptsLarge), + lists:foreach(fun(X) -> + LK = test_ledgerkey("Key" ++ integer_to_list(X)), + Value = term_to_binary({crypto:rand_bytes(1024), []}), + ok = leveled_cdb:cdb_put(CDB1, + {X, ?INKT_STND, LK}, + Value) + end, + lists:seq(1, 1000)), + {ok, NewName} = leveled_cdb:cdb_complete(CDB1), + {ok, CDBr} = leveled_cdb:cdb_open_reader(NewName), + CDBoptsSmall = #cdb_options{binary_mode=true, max_size=400000, file_path=CP}, + BestRun1 = [#candidate{low_sqn=1, + filename=leveled_cdb:cdb_filename(CDBr), + journal=CDBr, + compaction_perc=50.0}], + FakeFilterFun = fun(_FS, _LK, SQN) -> SQN rem 2 == 0 end, + + {ManifestSlice, PromptDelete} = compact_files(BestRun1, + CDBoptsSmall, + FakeFilterFun, + null, + 900, + [{?STD_TAG, recovr}]), + ?assertMatch(2, length(ManifestSlice)), + ?assertMatch(true, PromptDelete), + lists:foreach(fun({_SQN, _FN, CDB}) -> + ok = leveled_cdb:cdb_deletepending(CDB), + ok = leveled_cdb:cdb_destroy(CDB) + end, + ManifestSlice), + ok = leveled_cdb:cdb_deletepending(CDBr), + ok = leveled_cdb:cdb_destroy(CDBr). + + +-endif. \ No newline at end of file diff --git a/src/leveled_inker.erl b/src/leveled_inker.erl new file mode 100644 index 0000000..5689274 --- /dev/null +++ b/src/leveled_inker.erl @@ -0,0 +1,916 @@ +%% -------- Inker --------- +%% +%% The Inker is responsible for managing access and updates to the Journal. +%% +%% The Inker maintains a manifest of what files make up the Journal, and which +%% file is the current append-only nursery log to accept new PUTs into the +%% Journal. The Inker also marshals GET requests to the appropriate database +%% file within the Journal (routed by sequence number). The Inker is also +%% responsible for scheduling compaction work to be carried out by the Inker's +%% clerk. +%% +%% -------- Journal Files --------- +%% +%% The Journal is a series of files originally named as _ +%% where the sequence number is the first object sequence number (key) within +%% the given database file. The files will be named *.cdb at the point they +%% have been made immutable (through a rename operation). Prior to this, they +%% will originally start out as a *.pnd file. +%% +%% At some stage in the future compacted versions of old journal cdb files may +%% be produced. These files will be named -.cdb, and once +%% the manifest is updated the original _.cdb (or +%% _.cdb) files they replace will be erased. +%% +%% The current Journal is made up of a set of files referenced in the manifest. +%% No PUTs are made to files which are not in the manifest. +%% +%% The Journal is ordered by sequence number from front to back both within +%% and across files. +%% +%% On startup the Inker should open the manifest with the highest sequence +%% number, and this will contain the list of filenames that make up the +%% non-recent part of the Journal. All the filenames should then be opened. +%% How they are opened depends on the file extension: +%% +%% - If the file extension is *.cdb the file is opened read only +%% - If the file extension is *.pnd and the file is not the most recent in the +%% manifest, then the file should be completed bfore being opened read-only +%% - If the file extension is *.pnd the file is opened for writing +%% +%% -------- Manifest Files --------- +%% +%% The manifest is just saved as a straight term_to_binary blob, with a +%% filename ordered by the Manifest SQN. The Manifest is first saved with a +%% *.pnd extension, and then renamed to one with a *.man extension. +%% +%% On startup the *.man manifest file with the highest manifest sequence +%% number should be used. +%% +%% -------- Objects --------- +%% +%% From the perspective of the Inker, objects to store are made up of: +%% - An Inker Key formed from +%% - A sequence number (assigned by the Inker) +%% - An Inker key type (stnd, tomb or keyd) +%% - A Ledger Key (as an Erlang term) +%% - A value formed from +%% - An object (an Erlang term) which should be null for tomb types, and +%% maybe null for keyd types +%% - A set of Key Deltas associated with the change (which may be an +%% empty list ) +%% +%% Note that only the Inker key type of stnd is directly fetchable, other +%% key types are to be found only in scans and so can be added without being +%% entered into the hashtree +%% +%% -------- Compaction --------- +%% +%% Compaction is a process whereby an Inker's clerk will: +%% - Request a view of the current Inker manifest and a snaphot of the Ledger +%% - Test all files within the Journal to find the approximate comapction +%% potential percentage (the volume of the Journal that has been replaced) +%% - Attempts to find the optimal "run" of files to compact +%% - Compacts those files in the run, by rolling over the files re-writing +%% to a new Journal if and only if the Key is still present in the Ledger (or +%% the sequence number of the Key is higher than the SQN of the snapshot) +%% - Requests the Inker update the manifest with the new changes +%% - Instructs the files to destroy themselves when they are next closed +%% +%% TODO: how to instruct the files to close is tbd +%% + + +-module(leveled_inker). + +-behaviour(gen_server). + +-include("include/leveled.hrl"). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3, + ink_start/1, + ink_put/4, + ink_get/3, + ink_fetch/3, + ink_keycheck/3, + ink_loadpcl/4, + ink_registersnapshot/2, + ink_confirmdelete/2, + ink_compactjournal/3, + ink_compactioncomplete/1, + ink_compactionpending/1, + ink_getmanifest/1, + ink_updatemanifest/3, + ink_print_manifest/1, + ink_close/1, + build_dummy_journal/0, + simple_manifest_reader/2, + clean_testdir/1, + filepath/2, + filepath/3]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(MANIFEST_FP, "journal_manifest"). +-define(FILES_FP, "journal_files"). +-define(COMPACT_FP, "post_compact"). +-define(JOURNAL_FILEX, "cdb"). +-define(MANIFEST_FILEX, "man"). +-define(PENDING_FILEX, "pnd"). +-define(LOADING_PAUSE, 1000). +-define(LOADING_BATCH, 1000). + +-record(state, {manifest = [] :: list(), + manifest_sqn = 0 :: integer(), + journal_sqn = 0 :: integer(), + active_journaldb :: pid(), + pending_removals = [] :: list(), + registered_snapshots = [] :: list(), + root_path :: string(), + cdb_options :: #cdb_options{}, + clerk :: pid(), + compaction_pending = false :: boolean(), + is_snapshot = false :: boolean(), + source_inker :: pid()}). + + +%%%============================================================================ +%%% API +%%%============================================================================ + +ink_start(InkerOpts) -> + gen_server:start(?MODULE, [InkerOpts], []). + +ink_put(Pid, PrimaryKey, Object, KeyChanges) -> + gen_server:call(Pid, {put, PrimaryKey, Object, KeyChanges}, infinity). + +ink_get(Pid, PrimaryKey, SQN) -> + gen_server:call(Pid, {get, PrimaryKey, SQN}, infinity). + +ink_fetch(Pid, PrimaryKey, SQN) -> + gen_server:call(Pid, {fetch, PrimaryKey, SQN}, infinity). + +ink_keycheck(Pid, PrimaryKey, SQN) -> + gen_server:call(Pid, {key_check, PrimaryKey, SQN}, infinity). + +ink_registersnapshot(Pid, Requestor) -> + gen_server:call(Pid, {register_snapshot, Requestor}, infinity). + +ink_releasesnapshot(Pid, Snapshot) -> + gen_server:call(Pid, {release_snapshot, Snapshot}, infinity). + +ink_confirmdelete(Pid, ManSQN) -> + gen_server:call(Pid, {confirm_delete, ManSQN}, 1000). + +ink_close(Pid) -> + gen_server:call(Pid, close, infinity). + +ink_loadpcl(Pid, MinSQN, FilterFun, Penciller) -> + gen_server:call(Pid, {load_pcl, MinSQN, FilterFun, Penciller}, infinity). + +ink_compactjournal(Pid, Bookie, Timeout) -> + CheckerInitiateFun = fun initiate_penciller_snapshot/1, + CheckerFilterFun = fun leveled_penciller:pcl_checksequencenumber/3, + gen_server:call(Pid, + {compact, + Bookie, + CheckerInitiateFun, + CheckerFilterFun, + Timeout}, + infinity). + +%% Allows the Checker to be overriden in test, use something other than a +%% penciller +ink_compactjournal(Pid, Checker, InitiateFun, FilterFun, Timeout) -> + gen_server:call(Pid, + {compact, + Checker, + InitiateFun, + FilterFun, + Timeout}, + infinity). + +ink_compactioncomplete(Pid) -> + gen_server:call(Pid, compaction_complete, infinity). + +ink_compactionpending(Pid) -> + gen_server:call(Pid, compaction_pending, infinity). + +ink_getmanifest(Pid) -> + gen_server:call(Pid, get_manifest, infinity). + +ink_updatemanifest(Pid, ManifestSnippet, DeletedFiles) -> + gen_server:call(Pid, + {update_manifest, + ManifestSnippet, + DeletedFiles}, + infinity). + +ink_print_manifest(Pid) -> + gen_server:call(Pid, print_manifest, infinity). + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([InkerOpts]) -> + case {InkerOpts#inker_options.root_path, + InkerOpts#inker_options.start_snapshot} of + {undefined, true} -> + SrcInker = InkerOpts#inker_options.source_inker, + {Manifest, + ActiveJournalDB} = ink_registersnapshot(SrcInker, self()), + {ok, #state{manifest=Manifest, + active_journaldb=ActiveJournalDB, + source_inker=SrcInker, + is_snapshot=true}}; + %% Need to do something about timeout + {_RootPath, false} -> + start_from_file(InkerOpts) + end. + + +handle_call({put, Key, Object, KeyChanges}, _From, State) -> + case put_object(Key, Object, KeyChanges, State) of + {ok, UpdState, ObjSize} -> + {reply, {ok, UpdState#state.journal_sqn, ObjSize}, UpdState}; + {rolling, UpdState, ObjSize} -> + ok = leveled_cdb:cdb_roll(State#state.active_journaldb), + {reply, {ok, UpdState#state.journal_sqn, ObjSize}, UpdState} + end; +handle_call({fetch, Key, SQN}, _From, State) -> + case get_object(Key, SQN, State#state.manifest) of + {{SQN, Key}, {Value, _IndexSpecs}} -> + {reply, {ok, Value}, State}; + Other -> + leveled_log:log("I0001", [Key, SQN, Other]), + {reply, not_present, State} + end; +handle_call({get, Key, SQN}, _From, State) -> + {reply, get_object(Key, SQN, State#state.manifest), State}; +handle_call({key_check, Key, SQN}, _From, State) -> + {reply, key_check(Key, SQN, State#state.manifest), State}; +handle_call({load_pcl, StartSQN, FilterFun, Penciller}, _From, State) -> + Manifest = lists:reverse(State#state.manifest), + Reply = load_from_sequence(StartSQN, FilterFun, Penciller, Manifest), + {reply, Reply, State}; +handle_call({register_snapshot, Requestor}, _From , State) -> + Rs = [{Requestor, + State#state.manifest_sqn}|State#state.registered_snapshots], + leveled_log:log("I0002", [Requestor, State#state.manifest_sqn]), + {reply, {State#state.manifest, + State#state.active_journaldb}, + State#state{registered_snapshots=Rs}}; +handle_call({release_snapshot, Snapshot}, _From , State) -> + Rs = lists:keydelete(Snapshot, 1, State#state.registered_snapshots), + leveled_log:log("I0003", [Snapshot]), + leveled_log:log("I0004", [length(Rs)]), + {reply, ok, State#state{registered_snapshots=Rs}}; +handle_call({confirm_delete, ManSQN}, _From, State) -> + Reply = lists:foldl(fun({_R, SnapSQN}, Bool) -> + case SnapSQN >= ManSQN of + true -> + Bool; + false -> + false + end end, + true, + State#state.registered_snapshots), + {reply, Reply, State}; +handle_call(get_manifest, _From, State) -> + {reply, State#state.manifest, State}; +handle_call({update_manifest, + ManifestSnippet, + DeletedFiles}, _From, State) -> + Man0 = lists:foldl(fun(ManEntry, AccMan) -> + remove_from_manifest(AccMan, ManEntry) + end, + State#state.manifest, + DeletedFiles), + Man1 = lists:foldl(fun(ManEntry, AccMan) -> + add_to_manifest(AccMan, ManEntry) end, + Man0, + ManifestSnippet), + NewManifestSQN = State#state.manifest_sqn + 1, + manifest_printer(Man1), + simple_manifest_writer(Man1, NewManifestSQN, State#state.root_path), + {reply, + {ok, NewManifestSQN}, + State#state{manifest=Man1, + manifest_sqn=NewManifestSQN, + pending_removals=DeletedFiles}}; +handle_call(print_manifest, _From, State) -> + manifest_printer(State#state.manifest), + {reply, ok, State}; +handle_call({compact, + Checker, + InitiateFun, + FilterFun, + Timeout}, + _From, State) -> + leveled_iclerk:clerk_compact(State#state.clerk, + Checker, + InitiateFun, + FilterFun, + self(), + Timeout), + {reply, ok, State#state{compaction_pending=true}}; +handle_call(compaction_complete, _From, State) -> + {reply, ok, State#state{compaction_pending=false}}; +handle_call(compaction_pending, _From, State) -> + {reply, State#state.compaction_pending, State}; +handle_call(close, _From, State) -> + {stop, normal, ok, State}. + +handle_cast(_Msg, State) -> + {noreply, State}. + +handle_info(_Info, State) -> + {noreply, State}. + +terminate(Reason, State) -> + case State#state.is_snapshot of + true -> + ok = ink_releasesnapshot(State#state.source_inker, self()); + false -> + leveled_log:log("I0005", [Reason]), + leveled_log:log("I0006", [State#state.journal_sqn, + State#state.manifest_sqn]), + leveled_iclerk:clerk_stop(State#state.clerk), + lists:foreach(fun({Snap, _SQN}) -> ok = ink_close(Snap) end, + State#state.registered_snapshots), + leveled_log:log("I0007", []), + manifest_printer(State#state.manifest), + ok = close_allmanifest(State#state.manifest) + end. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +start_from_file(InkOpts) -> + RootPath = InkOpts#inker_options.root_path, + CDBopts = InkOpts#inker_options.cdb_options, + JournalFP = filepath(RootPath, journal_dir), + filelib:ensure_dir(JournalFP), + CompactFP = filepath(RootPath, journal_compact_dir), + filelib:ensure_dir(CompactFP), + + ManifestFP = filepath(RootPath, manifest_dir), + ok = filelib:ensure_dir(ManifestFP), + {ok, ManifestFilenames} = file:list_dir(ManifestFP), + + IClerkCDBOpts = CDBopts#cdb_options{file_path = CompactFP}, + ReloadStrategy = InkOpts#inker_options.reload_strategy, + MRL = InkOpts#inker_options.max_run_length, + IClerkOpts = #iclerk_options{inker = self(), + cdb_options=IClerkCDBOpts, + reload_strategy = ReloadStrategy, + max_run_length = MRL}, + {ok, Clerk} = leveled_iclerk:clerk_new(IClerkOpts), + + {Manifest, + ManifestSQN, + JournalSQN, + ActiveJournal} = build_manifest(ManifestFilenames, + RootPath, + CDBopts), + {ok, #state{manifest = Manifest, + manifest_sqn = ManifestSQN, + journal_sqn = JournalSQN, + active_journaldb = ActiveJournal, + root_path = RootPath, + cdb_options = CDBopts, + clerk = Clerk}}. + + +put_object(LedgerKey, Object, KeyChanges, State) -> + NewSQN = State#state.journal_sqn + 1, + {JournalKey, JournalBin} = leveled_codec:to_inkerkv(LedgerKey, + NewSQN, + Object, + KeyChanges), + case leveled_cdb:cdb_put(State#state.active_journaldb, + JournalKey, + JournalBin) of + ok -> + {ok, State#state{journal_sqn=NewSQN}, byte_size(JournalBin)}; + roll -> + SW = os:timestamp(), + CDBopts = State#state.cdb_options, + ManEntry = start_new_activejournal(NewSQN, + State#state.root_path, + CDBopts), + {_, _, NewJournalP} = ManEntry, + NewManifest = add_to_manifest(State#state.manifest, ManEntry), + ok = simple_manifest_writer(NewManifest, + State#state.manifest_sqn + 1, + State#state.root_path), + ok = leveled_cdb:cdb_put(NewJournalP, + JournalKey, + JournalBin), + leveled_log:log_timer("I0008", [], SW), + {rolling, + State#state{journal_sqn=NewSQN, + manifest=NewManifest, + manifest_sqn = State#state.manifest_sqn + 1, + active_journaldb=NewJournalP}, + byte_size(JournalBin)} + end. + + +get_object(LedgerKey, SQN, Manifest) -> + JournalP = find_in_manifest(SQN, Manifest), + {InkerKey, _V, true} = leveled_codec:to_inkerkv(LedgerKey, + SQN, + to_fetch, + null), + Obj = leveled_cdb:cdb_get(JournalP, InkerKey), + leveled_codec:from_inkerkv(Obj). + +key_check(LedgerKey, SQN, Manifest) -> + JournalP = find_in_manifest(SQN, Manifest), + {InkerKey, _V, true} = leveled_codec:to_inkerkv(LedgerKey, + SQN, + to_fetch, + null), + leveled_cdb:cdb_keycheck(JournalP, InkerKey). + +build_manifest(ManifestFilenames, + RootPath, + CDBopts) -> + % Find the manifest with a highest Manifest sequence number + % Open it and read it to get the current Confirmed Manifest + ManifestRegex = "(?[0-9]+)\\." ++ ?MANIFEST_FILEX, + ValidManSQNs = sequencenumbers_fromfilenames(ManifestFilenames, + ManifestRegex, + 'MSQN'), + {Manifest, + ManifestSQN} = case length(ValidManSQNs) of + 0 -> + {[], 1}; + _ -> + PersistedManSQN = lists:max(ValidManSQNs), + M1 = simple_manifest_reader(PersistedManSQN, + RootPath), + {M1, PersistedManSQN} + end, + + % Open the manifest files, completing if necessary and ensure there is + % a valid active journal at the head of the manifest + OpenManifest = open_all_manifest(Manifest, RootPath, CDBopts), + {ActiveLowSQN, _FN, ActiveJournal} = lists:nth(1, OpenManifest), + JournalSQN = case leveled_cdb:cdb_lastkey(ActiveJournal) of + empty -> + ActiveLowSQN; + {JSQN, _Type, _LastKey} -> + JSQN + end, + + % Update the manifest if it has been changed by the process of laoding + % the manifest (must also increment the manifest SQN). + UpdManifestSQN = if + length(OpenManifest) > length(Manifest) -> + leveled_log:log("I0009", []), + manifest_printer(OpenManifest), + simple_manifest_writer(OpenManifest, + ManifestSQN + 1, + RootPath), + ManifestSQN + 1; + true -> + leveled_log:log("I0010", []), + manifest_printer(OpenManifest), + ManifestSQN + end, + {OpenManifest, UpdManifestSQN, JournalSQN, ActiveJournal}. + + +close_allmanifest([]) -> + ok; +close_allmanifest([H|ManifestT]) -> + {_, _, Pid} = H, + ok = leveled_cdb:cdb_close(Pid), + close_allmanifest(ManifestT). + + +open_all_manifest([], RootPath, CDBOpts) -> + leveled_log:log("I0011", []), + add_to_manifest([], start_new_activejournal(1, RootPath, CDBOpts)); +open_all_manifest(Man0, RootPath, CDBOpts) -> + Man1 = lists:reverse(lists:sort(Man0)), + [{HeadSQN, HeadFN}|ManifestTail] = Man1, + CompleteHeadFN = HeadFN ++ "." ++ ?JOURNAL_FILEX, + PendingHeadFN = HeadFN ++ "." ++ ?PENDING_FILEX, + Man2 = case filelib:is_file(CompleteHeadFN) of + true -> + leveled_log:log("I0012", [HeadFN]), + {ok, HeadR} = leveled_cdb:cdb_open_reader(CompleteHeadFN), + {LastSQN, _Type, _PK} = leveled_cdb:cdb_lastkey(HeadR), + add_to_manifest(add_to_manifest(ManifestTail, + {HeadSQN, HeadFN, HeadR}), + start_new_activejournal(LastSQN + 1, + RootPath, + CDBOpts)); + false -> + {ok, HeadW} = leveled_cdb:cdb_open_writer(PendingHeadFN, + CDBOpts), + add_to_manifest(ManifestTail, {HeadSQN, HeadFN, HeadW}) + end, + lists:map(fun(ManEntry) -> + case ManEntry of + {LowSQN, FN} -> + CFN = FN ++ "." ++ ?JOURNAL_FILEX, + PFN = FN ++ "." ++ ?PENDING_FILEX, + case filelib:is_file(CFN) of + true -> + {ok, + Pid} = leveled_cdb:cdb_open_reader(CFN), + {LowSQN, FN, Pid}; + false -> + W = leveled_cdb:cdb_open_writer(PFN, CDBOpts), + {ok, Pid} = W, + ok = leveled_cdb:cdb_roll(Pid), + {LowSQN, FN, Pid} + end; + _ -> + ManEntry + end end, + Man2). + + +start_new_activejournal(SQN, RootPath, CDBOpts) -> + Filename = filepath(RootPath, SQN, new_journal), + {ok, PidW} = leveled_cdb:cdb_open_writer(Filename, CDBOpts), + {SQN, Filename, PidW}. + +add_to_manifest(Manifest, Entry) -> + {SQN, FN, PidR} = Entry, + StrippedName = filename:rootname(FN), + lists:reverse(lists:sort([{SQN, StrippedName, PidR}|Manifest])). + +remove_from_manifest(Manifest, Entry) -> + {SQN, FN, _PidR} = Entry, + leveled_log:log("I0013", [FN]), + lists:keydelete(SQN, 1, Manifest). + +find_in_manifest(SQN, [{LowSQN, _FN, Pid}|_Tail]) when SQN >= LowSQN -> + Pid; +find_in_manifest(SQN, [_Head|Tail]) -> + find_in_manifest(SQN, Tail). + + + +%% Scan between sequence numbers applying FilterFun to each entry where +%% FilterFun{K, V, Acc} -> Penciller Key List +%% Load the output for the CDB file into the Penciller. + +load_from_sequence(_MinSQN, _FilterFun, _Penciller, []) -> + ok; +load_from_sequence(MinSQN, FilterFun, Penciller, [{LowSQN, FN, Pid}|Rest]) + when LowSQN >= MinSQN -> + load_between_sequence(MinSQN, + MinSQN + ?LOADING_BATCH, + FilterFun, + Penciller, + Pid, + undefined, + FN, + Rest); +load_from_sequence(MinSQN, FilterFun, Penciller, [{_LowSQN, FN, Pid}|Rest]) -> + case Rest of + [] -> + load_between_sequence(MinSQN, + MinSQN + ?LOADING_BATCH, + FilterFun, + Penciller, + Pid, + undefined, + FN, + Rest); + [{NextSQN, _NxtFN, _NxtPid}|_Rest] when NextSQN > MinSQN -> + load_between_sequence(MinSQN, + MinSQN + ?LOADING_BATCH, + FilterFun, + Penciller, + Pid, + undefined, + FN, + Rest); + _ -> + load_from_sequence(MinSQN, FilterFun, Penciller, Rest) + end. + + + +load_between_sequence(MinSQN, MaxSQN, FilterFun, Penciller, + CDBpid, StartPos, FN, Rest) -> + leveled_log:log("I0014", [FN, MinSQN]), + InitAcc = {MinSQN, MaxSQN, gb_trees:empty()}, + Res = case leveled_cdb:cdb_scan(CDBpid, FilterFun, InitAcc, StartPos) of + {eof, {AccMinSQN, _AccMaxSQN, AccKL}} -> + ok = push_to_penciller(Penciller, AccKL), + {ok, AccMinSQN}; + {LastPosition, {_AccMinSQN, _AccMaxSQN, AccKL}} -> + ok = push_to_penciller(Penciller, AccKL), + NextSQN = MaxSQN + 1, + load_between_sequence(NextSQN, + NextSQN + ?LOADING_BATCH, + FilterFun, + Penciller, + CDBpid, + LastPosition, + FN, + Rest) + end, + case Res of + {ok, LMSQN} -> + load_from_sequence(LMSQN, FilterFun, Penciller, Rest); + ok -> + ok + end. + +push_to_penciller(Penciller, KeyTree) -> + % The push to penciller must start as a tree to correctly de-duplicate + % the list by order before becoming a de-duplicated list for loading + R = leveled_penciller:pcl_pushmem(Penciller, KeyTree), + case R of + returned -> + timer:sleep(?LOADING_PAUSE), + push_to_penciller(Penciller, KeyTree); + ok -> + ok + end. + + +sequencenumbers_fromfilenames(Filenames, Regex, IntName) -> + lists:foldl(fun(FN, Acc) -> + case re:run(FN, + Regex, + [{capture, [IntName], list}]) of + nomatch -> + Acc; + {match, [Int]} when is_list(Int) -> + Acc ++ [list_to_integer(Int)] + end end, + [], + Filenames). + + +filepath(RootPath, journal_dir) -> + RootPath ++ "/" ++ ?FILES_FP ++ "/"; +filepath(RootPath, manifest_dir) -> + RootPath ++ "/" ++ ?MANIFEST_FP ++ "/"; +filepath(RootPath, journal_compact_dir) -> + filepath(RootPath, journal_dir) ++ "/" ++ ?COMPACT_FP ++ "/". + +filepath(RootPath, NewSQN, new_journal) -> + filename:join(filepath(RootPath, journal_dir), + integer_to_list(NewSQN) ++ "_" + ++ leveled_codec:generate_uuid() + ++ "." ++ ?PENDING_FILEX); +filepath(CompactFilePath, NewSQN, compact_journal) -> + filename:join(CompactFilePath, + integer_to_list(NewSQN) ++ "_" + ++ leveled_codec:generate_uuid() + ++ "." ++ ?PENDING_FILEX). + + +simple_manifest_reader(SQN, RootPath) -> + ManifestPath = filepath(RootPath, manifest_dir), + leveled_log:log("I0015", [ManifestPath, SQN]), + {ok, MBin} = file:read_file(filename:join(ManifestPath, + integer_to_list(SQN) + ++ ".man")), + binary_to_term(MBin). + + +simple_manifest_writer(Manifest, ManSQN, RootPath) -> + ManPath = filepath(RootPath, manifest_dir), + NewFN = filename:join(ManPath, + integer_to_list(ManSQN) ++ "." ++ ?MANIFEST_FILEX), + TmpFN = filename:join(ManPath, + integer_to_list(ManSQN) ++ "." ++ ?PENDING_FILEX), + MBin = term_to_binary(lists:map(fun({SQN, FN, _PID}) -> {SQN, FN} end, + Manifest), [compressed]), + case filelib:is_file(NewFN) of + false -> + leveled_log:log("I0016", [ManSQN]), + ok = file:write_file(TmpFN, MBin), + ok = file:rename(TmpFN, NewFN), + ok + end. + +manifest_printer(Manifest) -> + lists:foreach(fun({SQN, FN, _PID}) -> + leveled_log:log("I0017", [SQN, FN]) end, + Manifest). + +initiate_penciller_snapshot(Bookie) -> + {ok, + {LedgerSnap, LedgerCache}, + _} = leveled_bookie:book_snapshotledger(Bookie, self(), undefined), + ok = leveled_penciller:pcl_loadsnapshot(LedgerSnap, LedgerCache), + MaxSQN = leveled_penciller:pcl_getstartupsequencenumber(LedgerSnap), + {LedgerSnap, MaxSQN}. + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +build_dummy_journal() -> + F = fun(X) -> X end, + build_dummy_journal(F). + +build_dummy_journal(KeyConvertF) -> + RootPath = "../test/journal", + clean_testdir(RootPath), + JournalFP = filepath(RootPath, journal_dir), + ManifestFP = filepath(RootPath, manifest_dir), + ok = filelib:ensure_dir(RootPath), + ok = filelib:ensure_dir(JournalFP), + ok = filelib:ensure_dir(ManifestFP), + F1 = filename:join(JournalFP, "nursery_1.pnd"), + {ok, J1} = leveled_cdb:cdb_open_writer(F1), + {K1, V1} = {KeyConvertF("Key1"), "TestValue1"}, + {K2, V2} = {KeyConvertF("Key2"), "TestValue2"}, + ok = leveled_cdb:cdb_put(J1, {1, stnd, K1}, term_to_binary({V1, []})), + ok = leveled_cdb:cdb_put(J1, {2, stnd, K2}, term_to_binary({V2, []})), + ok = leveled_cdb:cdb_roll(J1), + _LK = leveled_cdb:cdb_lastkey(J1), + ok = leveled_cdb:cdb_close(J1), + F2 = filename:join(JournalFP, "nursery_3.pnd"), + {ok, J2} = leveled_cdb:cdb_open_writer(F2), + {K1, V3} = {KeyConvertF("Key1"), "TestValue3"}, + {K4, V4} = {KeyConvertF("Key4"), "TestValue4"}, + ok = leveled_cdb:cdb_put(J2, {3, stnd, K1}, term_to_binary({V3, []})), + ok = leveled_cdb:cdb_put(J2, {4, stnd, K4}, term_to_binary({V4, []})), + ok = leveled_cdb:cdb_close(J2), + Manifest = [{1, "../test/journal/journal_files/nursery_1"}, + {3, "../test/journal/journal_files/nursery_3"}], + ManifestBin = term_to_binary(Manifest), + {ok, MF1} = file:open(filename:join(ManifestFP, "1.man"), + [binary, raw, read, write]), + ok = file:write(MF1, ManifestBin), + ok = file:close(MF1). + + +clean_testdir(RootPath) -> + clean_subdir(filepath(RootPath, journal_dir)), + clean_subdir(filepath(RootPath, journal_compact_dir)), + clean_subdir(filepath(RootPath, manifest_dir)). + +clean_subdir(DirPath) -> + ok = filelib:ensure_dir(DirPath), + {ok, Files} = file:list_dir(DirPath), + lists:foreach(fun(FN) -> + File = filename:join(DirPath, FN), + case file:delete(File) of + ok -> io:format("Success deleting ~s~n", [File]); + _ -> io:format("Error deleting ~s~n", [File]) + end + end, + Files). + + +simple_inker_test() -> + RootPath = "../test/journal", + build_dummy_journal(), + CDBopts = #cdb_options{max_size=300000}, + {ok, Ink1} = ink_start(#inker_options{root_path=RootPath, + cdb_options=CDBopts}), + Obj1 = ink_get(Ink1, "Key1", 1), + ?assertMatch({{1, "Key1"}, {"TestValue1", []}}, Obj1), + Obj2 = ink_get(Ink1, "Key4", 4), + ?assertMatch({{4, "Key4"}, {"TestValue4", []}}, Obj2), + ink_close(Ink1), + clean_testdir(RootPath). + +simple_inker_completeactivejournal_test() -> + RootPath = "../test/journal", + build_dummy_journal(), + CDBopts = #cdb_options{max_size=300000}, + JournalFP = filepath(RootPath, journal_dir), + F2 = filename:join(JournalFP, "nursery_3.pnd"), + {ok, PidW} = leveled_cdb:cdb_open_writer(F2), + {ok, _F2} = leveled_cdb:cdb_complete(PidW), + F1 = filename:join(JournalFP, "nursery_1.cdb"), + F1r = filename:join(JournalFP, "nursery_1.pnd"), + ok = file:rename(F1, F1r), + {ok, Ink1} = ink_start(#inker_options{root_path=RootPath, + cdb_options=CDBopts}), + Obj1 = ink_get(Ink1, "Key1", 1), + ?assertMatch({{1, "Key1"}, {"TestValue1", []}}, Obj1), + Obj2 = ink_get(Ink1, "Key4", 4), + ?assertMatch({{4, "Key4"}, {"TestValue4", []}}, Obj2), + ink_close(Ink1), + clean_testdir(RootPath). + +test_ledgerkey(Key) -> + {o, "Bucket", Key, null}. + +compact_journal_test() -> + RootPath = "../test/journal", + build_dummy_journal(fun test_ledgerkey/1), + CDBopts = #cdb_options{max_size=300000}, + RStrategy = [{?STD_TAG, recovr}], + {ok, Ink1} = ink_start(#inker_options{root_path=RootPath, + cdb_options=CDBopts, + reload_strategy=RStrategy}), + {ok, NewSQN1, _ObjSize} = ink_put(Ink1, + test_ledgerkey("KeyAA"), + "TestValueAA", []), + ?assertMatch(NewSQN1, 5), + ok = ink_print_manifest(Ink1), + R0 = ink_get(Ink1, test_ledgerkey("KeyAA"), 5), + ?assertMatch(R0, {{5, test_ledgerkey("KeyAA")}, {"TestValueAA", []}}), + FunnyLoop = lists:seq(1, 48), + Checker = lists:map(fun(X) -> + PK = "KeyZ" ++ integer_to_list(X), + {ok, SQN, _} = ink_put(Ink1, + test_ledgerkey(PK), + crypto:rand_bytes(10000), + []), + {SQN, test_ledgerkey(PK)} + end, + FunnyLoop), + {ok, NewSQN2, _ObjSize} = ink_put(Ink1, + test_ledgerkey("KeyBB"), + "TestValueBB", []), + ?assertMatch(NewSQN2, 54), + ActualManifest = ink_getmanifest(Ink1), + ok = ink_print_manifest(Ink1), + ?assertMatch(3, length(ActualManifest)), + ok = ink_compactjournal(Ink1, + Checker, + fun(X) -> {X, 55} end, + fun(L, K, SQN) -> lists:member({SQN, K}, L) end, + 5000), + timer:sleep(1000), + CompactedManifest1 = ink_getmanifest(Ink1), + ?assertMatch(2, length(CompactedManifest1)), + Checker2 = lists:sublist(Checker, 16), + ok = ink_compactjournal(Ink1, + Checker2, + fun(X) -> {X, 55} end, + fun(L, K, SQN) -> lists:member({SQN, K}, L) end, + 5000), + timer:sleep(1000), + CompactedManifest2 = ink_getmanifest(Ink1), + R = lists:foldl(fun({_SQN, FN, _P}, Acc) -> + case string:str(FN, "post_compact") of + N when N > 0 -> + true; + 0 -> + Acc + end end, + false, + CompactedManifest2), + ?assertMatch(true, R), + ?assertMatch(2, length(CompactedManifest2)), + ink_close(Ink1), + clean_testdir(RootPath). + +empty_manifest_test() -> + RootPath = "../test/journal", + clean_testdir(RootPath), + CDBopts = #cdb_options{max_size=300000}, + {ok, Ink1} = ink_start(#inker_options{root_path=RootPath, + cdb_options=CDBopts}), + ?assertMatch(not_present, ink_fetch(Ink1, "Key1", 1)), + ok = ink_compactjournal(Ink1, + [], + fun(X) -> {X, 55} end, + fun(L, K, SQN) -> lists:member({SQN, K}, L) end, + 5000), + timer:sleep(1000), + ?assertMatch(1, length(ink_getmanifest(Ink1))), + ok = ink_close(Ink1), + + % Add pending manifest to be ignored + FN = filepath(RootPath, manifest_dir) ++ "999.pnd", + ok = file:write_file(FN, term_to_binary("Hello")), + + {ok, Ink2} = ink_start(#inker_options{root_path=RootPath, + cdb_options=CDBopts}), + ?assertMatch(not_present, ink_fetch(Ink2, "Key1", 1)), + {ok, SQN, Size} = ink_put(Ink2, "Key1", "Value1", []), + ?assertMatch(2, SQN), + ?assertMatch(true, Size > 0), + {ok, V} = ink_fetch(Ink2, "Key1", 2), + ?assertMatch("Value1", V), + ink_close(Ink2), + clean_testdir(RootPath). + + +-endif. \ No newline at end of file diff --git a/src/leveled_log.erl b/src/leveled_log.erl new file mode 100644 index 0000000..f1779fc --- /dev/null +++ b/src/leveled_log.erl @@ -0,0 +1,293 @@ +%% Module to abstract from choice of logger, and allow use of logReferences +%% for fast lookup + +-module(leveled_log). + +-include("include/leveled.hrl"). + +-include_lib("eunit/include/eunit.hrl"). + +-export([log/2, + log_timer/3]). + +-define(LOG_LEVEL, [info, warn, error, critical]). +-define(LOGBASE, dict:from_list([ + + {"G0001", + {info, "Generic log point"}}, + {"D0001", + {debug, "Generic debug log"}}, + + {"B0001", + {info, "Bookie starting with Ink ~w Pcl ~w"}}, + {"B0002", + {info, "Snapshot starting with Ink ~w Pcl ~w"}}, + {"B0003", + {info, "Bookie closing for reason ~w"}}, + {"B0004", + {info, "Length of increment in snapshot is ~w"}}, + {"B0005", + {info, "LedgerSQN=~w at startup"}}, + {"B0006", + {info, "Reached end of load batch with SQN ~w"}}, + {"B0007", + {info, "Skipping as exceeded MaxSQN ~w with SQN ~w"}}, + + {"P0001", + {info, "Ledger snapshot ~w registered"}}, + {"P0003", + {info, "Ledger snapshot ~w released"}}, + {"P0004", + {info, "Remaining ledger snapshots are ~w"}}, + {"P0005", + {info, "Delete confirmed as file ~s is removed from " ++ + "unreferenced files"}}, + {"P0006", + {info, "Orphaned reply after timeout on L0 file write ~s"}}, + {"P0007", + {debug, "Sent release message for cloned Penciller following close for " + ++ "reason ~w"}}, + {"P0008", + {info, "Penciller closing for reason ~w"}}, + {"P0009", + {info, "Level 0 cache empty at close of Penciller"}}, + {"P0010", + {info, "No level zero action on close of Penciller"}}, + {"P0011", + {info, "Shutdown complete for Penciller"}}, + {"P0012", + {info, "Store to be started based on manifest sequence number of ~w"}}, + {"P0013", + {warn, "Seqence number of 0 indicates no valid manifest"}}, + {"P0014", + {info, "Maximum sequence number of ~w found in nonzero levels"}}, + {"P0015", + {info, "L0 file found ~s"}}, + {"P0016", + {info, "L0 file had maximum sequence number of ~w"}}, + {"P0017", + {info, "No L0 file found"}}, + {"P0018", + {info, "Respone to push_mem of ~w ~s"}}, + {"P0019", + {info, "Rolling level zero to filename ~s"}}, + {"P0020", + {info, "Work at Level ~w to be scheduled for ~w with ~w " + ++ "queue items outstanding at all levels"}}, + {"P0021", + {info, "Allocation of work blocked as L0 pending"}}, + {"P0022", + {info, "Manifest at Level ~w"}}, + {"P0023", + {info, "Manifest entry of startkey ~s ~s ~s endkey ~s ~s ~s " + ++ "filename=~s~n"}}, + {"P0024", + {info, "Outstanding compaction work items of ~w at level ~w"}}, + {"P0025", + {info, "Merge to sqn ~w from Level ~w completed"}}, + {"P0026", + {info, "Merge has been commmitted at sequence number ~w"}}, + {"P0027", + {info, "Rename of manifest from ~s ~w to ~s ~w"}}, + {"P0028", + {info, "Adding cleared file ~s to deletion list"}}, + {"P0029", + {info, "L0 completion confirmed and will transition to not pending"}}, + + {"PC001", + {info, "Penciller's clerk ~w started with owner ~w"}}, + {"PC002", + {info, "Request for manifest change from clerk on closing"}}, + {"PC003", + {info, "Confirmation of manifest change on closing"}}, + {"PC004", + {info, "Prompted confirmation of manifest change"}}, + {"PC005", + {info, "Penciller's Clerk ~w shutdown now complete for reason ~w"}}, + {"PC006", + {info, "Work prompted but none needed"}}, + {"PC007", + {info, "Clerk prompting Penciller regarding manifest change"}}, + {"PC008", + {info, "Merge from level ~w to merge into ~w files below"}}, + {"PC009", + {info, "File ~s to simply switch levels to level ~w"}}, + {"PC010", + {info, "Merge to be commenced for FileToMerge=~s with MSN=~w"}}, + {"PC011", + {info, "Merge completed with MSN=~w Level=~w and FileCounter=~w"}}, + {"PC012", + {info, "File to be created as part of MSN=~w Filename=~s"}}, + {"PC013", + {warn, "Merge resulted in empty file ~s"}}, + {"PC014", + {info, "Empty file ~s to be cleared"}}, + {"PC015", + {info, "File created"}}, + + {"I0001", + {info, "Unexpected failure to fetch value for Key=~w SQN=~w " + ++ "with reason ~w"}}, + {"I0002", + {info, "Journal snapshot ~w registered at SQN ~w"}}, + {"I0003", + {info, "Journal snapshot ~w released"}}, + {"I0004", + {info, "Remaining number of journal snapshots is ~w"}}, + {"I0005", + {info, "Inker closing journal for reason ~w"}}, + {"I0006", + {info, "Close triggered with journal_sqn=~w and manifest_sqn=~w"}}, + {"I0007", + {info, "Inker manifest when closing is:"}}, + {"I0008", + {info, "Put to new active journal required roll and manifest write"}}, + {"I0009", + {info, "Updated manifest on startup:"}}, + {"I0010", + {info, "Unchanged manifest on startup:"}}, + {"I0011", + {info, "Manifest is empty, starting from manifest SQN 1"}}, + {"I0012", + {info, "Head manifest entry ~s is complete so new active journal " + ++ "required"}}, + {"I0013", + {info, "File ~s to be removed from manifest"}}, + {"I0014", + {info, "On startup oading from filename ~s from SQN ~w"}}, + {"I0015", + {info, "Opening manifest file at ~s with SQN ~w"}}, + {"I0016", + {info, "Writing new version of manifest for manifestSQN=~w"}}, + {"I0017", + {info, "At SQN=~w journal has filename ~s"}}, + + {"IC001", + {info, "Inker no longer alive so Clerk to abandon work " + ++ "leaving garbage"}}, + {"IC002", + {info, "Clerk updating Inker as compaction complete of ~w files"}}, + {"IC003", + {info, "No compaction run as highest score=~w"}}, + {"IC004", + {info, "Score for filename ~s is ~w"}}, + {"IC005", + {info, "Compaction to be performed on ~w files with score of ~w"}}, + {"IC006", + {info, "Filename ~s is part of compaction run"}}, + {"IC007", + {info, "Clerk has completed compaction process"}}, + {"IC008", + {info, "Compaction source ~s has yielded ~w positions"}}, + {"IC009", + {info, "Generate journal for compaction with filename ~s"}}, + + {"PM001", + {info, "Indexed new cache entry with total L0 cache size now ~w"}}, + {"PM002", + {info, "Completed dump of L0 cache to list of size ~w"}}, + + + {"SFT01", + {info, "Opened filename with name ~s"}}, + {"SFT02", + {info, "File ~s has been set for delete"}}, + {"SFT03", + {info, "File creation of L0 file ~s"}}, + {"SFT04", + {info, "File ~s prompting for delete status check"}}, + {"SFT05", + {info, "Exit called for reason ~w on filename ~s"}}, + {"SFT06", + {info, "Exit called and now clearing ~s"}}, + {"SFT07", + {info, "Creating file with input of size ~w"}}, + {"SFT08", + {info, "Renaming file from ~s to ~s"}}, + {"SFT09", + {warn, "Filename ~s already exists"}}, + {"SFT10", + {warn, "Rename rogue filename ~s to ~s"}}, + {"SFT11", + {error, "Segment filter failed due to ~s"}}, + {"SFT12", + {error, "Segment filter failed due to CRC check ~w did not match ~w"}}, + {"SFT13", + {error, "Segment filter failed due to ~s"}}, + + + {"CDB01", + {info, "Opening file for writing with filename ~s"}}, + {"CDB02", + {info, "Opening file for reading with filename ~s"}}, + {"CDB03", + {info, "Re-opening file for reading with filename ~s"}}, + {"CDB04", + {info, "Deletion confirmed for file ~s at ManifestSQN ~w"}}, + {"CDB05", + {info, "Closing of filename ~s for Reason ~w"}}, + {"CDB06", + {info, "File to be truncated at last position of ~w with end of " + ++ "file at ~w"}}, + {"CDB07", + {info, "Hashtree computed"}}, + {"CDB08", + {info, "Renaming file from ~s to ~s for which existence is ~w"}}, + {"CDB09", + {info, "Failure to read Key/Value at Position ~w in scan"}}, + {"CDB10", + {info, "CRC check failed due to mismatch"}}, + {"CDB11", + {info, "CRC check failed due to size"}}, + {"CDB12", + {inof, "HashTree written"}} + + ])). + + +log(LogReference, Subs) -> + {ok, {LogLevel, LogText}} = dict:find(LogReference, ?LOGBASE), + case lists:member(LogLevel, ?LOG_LEVEL) of + true -> + io:format(LogReference ++ " ~w " ++ LogText ++ "~n", + [self()|Subs]); + false -> + ok + end. + + +log_timer(LogReference, Subs, StartTime) -> + {ok, {LogLevel, LogText}} = dict:find(LogReference, ?LOGBASE), + case lists:member(LogLevel, ?LOG_LEVEL) of + true -> + MicroS = timer:now_diff(os:timestamp(), StartTime), + {Unit, Time} = case MicroS of + MicroS when MicroS < 1000 -> + {"microsec", MicroS}; + MicroS -> + {"ms", MicroS div 1000} + end, + io:format(LogReference ++ " ~w " ++ LogText + ++ " with time taken ~w " ++ Unit ++ "~n", + [self()|Subs] ++ [Time]); + false -> + ok + end. + + + + + +%%%============================================================================ +%%% Test +%%%============================================================================ + + + +-ifdef(TEST). + +log_test() -> + log("D0001", []), + log_timer("D0001", [], os:timestamp()). + +-endif. \ No newline at end of file diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl new file mode 100644 index 0000000..b50c384 --- /dev/null +++ b/src/leveled_pclerk.erl @@ -0,0 +1,462 @@ +%% -------- PENCILLER's CLERK --------- +%% +%% The Penciller's clerk is responsible for compaction work within the Ledger. +%% +%% The Clerk will periodically poll the Penciller to see if there is work for +%% it to complete, except if the Clerk has informed the Penciller that it has +%% readied a manifest change to be committed - in which case it will wait to +%% be called by the Penciller. +%% +%% -------- COMMITTING MANIFEST CHANGES --------- +%% +%% Once the Penciller has taken a manifest change, the SFT file owners which no +%% longer form part of the manifest will be marked for delete. By marking for +%% deletion, the owners will poll to confirm when it is safe for them to be +%% deleted. +%% +%% It is imperative that the file is not marked for deletion until it is +%% certain that the manifest change has been committed. Some uncollected +%% garbage is considered acceptable. +%% +%% The process of committing a manifest change is as follows: +%% +%% A - The Clerk completes a merge, and casts a prompt to the Penciller with +%% a work item describing the change +%% +%% B - The Penciller commits the change to disk, and then calls the Clerk to +%% confirm the manifest change +%% +%% C - The Clerk replies immediately to acknowledge this call, then marks the +%% removed files for deletion +%% +%% Shutdown < A/B - If the Penciller starts the shutdown process before the +%% merge is complete, in the shutdown the Penciller will call a request for the +%% manifest change which will pick up the pending change. It will then confirm +%% the change, and now the Clerk will mark the files for delete before it +%% replies to the Penciller so it can complete the shutdown process (which will +%% prompt erasing of the removed files). +%% +%% The clerk will not request work on timeout if the committing of a manifest +%% change is pending confirmation. +%% +%% -------- TIMEOUTS --------- +%% +%% The Penciller may prompt the Clerk to callback soon (i.e. reduce the +%% Timeout) if it has urgent work ready (i.e. it has written a L0 file). +%% +%% There will also be a natural quick timeout once the committing of a manifest +%% change has occurred. +%% + +-module(leveled_pclerk). + +-behaviour(gen_server). + +-include("include/leveled.hrl"). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + clerk_new/1, + clerk_prompt/1, + clerk_manifestchange/3, + code_change/3]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(MAX_TIMEOUT, 2000). +-define(MIN_TIMEOUT, 50). + +-record(state, {owner :: pid(), + change_pending=false :: boolean(), + work_item :: #penciller_work{}|null}). + +%%%============================================================================ +%%% API +%%%============================================================================ + +clerk_new(Owner) -> + {ok, Pid} = gen_server:start(?MODULE, [], []), + ok = gen_server:call(Pid, {register, Owner}, infinity), + leveled_log:log("PC001", [Pid, Owner]), + {ok, Pid}. + +clerk_manifestchange(Pid, Action, Closing) -> + gen_server:call(Pid, {manifest_change, Action, Closing}, infinity). + +clerk_prompt(Pid) -> + gen_server:cast(Pid, prompt). + + + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([]) -> + {ok, #state{}}. + +handle_call({register, Owner}, _From, State) -> + {reply, + ok, + State#state{owner=Owner}, + ?MIN_TIMEOUT}; +handle_call({manifest_change, return, true}, _From, State) -> + leveled_log:log("PC002", []), + case State#state.change_pending of + true -> + WI = State#state.work_item, + {reply, {ok, WI}, State}; + false -> + {stop, normal, no_change, State} + end; +handle_call({manifest_change, confirm, Closing}, From, State) -> + case Closing of + true -> + leveled_log:log("PC003", []), + WI = State#state.work_item, + ok = mark_for_delete(WI#penciller_work.unreferenced_files, + State#state.owner), + {stop, normal, ok, State}; + false -> + leveled_log:log("PC004", []), + gen_server:reply(From, ok), + WI = State#state.work_item, + ok = mark_for_delete(WI#penciller_work.unreferenced_files, + State#state.owner), + {noreply, + State#state{work_item=null, change_pending=false}, + ?MIN_TIMEOUT} + end. + +handle_cast(prompt, State) -> + {noreply, State, ?MIN_TIMEOUT}. + +handle_info(timeout, State=#state{change_pending=Pnd}) when Pnd == false -> + case requestandhandle_work(State) of + {false, Timeout} -> + {noreply, State, Timeout}; + {true, WI} -> + % No timeout now as will wait for call to return manifest + % change + {noreply, + State#state{change_pending=true, work_item=WI}} + end. + + +terminate(Reason, _State) -> + leveled_log:log("PC005", [self(), Reason]). + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +requestandhandle_work(State) -> + case leveled_penciller:pcl_workforclerk(State#state.owner) of + none -> + leveled_log:log("PC006", []), + {false, ?MAX_TIMEOUT}; + WI -> + {NewManifest, FilesToDelete} = merge(WI), + UpdWI = WI#penciller_work{new_manifest=NewManifest, + unreferenced_files=FilesToDelete}, + leveled_log:log("PC007", []), + ok = leveled_penciller:pcl_promptmanifestchange(State#state.owner, + UpdWI), + {true, UpdWI} + end. + + +merge(WI) -> + SrcLevel = WI#penciller_work.src_level, + {SrcF, UpdMFest1} = select_filetomerge(SrcLevel, + WI#penciller_work.manifest), + SinkFiles = get_item(SrcLevel + 1, UpdMFest1, []), + {Candidates, Others} = check_for_merge_candidates(SrcF, SinkFiles), + %% TODO: + %% Need to work out if this is the top level + %% And then tell merge process to create files at the top level + %% Which will include the reaping of expired tombstones + leveled_log:log("PC008", [SrcLevel, length(Candidates)]), + + MergedFiles = case length(Candidates) of + 0 -> + %% If no overlapping candiates, manifest change only required + %% + %% TODO: need to think still about simply renaming when at + %% lower level + leveled_log:log("PC009", + [SrcF#manifest_entry.filename, SrcLevel + 1]), + [SrcF]; + _ -> + perform_merge({SrcF#manifest_entry.owner, + SrcF#manifest_entry.filename}, + Candidates, + {SrcLevel, WI#penciller_work.target_is_basement}, + {WI#penciller_work.ledger_filepath, + WI#penciller_work.next_sqn}) + end, + NewLevel = lists:sort(lists:append(MergedFiles, Others)), + UpdMFest2 = lists:keystore(SrcLevel + 1, + 1, + UpdMFest1, + {SrcLevel + 1, NewLevel}), + + ok = filelib:ensure_dir(WI#penciller_work.manifest_file), + {ok, Handle} = file:open(WI#penciller_work.manifest_file, + [binary, raw, write]), + ok = file:write(Handle, term_to_binary(UpdMFest2)), + ok = file:close(Handle), + case lists:member(SrcF, MergedFiles) of + true -> + {UpdMFest2, Candidates}; + false -> + %% Can rub out src file as it is not part of output + {UpdMFest2, Candidates ++ [SrcF]} + end. + + +mark_for_delete([], _Penciller) -> + ok; +mark_for_delete([Head|Tail], Penciller) -> + ok = leveled_sft:sft_setfordelete(Head#manifest_entry.owner, Penciller), + mark_for_delete(Tail, Penciller). + + +check_for_merge_candidates(SrcF, SinkFiles) -> + lists:partition(fun(Ref) -> + case {Ref#manifest_entry.start_key, + Ref#manifest_entry.end_key} of + {_, EK} when SrcF#manifest_entry.start_key > EK -> + false; + {SK, _} when SrcF#manifest_entry.end_key < SK -> + false; + _ -> + true + end end, + SinkFiles). + + +%% An algorithm for discovering which files to merge .... +%% We can find the most optimal file: +%% - The one with the most overlapping data below? +%% - The one that overlaps with the fewest files below? +%% - The smallest file? +%% We could try and be fair in some way (merge oldest first) +%% Ultimately, there is a lack of certainty that being fair or optimal is +%% genuinely better - eventually every file has to be compacted. +%% +%% Hence, the initial implementation is to select files to merge at random + +select_filetomerge(SrcLevel, Manifest) -> + {SrcLevel, LevelManifest} = lists:keyfind(SrcLevel, 1, Manifest), + Selected = lists:nth(random:uniform(length(LevelManifest)), + LevelManifest), + UpdManifest = lists:keyreplace(SrcLevel, + 1, + Manifest, + {SrcLevel, + lists:delete(Selected, + LevelManifest)}), + {Selected, UpdManifest}. + + + +%% Assumption is that there is a single SFT from a higher level that needs +%% to be merged into multiple SFTs at a lower level. This should create an +%% entirely new set of SFTs, and the calling process can then update the +%% manifest. +%% +%% Once the FileToMerge has been emptied, the remainder of the candidate list +%% needs to be placed in a remainder SFT that may be of a sub-optimal (small) +%% size. This stops the need to perpetually roll over the whole level if the +%% level consists of already full files. Some smartness may be required when +%% selecting the candidate list so that small files just outside the candidate +%% list be included to avoid a proliferation of small files. +%% +%% FileToMerge should be a tuple of {FileName, Pid} where the Pid is the Pid of +%% the gen_server leveled_sft process representing the file. +%% +%% CandidateList should be a list of {StartKey, EndKey, Pid} tuples +%% representing different gen_server leveled_sft processes, sorted by StartKey. +%% +%% The level is the level which the new files should be created at. + +perform_merge({SrcPid, SrcFN}, CandidateList, LevelInfo, {Filepath, MSN}) -> + leveled_log:log("PC010", [SrcFN, MSN]), + PointerList = lists:map(fun(P) -> + {next, P#manifest_entry.owner, all} end, + CandidateList), + do_merge([{next, SrcPid, all}], + PointerList, + LevelInfo, + {Filepath, MSN}, + 0, + []). + +do_merge([], [], {SrcLevel, _IsB}, {_Filepath, MSN}, FileCounter, OutList) -> + leveled_log:log("PC011", [MSN, SrcLevel, FileCounter]), + OutList; +do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> + FileName = lists:flatten(io_lib:format(Filepath ++ "_~w_~w.sft", + [SrcLevel + 1, FileCounter])), + leveled_log:log("PC012", [MSN, FileName]), + TS1 = os:timestamp(), + LevelR = case IsB of + true -> + #level{level = SrcLevel + 1, + is_basement = true, + timestamp = leveled_codec:integer_now()}; + false -> + SrcLevel + 1 + end, + {ok, Pid, Reply} = leveled_sft:sft_new(FileName, + KL1, + KL2, + LevelR), + case Reply of + {{[], []}, null, _} -> + leveled_log:log("PC013", [FileName]), + leveled_log:log("PC014", [FileName]), + ok = leveled_sft:sft_clear(Pid), + OutList; + {{KL1Rem, KL2Rem}, SmallestKey, HighestKey} -> + ExtMan = lists:append(OutList, + [#manifest_entry{start_key=SmallestKey, + end_key=HighestKey, + owner=Pid, + filename=FileName}]), + leveled_log:log_timer("PC015", [], TS1), + do_merge(KL1Rem, KL2Rem, + {SrcLevel, IsB}, {Filepath, MSN}, + FileCounter + 1, ExtMan) + end. + + +get_item(Index, List, Default) -> + case lists:keysearch(Index, 1, List) of + {value, {Index, Value}} -> + Value; + false -> + Default + end. + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +generate_randomkeys(Count, BucketRangeLow, BucketRangeHigh) -> + generate_randomkeys(Count, [], BucketRangeLow, BucketRangeHigh). + +generate_randomkeys(0, Acc, _BucketLow, _BucketHigh) -> + Acc; +generate_randomkeys(Count, Acc, BucketLow, BRange) -> + BNumber = string:right(integer_to_list(BucketLow + random:uniform(BRange)), + 4, $0), + KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0), + RandKey = {{o, + "Bucket" ++ BNumber, + "Key" ++ KNumber}, + {Count + 1, + {active, infinity}, null}}, + generate_randomkeys(Count - 1, [RandKey|Acc], BucketLow, BRange). + +choose_pid_toquery([ManEntry|_T], Key) when + Key >= ManEntry#manifest_entry.start_key, + ManEntry#manifest_entry.end_key >= Key -> + ManEntry#manifest_entry.owner; +choose_pid_toquery([_H|T], Key) -> + choose_pid_toquery(T, Key). + + +find_randomkeys(_FList, 0, _Source) -> + ok; +find_randomkeys(FList, Count, Source) -> + KV1 = lists:nth(random:uniform(length(Source)), Source), + K1 = leveled_codec:strip_to_keyonly(KV1), + P1 = choose_pid_toquery(FList, K1), + FoundKV = leveled_sft:sft_get(P1, K1), + Found = leveled_codec:strip_to_keyonly(FoundKV), + io:format("success finding ~w in ~w~n", [K1, P1]), + ?assertMatch(K1, Found), + find_randomkeys(FList, Count - 1, Source). + + +merge_file_test() -> + KL1_L1 = lists:sort(generate_randomkeys(16000, 0, 1000)), + {ok, PidL1_1, _} = leveled_sft:sft_new("../test/KL1_L1.sft", + KL1_L1, [], 1), + KL1_L2 = lists:sort(generate_randomkeys(16000, 0, 250)), + {ok, PidL2_1, _} = leveled_sft:sft_new("../test/KL1_L2.sft", + KL1_L2, [], 2), + KL2_L2 = lists:sort(generate_randomkeys(16000, 250, 250)), + {ok, PidL2_2, _} = leveled_sft:sft_new("../test/KL2_L2.sft", + KL2_L2, [], 2), + KL3_L2 = lists:sort(generate_randomkeys(16000, 500, 250)), + {ok, PidL2_3, _} = leveled_sft:sft_new("../test/KL3_L2.sft", + KL3_L2, [], 2), + KL4_L2 = lists:sort(generate_randomkeys(16000, 750, 250)), + {ok, PidL2_4, _} = leveled_sft:sft_new("../test/KL4_L2.sft", + KL4_L2, [], 2), + Result = perform_merge({PidL1_1, "../test/KL1_L1.sft"}, + [#manifest_entry{owner=PidL2_1}, + #manifest_entry{owner=PidL2_2}, + #manifest_entry{owner=PidL2_3}, + #manifest_entry{owner=PidL2_4}], + {2, false}, {"../test/", 99}), + lists:foreach(fun(ManEntry) -> + {o, B1, K1} = ManEntry#manifest_entry.start_key, + {o, B2, K2} = ManEntry#manifest_entry.end_key, + io:format("Result of ~s ~s and ~s ~s with Pid ~w~n", + [B1, K1, B2, K2, ManEntry#manifest_entry.owner]) end, + Result), + io:format("Finding keys in KL1_L1~n"), + ok = find_randomkeys(Result, 50, KL1_L1), + io:format("Finding keys in KL1_L2~n"), + ok = find_randomkeys(Result, 50, KL1_L2), + io:format("Finding keys in KL2_L2~n"), + ok = find_randomkeys(Result, 50, KL2_L2), + io:format("Finding keys in KL3_L2~n"), + ok = find_randomkeys(Result, 50, KL3_L2), + io:format("Finding keys in KL4_L2~n"), + ok = find_randomkeys(Result, 50, KL4_L2), + leveled_sft:sft_clear(PidL1_1), + leveled_sft:sft_clear(PidL2_1), + leveled_sft:sft_clear(PidL2_2), + leveled_sft:sft_clear(PidL2_3), + leveled_sft:sft_clear(PidL2_4), + lists:foreach(fun(ManEntry) -> + leveled_sft:sft_clear(ManEntry#manifest_entry.owner) end, + Result). + +select_merge_candidates_test() -> + Sink1 = #manifest_entry{start_key = {o, "Bucket", "Key1"}, + end_key = {o, "Bucket", "Key20000"}}, + Sink2 = #manifest_entry{start_key = {o, "Bucket", "Key20001"}, + end_key = {o, "Bucket1", "Key1"}}, + Src1 = #manifest_entry{start_key = {o, "Bucket", "Key40001"}, + end_key = {o, "Bucket", "Key60000"}}, + {Candidates, Others} = check_for_merge_candidates(Src1, [Sink1, Sink2]), + ?assertMatch([Sink2], Candidates), + ?assertMatch([Sink1], Others). + + +select_merge_file_test() -> + L0 = [{{o, "B1", "K1"}, {o, "B3", "K3"}, dummy_pid}], + L1 = [{{o, "B1", "K1"}, {o, "B2", "K2"}, dummy_pid}, + {{o, "B2", "K3"}, {o, "B4", "K4"}, dummy_pid}], + Manifest = [{0, L0}, {1, L1}], + {FileRef, NewManifest} = select_filetomerge(0, Manifest), + ?assertMatch(FileRef, {{o, "B1", "K1"}, {o, "B3", "K3"}, dummy_pid}), + ?assertMatch(NewManifest, [{0, []}, {1, L1}]). + +-endif. diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl new file mode 100644 index 0000000..287f122 --- /dev/null +++ b/src/leveled_penciller.erl @@ -0,0 +1,1662 @@ +%% -------- PENCILLER --------- +%% +%% The penciller is responsible for writing and re-writing the ledger - a +%% persisted, ordered view of non-recent Keys and Metadata which have been +%% added to the store. +%% - The penciller maintains a manifest of all the files within the current +%% Ledger. +%% - The Penciller provides re-write (compaction) work up to be managed by +%% the Penciller's Clerk +%% - The Penciller can be cloned and maintains a register of clones who have +%% requested snapshots of the Ledger +%% - The accepts new dumps (in the form of a gb_tree) from the Bookie, and +%% calls the Bookie once the process of pencilling this data in the Ledger is +%% complete - and the Bookie is free to forget about the data +%% - The Penciller's persistence of the ledger may not be reliable, in that it +%% may lose data but only in sequence from a particular sequence number. On +%% startup the Penciller will inform the Bookie of the highest sequence number +%% it has, and the Bookie should load any missing data from that point out of +%% the journal. +%% +%% -------- LEDGER --------- +%% +%% The Ledger is divided into many levels +%% - L0: New keys are received from the Bookie and merged into a single +%% gb_tree, until that tree is the size of a SFT file, and it is then persisted +%% as a SFT file at this level. L0 SFT files can be larger than the normal +%% maximum size - so we don't have to consider problems of either having more +%% than one L0 file (and handling what happens on a crash between writing the +%% files when the second may have overlapping sequence numbers), or having a +%% remainder with overlapping in sequence numbers in memory after the file is +%% written. Once the persistence is completed, the L0 tree can be erased. +%% There can be only one SFT file at Level 0, so the work to merge that file +%% to the lower level must be the highest priority, as otherwise writes to the +%% ledger will stall, when there is next a need to persist. +%% - L1 TO L7: May contain multiple processes managing non-overlapping sft +%% files. Compaction work should be sheduled if the number of files exceeds +%% the target size of the level, where the target size is 8 ^ n. +%% - L Minus 1: Used to cache the last ledger cache push for use in queries +%% whilst the Penciller awaits a callback from the roll_clerk with the new +%% merged L0 file containing the L-1 updates. +%% +%% +%% The most recent revision of a Key can be found by checking each level until +%% the key is found. To check a level the correct file must be sought from the +%% manifest for that level, and then a call is made to that file. If the Key +%% is not present then every level should be checked. +%% +%% If a compaction change takes the size of a level beyond the target size, +%% then compaction work for that level + 1 should be added to the compaction +%% work queue. +%% Compaction work is fetched by the Penciller's Clerk because: +%% - it has timed out due to a period of inactivity +%% - it has been triggered by the a cast to indicate the arrival of high +%% priority compaction work +%% The Penciller's Clerk (which performs compaction worker) will always call +%% the Penciller to find out the highest priority work currently required +%% whenever it has either completed work, or a timeout has occurred since it +%% was informed there was no work to do. +%% +%% When the clerk picks work it will take the current manifest, and the +%% Penciller assumes the manifest sequence number is to be incremented. +%% When the clerk has completed the work it cna request that the manifest +%% change be committed by the Penciller. The commit is made through changing +%% the filename of the new manifest - so the Penciller is not held up by the +%% process of wiritng a file, just altering file system metadata. +%% +%% The manifest is locked by a clerk taking work, or by there being a need to +%% write a file to Level 0. If the manifest is locked, then new keys can still +%% be added in memory - however, the response to that push will be to "pause", +%% that is to say the Penciller will ask the Bookie to slowdown. +%% +%% ---------- PUSH ---------- +%% +%% The Penciller must support the PUSH of a dump of keys from the Bookie. The +%% call to PUSH should be immediately acknowledged, and then work should be +%% completed to merge the tree into the L0 tree (with the tree being cached as +%% a Level -1 tree so as not to block reads whilst it waits. +%% +%% The Penciller MUST NOT accept a new PUSH if the Clerk has commenced the +%% conversion of the current ETS table into a SFT file, but not completed this +%% change. The Penciller in this case returns the push, and the Bookie should +%% continue to gorw the cache before trying again. +%% +%% ---------- FETCH ---------- +%% +%% On request to fetch a key the Penciller should look first in the L0 ETS +%% table, and then look in the SFT files Level by Level, consulting the +%% Manifest to determine which file should be checked at each level. +%% +%% ---------- SNAPSHOT ---------- +%% +%% Iterators may request a snapshot of the database. A snapshot is a cloned +%% Penciller seeded not from disk, but by the in-memory L0 gb_tree and the +%% in-memory manifest, allowing for direct reference for the SFT file processes. +%% +%% Clones formed to support snapshots are registered by the Penciller, so that +%% SFT files valid at the point of the snapshot until either the iterator is +%% completed or has timed out. +%% +%% ---------- ON STARTUP ---------- +%% +%% On Startup the Bookie with ask the Penciller to initiate the Ledger first. +%% To initiate the Ledger the must consult the manifest, and then start a SFT +%% management process for each file in the manifest. +%% +%% The penciller should then try and read any Level 0 file which has the +%% manifest sequence number one higher than the last store in the manifest. +%% +%% The Bookie will ask the Inker for any Keys seen beyond that sequence number +%% before the startup of the overall store can be completed. +%% +%% ---------- ON SHUTDOWN ---------- +%% +%% On a controlled shutdown the Penciller should attempt to write any in-memory +%% ETS table to a L0 SFT file, assuming one is nto already pending. If one is +%% already pending then the Penciller will not persist this part of the Ledger. +%% +%% ---------- FOLDER STRUCTURE ---------- +%% +%% The following folders are used by the Penciller +%% $ROOT/ledger/ledger_manifest/ - used for keeping manifest files +%% $ROOT/ledger/ledger_files/ - containing individual SFT files +%% +%% In larger stores there could be a large number of files in the ledger_file +%% folder - perhaps o(1000). It is assumed that modern file systems should +%% handle this efficiently. +%% +%% ---------- COMPACTION & MANIFEST UPDATES ---------- +%% +%% The Penciller can have one and only one Clerk for performing compaction +%% work. When the Clerk has requested and taken work, it should perform the +%5 compaction work starting the new SFT process to manage the new Ledger state +%% and then write a new manifest file that represents that state with using +%% the next Manifest sequence number as the filename: +%% - nonzero_.pnd +%% +%% The Penciller on accepting the change should rename the manifest file to - +%% - nonzero_.crr +%% +%% On startup, the Penciller should look for the nonzero_*.crr file with the +%% highest such manifest sequence number. This will be started as the +%% manifest, together with any _0_0.sft file found at that Manifest SQN. +%% Level zero files are not kept in the persisted manifest, and adding a L0 +%% file does not advanced the Manifest SQN. +%% +%% The pace at which the store can accept updates will be dependent on the +%% speed at which the Penciller's Clerk can merge files at lower levels plus +%% the time it takes to merge from Level 0. As if a clerk has commenced +%% compaction work at a lower level and then immediately a L0 SFT file is +%% written the Penciller will need to wait for this compaction work to +%% complete and the L0 file to be compacted before the ETS table can be +%% allowed to again reach capacity +%% +%% The writing of L0 files do not require the involvement of the clerk. +%% The L0 files are prompted directly by the penciller when the in-memory tree +%% has reached capacity. When there is a next push into memory the Penciller +%% calls to check that the file is now active (which may pause if the write is +%% ongoing the acceptence of the push), and if so it can clear the L0 tree +%% and build a new tree from an empty tree and the keys from the latest push. +%% +%% Only a single L0 file may exist at any one moment in time. If pushes are +%% received when memory is over the maximum size, the pushes must be kept into +%% memory. +%% +%% 1 - A L0 file is prompted to be created at ManifestSQN n +%% 2 - The next push to memory will be stalled until the L0 write is reported +%% as completed (as the memory needs to be flushed) +%% 3 - The completion of the L0 file will cause a prompt to be cast to the +%% clerk for them to look for work +%% 4 - On completion of the merge (of the L0 file into L1, as this will be the +%% highest priority work), the clerk will create a new manifest file at +%% manifest SQN n+1 +%% 5 - The clerk will prompt the penciller about the change, and the Penciller +%% will then commit the change (by renaming the manifest file to be active, and +%% advancing the in-memory state of the manifest and manifest SQN) +%% 6 - The Penciller having committed the change will cast back to the Clerk +%% to inform the Clerk that the chnage has been committed, and so it can carry +%% on requetsing new work +%% 7 - If the Penciller now receives a Push to over the max size, a new L0 file +%% can now be created with the ManifestSQN of n+1 +%% +%% ---------- NOTES ON THE (NON) USE OF ETS ---------- +%% +%% Insertion into ETS is very fast, and so using ETS does not slow the PUT +%% path. However, an ETS table is mutable, so it does complicate the +%% snapshotting of the Ledger. +%% +%% Originally the solution had used an ETS table for insertion speed as the L0 +%% cache. Insertion speed was an order or magnitude faster than gb_trees. To +%% resolving issues of trying to have fast start-up snapshots though led to +%% keeping a seperate set of trees alongside the ETS table to be used by +%% snapshots. +%% +%% The next strategy was to perform the expensive operation (merging the +%% Ledger cache into the Level0 cache), within a dedicated Penciller's clerk, +%% known as the roll_clerk. This may take 30-40ms, but during this period +%% the Penciller will keep a Level -1 cache of the unmerged elements which +%% it will wipe once the roll_clerk returns with an updated L0 cache. +%% +%% This was still a bit complicated, and did a lot of processing to +%% making updates to the large L0 cache - which will have created a lot of GC +%% effort required. The processing was inefficient +%% +%% The current paproach is to simply append each new tree pushed to a list, and +%% use an array of hashes to index for the presence of objects in the list. +%% When attempting to iterate, the caches are all merged for the range relevant +%% to the given iterator only. The main downside to the approahc is that the +%% Penciller cna no longer accurately measure the size of the L0 cache (as it +%% cannot determine how many replacements there are in the Cache - so it may +%% prematurely write a smaller than necessary L0 file. + +-module(leveled_penciller). + +-behaviour(gen_server). + +-include("include/leveled.hrl"). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3, + pcl_start/1, + pcl_pushmem/2, + pcl_fetchlevelzero/2, + pcl_fetch/2, + pcl_fetchkeys/5, + pcl_checksequencenumber/3, + pcl_workforclerk/1, + pcl_promptmanifestchange/2, + pcl_confirml0complete/4, + pcl_confirmdelete/2, + pcl_close/1, + pcl_registersnapshot/2, + pcl_releasesnapshot/2, + pcl_loadsnapshot/2, + pcl_getstartupsequencenumber/1, + clean_testdir/1]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(LEVEL_SCALEFACTOR, [{0, 0}, {1, 8}, {2, 64}, {3, 512}, + {4, 4096}, {5, 32768}, {6, 262144}, + {7, infinity}]). +-define(MAX_LEVELS, 8). +-define(MAX_WORK_WAIT, 300). +-define(MANIFEST_FP, "ledger_manifest"). +-define(FILES_FP, "ledger_files"). +-define(CURRENT_FILEX, "crr"). +-define(PENDING_FILEX, "pnd"). +-define(MEMTABLE, mem). +-define(MAX_TABLESIZE, 32000). +-define(PROMPT_WAIT_ONL0, 5). +-define(WORKQUEUE_BACKLOG_TOLERANCE, 4). + + +-record(state, {manifest = [] :: list(), + manifest_sqn = 0 :: integer(), + ledger_sqn = 0 :: integer(), % The highest SQN added to L0 + persisted_sqn = 0 :: integer(), % The highest SQN persisted + registered_snapshots = [] :: list(), + unreferenced_files = [] :: list(), + root_path = "../test" :: string(), + + clerk :: pid(), + + levelzero_pending = false :: boolean(), + levelzero_constructor :: pid(), + levelzero_cache = [] :: list(), % a list of gb_trees + levelzero_index :: erlang:array(), + levelzero_size = 0 :: integer(), + levelzero_maxcachesize :: integer(), + + is_snapshot = false :: boolean(), + snapshot_fully_loaded = false :: boolean(), + source_penciller :: pid(), + + ongoing_work = [] :: list(), + work_backlog = false :: boolean()}). + + +%%%============================================================================ +%%% API +%%%============================================================================ + + +pcl_start(PCLopts) -> + gen_server:start(?MODULE, [PCLopts], []). + +pcl_pushmem(Pid, DumpList) -> + %% Bookie to dump memory onto penciller + gen_server:call(Pid, {push_mem, DumpList}, infinity). + +pcl_fetchlevelzero(Pid, Slot) -> + %% Timeout to cause crash of L0 file when it can't get the close signal + %% as it is deadlocked making this call. + %% + %% If the timeout gets hit outside of close scenario the Penciller will + %% be stuck in L0 pending + gen_server:call(Pid, {fetch_levelzero, Slot}, 10000). + +pcl_fetch(Pid, Key) -> + gen_server:call(Pid, {fetch, Key}, infinity). + +pcl_fetchkeys(Pid, StartKey, EndKey, AccFun, InitAcc) -> + gen_server:call(Pid, + {fetch_keys, StartKey, EndKey, AccFun, InitAcc}, + infinity). + +pcl_checksequencenumber(Pid, Key, SQN) -> + gen_server:call(Pid, {check_sqn, Key, SQN}, infinity). + +pcl_workforclerk(Pid) -> + gen_server:call(Pid, work_for_clerk, infinity). + +pcl_promptmanifestchange(Pid, WI) -> + gen_server:cast(Pid, {manifest_change, WI}). + +pcl_confirml0complete(Pid, FN, StartKey, EndKey) -> + gen_server:cast(Pid, {levelzero_complete, FN, StartKey, EndKey}). + +pcl_confirmdelete(Pid, FileName) -> + gen_server:cast(Pid, {confirm_delete, FileName}). + +pcl_getstartupsequencenumber(Pid) -> + gen_server:call(Pid, get_startup_sqn, infinity). + +pcl_registersnapshot(Pid, Snapshot) -> + gen_server:call(Pid, {register_snapshot, Snapshot}, infinity). + +pcl_releasesnapshot(Pid, Snapshot) -> + gen_server:cast(Pid, {release_snapshot, Snapshot}). + +pcl_loadsnapshot(Pid, Increment) -> + gen_server:call(Pid, {load_snapshot, Increment}, infinity). + + +pcl_close(Pid) -> + gen_server:call(Pid, close, 60000). + + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([PCLopts]) -> + case {PCLopts#penciller_options.root_path, + PCLopts#penciller_options.start_snapshot} of + {undefined, true} -> + SrcPenciller = PCLopts#penciller_options.source_penciller, + {ok, State} = pcl_registersnapshot(SrcPenciller, self()), + leveled_log:log("P0001", [self()]), + {ok, State#state{is_snapshot=true, source_penciller=SrcPenciller}}; + %% Need to do something about timeout + {_RootPath, false} -> + start_from_file(PCLopts) + end. + + +handle_call({push_mem, PushedTree}, From, State=#state{is_snapshot=Snap}) + when Snap == false -> + % The push_mem process is as follows: + % + % 1 - Receive a gb_tree containing the latest Key/Value pairs (note that + % we mean value from the perspective of the Ledger, not the full value + % stored in the Inker) + % + % 2 - Check to see if there is a levelzero file pending. If so, the + % update must be returned. If not the update can be accepted + % + % 3 - The Penciller can now reply to the Bookie to show if the push has + % been accepted + % + % 4 - Update the cache: + % a) Append the cache to the list + % b) Add hashes for all the elements to the index + % + % Check the approximate size of the cache. If it is over the maximum size, + % trigger a backgroun L0 file write and update state of levelzero_pending. + case {State#state.levelzero_pending, State#state.work_backlog} of + {true, _} -> + leveled_log:log("P0018", [returned, "L-0 persist pending"]), + {reply, returned, State}; + {false, true} -> + leveled_log:log("P0018", [returned, "Merge tree work backlog"]), + {reply, returned, State}; + {false, false} -> + leveled_log:log("P0018", [ok, "L0 memory updated"]), + gen_server:reply(From, ok), + {noreply, update_levelzero(State#state.levelzero_index, + State#state.levelzero_size, + PushedTree, + State#state.ledger_sqn, + State#state.levelzero_cache, + State)} + end; +handle_call({fetch, Key}, _From, State) -> + {reply, + fetch_mem(Key, + State#state.manifest, + State#state.levelzero_index, + State#state.levelzero_cache), + State}; +handle_call({check_sqn, Key, SQN}, _From, State) -> + {reply, + compare_to_sqn(fetch_mem(Key, + State#state.manifest, + State#state.levelzero_index, + State#state.levelzero_cache), + SQN), + State}; +handle_call({fetch_keys, StartKey, EndKey, AccFun, InitAcc}, + _From, + State=#state{snapshot_fully_loaded=Ready}) + when Ready == true -> + L0AsTree = leveled_pmem:merge_trees(StartKey, + EndKey, + State#state.levelzero_cache, + gb_trees:empty()), + L0iter = gb_trees:iterator(L0AsTree), + SFTiter = initiate_rangequery_frommanifest(StartKey, + EndKey, + State#state.manifest), + Acc = keyfolder(L0iter, SFTiter, StartKey, EndKey, {AccFun, InitAcc}), + {reply, Acc, State}; +handle_call(work_for_clerk, From, State) -> + {UpdState, Work} = return_work(State, From), + {reply, Work, UpdState}; +handle_call(get_startup_sqn, _From, State) -> + {reply, State#state.persisted_sqn, State}; +handle_call({register_snapshot, Snapshot}, _From, State) -> + Rs = [{Snapshot, State#state.manifest_sqn}|State#state.registered_snapshots], + {reply, {ok, State}, State#state{registered_snapshots = Rs}}; +handle_call({load_snapshot, BookieIncrTree}, _From, State) -> + L0D = leveled_pmem:add_to_index(State#state.levelzero_index, + State#state.levelzero_size, + BookieIncrTree, + State#state.ledger_sqn, + State#state.levelzero_cache), + {LedgerSQN, L0Size, L0Index, L0Cache} = L0D, + {reply, ok, State#state{levelzero_cache=L0Cache, + levelzero_index=L0Index, + levelzero_size=L0Size, + ledger_sqn=LedgerSQN, + snapshot_fully_loaded=true}}; +handle_call({fetch_levelzero, Slot}, _From, State) -> + {reply, lists:nth(Slot, State#state.levelzero_cache), State}; +handle_call(close, _From, State) -> + {stop, normal, ok, State}. + + +handle_cast({manifest_change, WI}, State) -> + {ok, UpdState} = commit_manifest_change(WI, State), + ok = leveled_pclerk:clerk_manifestchange(State#state.clerk, + confirm, + false), + {noreply, UpdState}; +handle_cast({release_snapshot, Snapshot}, State) -> + Rs = lists:keydelete(Snapshot, 1, State#state.registered_snapshots), + leveled_log:log("P0003", [Snapshot]), + leveled_log:log("P0004", [Rs]), + {noreply, State#state{registered_snapshots=Rs}}; +handle_cast({confirm_delete, FileName}, State=#state{is_snapshot=Snap}) + when Snap == false -> + Reply = confirm_delete(FileName, + State#state.unreferenced_files, + State#state.registered_snapshots), + case Reply of + {true, Pid} -> + UF1 = lists:keydelete(FileName, 1, State#state.unreferenced_files), + leveled_log:log("P0005", [FileName]), + ok = leveled_sft:sft_deleteconfirmed(Pid), + {noreply, State#state{unreferenced_files=UF1}}; + _ -> + {noreply, State} + end; +handle_cast({levelzero_complete, FN, StartKey, EndKey}, State) -> + leveled_log:log("P0029", []), + ManEntry = #manifest_entry{start_key=StartKey, + end_key=EndKey, + owner=State#state.levelzero_constructor, + filename=FN}, + UpdMan = lists:keystore(0, 1, State#state.manifest, {0, [ManEntry]}), + % Prompt clerk to ask about work - do this for every L0 roll + ok = leveled_pclerk:clerk_prompt(State#state.clerk), + {noreply, State#state{levelzero_cache=[], + levelzero_pending=false, + levelzero_constructor=undefined, + levelzero_index=leveled_pmem:new_index(), + levelzero_size=0, + manifest=UpdMan, + persisted_sqn=State#state.ledger_sqn}}. + + +handle_info(_Info, State) -> + {noreply, State}. + +terminate(Reason, State=#state{is_snapshot=Snap}) when Snap == true -> + ok = pcl_releasesnapshot(State#state.source_penciller, self()), + leveled_log:log("P0007", [Reason]), + ok; +terminate(Reason, State) -> + %% When a Penciller shuts down it isn't safe to try an manage the safe + %% finishing of any outstanding work. The last commmitted manifest will + %% be used. + %% + %% Level 0 files lie outside of the manifest, and so if there is no L0 + %% file present it is safe to write the current contents of memory. If + %% there is a L0 file present - then the memory can be dropped (it is + %% recoverable from the ledger, and there should not be a lot to recover + %% as presumably the ETS file has been recently flushed, hence the presence + %% of a L0 file). + %% + %% The penciller should close each file in the unreferenced files, and + %% then each file in the manifest, and cast a close on the clerk. + %% The cast may not succeed as the clerk could be synchronously calling + %% the penciller looking for a manifest commit + %% + leveled_log:log("P0008", [Reason]), + MC = leveled_pclerk:clerk_manifestchange(State#state.clerk, + return, + true), + UpdState = case MC of + {ok, WI} -> + {ok, NewState} = commit_manifest_change(WI, State), + Clerk = State#state.clerk, + ok = leveled_pclerk:clerk_manifestchange(Clerk, + confirm, + true), + NewState; + no_change -> + State + end, + case {UpdState#state.levelzero_pending, + get_item(0, UpdState#state.manifest, []), + UpdState#state.levelzero_size} of + {true, [], _} -> + ok = leveled_sft:sft_close(UpdState#state.levelzero_constructor); + {false, [], 0} -> + leveled_log:log("P0009", []); + {false, [], _N} -> + L0Pid = roll_memory(UpdState, true), + ok = leveled_sft:sft_close(L0Pid); + _ -> + leveled_log:log("P0010", []) + end, + + % Tidy shutdown of individual files + ok = close_files(0, UpdState#state.manifest), + lists:foreach(fun({_FN, Pid, _SN}) -> + ok = leveled_sft:sft_close(Pid) end, + UpdState#state.unreferenced_files), + leveled_log:log("P0011", []), + ok. + + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +start_from_file(PCLopts) -> + RootPath = PCLopts#penciller_options.root_path, + MaxTableSize = case PCLopts#penciller_options.max_inmemory_tablesize of + undefined -> + ?MAX_TABLESIZE; + M -> + M + end, + + {ok, MergeClerk} = leveled_pclerk:clerk_new(self()), + InitState = #state{clerk=MergeClerk, + root_path=RootPath, + levelzero_index = leveled_pmem:new_index(), + levelzero_maxcachesize=MaxTableSize}, + + %% Open manifest + ManifestPath = InitState#state.root_path ++ "/" ++ ?MANIFEST_FP ++ "/", + ok = filelib:ensure_dir(ManifestPath), + {ok, Filenames} = file:list_dir(ManifestPath), + CurrRegex = "nonzero_(?[0-9]+)\\." ++ ?CURRENT_FILEX, + ValidManSQNs = lists:foldl(fun(FN, Acc) -> + case re:run(FN, + CurrRegex, + [{capture, ['MSN'], list}]) of + nomatch -> + Acc; + {match, [Int]} when is_list(Int) -> + Acc ++ [list_to_integer(Int)] + end + end, + [], + Filenames), + TopManSQN = lists:foldl(fun(X, MaxSQN) -> max(X, MaxSQN) end, + 0, + ValidManSQNs), + leveled_log:log("P0012", [TopManSQN]), + ManUpdate = case TopManSQN of + 0 -> + leveled_log:log("P0013", []), + {[], 0}; + _ -> + CurrManFile = filepath(InitState#state.root_path, + TopManSQN, + current_manifest), + {ok, Bin} = file:read_file(CurrManFile), + Manifest = binary_to_term(Bin), + open_all_filesinmanifest(Manifest) + end, + + {UpdManifest, MaxSQN} = ManUpdate, + leveled_log:log("P0014", [MaxSQN]), + + %% Find any L0 files + L0FN = filepath(RootPath, TopManSQN, new_merge_files) ++ "_0_0.sft", + case filelib:is_file(L0FN) of + true -> + leveled_log:log("P0015", [L0FN]), + {ok, + L0Pid, + {L0StartKey, L0EndKey}} = leveled_sft:sft_open(L0FN), + L0SQN = leveled_sft:sft_getmaxsequencenumber(L0Pid), + ManifestEntry = #manifest_entry{start_key=L0StartKey, + end_key=L0EndKey, + owner=L0Pid, + filename=L0FN}, + UpdManifest2 = lists:keystore(0, + 1, + UpdManifest, + {0, [ManifestEntry]}), + leveled_log:log("P0016", [L0SQN]), + LedgerSQN = max(MaxSQN, L0SQN), + {ok, + InitState#state{manifest=UpdManifest2, + manifest_sqn=TopManSQN, + ledger_sqn=LedgerSQN, + persisted_sqn=LedgerSQN}}; + false -> + leveled_log:log("P0017", []), + {ok, + InitState#state{manifest=UpdManifest, + manifest_sqn=TopManSQN, + ledger_sqn=MaxSQN, + persisted_sqn=MaxSQN}} + end. + + + +update_levelzero(L0Index, L0Size, PushedTree, LedgerSQN, L0Cache, State) -> + Update = leveled_pmem:add_to_index(L0Index, + L0Size, + PushedTree, + LedgerSQN, + L0Cache), + {MaxSQN, NewL0Size, UpdL0Index, UpdL0Cache} = Update, + if + MaxSQN >= LedgerSQN -> + UpdState = State#state{levelzero_cache=UpdL0Cache, + levelzero_index=UpdL0Index, + levelzero_size=NewL0Size, + ledger_sqn=MaxSQN}, + CacheTooBig = NewL0Size > State#state.levelzero_maxcachesize, + Level0Free = length(get_item(0, State#state.manifest, [])) == 0, + case {CacheTooBig, Level0Free} of + {true, true} -> + L0Constructor = roll_memory(UpdState, false), + UpdState#state{levelzero_pending=true, + levelzero_constructor=L0Constructor}; + _ -> + UpdState + end; + NewL0Size == L0Size -> + State#state{levelzero_cache=L0Cache, + levelzero_index=L0Index, + levelzero_size=L0Size, + ledger_sqn=LedgerSQN} + end. + + +%% Casting a large object (the levelzero cache) to the gen_server did not lead +%% to an immediate return as expected. With 32K keys in the TreeList it could +%% take around 35-40ms. +%% +%% To avoid blocking this gen_server, the SFT file can request each item of the +%% cache one at a time. +%% +%% The Wait is set to false to use a cast when calling this in normal operation +%% where as the Wait of true is used at shutdown + +roll_memory(State, false) -> + FileName = levelzero_filename(State), + leveled_log:log("P0019", [FileName]), + Opts = #sft_options{wait=false, penciller=self()}, + PCL = self(), + FetchFun = fun(Slot) -> pcl_fetchlevelzero(PCL, Slot) end, + % FetchFun = fun(Slot) -> lists:nth(Slot, State#state.levelzero_cache) end, + R = leveled_sft:sft_newfroml0cache(FileName, + length(State#state.levelzero_cache), + FetchFun, + Opts), + {ok, Constructor, _} = R, + Constructor; +roll_memory(State, true) -> + FileName = levelzero_filename(State), + Opts = #sft_options{wait=true}, + FetchFun = fun(Slot) -> lists:nth(Slot, State#state.levelzero_cache) end, + R = leveled_sft:sft_newfroml0cache(FileName, + length(State#state.levelzero_cache), + FetchFun, + Opts), + {ok, Constructor, _} = R, + Constructor. + +levelzero_filename(State) -> + MSN = State#state.manifest_sqn, + FileName = State#state.root_path + ++ "/" ++ ?FILES_FP ++ "/" + ++ integer_to_list(MSN) ++ "_0_0", + FileName. + + +fetch_mem(Key, Manifest, L0Index, L0Cache) -> + L0Check = leveled_pmem:check_levelzero(Key, L0Index, L0Cache), + case L0Check of + {false, not_found} -> + fetch(Key, Manifest, 0, fun leveled_sft:sft_get/2); + {true, KV} -> + KV + end. + +fetch(_Key, _Manifest, ?MAX_LEVELS + 1, _FetchFun) -> + not_present; +fetch(Key, Manifest, Level, FetchFun) -> + LevelManifest = get_item(Level, Manifest, []), + case lists:foldl(fun(File, Acc) -> + case Acc of + not_present when + Key >= File#manifest_entry.start_key, + File#manifest_entry.end_key >= Key -> + File#manifest_entry.owner; + PidFound -> + PidFound + end end, + not_present, + LevelManifest) of + not_present -> + fetch(Key, Manifest, Level + 1, FetchFun); + FileToCheck -> + case FetchFun(FileToCheck, Key) of + not_present -> + fetch(Key, Manifest, Level + 1, FetchFun); + ObjectFound -> + ObjectFound + end + end. + + +compare_to_sqn(Obj, SQN) -> + case Obj of + not_present -> + false; + Obj -> + SQNToCompare = leveled_codec:strip_to_seqonly(Obj), + if + SQNToCompare > SQN -> + false; + true -> + true + end + end. + + +%% Work out what the current work queue should be +%% +%% The work queue should have a lower level work at the front, and no work +%% should be added to the queue if a compaction worker has already been asked +%% to look at work at that level +%% +%% The full queue is calculated for logging purposes only + +return_work(State, From) -> + {WorkQ, BasementL} = assess_workqueue([], 0, State#state.manifest, 0), + case length(WorkQ) of + L when L > 0 -> + Excess = lists:foldl(fun({_, _, OH}, Acc) -> Acc+OH end, 0, WorkQ), + [{SrcLevel, Manifest, _Overhead}|_OtherWork] = WorkQ, + leveled_log:log("P0020", [SrcLevel, From, Excess]), + IsBasement = if + SrcLevel + 1 == BasementL -> + true; + true -> + false + end, + Backlog = Excess >= ?WORKQUEUE_BACKLOG_TOLERANCE, + case State#state.levelzero_pending of + true -> + % Once the L0 file is completed there will be more work + % - so don't be busy doing other work now + leveled_log:log("P0021", []), + {State#state{work_backlog=Backlog}, none}; + false -> + %% No work currently outstanding + %% Can allocate work + NextSQN = State#state.manifest_sqn + 1, + FP = filepath(State#state.root_path, + NextSQN, + new_merge_files), + ManFile = filepath(State#state.root_path, + NextSQN, + pending_manifest), + WI = #penciller_work{next_sqn=NextSQN, + clerk=From, + src_level=SrcLevel, + manifest=Manifest, + start_time = os:timestamp(), + ledger_filepath = FP, + manifest_file = ManFile, + target_is_basement = IsBasement}, + {State#state{ongoing_work=[WI], work_backlog=Backlog}, WI} + end; + _ -> + {State#state{work_backlog=false}, none} + end. + + +close_files(?MAX_LEVELS - 1, _Manifest) -> + ok; +close_files(Level, Manifest) -> + LevelList = get_item(Level, Manifest, []), + lists:foreach(fun(F) -> + ok = leveled_sft:sft_close(F#manifest_entry.owner) end, + LevelList), + close_files(Level + 1, Manifest). + + +open_all_filesinmanifest(Manifest) -> + open_all_filesinmanifest({Manifest, 0}, 0). + +open_all_filesinmanifest(Result, ?MAX_LEVELS - 1) -> + Result; +open_all_filesinmanifest({Manifest, TopSQN}, Level) -> + LevelList = get_item(Level, Manifest, []), + %% The Pids in the saved manifest related to now closed references + %% Need to roll over the manifest at this level starting new processes to + %5 replace them + LvlR = lists:foldl(fun(F, {FL, FL_SQN}) -> + FN = F#manifest_entry.filename, + {ok, P, _Keys} = leveled_sft:sft_open(FN), + F_SQN = leveled_sft:sft_getmaxsequencenumber(P), + {lists:append(FL, + [F#manifest_entry{owner = P}]), + max(FL_SQN, F_SQN)} + end, + {[], 0}, + LevelList), + %% Result is tuple of revised file list for this level in manifest, and + %% the maximum sequence number seen at this level + {LvlFL, LvlSQN} = LvlR, + UpdManifest = lists:keystore(Level, 1, Manifest, {Level, LvlFL}), + open_all_filesinmanifest({UpdManifest, max(TopSQN, LvlSQN)}, Level + 1). + +print_manifest(Manifest) -> + lists:foreach(fun(L) -> + leveled_log:log("P0022", [L]), + Level = get_item(L, Manifest, []), + lists:foreach(fun print_manifest_entry/1, Level) + end, + lists:seq(0, ?MAX_LEVELS - 1)), + ok. + +print_manifest_entry(Entry) -> + {S1, S2, S3} = leveled_codec:print_key(Entry#manifest_entry.start_key), + {E1, E2, E3} = leveled_codec:print_key(Entry#manifest_entry.end_key), + leveled_log:log("P0023", + [S1, S2, S3, E1, E2, E3, Entry#manifest_entry.filename]). + +initiate_rangequery_frommanifest(StartKey, EndKey, Manifest) -> + CompareFun = fun(M) -> + C1 = StartKey > M#manifest_entry.end_key, + C2 = leveled_codec:endkey_passed(EndKey, + M#manifest_entry.start_key), + not (C1 or C2) end, + lists:foldl(fun(L, AccL) -> + Level = get_item(L, Manifest, []), + FL = lists:foldl(fun(M, Acc) -> + case CompareFun(M) of + true -> + Acc ++ [{next_file, M}]; + false -> + Acc + end end, + [], + Level), + case FL of + [] -> AccL; + FL -> AccL ++ [{L, FL}] + end + end, + [], + lists:seq(0, ?MAX_LEVELS - 1)). + +%% Looks to find the best choice for the next key across the levels (other +%% than in-memory table) +%% In finding the best choice, the next key in a given level may be a next +%% block or next file pointer which will need to be expanded + +find_nextkey(QueryArray, StartKey, EndKey) -> + find_nextkey(QueryArray, + 0, + {null, null}, + {fun leveled_sft:sft_getkvrange/4, StartKey, EndKey, 1}). + +find_nextkey(_QueryArray, LCnt, {null, null}, _QueryFunT) + when LCnt > ?MAX_LEVELS -> + % The array has been scanned wihtout finding a best key - must be + % exhausted - respond to indicate no more keys to be found by the + % iterator + no_more_keys; +find_nextkey(QueryArray, LCnt, {BKL, BestKV}, _QueryFunT) + when LCnt > ?MAX_LEVELS -> + % All levels have been scanned, so need to remove the best result from + % the array, and return that array along with the best key/sqn/status + % combination + {BKL, [BestKV|Tail]} = lists:keyfind(BKL, 1, QueryArray), + {lists:keyreplace(BKL, 1, QueryArray, {BKL, Tail}), BestKV}; +find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, QueryFunT) -> + % Get the next key at this level + {NextKey, RestOfKeys} = case lists:keyfind(LCnt, 1, QueryArray) of + false -> + {null, null}; + {LCnt, []} -> + {null, null}; + {LCnt, [NK|ROfKs]} -> + {NK, ROfKs} + end, + % Compare the next key at this level with the best key + case {NextKey, BestKeyLevel, BestKV} of + {null, BKL, BKV} -> + % There is no key at this level - go to the next level + find_nextkey(QueryArray, LCnt + 1, {BKL, BKV}, QueryFunT); + {{next_file, ManifestEntry}, BKL, BKV} -> + % The first key at this level is pointer to a file - need to query + % the file to expand this level out before proceeding + Owner = ManifestEntry#manifest_entry.owner, + {QueryFun, StartKey, EndKey, ScanSize} = QueryFunT, + QueryResult = QueryFun(Owner, StartKey, EndKey, ScanSize), + NewEntry = {LCnt, QueryResult ++ RestOfKeys}, + % Need to loop around at this level (LCnt) as we have not yet + % examined a real key at this level + find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), + LCnt, + {BKL, BKV}, + QueryFunT); + {{next, SFTpid, NewStartKey}, BKL, BKV} -> + % The first key at this level is pointer within a file - need to + % query the file to expand this level out before proceeding + {QueryFun, _StartKey, EndKey, ScanSize} = QueryFunT, + QueryResult = QueryFun(SFTpid, NewStartKey, EndKey, ScanSize), + NewEntry = {LCnt, QueryResult ++ RestOfKeys}, + % Need to loop around at this level (LCnt) as we have not yet + % examined a real key at this level + find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), + LCnt, + {BKL, BKV}, + QueryFunT); + {{Key, Val}, null, null} -> + % No best key set - so can assume that this key is the best key, + % and check the lower levels + find_nextkey(QueryArray, + LCnt + 1, + {LCnt, {Key, Val}}, + QueryFunT); + {{Key, Val}, _BKL, {BestKey, _BestVal}} when Key < BestKey -> + % There is a real key and a best key to compare, and the real key + % at this level is before the best key, and so is now the new best + % key + % The QueryArray is not modified until we have checked all levels + find_nextkey(QueryArray, + LCnt + 1, + {LCnt, {Key, Val}}, + QueryFunT); + {{Key, Val}, BKL, {BestKey, BestVal}} when Key == BestKey -> + SQN = leveled_codec:strip_to_seqonly({Key, Val}), + BestSQN = leveled_codec:strip_to_seqonly({BestKey, BestVal}), + if + SQN =< BestSQN -> + % This is a dominated key, so we need to skip over it + NewEntry = {LCnt, RestOfKeys}, + find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), + LCnt + 1, + {BKL, {BestKey, BestVal}}, + QueryFunT); + SQN > BestSQN -> + % There is a real key at the front of this level and it has + % a higher SQN than the best key, so we should use this as + % the best key + % But we also need to remove the dominated key from the + % lower level in the query array + OldBestEntry = lists:keyfind(BKL, 1, QueryArray), + {BKL, [{BestKey, BestVal}|BestTail]} = OldBestEntry, + find_nextkey(lists:keyreplace(BKL, + 1, + QueryArray, + {BKL, BestTail}), + LCnt + 1, + {LCnt, {Key, Val}}, + QueryFunT) + end; + {_, BKL, BKV} -> + % This is not the best key + find_nextkey(QueryArray, LCnt + 1, {BKL, BKV}, QueryFunT) + end. + + +keyfolder(null, SFTiterator, StartKey, EndKey, {AccFun, Acc}) -> + case find_nextkey(SFTiterator, StartKey, EndKey) of + no_more_keys -> + Acc; + {NxtSFTiterator, {SFTKey, SFTVal}} -> + Acc1 = AccFun(SFTKey, SFTVal, Acc), + keyfolder(null, NxtSFTiterator, StartKey, EndKey, {AccFun, Acc1}) + end; +keyfolder(IMMiterator, SFTiterator, StartKey, EndKey, {AccFun, Acc}) -> + case gb_trees:next(IMMiterator) of + none -> + % There are no more keys in the in-memory iterator, so now + % iterate only over the remaining keys in the SFT iterator + keyfolder(null, SFTiterator, StartKey, EndKey, {AccFun, Acc}); + {IMMKey, IMMVal, NxtIMMiterator} -> + case leveled_codec:endkey_passed(EndKey, IMMKey) of + true -> + % There are no more keys in-range in the in-memory + % iterator, so take action as if this iterator is empty + % (see above) + keyfolder(null, SFTiterator, + StartKey, EndKey, {AccFun, Acc}); + false -> + case find_nextkey(SFTiterator, StartKey, EndKey) of + no_more_keys -> + % No more keys in range in the persisted store, so use the + % in-memory KV as the next + Acc1 = AccFun(IMMKey, IMMVal, Acc), + keyfolder(NxtIMMiterator, SFTiterator, + StartKey, EndKey, {AccFun, Acc1}); + {NxtSFTiterator, {SFTKey, SFTVal}} -> + % There is a next key, so need to know which is the + % next key between the two (and handle two keys + % with different sequence numbers). + case leveled_codec:key_dominates({IMMKey, + IMMVal}, + {SFTKey, + SFTVal}) of + left_hand_first -> + Acc1 = AccFun(IMMKey, IMMVal, Acc), + keyfolder(NxtIMMiterator, SFTiterator, + StartKey, EndKey, + {AccFun, Acc1}); + right_hand_first -> + Acc1 = AccFun(SFTKey, SFTVal, Acc), + keyfolder(IMMiterator, NxtSFTiterator, + StartKey, EndKey, + {AccFun, Acc1}); + left_hand_dominant -> + Acc1 = AccFun(IMMKey, IMMVal, Acc), + keyfolder(NxtIMMiterator, NxtSFTiterator, + StartKey, EndKey, + {AccFun, Acc1}) + end + end + end + end. + + +assess_workqueue(WorkQ, ?MAX_LEVELS - 1, _Man, BasementLevel) -> + {WorkQ, BasementLevel}; +assess_workqueue(WorkQ, LevelToAssess, Man, BasementLevel) -> + MaxFiles = get_item(LevelToAssess, ?LEVEL_SCALEFACTOR, 0), + case length(get_item(LevelToAssess, Man, [])) of + FileCount when FileCount > 0 -> + NewWQ = maybe_append_work(WorkQ, + LevelToAssess, + Man, + MaxFiles, + FileCount), + assess_workqueue(NewWQ, LevelToAssess + 1, Man, LevelToAssess); + 0 -> + assess_workqueue(WorkQ, LevelToAssess + 1, Man, BasementLevel) + end. + + +maybe_append_work(WorkQ, Level, Manifest, + MaxFiles, FileCount) + when FileCount > MaxFiles -> + Overhead = FileCount - MaxFiles, + leveled_log:log("P0024", [Overhead, Level]), + lists:append(WorkQ, [{Level, Manifest, Overhead}]); +maybe_append_work(WorkQ, _Level, _Manifest, + _MaxFiles, _FileCount) -> + WorkQ. + + +get_item(Index, List, Default) -> + case lists:keysearch(Index, 1, List) of + {value, {Index, Value}} -> + Value; + false -> + Default + end. + + +%% Request a manifest change +%% The clerk should have completed the work, and created a new manifest +%% and persisted the new view of the manifest +%% +%% To complete the change of manifest: +%% - the state of the manifest file needs to be changed from pending to current +%% - the list of unreferenced files needs to be updated on State +%% - the current manifest needs to be update don State +%% - the list of ongoing work needs to be cleared of this item + + +commit_manifest_change(ReturnedWorkItem, State) -> + NewMSN = State#state.manifest_sqn + 1, + [SentWorkItem] = State#state.ongoing_work, + RootPath = State#state.root_path, + UnreferencedFiles = State#state.unreferenced_files, + + if + NewMSN == SentWorkItem#penciller_work.next_sqn -> + WISrcLevel = SentWorkItem#penciller_work.src_level, + leveled_log:log_timer("P0025", + [SentWorkItem#penciller_work.next_sqn, + WISrcLevel], + SentWorkItem#penciller_work.start_time), + ok = rename_manifest_files(RootPath, NewMSN), + FilesToDelete = ReturnedWorkItem#penciller_work.unreferenced_files, + UnreferencedFilesUpd = update_deletions(FilesToDelete, + NewMSN, + UnreferencedFiles), + leveled_log:log("P0026", [NewMSN]), + NewManifest = ReturnedWorkItem#penciller_work.new_manifest, + + CurrL0 = get_item(0, State#state.manifest, []), + % If the work isn't L0 work, then we may have an uncommitted + % manifest change at L0 - so add this back into the Manifest loop + % state + RevisedManifest = case {WISrcLevel, CurrL0} of + {0, _} -> + NewManifest; + {_, []} -> + NewManifest; + {_, [L0ManEntry]} -> + lists:keystore(0, + 1, + NewManifest, + {0, [L0ManEntry]}) + end, + {ok, State#state{ongoing_work=[], + manifest_sqn=NewMSN, + manifest=RevisedManifest, + unreferenced_files=UnreferencedFilesUpd}} + end. + + +rename_manifest_files(RootPath, NewMSN) -> + OldFN = filepath(RootPath, NewMSN, pending_manifest), + NewFN = filepath(RootPath, NewMSN, current_manifest), + leveled_log:log("P0027", [OldFN, filelib:is_file(OldFN), + NewFN, filelib:is_file(NewFN)]), + ok = file:rename(OldFN,NewFN). + +filepath(RootPath, manifest) -> + RootPath ++ "/" ++ ?MANIFEST_FP; +filepath(RootPath, files) -> + RootPath ++ "/" ++ ?FILES_FP. + +filepath(RootPath, NewMSN, pending_manifest) -> + filepath(RootPath, manifest) ++ "/" ++ "nonzero_" + ++ integer_to_list(NewMSN) ++ "." ++ ?PENDING_FILEX; +filepath(RootPath, NewMSN, current_manifest) -> + filepath(RootPath, manifest) ++ "/" ++ "nonzero_" + ++ integer_to_list(NewMSN) ++ "." ++ ?CURRENT_FILEX; +filepath(RootPath, NewMSN, new_merge_files) -> + filepath(RootPath, files) ++ "/" ++ integer_to_list(NewMSN). + +update_deletions([], _NewMSN, UnreferencedFiles) -> + UnreferencedFiles; +update_deletions([ClearedFile|Tail], MSN, UnreferencedFiles) -> + leveled_log:log("P0028", [ClearedFile#manifest_entry.filename]), + update_deletions(Tail, + MSN, + lists:append(UnreferencedFiles, + [{ClearedFile#manifest_entry.filename, + ClearedFile#manifest_entry.owner, + MSN}])). + +confirm_delete(Filename, UnreferencedFiles, RegisteredSnapshots) -> + case lists:keyfind(Filename, 1, UnreferencedFiles) of + {Filename, Pid, MSN} -> + LowSQN = lists:foldl(fun({_, SQN}, MinSQN) -> min(SQN, MinSQN) end, + infinity, + RegisteredSnapshots), + if + MSN >= LowSQN -> + false; + true -> + {true, Pid} + end + end. + + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +clean_testdir(RootPath) -> + clean_subdir(filepath(RootPath, manifest)), + clean_subdir(filepath(RootPath, files)). + +clean_subdir(DirPath) -> + case filelib:is_dir(DirPath) of + true -> + {ok, Files} = file:list_dir(DirPath), + lists:foreach(fun(FN) -> + File = filename:join(DirPath, FN), + ok = file:delete(File), + io:format("Success deleting ~s~n", [File]) + end, + Files); + false -> + ok + end. + + +compaction_work_assessment_test() -> + L0 = [{{o, "B1", "K1", null}, {o, "B3", "K3", null}, dummy_pid}], + L1 = [{{o, "B1", "K1", null}, {o, "B2", "K2", null}, dummy_pid}, + {{o, "B2", "K3", null}, {o, "B4", "K4", null}, dummy_pid}], + Manifest = [{0, L0}, {1, L1}], + {WorkQ1, 1} = assess_workqueue([], 0, Manifest, 0), + ?assertMatch([{0, Manifest, 1}], WorkQ1), + L1Alt = lists:append(L1, + [{{o, "B5", "K0001", null}, {o, "B5", "K9999", null}, + dummy_pid}, + {{o, "B6", "K0001", null}, {o, "B6", "K9999", null}, + dummy_pid}, + {{o, "B7", "K0001", null}, {o, "B7", "K9999", null}, + dummy_pid}, + {{o, "B8", "K0001", null}, {o, "B8", "K9999", null}, + dummy_pid}, + {{o, "B9", "K0001", null}, {o, "B9", "K9999", null}, + dummy_pid}, + {{o, "BA", "K0001", null}, {o, "BA", "K9999", null}, + dummy_pid}, + {{o, "BB", "K0001", null}, {o, "BB", "K9999", null}, + dummy_pid}]), + Manifest3 = [{0, []}, {1, L1Alt}], + {WorkQ3, 1} = assess_workqueue([], 0, Manifest3, 0), + ?assertMatch([{1, Manifest3, 1}], WorkQ3). + +confirm_delete_test() -> + Filename = 'test.sft', + UnreferencedFiles = [{'other.sft', dummy_owner, 15}, + {Filename, dummy_owner, 10}], + RegisteredIterators1 = [{dummy_pid, 16}, {dummy_pid, 12}], + R1 = confirm_delete(Filename, UnreferencedFiles, RegisteredIterators1), + ?assertMatch(R1, {true, dummy_owner}), + RegisteredIterators2 = [{dummy_pid, 10}, {dummy_pid, 12}], + R2 = confirm_delete(Filename, UnreferencedFiles, RegisteredIterators2), + ?assertMatch(R2, false), + RegisteredIterators3 = [{dummy_pid, 9}, {dummy_pid, 12}], + R3 = confirm_delete(Filename, UnreferencedFiles, RegisteredIterators3), + ?assertMatch(R3, false). + + +maybe_pause_push(PCL, KL) -> + T0 = gb_trees:empty(), + T1 = lists:foldl(fun({K, V}, Acc) -> gb_trees:enter(K, V, Acc) end, + T0, + KL), + case pcl_pushmem(PCL, T1) of + returned -> + timer:sleep(50), + maybe_pause_push(PCL, KL); + ok -> + ok + end. + +simple_server_test() -> + RootPath = "../test/ledger", + clean_testdir(RootPath), + {ok, PCL} = pcl_start(#penciller_options{root_path=RootPath, + max_inmemory_tablesize=1000}), + Key1 = {{o,"Bucket0001", "Key0001", null}, {1, {active, infinity}, null}}, + KL1 = leveled_sft:generate_randomkeys({1000, 2}), + Key2 = {{o,"Bucket0002", "Key0002", null}, {1002, {active, infinity}, null}}, + KL2 = leveled_sft:generate_randomkeys({1000, 1003}), + Key3 = {{o,"Bucket0003", "Key0003", null}, {2003, {active, infinity}, null}}, + KL3 = leveled_sft:generate_randomkeys({1000, 2004}), + Key4 = {{o,"Bucket0004", "Key0004", null}, {3004, {active, infinity}, null}}, + KL4 = leveled_sft:generate_randomkeys({1000, 3005}), + ok = maybe_pause_push(PCL, [Key1]), + ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), + ok = maybe_pause_push(PCL, KL1), + ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), + ok = maybe_pause_push(PCL, [Key2]), + ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), + ?assertMatch(Key2, pcl_fetch(PCL, {o,"Bucket0002", "Key0002", null})), + + ok = maybe_pause_push(PCL, KL2), + ?assertMatch(Key2, pcl_fetch(PCL, {o,"Bucket0002", "Key0002", null})), + ok = maybe_pause_push(PCL, [Key3]), + + ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), + ?assertMatch(Key2, pcl_fetch(PCL, {o,"Bucket0002", "Key0002", null})), + ?assertMatch(Key3, pcl_fetch(PCL, {o,"Bucket0003", "Key0003", null})), + ok = pcl_close(PCL), + + {ok, PCLr} = pcl_start(#penciller_options{root_path=RootPath, + max_inmemory_tablesize=1000}), + ?assertMatch(1001, pcl_getstartupsequencenumber(PCLr)), + ok = maybe_pause_push(PCLr, [Key2] ++ KL2 ++ [Key3]), + io:format("Back to starting position with lost data recovered~n"), + + ?assertMatch(Key1, pcl_fetch(PCLr, {o,"Bucket0001", "Key0001", null})), + ?assertMatch(Key2, pcl_fetch(PCLr, {o,"Bucket0002", "Key0002", null})), + ?assertMatch(Key3, pcl_fetch(PCLr, {o,"Bucket0003", "Key0003", null})), + ok = maybe_pause_push(PCLr, KL3), + ok = maybe_pause_push(PCLr, [Key4]), + ok = maybe_pause_push(PCLr, KL4), + ?assertMatch(Key1, pcl_fetch(PCLr, {o,"Bucket0001", "Key0001", null})), + ?assertMatch(Key2, pcl_fetch(PCLr, {o,"Bucket0002", "Key0002", null})), + ?assertMatch(Key3, pcl_fetch(PCLr, {o,"Bucket0003", "Key0003", null})), + ?assertMatch(Key4, pcl_fetch(PCLr, {o,"Bucket0004", "Key0004", null})), + SnapOpts = #penciller_options{start_snapshot = true, + source_penciller = PCLr}, + {ok, PclSnap} = pcl_start(SnapOpts), + ok = pcl_loadsnapshot(PclSnap, gb_trees:empty()), + ?assertMatch(Key1, pcl_fetch(PclSnap, {o,"Bucket0001", "Key0001", null})), + ?assertMatch(Key2, pcl_fetch(PclSnap, {o,"Bucket0002", "Key0002", null})), + ?assertMatch(Key3, pcl_fetch(PclSnap, {o,"Bucket0003", "Key0003", null})), + ?assertMatch(Key4, pcl_fetch(PclSnap, {o,"Bucket0004", "Key0004", null})), + ?assertMatch(true, pcl_checksequencenumber(PclSnap, + {o, + "Bucket0001", + "Key0001", + null}, + 1)), + ?assertMatch(true, pcl_checksequencenumber(PclSnap, + {o, + "Bucket0002", + "Key0002", + null}, + 1002)), + ?assertMatch(true, pcl_checksequencenumber(PclSnap, + {o, + "Bucket0003", + "Key0003", + null}, + 2003)), + ?assertMatch(true, pcl_checksequencenumber(PclSnap, + {o, + "Bucket0004", + "Key0004", + null}, + 3004)), + % Add some more keys and confirm that check sequence number still + % sees the old version in the previous snapshot, but will see the new version + % in a new snapshot + Key1A = {{o,"Bucket0001", "Key0001", null}, {4005, {active, infinity}, null}}, + KL1A = leveled_sft:generate_randomkeys({2000, 4006}), + ok = maybe_pause_push(PCLr, [Key1A]), + ok = maybe_pause_push(PCLr, KL1A), + ?assertMatch(true, pcl_checksequencenumber(PclSnap, + {o, + "Bucket0001", + "Key0001", + null}, + 1)), + ok = pcl_close(PclSnap), + + % Ignore a fake pending mnaifest on startup + ok = file:write_file(RootPath ++ "/" ++ ?MANIFEST_FP ++ "nonzero_99.pnd", + term_to_binary("Hello")), + + {ok, PclSnap2} = pcl_start(SnapOpts), + ok = pcl_loadsnapshot(PclSnap2, gb_trees:empty()), + ?assertMatch(false, pcl_checksequencenumber(PclSnap2, + {o, + "Bucket0001", + "Key0001", + null}, + 1)), + ?assertMatch(true, pcl_checksequencenumber(PclSnap2, + {o, + "Bucket0001", + "Key0001", + null}, + 4005)), + ?assertMatch(true, pcl_checksequencenumber(PclSnap2, + {o, + "Bucket0002", + "Key0002", + null}, + 1002)), + ok = pcl_close(PclSnap2), + ok = pcl_close(PCLr), + clean_testdir(RootPath). + + +rangequery_manifest_test() -> + {E1, + E2, + E3} = {#manifest_entry{start_key={i, "Bucket1", {"Idx1", "Fld1"}, "K8"}, + end_key={i, "Bucket1", {"Idx1", "Fld9"}, "K93"}, + filename="Z1"}, + #manifest_entry{start_key={i, "Bucket1", {"Idx1", "Fld9"}, "K97"}, + end_key={o, "Bucket1", "K71", null}, + filename="Z2"}, + #manifest_entry{start_key={o, "Bucket1", "K75", null}, + end_key={o, "Bucket1", "K993", null}, + filename="Z3"}}, + {E4, + E5, + E6} = {#manifest_entry{start_key={i, "Bucket1", {"Idx1", "Fld1"}, "K8"}, + end_key={i, "Bucket1", {"Idx1", "Fld7"}, "K93"}, + filename="Z4"}, + #manifest_entry{start_key={i, "Bucket1", {"Idx1", "Fld7"}, "K97"}, + end_key={o, "Bucket1", "K78", null}, + filename="Z5"}, + #manifest_entry{start_key={o, "Bucket1", "K81", null}, + end_key={o, "Bucket1", "K996", null}, + filename="Z6"}}, + Man = [{1, [E1, E2, E3]}, {2, [E4, E5, E6]}], + R1 = initiate_rangequery_frommanifest({o, "Bucket1", "K711", null}, + {o, "Bucket1", "K999", null}, + Man), + ?assertMatch([{1, [{next_file, E3}]}, + {2, [{next_file, E5}, {next_file, E6}]}], + R1), + R2 = initiate_rangequery_frommanifest({i, "Bucket1", {"Idx1", "Fld8"}, null}, + {i, "Bucket1", {"Idx1", "Fld8"}, null}, + Man), + ?assertMatch([{1, [{next_file, E1}]}, {2, [{next_file, E5}]}], + R2), + R3 = initiate_rangequery_frommanifest({i, "Bucket1", {"Idx0", "Fld8"}, null}, + {i, "Bucket1", {"Idx0", "Fld9"}, null}, + Man), + ?assertMatch([], R3). + +print_manifest_test() -> + M1 = #manifest_entry{start_key={i, "Bucket1", {<<"Idx1">>, "Fld1"}, "K8"}, + end_key={i, 4565, {"Idx1", "Fld9"}, "K93"}, + filename="Z1"}, + M2 = #manifest_entry{start_key={i, self(), {null, "Fld1"}, "K8"}, + end_key={i, <<200:32/integer>>, {"Idx1", "Fld9"}, "K93"}, + filename="Z1"}, + M3 = #manifest_entry{start_key={?STD_TAG, self(), {null, "Fld1"}, "K8"}, + end_key={?RIAK_TAG, <<200:32/integer>>, {"Idx1", "Fld9"}, "K93"}, + filename="Z1"}, + print_manifest([{1, [M1, M2, M3]}]). + +simple_findnextkey_test() -> + QueryArray = [ + {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, + {{o, "Bucket1", "Key5"}, {4, {active, infinity}, null}}]}, + {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}]}, + {5, [{{o, "Bucket1", "Key2"}, {2, {active, infinity}, null}}]} + ], + {Array2, KV1} = find_nextkey(QueryArray, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, KV1), + {Array3, KV2} = find_nextkey(Array2, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key2"}, {2, {active, infinity}, null}}, KV2), + {Array4, KV3} = find_nextkey(Array3, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}, KV3), + {Array5, KV4} = find_nextkey(Array4, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key5"}, {4, {active, infinity}, null}}, KV4), + ER = find_nextkey(Array5, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch(no_more_keys, ER). + +sqnoverlap_findnextkey_test() -> + QueryArray = [ + {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, + {{o, "Bucket1", "Key5"}, {4, {active, infinity}, null}}]}, + {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}]}, + {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, null}}]} + ], + {Array2, KV1} = find_nextkey(QueryArray, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, KV1), + {Array3, KV2} = find_nextkey(Array2, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}, KV2), + {Array4, KV3} = find_nextkey(Array3, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key5"}, {4, {active, infinity}, null}}, KV3), + ER = find_nextkey(Array4, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch(no_more_keys, ER). + +sqnoverlap_otherway_findnextkey_test() -> + QueryArray = [ + {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, + {{o, "Bucket1", "Key5"}, {1, {active, infinity}, null}}]}, + {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}]}, + {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, null}}]} + ], + {Array2, KV1} = find_nextkey(QueryArray, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, KV1), + {Array3, KV2} = find_nextkey(Array2, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}, KV2), + {Array4, KV3} = find_nextkey(Array3, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch({{o, "Bucket1", "Key5"}, {2, {active, infinity}, null}}, KV3), + ER = find_nextkey(Array4, + {o, "Bucket1", "Key0"}, + {o, "Bucket1", "Key5"}), + ?assertMatch(no_more_keys, ER). + +foldwithimm_simple_test() -> + QueryArray = [ + {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, + {{o, "Bucket1", "Key5"}, {1, {active, infinity}, null}}]}, + {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}]}, + {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, null}}]} + ], + IMM0 = gb_trees:enter({o, "Bucket1", "Key6"}, + {7, {active, infinity}, null}, + gb_trees:empty()), + IMM1 = gb_trees:enter({o, "Bucket1", "Key1"}, + {8, {active, infinity}, null}, + IMM0), + IMM2 = gb_trees:enter({o, "Bucket1", "Key8"}, + {9, {active, infinity}, null}, + IMM1), + IMMiter = gb_trees:iterator_from({o, "Bucket1", "Key1"}, IMM2), + AccFun = fun(K, V, Acc) -> SQN = leveled_codec:strip_to_seqonly({K, V}), + Acc ++ [{K, SQN}] end, + Acc = keyfolder(IMMiter, + QueryArray, + {o, "Bucket1", "Key1"}, {o, "Bucket1", "Key6"}, + {AccFun, []}), + ?assertMatch([{{o, "Bucket1", "Key1"}, 8}, + {{o, "Bucket1", "Key3"}, 3}, + {{o, "Bucket1", "Key5"}, 2}, + {{o, "Bucket1", "Key6"}, 7}], Acc), + + IMM1A = gb_trees:enter({o, "Bucket1", "Key1"}, + {8, {active, infinity}, null}, + gb_trees:empty()), + IMMiterA = gb_trees:iterator_from({o, "Bucket1", "Key1"}, IMM1A), + AccA = keyfolder(IMMiterA, + QueryArray, + {o, "Bucket1", "Key1"}, {o, "Bucket1", "Key6"}, + {AccFun, []}), + ?assertMatch([{{o, "Bucket1", "Key1"}, 8}, + {{o, "Bucket1", "Key3"}, 3}, + {{o, "Bucket1", "Key5"}, 2}], AccA), + + IMM3 = gb_trees:enter({o, "Bucket1", "Key4"}, + {10, {active, infinity}, null}, + IMM2), + IMMiterB = gb_trees:iterator_from({o, "Bucket1", "Key1"}, IMM3), + AccB = keyfolder(IMMiterB, + QueryArray, + {o, "Bucket1", "Key1"}, {o, "Bucket1", "Key6"}, + {AccFun, []}), + ?assertMatch([{{o, "Bucket1", "Key1"}, 8}, + {{o, "Bucket1", "Key3"}, 3}, + {{o, "Bucket1", "Key4"}, 10}, + {{o, "Bucket1", "Key5"}, 2}, + {{o, "Bucket1", "Key6"}, 7}], AccB). + +create_file_test() -> + Filename = "../test/new_file.sft", + ok = file:write_file(Filename, term_to_binary("hello")), + KVL = lists:usort(leveled_sft:generate_randomkeys(10000)), + Tree = gb_trees:from_orddict(KVL), + FetchFun = fun(Slot) -> lists:nth(Slot, [Tree]) end, + {ok, + SP, + noreply} = leveled_sft:sft_newfroml0cache(Filename, + 1, + FetchFun, + #sft_options{wait=false}), + lists:foreach(fun(X) -> + case checkready(SP) of + timeout -> + timer:sleep(X); + _ -> + ok + end end, + [50, 50, 50, 50, 50]), + {ok, SrcFN, StartKey, EndKey} = checkready(SP), + io:format("StartKey ~w EndKey ~w~n", [StartKey, EndKey]), + ?assertMatch({o, _, _, _}, StartKey), + ?assertMatch({o, _, _, _}, EndKey), + ?assertMatch("../test/new_file.sft", SrcFN), + ok = leveled_sft:sft_clear(SP), + {ok, Bin} = file:read_file("../test/new_file.sft.discarded"), + ?assertMatch("hello", binary_to_term(Bin)). + +coverage_test() -> + RootPath = "../test/ledger", + clean_testdir(RootPath), + {ok, PCL} = pcl_start(#penciller_options{root_path=RootPath, + max_inmemory_tablesize=1000}), + Key1 = {{o,"Bucket0001", "Key0001", null}, {1001, {active, infinity}, null}}, + KL1 = leveled_sft:generate_randomkeys({1000, 1}), + + ok = maybe_pause_push(PCL, KL1 ++ [Key1]), + %% Added together, as split apart there will be a race between the close + %% call to the penciller and the second fetch of the cache entry + ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), + + ok = pcl_close(PCL), + + ManifestFP = filepath(RootPath, manifest), + ok = file:write_file(filename:join(ManifestFP, "yeszero_123.man"), term_to_binary("hello")), + {ok, PCLr} = pcl_start(#penciller_options{root_path=RootPath, + max_inmemory_tablesize=1000}), + ?assertMatch(Key1, pcl_fetch(PCLr, {o,"Bucket0001", "Key0001", null})), + ok = pcl_close(PCLr), + clean_testdir(RootPath). + + +checkready(Pid) -> + try + leveled_sft:sft_checkready(Pid) + catch + exit:{timeout, _} -> + timeout + end. + + +-endif. diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl new file mode 100644 index 0000000..d12425b --- /dev/null +++ b/src/leveled_pmem.erl @@ -0,0 +1,282 @@ +%% -------- PENCILLER MEMORY --------- +%% +%% Module that provides functions for maintaining the L0 memory of the +%% Penciller. +%% +%% It is desirable that the L0Mem can efficiently handle the push of new trees +%% whilst maintaining the capability to quickly snapshot the memory for clones +%% of the Penciller. +%% +%% ETS tables are not used due to complications with managing their mutability, +%% as the database is snapshotted. +%% +%% An attempt was made to merge all trees into a single tree on push (in a +%% spawned process), but this proved to have an expensive impact as the tree +%% got larger. +%% +%% This approach is to keep a list of trees which have been received in the +%% order which they were received. There is then a fixed-size array of hashes +%% used to either point lookups at the right tree in the list, or inform the +%% requestor it is not present avoiding any lookups. +%% +%% Tests show this takes one third of the time at push (when compared to +%% merging to a single tree), and is an order of magnitude more efficient as +%% the tree reaches peak size. It is also an order of magnitude more +%% efficient to use the hash index when compared to looking through all the +%% trees. +%% +%% Total time for single_tree 217000 microseconds +%% Total time for array_tree 209000 microseconds +%% Total time for array_list 142000 microseconds +%% Total time for array_filter 69000 microseconds +%% List of 2000 checked without array - success count of 90 in 36000 microsecs +%% List of 2000 checked with array - success count of 90 in 1000 microsecs +%% +%% The trade-off taken with the approach is that the size of the L0Cache is +%% uncertain. The Size count is incremented if the hash is not already +%% present, so the size may be lower than the actual size due to hash +%% collisions + +-module(leveled_pmem). + +-include("include/leveled.hrl"). + +-export([ + add_to_index/5, + to_list/2, + new_index/0, + check_levelzero/3, + merge_trees/4 + ]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(SLOT_WIDTH, {4096, 12}). + + +%%%============================================================================ +%%% API +%%%============================================================================ + +add_to_index(L0Index, L0Size, LevelMinus1, LedgerSQN, TreeList) -> + SW = os:timestamp(), + SlotInTreeList = length(TreeList) + 1, + FoldFun = fun({K, V}, {AccMinSQN, AccMaxSQN, AccCount, HashIndex}) -> + SQN = leveled_codec:strip_to_seqonly({K, V}), + {Hash, Slot} = hash_to_slot(K), + L = array:get(Slot, HashIndex), + Count0 = case lists:keymember(Hash, 1, L) of + true -> + AccCount; + false -> + AccCount + 1 + end, + {min(SQN, AccMinSQN), + max(SQN, AccMaxSQN), + Count0, + array:set(Slot, [{Hash, SlotInTreeList}|L], HashIndex)} + end, + LM1List = gb_trees:to_list(LevelMinus1), + StartingT = {infinity, 0, L0Size, L0Index}, + {MinSQN, MaxSQN, NewL0Size, UpdL0Index} = lists:foldl(FoldFun, + StartingT, + LM1List), + leveled_log:log_timer("PM001", [NewL0Size], SW), + if + MinSQN > LedgerSQN -> + {MaxSQN, + NewL0Size, + UpdL0Index, + lists:append(TreeList, [LevelMinus1])} + end. + + +to_list(Slots, FetchFun) -> + SW = os:timestamp(), + SlotList = lists:reverse(lists:seq(1, Slots)), + FullList = lists:foldl(fun(Slot, Acc) -> + Tree = FetchFun(Slot), + L = gb_trees:to_list(Tree), + lists:ukeymerge(1, Acc, L) + end, + [], + SlotList), + leveled_log:log_timer("PM002", [length(FullList)], SW), + FullList. + + +new_index() -> + array:new(element(1, ?SLOT_WIDTH), [{default, []}, fixed]). + + +check_levelzero(Key, L0Index, TreeList) -> + {Hash, Slot} = hash_to_slot(Key), + CheckList = array:get(Slot, L0Index), + SlotList = lists:foldl(fun({H0, S0}, SL) -> + case H0 of + Hash -> + [S0|SL]; + _ -> + SL + end + end, + [], + CheckList), + lists:foldl(fun(SlotToCheck, {Found, KV}) -> + case Found of + true -> + {Found, KV}; + false -> + CheckTree = lists:nth(SlotToCheck, TreeList), + case gb_trees:lookup(Key, CheckTree) of + none -> + {Found, KV}; + {value, Value} -> + {true, {Key, Value}} + end + end + end, + {false, not_found}, + lists:reverse(lists:usort(SlotList))). + + +merge_trees(StartKey, EndKey, TreeList, LevelMinus1) -> + lists:foldl(fun(Tree, TreeAcc) -> + merge_nexttree(Tree, TreeAcc, StartKey, EndKey) end, + gb_trees:empty(), + lists:append(TreeList, [LevelMinus1])). + +%%%============================================================================ +%%% Internal Functions +%%%============================================================================ + + +hash_to_slot(Key) -> + H = erlang:phash2(Key), + {H bsr element(2, ?SLOT_WIDTH), H band (element(1, ?SLOT_WIDTH) - 1)}. + +merge_nexttree(Tree, TreeAcc, StartKey, EndKey) -> + Iter = gb_trees:iterator_from(StartKey, Tree), + merge_nexttree(Iter, TreeAcc, EndKey). + +merge_nexttree(Iter, TreeAcc, EndKey) -> + case gb_trees:next(Iter) of + none -> + TreeAcc; + {Key, Value, NewIter} -> + case leveled_codec:endkey_passed(EndKey, Key) of + true -> + TreeAcc; + false -> + merge_nexttree(NewIter, + gb_trees:enter(Key, Value, TreeAcc), + EndKey) + end + end. + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> + generate_randomkeys(Seqn, + Count, + gb_trees:empty(), + BucketRangeLow, + BucketRangeHigh). + +generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) -> + Acc; +generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> + BNumber = string:right(integer_to_list(BucketLow + random:uniform(BRange)), + 4, $0), + KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0), + {K, V} = {{o, "Bucket" ++ BNumber, "Key" ++ KNumber, null}, + {Seqn, {active, infinity}, null}}, + generate_randomkeys(Seqn + 1, + Count - 1, + gb_trees:enter(K, V, Acc), + BucketLow, + BRange). + + +compare_method_test() -> + R = lists:foldl(fun(_X, {LedgerSQN, L0Size, L0Index, L0TreeList}) -> + LM1 = generate_randomkeys(LedgerSQN + 1, + 2000, 1, 500), + add_to_index(L0Index, L0Size, LM1, LedgerSQN, + L0TreeList) + end, + {0, 0, new_index(), []}, + lists:seq(1, 16)), + + {SQN, Size, Index, TreeList} = R, + ?assertMatch(32000, SQN), + ?assertMatch(true, Size =< 32000), + + TestList = gb_trees:to_list(generate_randomkeys(1, 2000, 1, 800)), + + S0 = lists:foldl(fun({Key, _V}, Acc) -> + R0 = lists:foldr(fun(Tree, {Found, KV}) -> + case Found of + true -> + {true, KV}; + false -> + L0 = gb_trees:lookup(Key, Tree), + case L0 of + none -> + {false, not_found}; + {value, Value} -> + {true, {Key, Value}} + end + end + end, + {false, not_found}, + TreeList), + [R0|Acc] + end, + [], + TestList), + + S1 = lists:foldl(fun({Key, _V}, Acc) -> + R0 = check_levelzero(Key, Index, TreeList), + [R0|Acc] + end, + [], + TestList), + + ?assertMatch(S0, S1), + + StartKey = {o, "Bucket0100", null, null}, + EndKey = {o, "Bucket0200", null, null}, + SWa = os:timestamp(), + FetchFun = fun(Slot) -> lists:nth(Slot, TreeList) end, + DumpList = to_list(length(TreeList), FetchFun), + Q0 = lists:foldl(fun({K, V}, Acc) -> + P = leveled_codec:endkey_passed(EndKey, K), + case {K, P} of + {K, false} when K >= StartKey -> + gb_trees:enter(K, V, Acc); + _ -> + Acc + end + end, + gb_trees:empty(), + DumpList), + Sz0 = gb_trees:size(Q0), + io:format("Crude method took ~w microseconds resulting in tree of " ++ + "size ~w~n", + [timer:now_diff(os:timestamp(), SWa), Sz0]), + SWb = os:timestamp(), + Q1 = merge_trees(StartKey, EndKey, TreeList, gb_trees:empty()), + Sz1 = gb_trees:size(Q1), + io:format("Merge method took ~w microseconds resulting in tree of " ++ + "size ~w~n", + [timer:now_diff(os:timestamp(), SWb), Sz1]), + ?assertMatch(Sz0, Sz1). + + + +-endif. \ No newline at end of file diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl new file mode 100644 index 0000000..30f1e70 --- /dev/null +++ b/src/leveled_sft.erl @@ -0,0 +1,1921 @@ +%% This module provides functions for managing sft files - a modified version +%% of sst files, to be used in leveleddb. +%% +%% sft files are segment filtered tables in that they are guarded by a quick +%% access filter that checks for the presence of key by segment id, with the +%% segment id being a hash in the range 0 - 1024 * 1024 +%% +%% This filter has a dual purpose +%% - a memory efficient way of discovering non-presence with low false positive +%% rate +%% - to make searching for all keys by hashtree segment more efficient (a +%% specific change to optimise behaviour for use with the incremental refresh) +%% of riak hashtrees +%% +%% All keys are not equal in sft files, keys are only expected in a specific +%% series of formats +%% - {Tag, Bucket, Key, SubKey|null} - Object Keys +%% - {i, Bucket, {IndexName, IndexTerm}, Key} - Postings +%% The {Bucket, Key} part of all types of keys are hashed for segment filters. +%% For Postings the {Bucket, IndexName, IndexTerm} is also hashed. This +%% causes a false positive on lookup of a segment, but allows for the presence +%% of specific index terms to be checked +%% +%% The objects stored are a tuple of {Key, SequenceNumber, State, Value}, where +%% Key - as above +%% SequenceNumber - monotonically increasing counter of addition to the nursery +%% log +%% State - {active|tomb, ExpiryTimestamp | infinity} +%% Value - null (all postings) | [Object Metadata] (all object keys) +%% Keys should be unique in files. If more than two keys are candidate for +%% the same file the highest sequence number should be chosen. If the file +%% is at the basemenet level of a leveleddb database the objects with an +%% ExpiryTimestamp in the past should not be written, but at all other levels +%% keys should not be ignored because of a timestamp in the past. +%% tomb objects are written for deletions, and these tombstones may have an +%% Expirytimestamp which in effect is the time when the tombstone should be +%% reaped. +%% +%% sft files are broken into the following sections: +%% - Header (fixed width 80 bytes - containing pointers and metadata) +%% - Blocks (variable length) +%% - Slot Filter (variable length) +%% - Slot Index (variable length) +%% - Table Summary (variable length) +%% Each section should contain at the footer of the section a 4-byte CRC which +%% is to be checked only on the opening of the file +%% +%% The keys in the sft file are placed into the file in erlang term order. +%% There will normally be 256 slots of keys. The Slot Index is a gb_tree +%% acting as a helper to find the right slot to check when searching for a key +%% or range of keys. +%% The Key in the Slot Index is the Key at the start of the Slot. +%% The Value in the Slot Index is a record indicating: +%% - The starting position of the Slot within the Blocks (relative to the +%% starting position of the Blocks) +%% - The (relative) starting position of the Slot Filter for this Slot +%% - The number of blocks within the Slot +%% - The length of each of the Blocks within the Slot +%% +%% When checking for a Key in the sft file, the key should be hashed to the +%% segment, then the key should be looked-up in the Slot Index. The segment +%% ID can then be checked against the Slot Filter which will either return +%% not_present or [BlockIDs] +%% If a list of BlockIDs (normally of length 1) is returned the block should +%% be fetched using the starting position and length of the Block to find the +%% actual key (or not if the Slot Filter had returned a false positive) +%% +%% There will exist a Slot Filter for each entry in the Slot Index +%% The Slot Filter starts with some fixed length metadata +%% - 1 byte stating the expected number of keys in the block +%% - 1 byte stating the number of complete (i.e. containing the expected +%% number of keys) Blocks in the Slot +%% - 1 byte stating the number of keys in any incomplete Block (there can +%% only be 1 incomplete Block per Slot and it must be the last block) +%% - 3 bytes stating the largest segment ID in the Slot +%% - 1 byte stating the exponent used in the rice-encoding of the filter +%% The Filter itself is a rice-encoded list of Integers representing the +%% differences between the Segment IDs in the Slot with each entry being +%% appended by the minimal number of bits to represent the Block ID in which +%% an entry for that segment can be found. Where a segment exists more than +%% once then a 0 length will be used. +%% To use the filter code should roll over the filter incrementing the Segment +%% ID by each difference, and counting the keys by Block ID. This should +%% return one of: +%% mismatch - the final Segment Count didn't meet the largest Segment ID or +%% the per-block key counts don't add-up. There could have been a bit-flip, +%% so don't rely on the filter +%% no_match - everything added up but the counter never equalled the queried +%% Segment ID +%% {match, [BlockIDs]} - everything added up and the Segment may be +%% represented in the given blocks +%% +%% The makeup of a block +%% - A block is a list of 32 {Key, Value} pairs in Erlang term order +%% - The block is stored using standard compression in term_to_binary +%% May be improved by use of lz4 or schema-based binary_to_term +%% +%% The Table Summary may contain multiple summaries +%% The standard table summary contains: +%% - a count of keys by bucket and type of key (posting or object key) +%% - the total size of objects referred to by object keys +%% - the number of postings by index name +%% - the number of tombstones within the file +%% - the highest and lowest sequence number in the file +%% Summaries could be used for other summaries of table content in the future, +%% perhaps application-specific bloom filters + +%% The 56-byte header is made up of +%% - 1 byte version (major 5 bits, minor 3 bits) - default 0.1 +%% - 1 byte options (currently undefined) +%% - 1 byte Block Size - the expected number of keys in each block +%% - 1 byte Block Count - the expected number of blocks in each slot +%% - 2 byte Slot Count - the maximum number of slots in the file +%% - 6 bytes - spare +%% - 4 bytes - Blocks length +%% - 4 bytes - Slot Index length +%% - 4 bytes - Slot Filter length +%% - 4 bytes - Table summary length +%% - 24 bytes - spare +%% - 4 bytes - CRC32 +%% +%% The file body is written in the same order of events as the header (i.e. +%% Blocks first) +%% +%% Once open the file can be in the following states +%% - writing, the file is still being created +%% - available, the file may be read, but never again must be modified +%% - pending_deletion, the file can be closed and deleted once all outstanding +%% Snapshots have been started beyond a certain sequence number +%% +%% Level managers should only be aware of files in the available state. +%% Iterators may be aware of files in either available or pending_delete. +%% Level maintainers should control the file exclusively when in the writing +%% state, and send the event to trigger pending_delete with the a sequence +%% number equal to or higher than the number at the point it was no longer +%% active at any level. +%% +%% The format of the file is intended to support quick lookups, whilst +%% allowing for a new file to be written incrementally (so that all keys and +%% values need not be retained in memory) - perhaps n blocks at a time + + +-module(leveled_sft). + +-behaviour(gen_server). +-include("include/leveled.hrl"). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3, + sft_new/4, + sft_newfroml0cache/4, + sft_open/1, + sft_get/2, + sft_getkvrange/4, + sft_close/1, + sft_clear/1, + sft_checkready/1, + sft_setfordelete/2, + sft_deleteconfirmed/1, + sft_getmaxsequencenumber/1, + generate_randomkeys/1]). + +-include_lib("eunit/include/eunit.hrl"). + + +-define(WORD_SIZE, 4). +-define(DWORD_SIZE, 8). +-define(CURRENT_VERSION, {0,1}). +-define(SLOT_COUNT, 256). +-define(SLOT_GROUPWRITE_COUNT, 32). +-define(BLOCK_SIZE, 32). +-define(BLOCK_COUNT, 4). +-define(FOOTERPOS_HEADERPOS, 2). +-define(MAX_SEG_HASH, 1048576). +-define(DIVISOR_BITS, 13). +-define(DIVISOR, 8092). +-define(COMPRESSION_LEVEL, 1). +-define(HEADER_LEN, 56). +-define(ITERATOR_SCANWIDTH, 1). +-define(MERGE_SCANWIDTH, 8). +-define(DELETE_TIMEOUT, 10000). +-define(MAX_KEYS, ?SLOT_COUNT * ?BLOCK_COUNT * ?BLOCK_SIZE). +-define(DISCARD_EXT, ".discarded"). + +-record(state, {version = ?CURRENT_VERSION :: tuple(), + slot_index :: list(), + next_position :: integer(), + smallest_sqn :: integer(), + highest_sqn :: integer(), + smallest_key :: string(), + highest_key :: string(), + slots_pointer :: integer(), + index_pointer :: integer(), + filter_pointer :: integer(), + summ_pointer :: integer(), + summ_length :: integer(), + filename = "not set" :: string(), + handle :: file:fd(), + background_complete = false :: boolean(), + oversized_file = false :: boolean(), + ready_for_delete = false ::boolean(), + penciller :: pid()}). + + +%%%============================================================================ +%%% API +%%%============================================================================ + + +sft_new(Filename, KL1, KL2, LevelInfo) -> + LevelR = case is_integer(LevelInfo) of + true -> + #level{level=LevelInfo}; + _ -> + if + is_record(LevelInfo, level) -> + LevelInfo + end + end, + {ok, Pid} = gen_server:start(?MODULE, [], []), + Reply = gen_server:call(Pid, + {sft_new, Filename, KL1, KL2, LevelR}, + infinity), + {ok, Pid, Reply}. + +sft_newfroml0cache(Filename, Slots, FetchFun, Options) -> + {ok, Pid} = gen_server:start(?MODULE, [], []), + case Options#sft_options.wait of + true -> + KL1 = leveled_pmem:to_list(Slots, FetchFun), + Reply = gen_server:call(Pid, + {sft_new, + Filename, + KL1, + [], + #level{level=0}}, + infinity), + {ok, Pid, Reply}; + false -> + gen_server:cast(Pid, + {sft_newfroml0cache, + Filename, + Slots, + FetchFun, + Options#sft_options.penciller}), + {ok, Pid, noreply} + end. + +sft_open(Filename) -> + {ok, Pid} = gen_server:start(?MODULE, [], []), + case gen_server:call(Pid, {sft_open, Filename}, infinity) of + {ok, {SK, EK}} -> + {ok, Pid, {SK, EK}} + end. + +sft_setfordelete(Pid, Penciller) -> + gen_server:call(Pid, {set_for_delete, Penciller}, infinity). + +sft_get(Pid, Key) -> + gen_server:call(Pid, {get_kv, Key}, infinity). + +sft_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> + gen_server:call(Pid, {get_kvrange, StartKey, EndKey, ScanWidth}, infinity). + +sft_clear(Pid) -> + gen_server:call(Pid, clear, infinity). + +sft_close(Pid) -> + gen_server:call(Pid, close, 1000). + +sft_deleteconfirmed(Pid) -> + gen_server:cast(Pid, close). + +sft_checkready(Pid) -> + gen_server:call(Pid, background_complete, 20). + +sft_getmaxsequencenumber(Pid) -> + gen_server:call(Pid, get_maxsqn, infinity). + + + +%%%============================================================================ +%%% gen_server callbacks +%%%============================================================================ + +init([]) -> + {ok, #state{}}. + +handle_call({sft_new, Filename, KL1, [], _LevelR=#level{level=L}}, + _From, + _State) when L == 0 -> + {ok, State} = create_levelzero(KL1, Filename), + {reply, + {{[], []}, + State#state.smallest_key, + State#state.highest_key}, + State}; +handle_call({sft_new, Filename, KL1, KL2, LevelR}, _From, _State) -> + case create_file(Filename) of + {Handle, FileMD} -> + {ReadHandle, UpdFileMD, KeyRemainders} = complete_file(Handle, + FileMD, + KL1, KL2, + LevelR), + {reply, {KeyRemainders, + UpdFileMD#state.smallest_key, + UpdFileMD#state.highest_key}, + UpdFileMD#state{handle=ReadHandle, filename=Filename}} + end; +handle_call({sft_open, Filename}, _From, _State) -> + {_Handle, FileMD} = open_file(#state{filename=Filename}), + leveled_log:log("SFT01", [Filename]), + {reply, + {ok, + {FileMD#state.smallest_key, FileMD#state.highest_key}}, + FileMD}; +handle_call({get_kv, Key}, _From, State) -> + Reply = fetch_keyvalue(State#state.handle, State, Key), + statecheck_onreply(Reply, State); +handle_call({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> + Reply = pointer_append_queryresults(fetch_range_kv(State#state.handle, + State, + StartKey, + EndKey, + ScanWidth), + self()), + statecheck_onreply(Reply, State); +handle_call(close, _From, State) -> + {stop, normal, ok, State}; +handle_call(clear, _From, State) -> + {stop, normal, ok, State#state{ready_for_delete=true}}; +handle_call(background_complete, _From, State) -> + if + State#state.background_complete == true -> + {reply, + {ok, + State#state.filename, + State#state.smallest_key, + State#state.highest_key}, + State} + end; +handle_call({set_for_delete, Penciller}, _From, State) -> + leveled_log:log("SFT02", [State#state.filename]), + {reply, + ok, + State#state{ready_for_delete=true, + penciller=Penciller}, + ?DELETE_TIMEOUT}; +handle_call(get_maxsqn, _From, State) -> + statecheck_onreply(State#state.highest_sqn, State). + +handle_cast({sft_newfroml0cache, Filename, Slots, FetchFun, PCL}, _State) -> + SW = os:timestamp(), + Inp1 = leveled_pmem:to_list(Slots, FetchFun), + {ok, State} = create_levelzero(Inp1, Filename), + leveled_log:log_timer("SFT03", [Filename], SW), + case PCL of + undefined -> + {noreply, State}; + _ -> + leveled_penciller:pcl_confirml0complete(PCL, + State#state.filename, + State#state.smallest_key, + State#state.highest_key), + {noreply, State} + end; +handle_cast(close, State) -> + {stop, normal, State}. + +handle_info(timeout, State) -> + if + State#state.ready_for_delete == true -> + leveled_log:log("SFT05", [timeout, State#state.filename]), + ok = leveled_penciller:pcl_confirmdelete(State#state.penciller, + State#state.filename), + {noreply, State, ?DELETE_TIMEOUT} + end. + +terminate(Reason, State) -> + leveled_log:log("SFT05", [Reason, State#state.filename]), + case State#state.ready_for_delete of + true -> + leveled_log:log("SFT06", [State#state.filename]), + ok = file:close(State#state.handle), + ok = file:delete(State#state.filename); + _ -> + ok = file:close(State#state.handle) + end. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +statecheck_onreply(Reply, State) -> + case State#state.ready_for_delete of + true -> + {reply, Reply, State, ?DELETE_TIMEOUT}; + false -> + {reply, Reply, State} + end. + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + + +create_levelzero(ListForFile, Filename) -> + {TmpFilename, PrmFilename} = generate_filenames(Filename), + {Handle, FileMD} = create_file(TmpFilename), + InputSize = length(ListForFile), + leveled_log:log("SFT07", [InputSize]), + Rename = {true, TmpFilename, PrmFilename}, + {ReadHandle, + UpdFileMD, + {[], []}} = complete_file(Handle, FileMD, + ListForFile, [], + #level{level=0}, Rename), + {ok, + UpdFileMD#state{handle=ReadHandle, + filename=PrmFilename, + background_complete=true, + oversized_file=InputSize>?MAX_KEYS}}. + + +generate_filenames(RootFilename) -> + Ext = filename:extension(RootFilename), + Components = filename:split(RootFilename), + case Ext of + [] -> + {filename:join(Components) ++ ".pnd", + filename:join(Components) ++ ".sft"}; + Ext -> + %% This seems unnecessarily hard + DN = filename:dirname(RootFilename), + FP = lists:last(Components), + FP_NOEXT = lists:sublist(FP, 1, 1 + length(FP) - length(Ext)), + {DN ++ "/" ++ FP_NOEXT ++ "pnd", DN ++ "/" ++ FP_NOEXT ++ "sft"} + end. + + +%% Start a bare file with an initial header and no further details +%% Return the {Handle, metadata record} +create_file(FileName) when is_list(FileName) -> + leveled_log:log("SFT01", [FileName]), + ok = filelib:ensure_dir(FileName), + {ok, Handle} = file:open(FileName, [binary, raw, read, write]), + Header = create_header(initial), + {ok, _} = file:position(Handle, bof), + ok = file:write(Handle, Header), + {ok, StartPos} = file:position(Handle, cur), + FileMD = #state{next_position=StartPos, filename=FileName}, + {Handle, FileMD}. + + +create_header(initial) -> + {Major, Minor} = ?CURRENT_VERSION, + Version = <>, + %% Not thought of any options - options are ignored + Options = <<0:8>>, + %% Settings are currently ignored + {BlSize, BlCount, SlCount} = {?BLOCK_COUNT, ?BLOCK_SIZE, ?SLOT_COUNT}, + Settings = <>, + {SpareO, SpareL} = {<<0:48>>, <<0:192>>}, + Lengths = <<0:32, 0:32, 0:32, 0:32>>, + H1 = <>, + CRC32 = erlang:crc32(H1), + <

>. + +%% Open a file returning a handle and metadata which can be used in fetch and +%% iterator requests +%% The handle should be read-only as these are immutable files, a file cannot +%% be opened for writing keys, it can only be created to write keys + +open_file(FileMD) -> + Filename = FileMD#state.filename, + {ok, Handle} = file:open(Filename, [binary, raw, read]), + {ok, HeaderLengths} = file:pread(Handle, 12, 16), + <> = HeaderLengths, + {ok, SummaryBin} = file:pread(Handle, + ?HEADER_LEN + Blen + Ilen + Flen, Slen), + {{LowSQN, HighSQN}, {LowKey, HighKey}} = binary_to_term(SummaryBin), + {ok, SlotIndexBin} = file:pread(Handle, ?HEADER_LEN + Blen, Ilen), + SlotIndex = binary_to_term(SlotIndexBin), + {Handle, FileMD#state{slot_index=SlotIndex, + smallest_sqn=LowSQN, + highest_sqn=HighSQN, + smallest_key=LowKey, + highest_key=HighKey, + slots_pointer=?HEADER_LEN, + index_pointer=?HEADER_LEN + Blen, + filter_pointer=?HEADER_LEN + Blen + Ilen, + summ_pointer=?HEADER_LEN + Blen + Ilen + Flen, + summ_length=Slen, + handle=Handle}}. + +%% Take a file handle with a previously created header and complete it based on +%% the two key lists KL1 and KL2 +complete_file(Handle, FileMD, KL1, KL2, LevelR) -> + complete_file(Handle, FileMD, KL1, KL2, LevelR, false). + +complete_file(Handle, FileMD, KL1, KL2, LevelR, Rename) -> + {ok, KeyRemainders} = write_keys(Handle, + maybe_expand_pointer(KL1), + maybe_expand_pointer(KL2), + [], <<>>, + LevelR, + fun sftwrite_function/2), + {ReadHandle, UpdFileMD} = case Rename of + false -> + open_file(FileMD); + {true, OldName, NewName} -> + ok = rename_file(OldName, NewName), + open_file(FileMD#state{filename=NewName}) + end, + {ReadHandle, UpdFileMD, KeyRemainders}. + +rename_file(OldName, NewName) -> + leveled_log:log("SFT08", [OldName, NewName]), + case filelib:is_file(NewName) of + true -> + leveled_log:log("SFT09", [NewName]), + AltName = filename:join(filename:dirname(NewName), + filename:basename(NewName)) + ++ ?DISCARD_EXT, + leveled_log:log("SFT10", [NewName, AltName]), + ok = file:rename(NewName, AltName); + false -> + ok + end, + file:rename(OldName, NewName). + + +%% Fetch a Key and Value from a file, returns +%% {value, KV} or not_present +%% The key must be pre-checked to ensure it is in the valid range for the file +%% A key out of range may fail + +fetch_keyvalue(Handle, FileMD, Key) -> + {_NearestKey, {FilterLen, PointerF}, + {LengthList, PointerB}} = get_nearestkey(FileMD#state.slot_index, Key), + {ok, SegFilter} = file:pread(Handle, + PointerF + FileMD#state.filter_pointer, + FilterLen), + SegID = hash_for_segmentid({keyonly, Key}), + case check_for_segments(SegFilter, [SegID], true) of + {maybe_present, BlockList} -> + fetch_keyvalue_fromblock(BlockList, + Key, + LengthList, + Handle, + PointerB + FileMD#state.slots_pointer); + not_present -> + not_present; + error_so_maybe_present -> + fetch_keyvalue_fromblock(lists:seq(0,length(LengthList)), + Key, + LengthList, + Handle, + PointerB + FileMD#state.slots_pointer) + end. + +%% Fetches a range of keys returning a list of {Key, SeqN} tuples +fetch_range_keysonly(Handle, FileMD, StartKey, EndKey) -> + fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_keysonly/2). + +fetch_range_keysonly(Handle, FileMD, StartKey, EndKey, ScanWidth) -> + fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_keysonly/2, + ScanWidth). + +%% Fetches a range of keys returning the full tuple, including value +fetch_range_kv(Handle, FileMD, StartKey, EndKey, ScanWidth) -> + fetch_range(Handle, FileMD, StartKey, EndKey, fun acc_list_kv/2, + ScanWidth). + +acc_list_keysonly(null, empty) -> + []; +acc_list_keysonly(null, RList) -> + RList; +acc_list_keysonly(R, RList) -> + lists:append(RList, [leveled_codec:strip_to_keyseqstatusonly(R)]). + +acc_list_kv(null, empty) -> + []; +acc_list_kv(null, RList) -> + RList; +acc_list_kv(R, RList) -> + lists:append(RList, [R]). + +%% Iterate keys, returning a batch of keys & values in a range +%% - the iterator can have a ScanWidth which is how many slots should be +%% scanned by the iterator before returning a result +%% - batches can be ended with a pointer to indicate there are potentially +%% further values in the range +%% - a list of functions can be provided, which should either return true +%% or false, and these can be used to filter the results from the query, +%% for example to ignore keys above a certain sequence number, to ignore +%% keys not matching a certain regular expression, or to ignore keys not +%% a member of a particular partition +%% - An Accumulator and an Accumulator function can be passed. The function +%% needs to handle being passed (KV, Acc) to add the current result to the +%% Accumulator. The functional should handle KV=null, Acc=empty to initiate +%% the accumulator, and KV=null to leave the Accumulator unchanged. +%% Flexibility with accumulators is such that keys-only can be returned rather +%% than keys and values, or other entirely different accumulators can be +%% used - e.g. counters, hash-lists to build bloom filters etc + +fetch_range(Handle, FileMD, StartKey, EndKey, AccFun) -> + fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ?ITERATOR_SCANWIDTH). + +fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth) -> + fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth, empty). + +fetch_range(_Handle, _FileMD, StartKey, _EndKey, _AccFun, 0, Acc) -> + {partial, Acc, StartKey}; +fetch_range(Handle, FileMD, StartKey, EndKey, AccFun, ScanWidth, Acc) -> + %% get_nearestkey gets the last key in the index <= StartKey, or the next + %% key along if {next, StartKey} is passed + case get_nearestkey(FileMD#state.slot_index, StartKey) of + {NearestKey, _Filter, {LengthList, PointerB}} -> + fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, + AccFun, ScanWidth, + LengthList, + 0, + PointerB + FileMD#state.slots_pointer, + AccFun(null, Acc)); + not_found -> + {complete, AccFun(null, Acc)} + end. + +fetch_range(Handle, FileMD, _StartKey, NearestKey, EndKey, + AccFun, ScanWidth, + LengthList, + BlockNumber, + _Pointer, + Acc) + when length(LengthList) == BlockNumber -> + %% Reached the end of the slot. Move the start key on one to scan a new slot + fetch_range(Handle, FileMD, {next, NearestKey}, EndKey, + AccFun, ScanWidth - 1, + Acc); +fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, + AccFun, ScanWidth, + LengthList, + BlockNumber, + Pointer, + Acc) -> + Block = fetch_block(Handle, LengthList, BlockNumber, Pointer), + Results = scan_block(Block, StartKey, EndKey, AccFun, Acc), + case Results of + {partial, Acc1, StartKey} -> + %% Move on to the next block + fetch_range(Handle, FileMD, StartKey, NearestKey, EndKey, + AccFun, ScanWidth, + LengthList, + BlockNumber + 1, + Pointer, + Acc1); + {complete, Acc1} -> + {complete, Acc1} + end. + +scan_block([], StartKey, _EndKey, _AccFun, Acc) -> + {partial, Acc, StartKey}; +scan_block([HeadKV|T], StartKey, EndKey, AccFun, Acc) -> + K = leveled_codec:strip_to_keyonly(HeadKV), + case {StartKey > K, leveled_codec:endkey_passed(EndKey, K)} of + {true, _} when StartKey /= all -> + scan_block(T, StartKey, EndKey, AccFun, Acc); + {_, true} when EndKey /= all -> + {complete, Acc}; + _ -> + scan_block(T, StartKey, EndKey, AccFun, AccFun(HeadKV, Acc)) + end. + + +fetch_keyvalue_fromblock([], _Key, _LengthList, _Handle, _StartOfSlot) -> + not_present; +fetch_keyvalue_fromblock([BlockNmb|T], Key, LengthList, Handle, StartOfSlot) -> + BlockToCheck = fetch_block(Handle, LengthList, BlockNmb, StartOfSlot), + Result = lists:keyfind(Key, 1, BlockToCheck), + case Result of + false -> + fetch_keyvalue_fromblock(T, Key, LengthList, Handle, StartOfSlot); + KV -> + KV + end. + +fetch_block(Handle, LengthList, BlockNmb, StartOfSlot) -> + Start = lists:sum(lists:sublist(LengthList, BlockNmb)), + Length = lists:nth(BlockNmb + 1, LengthList), + {ok, BlockToCheckBin} = file:pread(Handle, Start + StartOfSlot, Length), + binary_to_term(BlockToCheckBin). + +%% Need to deal with either Key or {next, Key} +get_nearestkey(KVList, all) -> + case KVList of + [] -> + not_found; + [H|_Tail] -> + H + end; +get_nearestkey(KVList, Key) -> + case Key of + {next, K} -> + get_nextkeyaftermatch(KVList, K, not_found); + _ -> + get_firstkeytomatch(KVList, Key, not_found) + end. + +get_firstkeytomatch([], _KeyToFind, PrevV) -> + PrevV; +get_firstkeytomatch([{K, FilterInfo, SlotInfo}|_T], KeyToFind, PrevV) + when K > KeyToFind -> + case PrevV of + not_found -> + {K, FilterInfo, SlotInfo}; + _ -> + PrevV + end; +get_firstkeytomatch([{K, FilterInfo, SlotInfo}|T], KeyToFind, _PrevV) -> + get_firstkeytomatch(T, KeyToFind, {K, FilterInfo, SlotInfo}). + +get_nextkeyaftermatch([], _KeyToFind, _PrevV) -> + not_found; +get_nextkeyaftermatch([{K, FilterInfo, SlotInfo}|T], KeyToFind, PrevV) + when K >= KeyToFind -> + case PrevV of + not_found -> + get_nextkeyaftermatch(T, KeyToFind, next); + next -> + {K, FilterInfo, SlotInfo} + end; +get_nextkeyaftermatch([_KTuple|T], KeyToFind, PrevV) -> + get_nextkeyaftermatch(T, KeyToFind, PrevV). + + +%% Take a file handle at the sart position (after creating the header) and then +%% write the Key lists to the file slot by slot. +%% +%% Slots are created then written in bulk to impove I/O efficiency. Slots will +%% be written in groups of 32 + +write_keys(Handle, + KL1, KL2, + SlotIndex, SerialisedSlots, + LevelR, WriteFun) -> + write_keys(Handle, + KL1, KL2, + {0, 0}, + SlotIndex, SerialisedSlots, + {infinity, 0}, null, {last, null}, + LevelR, WriteFun). + + +write_keys(Handle, + KL1, KL2, + {SlotCount, SlotTotal}, + SlotIndex, SerialisedSlots, + {LSN, HSN}, LowKey, LastKey, + LevelR, WriteFun) + when SlotCount =:= ?SLOT_GROUPWRITE_COUNT -> + UpdHandle = WriteFun(slots , {Handle, SerialisedSlots}), + case maxslots_bylevel(SlotTotal, LevelR#level.level) of + reached -> + {complete_keywrite(UpdHandle, + SlotIndex, + {LSN, HSN}, {LowKey, LastKey}, + WriteFun), + {KL1, KL2}}; + continue -> + write_keys(UpdHandle, + KL1, KL2, + {0, SlotTotal}, + SlotIndex, <<>>, + {LSN, HSN}, LowKey, LastKey, + LevelR, WriteFun) + end; +write_keys(Handle, + KL1, KL2, + {SlotCount, SlotTotal}, + SlotIndex, SerialisedSlots, + {LSN, HSN}, LowKey, LastKey, + LevelR, WriteFun) -> + SlotOutput = create_slot(KL1, KL2, LevelR), + {{LowKey_Slot, SegFilter, SerialisedSlot, LengthList}, + {{LSN_Slot, HSN_Slot}, LastKey_Slot, Status}, + KL1rem, KL2rem} = SlotOutput, + UpdSlotIndex = lists:append(SlotIndex, + [{LowKey_Slot, SegFilter, LengthList}]), + UpdSlots = <>, + SNExtremes = {min(LSN_Slot, LSN), max(HSN_Slot, HSN)}, + FinalKey = case LastKey_Slot of null -> LastKey; _ -> LastKey_Slot end, + FirstKey = case LowKey of null -> LowKey_Slot; _ -> LowKey end, + case Status of + partial -> + UpdHandle = WriteFun(slots , {Handle, UpdSlots}), + {complete_keywrite(UpdHandle, + UpdSlotIndex, + SNExtremes, {FirstKey, FinalKey}, + WriteFun), + {KL1rem, KL2rem}}; + full -> + write_keys(Handle, + KL1rem, KL2rem, + {SlotCount + 1, SlotTotal + 1}, + UpdSlotIndex, UpdSlots, + SNExtremes, FirstKey, FinalKey, + LevelR, WriteFun); + complete -> + UpdHandle = WriteFun(slots , {Handle, UpdSlots}), + {complete_keywrite(UpdHandle, + UpdSlotIndex, + SNExtremes, {FirstKey, FinalKey}, + WriteFun), + {KL1rem, KL2rem}} + end. + + +complete_keywrite(Handle, SlotIndex, + SNExtremes, {FirstKey, FinalKey}, + WriteFun) -> + ConvSlotIndex = convert_slotindex(SlotIndex), + WriteFun(finalise, {Handle, + ConvSlotIndex, + SNExtremes, + {FirstKey, FinalKey}}). + + +%% Take a slot index, and remove the SegFilters replacing with pointers +%% Return a tuple of the accumulated slot filters, and a pointer-based +%% slot-index + +convert_slotindex(SlotIndex) -> + SlotFun = fun({LowKey, SegFilter, LengthList}, + {FilterAcc, SlotIndexAcc, PointerF, PointerB}) -> + FilterOut = serialise_segment_filter(SegFilter), + FilterLen = byte_size(FilterOut), + {<>, + lists:append(SlotIndexAcc, [{LowKey, + {FilterLen, PointerF}, + {LengthList, PointerB}}]), + PointerF + FilterLen, + PointerB + lists:sum(LengthList)} end, + {SlotFilters, PointerIndex, _FLength, _BLength} = lists:foldl(SlotFun, + {<<>>, [], 0, 0}, + SlotIndex), + {SlotFilters, PointerIndex}. + +sftwrite_function(slots, {Handle, SerialisedSlots}) -> + ok = file:write(Handle, SerialisedSlots), + Handle; +sftwrite_function(finalise, + {Handle, + {SlotFilters, PointerIndex}, + SNExtremes, + KeyExtremes}) -> + {ok, Position} = file:position(Handle, cur), + + BlocksLength = Position - ?HEADER_LEN, + Index = term_to_binary(PointerIndex), + IndexLength = byte_size(Index), + FilterLength = byte_size(SlotFilters), + Summary = term_to_binary({SNExtremes, KeyExtremes}), + SummaryLength = byte_size(Summary), + %% Write Index, Filter and Summary + ok = file:write(Handle, <>), + %% Write Lengths into header + ok = file:pwrite(Handle, 12, <>), + {ok, _Position} = file:position(Handle, bof), + ok = file:advise(Handle, + BlocksLength + IndexLength, + FilterLength, + will_need), + file:close(Handle). + +%% Level 0 files are of variable (infinite) size to avoid issues with having +%% any remainders when flushing from memory +maxslots_bylevel(_SlotTotal, 0) -> + continue; +maxslots_bylevel(SlotTotal, _Level) -> + case SlotTotal of + ?SLOT_COUNT -> + reached; + X when X < ?SLOT_COUNT -> + continue + end. + + + +%% Take two potentially overlapping lists of keys and output a Block, +%% together with: +%% - block status (full, partial) +%% - the lowest and highest sequence numbers in the block +%% - the list of segment IDs in the block +%% - the remainders of the lists +%% The Key lists must be sorted in key order. The last key in a list may be +%% a pointer to request more keys for the file (otherwise it is assumed there +%% are no more keys) +%% +%% Level also to be passed in +%% This is either an integer (to be ignored) of {floor, os:timestamp()} +%% if this is the basement level of the LevelDB database and expired keys +%% and tombstone should be reaped + + +%% Do we need to check here that KeyList1 and KeyList2 are not just a [pointer] +%% Otherwise the pointer will never be expanded +%% +%% Also this should return a partial block if the KeyLists have been exhausted +%% but the block is full + +create_block(KeyList1, KeyList2, LevelR) -> + create_block(KeyList1, KeyList2, [], {infinity, 0}, [], LevelR). + +create_block(KeyList1, KeyList2, + BlockKeyList, {LSN, HSN}, SegmentList, _LevelR) + when length(BlockKeyList)==?BLOCK_SIZE -> + case {KeyList1, KeyList2} of + {[], []} -> + {BlockKeyList, complete, {LSN, HSN}, SegmentList, [], []}; + _ -> + {BlockKeyList, full, {LSN, HSN}, SegmentList, KeyList1, KeyList2} + end; +create_block([], [], + BlockKeyList, {LSN, HSN}, SegmentList, _LevelR) -> + {BlockKeyList, partial, {LSN, HSN}, SegmentList, [], []}; +create_block(KeyList1, KeyList2, + BlockKeyList, {LSN, HSN}, SegmentList, LevelR) -> + case key_dominates(KeyList1, + KeyList2, + {LevelR#level.is_basement, LevelR#level.timestamp}) of + {{next_key, TopKey}, Rem1, Rem2} -> + {UpdLSN, UpdHSN} = update_sequencenumbers(TopKey, LSN, HSN), + NewBlockKeyList = lists:append(BlockKeyList, + [TopKey]), + NewSegmentList = lists:append(SegmentList, + [hash_for_segmentid(TopKey)]), + create_block(Rem1, Rem2, + NewBlockKeyList, {UpdLSN, UpdHSN}, + NewSegmentList, LevelR); + {skipped_key, Rem1, Rem2} -> + create_block(Rem1, Rem2, + BlockKeyList, {LSN, HSN}, + SegmentList, LevelR) + end. + + + +%% Should return an index entry in the Slot Index. Each entry consists of: +%% - Start Key +%% - SegmentIDFilter for the (will eventually be replaced with a pointer) +%% - Serialised Slot (will eventually be replaced with a pointer) +%% - Length for each Block within the Serialised Slot +%% Additional information will also be provided +%% - {Low Seq Number, High Seq Number} within the slot +%% - End Key +%% - Whether the slot is full or partially filled +%% - Remainder of any KeyLists used to make the slot + + +create_slot(KeyList1, KeyList2, Level) -> + create_slot(KeyList1, KeyList2, Level, ?BLOCK_COUNT, [], <<>>, [], + {null, infinity, 0, null, full}). + +%% Keep adding blocks to the slot until either the block count is reached or +%% there is a partial block + +create_slot(KL1, KL2, _, 0, SegLists, SerialisedSlot, LengthList, + {LowKey, LSN, HSN, LastKey, Status}) -> + {{LowKey, generate_segment_filter(SegLists), SerialisedSlot, LengthList}, + {{LSN, HSN}, LastKey, Status}, + KL1, KL2}; +create_slot(KL1, KL2, _, _, SegLists, SerialisedSlot, LengthList, + {LowKey, LSN, HSN, LastKey, partial}) -> + {{LowKey, generate_segment_filter(SegLists), SerialisedSlot, LengthList}, + {{LSN, HSN}, LastKey, partial}, + KL1, KL2}; +create_slot(KL1, KL2, _, _, SegLists, SerialisedSlot, LengthList, + {LowKey, LSN, HSN, LastKey, complete}) -> + {{LowKey, generate_segment_filter(SegLists), SerialisedSlot, LengthList}, + {{LSN, HSN}, LastKey, partial}, + KL1, KL2}; +create_slot(KL1, KL2, LevelR, BlockCount, SegLists, SerialisedSlot, LengthList, + {LowKey, LSN, HSN, LastKey, _Status}) -> + {BlockKeyList, Status, + {LSNb, HSNb}, + SegmentList, KL1b, KL2b} = create_block(KL1, KL2, LevelR), + TrackingMetadata = case {LowKey, BlockKeyList} of + {null, []} -> + {null, LSN, HSN, LastKey, Status}; + {null, _} -> + [NewLowKeyV|_] = BlockKeyList, + {leveled_codec:strip_to_keyonly(NewLowKeyV), + min(LSN, LSNb), max(HSN, HSNb), + leveled_codec:strip_to_keyonly(last(BlockKeyList, + {last, LastKey})), + Status}; + {_, _} -> + {LowKey, + min(LSN, LSNb), max(HSN, HSNb), + leveled_codec:strip_to_keyonly(last(BlockKeyList, + {last, LastKey})), + Status} + end, + SerialisedBlock = serialise_block(BlockKeyList), + BlockLength = byte_size(SerialisedBlock), + SerialisedSlot2 = <>, + create_slot(KL1b, KL2b, LevelR, BlockCount - 1, SegLists ++ [SegmentList], + SerialisedSlot2, LengthList ++ [BlockLength], + TrackingMetadata). + + +last([], {last, LastKey}) -> {keyonly, LastKey}; +last([E|Es], PrevLast) -> last(E, Es, PrevLast). + +last(_, [E|Es], PrevLast) -> last(E, Es, PrevLast); +last(E, [], _) -> E. + +serialise_block(BlockKeyList) -> + term_to_binary(BlockKeyList, [{compressed, ?COMPRESSION_LEVEL}]). + + +%% Compare the keys at the head of the list, and either skip that "best" key or +%% identify as the next key. +%% +%% The logic needs to change if the file is in the basement level, as keys with +%% expired timestamps need not be written at this level +%% +%% The best key is considered to be the lowest key in erlang term order. If +%% there are matching keys then the highest sequence number must be chosen and +%% any lower sequence numbers should be compacted out of existence + + +key_dominates(KL1, KL2, Level) -> + key_dominates_expanded(maybe_expand_pointer(KL1), + maybe_expand_pointer(KL2), + Level). + +key_dominates_expanded([H1|T1], [], Level) -> + case leveled_codec:maybe_reap_expiredkey(H1, Level) of + true -> + {skipped_key, maybe_expand_pointer(T1), []}; + false -> + {{next_key, H1}, maybe_expand_pointer(T1), []} + end; +key_dominates_expanded([], [H2|T2], Level) -> + case leveled_codec:maybe_reap_expiredkey(H2, Level) of + true -> + {skipped_key, [], maybe_expand_pointer(T2)}; + false -> + {{next_key, H2}, [], maybe_expand_pointer(T2)} + end; +key_dominates_expanded([H1|T1], [H2|T2], Level) -> + case leveled_codec:key_dominates(H1, H2) of + left_hand_first -> + case leveled_codec:maybe_reap_expiredkey(H1, Level) of + true -> + {skipped_key, maybe_expand_pointer(T1), [H2|T2]}; + false -> + {{next_key, H1}, maybe_expand_pointer(T1), [H2|T2]} + end; + right_hand_first -> + case leveled_codec:maybe_reap_expiredkey(H2, Level) of + true -> + {skipped_key, [H1|T1], maybe_expand_pointer(T2)}; + false -> + {{next_key, H2}, [H1|T1], maybe_expand_pointer(T2)} + end; + left_hand_dominant -> + {skipped_key, [H1|T1], maybe_expand_pointer(T2)}; + right_hand_dominant -> + {skipped_key, maybe_expand_pointer(T1), [H2|T2]} + end. + + +%% When a list is provided it may include a pointer to gain another batch of +%% entries from the same file, or a new batch of entries from another file +%% +%% This resultant list should include the Tail of any pointers added at the +%% end of the list + +maybe_expand_pointer([]) -> + []; +maybe_expand_pointer([H|Tail]) -> + case H of + {next, SFTPid, StartKey} -> + %% io:format("Scanning further on PID ~w ~w~n", [SFTPid, StartKey]), + Acc = sft_getkvrange(SFTPid, StartKey, all, ?MERGE_SCANWIDTH), + lists:append(Acc, Tail); + _ -> + [H|Tail] + end. + + +pointer_append_queryresults(Results, QueryPid) -> + case Results of + {complete, Acc} -> + Acc; + {partial, Acc, StartKey} -> + lists:append(Acc, [{next, QueryPid, StartKey}]) + end. + + +%% Update the sequence numbers +update_sequencenumbers(Item, LSN, HSN) when is_tuple(Item) -> + update_sequencenumbers(leveled_codec:strip_to_seqonly(Item), LSN, HSN); +update_sequencenumbers(SN, infinity, 0) -> + {SN, SN}; +update_sequencenumbers(SN, LSN, HSN) when SN < LSN -> + {SN, HSN}; +update_sequencenumbers(SN, LSN, HSN) when SN > HSN -> + {LSN, SN}; +update_sequencenumbers(_SN, LSN, HSN) -> + {LSN, HSN}. + + +%% The Segment filter is a compressed filter representing the keys in a +%% given slot. The filter is delta-compressed list of integers using rice +%% encoding extended by the reference to each integer having an extra two bits +%% to indicate the block - there are four blocks in each slot. +%% +%% So each delta is represented as +%% - variable length exponent ending in 0, +%% with 0 representing the exponent of 0, +%% 10 -> 2 ^ 13, +%% 110 -> 2^14, +%% 1110 -> 2^15 etc +%% - 13-bit fixed length remainder +%% - 2-bit block number +%% This gives about 2-bytes per key, with a 1:8000 (approx) false positive +%% ratio (when checking the key by hashing to the segment ID) +%% +%% Before the delta list are three 20-bit integers representing the highest +%% integer in each block. Plus two bytes to indicate how many hashes +%% there are in the slot +%% +%% To check for the presence of a segment in a slot, roll over the deltas +%% keeping a running total overall and the current highest segment ID seen +%% per block. Roll all the way through even if matches are found or passed +%% over to confirm that the totals match the expected value (hence creating +%% a natural checksum) +%% +%% The end-result is a 260-byte check for the presence of a key in a slot +%% returning the block in which the segment can be found, which may also be +%% used directly for checking for the presence of segments. +%% +%% This is more space efficient than the equivalent bloom filter and avoids +%% the calculation of many hash functions. + +generate_segment_filter([SegL1]) -> + generate_segment_filter({SegL1, [], [], []}); +generate_segment_filter([SegL1, SegL2]) -> + generate_segment_filter({SegL1, SegL2, [], []}); +generate_segment_filter([SegL1, SegL2, SegL3]) -> + generate_segment_filter({SegL1, SegL2, SegL3, []}); +generate_segment_filter([SegL1, SegL2, SegL3, SegL4]) -> + generate_segment_filter({SegL1, SegL2, SegL3, SegL4}); +generate_segment_filter(SegLists) -> + generate_segment_filter(merge_seglists(SegLists), + [], + [{0, 0}, {0, 1}, {0, 2}, {0, 3}]). + +%% to generate the segment filter needs a sorted list of {Delta, Block} pairs +%% as DeltaList and a list of {TopHash, Block} pairs as TopHashes + +generate_segment_filter([], DeltaList, TopHashes) -> + {lists:reverse(DeltaList), TopHashes}; +generate_segment_filter([NextSeg|SegTail], DeltaList, TopHashes) -> + {TopHash, _} = lists:max(TopHashes), + {NextSegHash, NextSegBlock} = NextSeg, + DeltaList2 = [{NextSegHash - TopHash, NextSegBlock}|DeltaList], + TopHashes2 = lists:keyreplace(NextSegBlock, 2, TopHashes, + {NextSegHash, NextSegBlock}), + generate_segment_filter(SegTail, DeltaList2, TopHashes2). + + +serialise_segment_filter({DeltaList, TopHashes}) -> + TopHashesBin = lists:foldl(fun({X, _}, Acc) -> + <> end, + <<>>, TopHashes), + Length = length(DeltaList), + HeaderBin = <>, + {Divisor, Factor} = {?DIVISOR, ?DIVISOR_BITS}, + F = fun({Delta, Block}, Acc) -> + Exponent = buildexponent(Delta div Divisor), + Remainder = Delta rem Divisor, + Block2Bit = Block, + <> end, + pad_binary(lists:foldl(F, HeaderBin, DeltaList)). + + +pad_binary(BitString) -> + Pad = 8 - bit_size(BitString) rem 8, + case Pad of + 8 -> BitString; + _ -> <> + end. + +buildexponent(Exponent) -> + buildexponent(Exponent, <<0:1>>). + +buildexponent(0, OutputBits) -> + OutputBits; +buildexponent(Exponent, OutputBits) -> + buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>). + +merge_seglists({SegList1, SegList2, SegList3, SegList4}) -> + Stage1 = lists:foldl(fun(X, Acc) -> [{X, 0}|Acc] end, [], SegList1), + Stage2 = lists:foldl(fun(X, Acc) -> [{X, 1}|Acc] end, Stage1, SegList2), + Stage3 = lists:foldl(fun(X, Acc) -> [{X, 2}|Acc] end, Stage2, SegList3), + Stage4 = lists:foldl(fun(X, Acc) -> [{X, 3}|Acc] end, Stage3, SegList4), + lists:sort(Stage4). + +hash_for_segmentid(KV) -> + erlang:phash2(leveled_codec:strip_to_keyonly(KV), ?MAX_SEG_HASH). + + +%% Check for a given list of segments in the filter, returning in normal +%% operations a TupleList of {SegmentID, [ListOFBlocks]} where the ListOfBlocks +%% are the block IDs which contain keys in that given segment +%% +%% If there is a failure - perhaps due to a bit flip of some sort an error +%% willl be returned (error_so_maybe_present) and all blocks should be checked +%% as the filter cannot be relied upon + +check_for_segments(SegFilter, SegmentList, CRCCheck) -> + case CRCCheck of + true -> + <> = SegFilter, + CheckSum = [T0, T1, T2, T3], + case safecheck_for_segments(SegRem, SegmentList, + [0, 0, 0, 0], + 0, Count, []) of + {error_so_maybe_present, Reason} -> + leveled_log:log("SFT11", [Reason]), + error_so_maybe_present; + {OutputCheck, BlockList} when OutputCheck == CheckSum, + BlockList == [] -> + not_present; + {OutputCheck, BlockList} when OutputCheck == CheckSum -> + {maybe_present, BlockList}; + {OutputCheck, _} -> + leveled_log:log("SFT12", [OutputCheck, CheckSum]), + error_so_maybe_present + end; + false -> + <<_:80/bitstring, Count:16/integer, SegRem/bitstring>> = SegFilter, + case quickcheck_for_segments(SegRem, SegmentList, + lists:max(SegmentList), + 0, Count, []) of + {error_so_maybe_present, Reason} -> + leveled_log:log("SFT13", [Reason]), + error_so_maybe_present; + BlockList when BlockList == [] -> + not_present; + BlockList -> + {maybe_present, BlockList} + end + end. + + +safecheck_for_segments(_, _, TopHashes, _, 0, BlockList) -> + {TopHashes, BlockList}; +safecheck_for_segments(Filter, SegmentList, TopHs, Acc, Count, BlockList) -> + case findexponent(Filter) of + {ok, Exp, FilterRem1} -> + case findremainder(FilterRem1, ?DIVISOR_BITS) of + {ok, Remainder, BlockID, FilterRem2} -> + {NextHash, BlockList2} = checkhash_forsegments(Acc, + Exp, + Remainder, + SegmentList, + BlockList, + BlockID), + TopHashes2 = setnth(BlockID, TopHs, NextHash), + safecheck_for_segments(FilterRem2, SegmentList, + TopHashes2, + NextHash, Count - 1, + BlockList2); + error -> + {error_so_maybe_present, "Remainder Check"} + end; + error -> + {error_so_maybe_present, "Exponent Check"} + end. + +quickcheck_for_segments(_, _, _, _, 0, BlockList) -> + BlockList; +quickcheck_for_segments(Filter, SegmentList, MaxSeg, Acc, Count, BlockList) -> + case findexponent(Filter) of + {ok, Exp, FilterRem1} -> + case findremainder(FilterRem1, ?DIVISOR_BITS) of + {ok, Remainder, BlockID, FilterRem2} -> + {NextHash, BlockList2} = checkhash_forsegments(Acc, + Exp, + Remainder, + SegmentList, + BlockList, + BlockID), + case NextHash > MaxSeg of + true -> + BlockList2; + false -> + quickcheck_for_segments(FilterRem2, SegmentList, + MaxSeg, + NextHash, Count - 1, + BlockList2) + end; + error -> + {error_so_maybe_present, "Remainder Check"} + end; + error -> + {error_so_maybe_present, "Exponent Check"} + end. + + +checkhash_forsegments(Acc, Exp, Remainder, SegmentList, BlockList, BlockID) -> + NextHash = Acc + ?DIVISOR * Exp + Remainder, + case lists:member(NextHash, SegmentList) of + true -> + {NextHash, [BlockID|BlockList]}; + false -> + {NextHash, BlockList} + end. + + +setnth(0, [_|Rest], New) -> [New|Rest]; +setnth(I, [E|Rest], New) -> [E|setnth(I-1, Rest, New)]. + + +findexponent(BitStr) -> + findexponent(BitStr, 0). + +findexponent(<<>>, _) -> + error; +findexponent(<>, Acc) -> + case H of + 1 -> findexponent(T, Acc + 1); + 0 -> {ok, Acc, T} + end. + + +findremainder(BitStr, Factor) -> + case BitStr of + <> -> + {ok, Remainder, BlockID, Tail}; + _ -> + error + end. + + + +%%%============================================================================ +%%% Test +%%%============================================================================ + + +-ifdef(TEST). + +generate_randomkeys({Count, StartSQN}) -> + generate_randomkeys(Count, StartSQN, []); +generate_randomkeys(Count) -> + generate_randomkeys(Count, 0, []). + +generate_randomkeys(0, _SQN, Acc) -> + lists:reverse(Acc); +generate_randomkeys(Count, SQN, Acc) -> + RandKey = {{o, + lists:concat(["Bucket", random:uniform(1024)]), + lists:concat(["Key", random:uniform(1024)]), + null}, + {SQN, + {active, infinity}, null}}, + generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). + +generate_sequentialkeys(Count, Start) -> + generate_sequentialkeys(Count + Start, Start, []). + +generate_sequentialkeys(Target, Incr, Acc) when Incr =:= Target -> + Acc; +generate_sequentialkeys(Target, Incr, Acc) -> + KeyStr = string:right(integer_to_list(Incr), 8, $0), + NextKey = {{o, + "BucketSeq", + lists:concat(["Key", KeyStr]), + null}, + {5, + {active, infinity}, null}}, + generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]). + +simple_create_block_test() -> + KeyList1 = [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key3", null}, {2, {active, infinity}, null}}], + KeyList2 = [{{o, "Bucket1", "Key2", null}, {3, {active, infinity}, null}}], + {MergedKeyList, ListStatus, SN, _, _, _} = create_block(KeyList1, + KeyList2, + #level{level=1}), + ?assertMatch(partial, ListStatus), + [H1|T1] = MergedKeyList, + ?assertMatch(H1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}), + [H2|T2] = T1, + ?assertMatch(H2, {{o, "Bucket1", "Key2", null}, {3, {active, infinity}, null}}), + ?assertMatch(T2, [{{o, "Bucket1", "Key3", null}, {2, {active, infinity}, null}}]), + ?assertMatch(SN, {1,3}). + +dominate_create_block_test() -> + KeyList1 = [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key2", null}, {2, {active, infinity}, null}}], + KeyList2 = [{{o, "Bucket1", "Key2", null}, {3, {tomb, infinity}, null}}], + {MergedKeyList, ListStatus, SN, _, _, _} = create_block(KeyList1, + KeyList2, + #level{level=1}), + ?assertMatch(partial, ListStatus), + [K1, K2] = MergedKeyList, + ?assertMatch(K1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}), + ?assertMatch(K2, {{o, "Bucket1", "Key2", null}, {3, {tomb, infinity}, null}}), + ?assertMatch(SN, {1,3}). + +sample_keylist() -> + KeyList1 = [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key3", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key5", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key7", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key9", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key1", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key3", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key5", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key7", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key9", null}, {1, {active, infinity}, null}}, + {{o, "Bucket3", "Key1", null}, {1, {active, infinity}, null}}, + {{o, "Bucket3", "Key3", null}, {1, {active, infinity}, null}}, + {{o, "Bucket3", "Key5", null}, {1, {active, infinity}, null}}, + {{o, "Bucket3", "Key7", null}, {1, {active, infinity}, null}}, + {{o, "Bucket3", "Key9", null}, {1, {active, infinity}, null}}, + {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, null}}], + KeyList2 = [{{o, "Bucket1", "Key2", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key4", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key6", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key8", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key9a", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key9c", null}, {1, {active, infinity}, null}}, + {{o, "Bucket1", "Key9d", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key2", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key4", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key6", null}, {1, {active, infinity}, null}}, + {{o, "Bucket2", "Key8", null}, {1, {active, infinity}, null}}, + {{o, "Bucket3", "Key2", null}, {1, {active, infinity}, null}}, + {{o, "Bucket3", "Key4", null}, {3, {active, infinity}, null}}, + {{o, "Bucket3", "Key6", null}, {2, {active, infinity}, null}}, + {{o, "Bucket3", "Key8", null}, {1, {active, infinity}, null}}], + {KeyList1, KeyList2}. + +alternating_create_block_test() -> + {KeyList1, KeyList2} = sample_keylist(), + {MergedKeyList, ListStatus, _, _, _, _} = create_block(KeyList1, + KeyList2, + #level{level=1}), + BlockSize = length(MergedKeyList), + ?assertMatch(BlockSize, 32), + ?assertMatch(ListStatus, complete), + K1 = lists:nth(1, MergedKeyList), + ?assertMatch(K1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}), + K11 = lists:nth(11, MergedKeyList), + ?assertMatch(K11, {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, null}}), + K32 = lists:nth(32, MergedKeyList), + ?assertMatch(K32, {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, null}}), + HKey = {{o, "Bucket1", "Key0", null}, {1, {active, infinity}, null}}, + {_, ListStatus2, _, _, _, _} = create_block([HKey|KeyList1], + KeyList2, + #level{level=1}), + ?assertMatch(ListStatus2, full). + + +merge_seglists_test() -> + SegList1 = [0, 100, 200], + SegList2 = [50, 200], + SegList3 = [75, 10000], + SegList4 = [], + MergedList = merge_seglists({SegList1, SegList2, + SegList3, SegList4}), + ?assertMatch(MergedList, [{0, 0}, {50, 1}, {75, 2}, {100, 0}, + {200, 0}, {200,1}, {10000,2}]), + SegTerm = generate_segment_filter({SegList1, SegList2, + SegList3, SegList4}), + ?assertMatch(SegTerm, {[{0, 0}, {50, 1}, {25, 2}, {25, 0}, + {100, 0}, {0, 1}, {9800, 2}], + [{200, 0}, {200, 1}, {10000, 2},{0, 3}]}), + SegBin = serialise_segment_filter(SegTerm), + ExpectedTopHashes = <<200:20, 200:20, 10000:20, 0:20>>, + ExpectedDeltas = <<0:1, 0:13, 0:2, + 0:1, 50:13, 1:2, + 0:1, 25:13, 2:2, + 0:1, 25:13, 0:2, + 0:1, 100:13, 0:2, + 0:1, 0:13, 1:2, + 2:2, 1708:13, 2:2>>, + ExpectedResult = <>, + ?assertMatch(SegBin, ExpectedResult), + R1 = check_for_segments(SegBin, [100], true), + ?assertMatch(R1,{maybe_present, [0]}), + R2 = check_for_segments(SegBin, [900], true), + ?assertMatch(R2, not_present), + R3 = check_for_segments(SegBin, [200], true), + ?assertMatch(R3, {maybe_present, [1,0]}), + R4 = check_for_segments(SegBin, [0,900], true), + ?assertMatch(R4, {maybe_present, [0]}), + R5 = check_for_segments(SegBin, [100], false), + ?assertMatch(R5, {maybe_present, [0]}), + R6 = check_for_segments(SegBin, [900], false), + ?assertMatch(R6, not_present), + R7 = check_for_segments(SegBin, [200], false), + ?assertMatch(R7, {maybe_present, [1,0]}), + R8 = check_for_segments(SegBin, [0,900], false), + ?assertMatch(R8, {maybe_present, [0]}), + R9 = check_for_segments(SegBin, [1024*1024 - 1], false), + ?assertMatch(R9, not_present), + io:format("Try corrupted bloom filter with flipped bit in " ++ + "penultimate delta~n"), + ExpectedDeltasFlippedBit = <<0:1, 0:13, 0:2, + 0:1, 50:13, 1:2, + 0:1, 25:13, 2:2, + 0:1, 25:13, 0:2, + 0:1, 100:13, 0:2, + 0:1, 0:13, 1:2, + 2:2, 1709:13, 2:2>>, + SegBin1 = <>, + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin1, [900], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin1, [200], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin1, [0,900], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin1, [1024*1024 - 1], true)), + % This match is before the flipped bit, so still works without CRC check + ?assertMatch({maybe_present, [0]}, + check_for_segments(SegBin1, [0,900], false)), + io:format("Try corrupted bloom filter with flipped bit in " ++ + "final block's top hash~n"), + ExpectedTopHashesFlippedBit = <<200:20, 200:20, 10000:20, 1:20>>, + SegBin2 = <>, + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin2, [900], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin2, [200], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin2, [0,900], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin2, [1024*1024 - 1], true)), + % This match is before the flipped bit, so still works without CRC check + ?assertMatch({maybe_present, [0]}, + check_for_segments(SegBin2, [0,900], false)), + + ExpectedDeltasAll1s = <<4294967295:32/integer>>, + SegBin3 = <>, + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin3, [900], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin3, [200], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin3, [0,900], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin3, [1024*1024 - 1], true)), + % This is so badly mangled, the error gets detected event without CRC + % checking being enforced + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin3, [900], false)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin3, [200], false)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin3, [0,900], false)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin3, [1024*1024 - 1], false)), + + ExpectedDeltasNearlyAll1s = <<4294967287:32/integer>>, + SegBin4 = <>, + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin4, [900], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin4, [200], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin4, [0,900], true)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin4, [1024*1024 - 1], true)), + % This is so badly mangled, the error gets detected event without CRC + % checking being enforced + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin4, [900], false)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin4, [200], false)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin4, [0,900], false)), + ?assertMatch(error_so_maybe_present, + check_for_segments(SegBin4, [1024*1024 - 1], false)). + +createslot_stage1_test() -> + {KeyList1, KeyList2} = sample_keylist(), + Out = create_slot(KeyList1, KeyList2, #level{level=1}), + {{LowKey, SegFilter, _SerialisedSlot, _LengthList}, + {{LSN, HSN}, LastKey, Status}, + KL1, KL2} = Out, + ?assertMatch(LowKey, {o, "Bucket1", "Key1", null}), + ?assertMatch(LastKey, {o, "Bucket4", "Key1", null}), + ?assertMatch(Status, partial), + ?assertMatch(KL1, []), + ?assertMatch(KL2, []), + R0 = check_for_segments(serialise_segment_filter(SegFilter), + [hash_for_segmentid({keyonly, {o, "Bucket1", "Key1", null}})], + true), + ?assertMatch(R0, {maybe_present, [0]}), + R1 = check_for_segments(serialise_segment_filter(SegFilter), + [hash_for_segmentid({keyonly, {o, "Bucket1", "Key99", null}})], + true), + ?assertMatch(R1, not_present), + ?assertMatch(LSN, 1), + ?assertMatch(HSN, 3). + +createslot_stage2_test() -> + Out = create_slot(lists:sort(generate_randomkeys(100)), + lists:sort(generate_randomkeys(100)), + #level{level=1}), + {{_LowKey, _SegFilter, SerialisedSlot, LengthList}, + {{_LSN, _HSN}, _LastKey, Status}, + _KL1, _KL2} = Out, + ?assertMatch(Status, full), + Sum1 = lists:foldl(fun(X, Sum) -> Sum + X end, 0, LengthList), + Sum2 = byte_size(SerialisedSlot), + ?assertMatch(Sum1, Sum2). + + +createslot_stage3_test() -> + Out = create_slot(lists:sort(generate_sequentialkeys(100, 1)), + lists:sort(generate_sequentialkeys(100, 101)), + #level{level=1}), + {{LowKey, SegFilter, SerialisedSlot, LengthList}, + {{_LSN, _HSN}, LastKey, Status}, + KL1, KL2} = Out, + ?assertMatch(Status, full), + Sum1 = lists:foldl(fun(X, Sum) -> Sum + X end, 0, LengthList), + Sum2 = byte_size(SerialisedSlot), + ?assertMatch(Sum1, Sum2), + ?assertMatch(LowKey, {o, "BucketSeq", "Key00000001", null}), + ?assertMatch(LastKey, {o, "BucketSeq", "Key00000128", null}), + ?assertMatch(KL1, []), + Rem = length(KL2), + ?assertMatch(Rem, 72), + R0 = check_for_segments(serialise_segment_filter(SegFilter), + [hash_for_segmentid({keyonly, + {o, "BucketSeq", "Key00000100", null}})], + true), + ?assertMatch(R0, {maybe_present, [3]}), + R1 = check_for_segments(serialise_segment_filter(SegFilter), + [hash_for_segmentid({keyonly, + {o, "Bucket1", "Key99", null}})], + true), + ?assertMatch(R1, not_present), + R2 = check_for_segments(serialise_segment_filter(SegFilter), + [hash_for_segmentid({keyonly, + {o, "BucketSeq", "Key00000040", null}})], + true), + ?assertMatch(R2, {maybe_present, [1]}), + R3 = check_for_segments(serialise_segment_filter(SegFilter), + [hash_for_segmentid({keyonly, + {o, "BucketSeq", "Key00000004", null}})], + true), + ?assertMatch(R3, {maybe_present, [0]}). + + + +testwrite_function(slots, {Handle, SerialisedSlots}) -> + lists:append(Handle, [SerialisedSlots]); +testwrite_function(finalise, {Handle, C_SlotIndex, SNExtremes, KeyExtremes}) -> + {Handle, C_SlotIndex, SNExtremes, KeyExtremes}. + +writekeys_stage1_test() -> + {KL1, KL2} = sample_keylist(), + {FunOut, {_KL1Rem, _KL2Rem}} = write_keys([], + KL1, KL2, + [], <<>>, + #level{level=1}, + fun testwrite_function/2), + {Handle, {_, PointerIndex}, SNExtremes, KeyExtremes} = FunOut, + ?assertMatch(SNExtremes, {1,3}), + ?assertMatch(KeyExtremes, {{o, "Bucket1", "Key1", null}, + {o, "Bucket4", "Key1", null}}), + [TopIndex|[]] = PointerIndex, + {TopKey, _SegFilter, {LengthList, _Total}} = TopIndex, + ?assertMatch(TopKey, {o, "Bucket1", "Key1", null}), + TotalLength = lists:foldl(fun(X, Acc) -> Acc + X end, + 0, LengthList), + ActualLength = lists:foldl(fun(X, Acc) -> Acc + byte_size(X) end, + 0, Handle), + ?assertMatch(TotalLength, ActualLength). + +initial_create_header_test() -> + Output = create_header(initial), + ?assertMatch(?HEADER_LEN, byte_size(Output)). + +initial_create_file_test() -> + Filename = "../test/test1.sft", + {KL1, KL2} = sample_keylist(), + {Handle, FileMD} = create_file(Filename), + {UpdHandle, UpdFileMD, {[], []}} = complete_file(Handle, FileMD, + KL1, KL2, + #level{level=1}), + Result1 = fetch_keyvalue(UpdHandle, UpdFileMD, {o, "Bucket1", "Key8", null}), + io:format("Result is ~w~n", [Result1]), + ?assertMatch(Result1, {{o, "Bucket1", "Key8", null}, + {1, {active, infinity}, null}}), + Result2 = fetch_keyvalue(UpdHandle, UpdFileMD, {o, "Bucket1", "Key88", null}), + io:format("Result is ~w~n", [Result2]), + ?assertMatch(Result2, not_present), + ok = file:close(UpdHandle), + ok = file:delete(Filename). + +big_create_file_test() -> + Filename = "../test/bigtest1.sft", + {KL1, KL2} = {lists:sort(generate_randomkeys(2000)), + lists:sort(generate_randomkeys(40000))}, + {InitHandle, InitFileMD} = create_file(Filename), + {Handle, FileMD, {_KL1Rem, _KL2Rem}} = complete_file(InitHandle, + InitFileMD, + KL1, KL2, + #level{level=1}), + [{K1, {Sq1, St1, V1}}|_] = KL1, + [{K2, {Sq2, St2, V2}}|_] = KL2, + Result1 = fetch_keyvalue(Handle, FileMD, K1), + Result2 = fetch_keyvalue(Handle, FileMD, K2), + ?assertMatch(Result1, {K1, {Sq1, St1, V1}}), + ?assertMatch(Result2, {K2, {Sq2, St2, V2}}), + SubList = lists:sublist(KL2, 1000), + FailedFinds = lists:foldl(fun(K, Acc) -> + {Kn, {_, _, _}} = K, + Rn = fetch_keyvalue(Handle, FileMD, Kn), + case Rn of + {Kn, {_, _, _}} -> + Acc; + _ -> + Acc + 1 + end + end, + 0, + SubList), + io:format("FailedFinds of ~w~n", [FailedFinds]), + ?assertMatch(FailedFinds, 0), + Result3 = fetch_keyvalue(Handle, + FileMD, + {o, "Bucket1024", "Key1024Alt", null}), + ?assertMatch(Result3, not_present), + ok = file:close(Handle), + ok = file:delete(Filename). + +initial_iterator_test() -> + Filename = "../test/test2.sft", + {KL1, KL2} = sample_keylist(), + {Handle, FileMD} = create_file(Filename), + {UpdHandle, UpdFileMD, {[], []}} = complete_file(Handle, FileMD, + KL1, KL2, + #level{level=1}), + Result1 = fetch_range_keysonly(UpdHandle, UpdFileMD, + {o, "Bucket1", "Key8", null}, + {o, "Bucket1", "Key9d", null}), + io:format("Result returned of ~w~n", [Result1]), + ?assertMatch({complete, + [{{o, "Bucket1", "Key8", null}, 1, {active, infinity}}, + {{o, "Bucket1", "Key9", null}, 1, {active, infinity}}, + {{o, "Bucket1", "Key9a", null}, 1, {active, infinity}}, + {{o, "Bucket1", "Key9b", null}, 1, {active, infinity}}, + {{o, "Bucket1", "Key9c", null}, 1, {active, infinity}}, + {{o, "Bucket1", "Key9d", null}, 1, {active, infinity}} + ]}, + Result1), + Result2 = fetch_range_keysonly(UpdHandle, UpdFileMD, + {o, "Bucket1", "Key8", null}, + {o, "Bucket1", "Key9b", null}), + ?assertMatch({complete, + [{{o, "Bucket1", "Key8", null}, 1, {active, infinity}}, + {{o, "Bucket1", "Key9", null}, 1, {active, infinity}}, + {{o, "Bucket1", "Key9a", null}, 1, {active, infinity}}, + {{o, "Bucket1", "Key9b", null}, 1, {active, infinity}} + ]}, + Result2), + Result3 = fetch_range_keysonly(UpdHandle, UpdFileMD, + {o, "Bucket3", "Key4", null}, + all), + {partial, RL3, _} = Result3, + ?assertMatch([{{o, "Bucket3", "Key4", null}, 3, {active, infinity}}, + {{o, "Bucket3", "Key5", null}, 1, {active, infinity}}, + {{o, "Bucket3", "Key6", null}, 2, {active, infinity}}, + {{o, "Bucket3", "Key7", null}, 1, {active, infinity}}, + {{o, "Bucket3", "Key8", null}, 1, {active, infinity}}, + {{o, "Bucket3", "Key9", null}, 1, {active, infinity}}, + {{o, "Bucket4", "Key1", null}, 1, {active, infinity}}], + RL3), + ok = file:close(UpdHandle), + ok = file:delete(Filename). + +key_dominates_test() -> + KV1 = {{o, "Bucket", "Key1", null}, {5, {active, infinity}, []}}, + KV2 = {{o, "Bucket", "Key3", null}, {6, {active, infinity}, []}}, + KV3 = {{o, "Bucket", "Key2", null}, {3, {active, infinity}, []}}, + KV4 = {{o, "Bucket", "Key4", null}, {7, {active, infinity}, []}}, + KV5 = {{o, "Bucket", "Key1", null}, {4, {active, infinity}, []}}, + KV6 = {{o, "Bucket", "Key1", null}, {99, {tomb, 999}, []}}, + KV7 = {{o, "Bucket", "Key1", null}, {99, tomb, []}}, + KL1 = [KV1, KV2], + KL2 = [KV3, KV4], + ?assertMatch({{next_key, KV1}, [KV2], KL2}, + key_dominates(KL1, KL2, {undefined, 1})), + ?assertMatch({{next_key, KV1}, KL2, [KV2]}, + key_dominates(KL2, KL1, {undefined, 1})), + ?assertMatch({skipped_key, KL2, KL1}, + key_dominates([KV5|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV1}, [KV2], []}, + key_dominates(KL1, [], {undefined, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV6}, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {undefined, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {true, 1})), + ?assertMatch({skipped_key, [KV6|KL2], [KV2]}, + key_dominates([KV6|KL2], KL1, {true, 1000})), + ?assertMatch({{next_key, KV6}, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {true, 1})), + ?assertMatch({skipped_key, KL2, [KV2]}, + key_dominates([KV6|KL2], [KV2], {true, 1000})), + ?assertMatch({skipped_key, [], []}, + key_dominates([KV6], [], {true, 1000})), + ?assertMatch({skipped_key, [], []}, + key_dominates([], [KV6], {true, 1000})), + ?assertMatch({{next_key, KV6}, [], []}, + key_dominates([KV6], [], {true, 1})), + ?assertMatch({{next_key, KV6}, [], []}, + key_dominates([], [KV6], {true, 1})), + ?assertMatch({skipped_key, [], []}, + key_dominates([KV7], [], {true, 1})), + ?assertMatch({skipped_key, [], []}, + key_dominates([], [KV7], {true, 1})), + ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, + key_dominates([KV7|KL2], KL1, {undefined, 1})), + ?assertMatch({{next_key, KV7}, KL2, [KV2]}, + key_dominates([KV7|KL2], [KV2], {undefined, 1})), + ?assertMatch({skipped_key, [KV7|KL2], [KV2]}, + key_dominates([KV7|KL2], KL1, {true, 1})), + ?assertMatch({skipped_key, KL2, [KV2]}, + key_dominates([KV7|KL2], [KV2], {true, 1})). + + +big_iterator_test() -> + Filename = "../test/bigtest1.sft", + {KL1, KL2} = {lists:sort(generate_randomkeys(10000)), []}, + {InitHandle, InitFileMD} = create_file(Filename), + {Handle, FileMD, {KL1Rem, KL2Rem}} = complete_file(InitHandle, InitFileMD, + KL1, KL2, + #level{level=1}), + io:format("Remainder lengths are ~w and ~w ~n", [length(KL1Rem), length(KL2Rem)]), + {complete, Result1} = fetch_range_keysonly(Handle, + FileMD, + {o, "Bucket0000", "Key0000", null}, + {o, "Bucket9999", "Key9999", null}, + 256), + NumFoundKeys1 = length(Result1), + NumAddedKeys = 10000 - length(KL1Rem), + ?assertMatch(NumFoundKeys1, NumAddedKeys), + {partial, Result2, _} = fetch_range_keysonly(Handle, + FileMD, + {o, "Bucket0000", "Key0000", null}, + {o, "Bucket9999", "Key9999", null}, + 32), + ?assertMatch(32 * 128, length(Result2)), + {partial, Result3, _} = fetch_range_keysonly(Handle, + FileMD, + {o, "Bucket0000", "Key0000", null}, + {o, "Bucket9999", "Key9999", null}, + 4), + ?assertMatch(4 * 128, length(Result3)), + ok = file:close(Handle), + ok = file:delete(Filename). + +filename_test() -> + FN1 = "../tmp/filename", + FN2 = "../tmp/filename.pnd", + FN3 = "../tmp/subdir/file_name.pend", + ?assertMatch({"../tmp/filename.pnd", "../tmp/filename.sft"}, + generate_filenames(FN1)), + ?assertMatch({"../tmp/filename.pnd", "../tmp/filename.sft"}, + generate_filenames(FN2)), + ?assertMatch({"../tmp/subdir/file_name.pnd", + "../tmp/subdir/file_name.sft"}, + generate_filenames(FN3)). + +-endif. \ No newline at end of file diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl new file mode 100644 index 0000000..23273de --- /dev/null +++ b/test/end_to_end/basic_SUITE.erl @@ -0,0 +1,544 @@ +-module(basic_SUITE). +-include_lib("common_test/include/ct.hrl"). +-include("include/leveled.hrl"). +-export([all/0]). +-export([simple_put_fetch_head_delete/1, + many_put_fetch_head/1, + journal_compaction/1, + fetchput_snapshot/1, + load_and_count/1, + load_and_count_withdelete/1, + space_clear_ondelete/1 + ]). + +all() -> [ + simple_put_fetch_head_delete, + many_put_fetch_head, + journal_compaction, + fetchput_snapshot, + load_and_count, + load_and_count_withdelete, + space_clear_ondelete + ]. + + +simple_put_fetch_head_delete(_Config) -> + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + testutil:check_formissingobject(Bookie1, "Bucket1", "Key2"), + ok = leveled_bookie:book_close(Bookie1), + StartOpts2 = [{root_path, RootPath}, + {max_journalsize, 3000000}], + {ok, Bookie2} = leveled_bookie:book_start(StartOpts2), + testutil:check_forobject(Bookie2, TestObject), + ObjList1 = testutil:generate_objects(5000, 2), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie2, Obj, Spc) end, + ObjList1), + ChkList1 = lists:sublist(lists:sort(ObjList1), 100), + testutil:check_forlist(Bookie2, ChkList1), + testutil:check_forobject(Bookie2, TestObject), + testutil:check_formissingobject(Bookie2, "Bucket1", "Key2"), + ok = leveled_bookie:book_put(Bookie2, "Bucket1", "Key2", "Value2", + [{add, "Index1", "Term1"}]), + {ok, "Value2"} = leveled_bookie:book_get(Bookie2, "Bucket1", "Key2"), + {ok, {62888926, 56}} = leveled_bookie:book_head(Bookie2, + "Bucket1", + "Key2"), + testutil:check_formissingobject(Bookie2, "Bucket1", "Key2"), + ok = leveled_bookie:book_put(Bookie2, "Bucket1", "Key2", <<"Value2">>, + [{remove, "Index1", "Term1"}, + {add, "Index1", <<"Term2">>}]), + {ok, <<"Value2">>} = leveled_bookie:book_get(Bookie2, "Bucket1", "Key2"), + ok = leveled_bookie:book_close(Bookie2), + {ok, Bookie3} = leveled_bookie:book_start(StartOpts2), + {ok, <<"Value2">>} = leveled_bookie:book_get(Bookie3, "Bucket1", "Key2"), + ok = leveled_bookie:book_delete(Bookie3, "Bucket1", "Key2", + [{remove, "Index1", "Term1"}]), + not_found = leveled_bookie:book_get(Bookie3, "Bucket1", "Key2"), + not_found = leveled_bookie:book_head(Bookie3, "Bucket1", "Key2"), + ok = leveled_bookie:book_close(Bookie3), + {ok, Bookie4} = leveled_bookie:book_start(StartOpts2), + not_found = leveled_bookie:book_get(Bookie4, "Bucket1", "Key2"), + ok = leveled_bookie:book_close(Bookie4), + testutil:reset_filestructure(). + +many_put_fetch_head(_Config) -> + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}, {max_pencillercachesize, 16000}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + ok = leveled_bookie:book_close(Bookie1), + StartOpts2 = [{root_path, RootPath}, + {max_journalsize, 1000000000}, + {max_pencillercachesize, 32000}], + {ok, Bookie2} = leveled_bookie:book_start(StartOpts2), + testutil:check_forobject(Bookie2, TestObject), + GenList = [2, 20002, 40002, 60002, 80002, + 100002, 120002, 140002, 160002, 180002], + CLs = testutil:load_objects(20000, GenList, Bookie2, TestObject, + fun testutil:generate_smallobjects/2), + CL1A = lists:nth(1, CLs), + ChkListFixed = lists:nth(length(CLs), CLs), + testutil:check_forlist(Bookie2, CL1A), + ObjList2A = testutil:generate_objects(5000, 2), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie2, Obj, Spc) end, + ObjList2A), + ChkList2A = lists:sublist(lists:sort(ObjList2A), 1000), + testutil:check_forlist(Bookie2, ChkList2A), + testutil:check_forlist(Bookie2, ChkListFixed), + testutil:check_forobject(Bookie2, TestObject), + testutil:check_forlist(Bookie2, ChkList2A), + testutil:check_forlist(Bookie2, ChkListFixed), + testutil:check_forobject(Bookie2, TestObject), + ok = leveled_bookie:book_close(Bookie2), + {ok, Bookie3} = leveled_bookie:book_start(StartOpts2), + testutil:check_forlist(Bookie3, ChkList2A), + testutil:check_forobject(Bookie3, TestObject), + ok = leveled_bookie:book_close(Bookie3), + testutil:reset_filestructure(). + +journal_compaction(_Config) -> + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}, + {max_journalsize, 10000000}, + {max_run_length, 1}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + ok = leveled_bookie:book_compactjournal(Bookie1, 30000), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + ObjList1 = testutil:generate_objects(20000, 2), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie1, Obj, Spc) end, + ObjList1), + ChkList1 = lists:sublist(lists:sort(ObjList1), 10000), + testutil:check_forlist(Bookie1, ChkList1), + testutil:check_forobject(Bookie1, TestObject), + {B2, K2, V2, Spec2, MD} = {"Bucket1", + "Key1", + "Value1", + [], + {"MDK1", "MDV1"}}, + {TestObject2, TestSpec2} = testutil:generate_testobject(B2, K2, + V2, Spec2, MD), + ok = leveled_bookie:book_riakput(Bookie1, TestObject2, TestSpec2), + ok = leveled_bookie:book_compactjournal(Bookie1, 30000), + testutil:check_forlist(Bookie1, ChkList1), + testutil:check_forobject(Bookie1, TestObject), + testutil:check_forobject(Bookie1, TestObject2), + testutil:check_forlist(Bookie1, ChkList1), + testutil:check_forobject(Bookie1, TestObject), + testutil:check_forobject(Bookie1, TestObject2), + %% Delete some of the objects + ObjListD = testutil:generate_objects(10000, 2), + lists:foreach(fun({_R, O, _S}) -> + ok = leveled_bookie:book_riakdelete(Bookie1, + O#r_object.bucket, + O#r_object.key, + []) + end, + ObjListD), + + %% Now replace all the other objects + ObjList2 = testutil:generate_objects(40000, 10002), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie1, Obj, Spc) end, + ObjList2), + ok = leveled_bookie:book_compactjournal(Bookie1, 30000), + + F = fun leveled_bookie:book_islastcompactionpending/1, + lists:foldl(fun(X, Pending) -> + case Pending of + false -> + false; + true -> + io:format("Loop ~w waiting for journal " + ++ "compaction to complete~n", [X]), + timer:sleep(20000), + F(Bookie1) + end end, + true, + lists:seq(1, 15)), + + ChkList3 = lists:sublist(lists:sort(ObjList2), 500), + testutil:check_forlist(Bookie1, ChkList3), + ok = leveled_bookie:book_close(Bookie1), + % Restart + {ok, Bookie2} = leveled_bookie:book_start(StartOpts1), + testutil:check_forobject(Bookie2, TestObject), + testutil:check_forlist(Bookie2, ChkList3), + ok = leveled_bookie:book_close(Bookie2), + testutil:reset_filestructure(10000). + + +fetchput_snapshot(_Config) -> + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}, {max_journalsize, 30000000}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + ObjList1 = testutil:generate_objects(5000, 2), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie1, Obj, Spc) end, + ObjList1), + SnapOpts1 = [{snapshot_bookie, Bookie1}], + {ok, SnapBookie1} = leveled_bookie:book_start(SnapOpts1), + ChkList1 = lists:sublist(lists:sort(ObjList1), 100), + testutil:check_forlist(Bookie1, ChkList1), + testutil:check_forlist(SnapBookie1, ChkList1), + ok = leveled_bookie:book_close(SnapBookie1), + testutil:check_forlist(Bookie1, ChkList1), + ok = leveled_bookie:book_close(Bookie1), + io:format("Closed initial bookies~n"), + + {ok, Bookie2} = leveled_bookie:book_start(StartOpts1), + SnapOpts2 = [{snapshot_bookie, Bookie2}], + {ok, SnapBookie2} = leveled_bookie:book_start(SnapOpts2), + io:format("Bookies restarted~n"), + + testutil:check_forlist(Bookie2, ChkList1), + io:format("Check active bookie still contains original data~n"), + testutil:check_forlist(SnapBookie2, ChkList1), + io:format("Check snapshot still contains original data~n"), + + + ObjList2 = testutil:generate_objects(5000, 2), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie2, Obj, Spc) end, + ObjList2), + io:format("Replacement objects put~n"), + + ChkList2 = lists:sublist(lists:sort(ObjList2), 100), + testutil:check_forlist(Bookie2, ChkList2), + testutil:check_forlist(SnapBookie2, ChkList1), + io:format("Checked for replacement objects in active bookie" ++ + ", old objects in snapshot~n"), + + ok = filelib:ensure_dir(RootPath ++ "/ledger/ledger_files"), + {ok, FNsA} = file:list_dir(RootPath ++ "/ledger/ledger_files"), + ObjList3 = testutil:generate_objects(15000, 5002), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie2, Obj, Spc) end, + ObjList3), + ChkList3 = lists:sublist(lists:sort(ObjList3), 100), + testutil:check_forlist(Bookie2, ChkList3), + testutil:check_formissinglist(SnapBookie2, ChkList3), + GenList = [20002, 40002, 60002, 80002], + CLs2 = testutil:load_objects(20000, GenList, Bookie2, TestObject, + fun testutil:generate_smallobjects/2), + io:format("Loaded significant numbers of new objects~n"), + + testutil:check_forlist(Bookie2, lists:nth(length(CLs2), CLs2)), + io:format("Checked active bookie has new objects~n"), + + {ok, SnapBookie3} = leveled_bookie:book_start(SnapOpts2), + testutil:check_forlist(SnapBookie3, lists:nth(length(CLs2), CLs2)), + testutil:check_formissinglist(SnapBookie2, ChkList3), + testutil:check_formissinglist(SnapBookie2, lists:nth(length(CLs2), CLs2)), + testutil:check_forlist(Bookie2, ChkList2), + testutil:check_forlist(SnapBookie3, ChkList2), + testutil:check_forlist(SnapBookie2, ChkList1), + io:format("Started new snapshot and check for new objects~n"), + + CLs3 = testutil:load_objects(20000, GenList, Bookie2, TestObject, + fun testutil:generate_smallobjects/2), + testutil:check_forlist(Bookie2, lists:nth(length(CLs3), CLs3)), + testutil:check_forlist(Bookie2, lists:nth(1, CLs3)), + + io:format("Starting 15s sleep in which snap2 should block deletion~n"), + timer:sleep(15000), + {ok, FNsB} = file:list_dir(RootPath ++ "/ledger/ledger_files"), + ok = leveled_bookie:book_close(SnapBookie2), + io:format("Starting 15s sleep as snap2 close should unblock deletion~n"), + timer:sleep(15000), + io:format("Pause for deletion has ended~n"), + + testutil:check_forlist(Bookie2, lists:nth(length(CLs3), CLs3)), + ok = leveled_bookie:book_close(SnapBookie3), + io:format("Starting 15s sleep as snap3 close should unblock deletion~n"), + timer:sleep(15000), + io:format("Pause for deletion has ended~n"), + testutil:check_forlist(Bookie2, lists:nth(length(CLs3), CLs3)), + testutil:check_forlist(Bookie2, lists:nth(1, CLs3)), + {ok, FNsC} = file:list_dir(RootPath ++ "/ledger/ledger_files"), + true = length(FNsB) > length(FNsA), + true = length(FNsB) > length(FNsC), + + {B1Size, B1Count} = testutil:check_bucket_stats(Bookie2, "Bucket1"), + true = B1Size > 0, + true = B1Count == 1, + {B1Size, B1Count} = testutil:check_bucket_stats(Bookie2, "Bucket1"), + {BSize, BCount} = testutil:check_bucket_stats(Bookie2, "Bucket"), + true = BSize > 0, + true = BCount == 100000, + + ok = leveled_bookie:book_close(Bookie2), + testutil:reset_filestructure(). + + +load_and_count(_Config) -> + % Use artificially small files, and the load keys, counting they're all + % present + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}, {max_journalsize, 50000000}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + io:format("Loading initial small objects~n"), + G1 = fun testutil:generate_smallobjects/2, + lists:foldl(fun(_X, Acc) -> + testutil:load_objects(5000, + [Acc + 2], + Bookie1, + TestObject, + G1), + {_S, Count} = testutil:check_bucket_stats(Bookie1, + "Bucket"), + if + Acc + 5000 == Count -> + ok + end, + Acc + 5000 end, + 0, + lists:seq(1, 20)), + testutil:check_forobject(Bookie1, TestObject), + io:format("Loading larger compressible objects~n"), + G2 = fun testutil:generate_compressibleobjects/2, + lists:foldl(fun(_X, Acc) -> + testutil:load_objects(5000, + [Acc + 2], + Bookie1, + TestObject, + G2), + {_S, Count} = testutil:check_bucket_stats(Bookie1, + "Bucket"), + if + Acc + 5000 == Count -> + ok + end, + Acc + 5000 end, + 100000, + lists:seq(1, 20)), + testutil:check_forobject(Bookie1, TestObject), + io:format("Replacing small objects~n"), + lists:foldl(fun(_X, Acc) -> + testutil:load_objects(5000, + [Acc + 2], + Bookie1, + TestObject, + G1), + {_S, Count} = testutil:check_bucket_stats(Bookie1, + "Bucket"), + if + Count == 200000 -> + ok + end, + Acc + 5000 end, + 0, + lists:seq(1, 20)), + testutil:check_forobject(Bookie1, TestObject), + io:format("Loading more small objects~n"), + lists:foldl(fun(_X, Acc) -> + testutil:load_objects(5000, + [Acc + 2], + Bookie1, + TestObject, + G2), + {_S, Count} = testutil:check_bucket_stats(Bookie1, + "Bucket"), + if + Acc + 5000 == Count -> + ok + end, + Acc + 5000 end, + 200000, + lists:seq(1, 20)), + testutil:check_forobject(Bookie1, TestObject), + ok = leveled_bookie:book_close(Bookie1), + {ok, Bookie2} = leveled_bookie:book_start(StartOpts1), + {_, 300000} = testutil:check_bucket_stats(Bookie2, "Bucket"), + ok = leveled_bookie:book_close(Bookie2), + testutil:reset_filestructure(). + +load_and_count_withdelete(_Config) -> + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}, {max_journalsize, 50000000}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + io:format("Loading initial small objects~n"), + G1 = fun testutil:generate_smallobjects/2, + lists:foldl(fun(_X, Acc) -> + testutil:load_objects(5000, + [Acc + 2], + Bookie1, + TestObject, + G1), + {_S, Count} = testutil:check_bucket_stats(Bookie1, + "Bucket"), + if + Acc + 5000 == Count -> + ok + end, + Acc + 5000 end, + 0, + lists:seq(1, 20)), + testutil:check_forobject(Bookie1, TestObject), + {BucketD, KeyD} = leveled_codec:riakto_keydetails(TestObject), + {_, 1} = testutil:check_bucket_stats(Bookie1, BucketD), + ok = leveled_bookie:book_riakdelete(Bookie1, BucketD, KeyD, []), + not_found = leveled_bookie:book_riakget(Bookie1, BucketD, KeyD), + {_, 0} = testutil:check_bucket_stats(Bookie1, BucketD), + io:format("Loading larger compressible objects~n"), + G2 = fun testutil:generate_compressibleobjects/2, + lists:foldl(fun(_X, Acc) -> + testutil:load_objects(5000, + [Acc + 2], + Bookie1, + no_check, + G2), + {_S, Count} = testutil:check_bucket_stats(Bookie1, + "Bucket"), + if + Acc + 5000 == Count -> + ok + end, + Acc + 5000 end, + 100000, + lists:seq(1, 20)), + not_found = leveled_bookie:book_riakget(Bookie1, BucketD, KeyD), + ok = leveled_bookie:book_close(Bookie1), + {ok, Bookie2} = leveled_bookie:book_start(StartOpts1), + testutil:check_formissingobject(Bookie2, BucketD, KeyD), + {_BSize, 0} = testutil:check_bucket_stats(Bookie2, BucketD), + ok = leveled_bookie:book_close(Bookie2), + testutil:reset_filestructure(). + + +space_clear_ondelete(_Config) -> + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}, {max_journalsize, 20000000}], + {ok, Book1} = leveled_bookie:book_start(StartOpts1), + G2 = fun testutil:generate_compressibleobjects/2, + testutil:load_objects(20000, + [uuid, uuid, uuid, uuid], + Book1, + no_check, + G2), + + {async, F1} = leveled_bookie:book_returnfolder(Book1, {keylist, o_rkv}), + SW1 = os:timestamp(), + KL1 = F1(), + ok = case length(KL1) of + 80000 -> + io:format("Key list took ~w microseconds for 80K keys~n", + [timer:now_diff(os:timestamp(), SW1)]), + ok + end, + timer:sleep(10000), % Allow for any L0 file to be rolled + {ok, FNsA_L} = file:list_dir(RootPath ++ "/ledger/ledger_files"), + {ok, FNsA_J} = file:list_dir(RootPath ++ "/journal/journal_files"), + io:format("Bookie created ~w journal files and ~w ledger files~n", + [length(FNsA_J), length(FNsA_L)]), + + % Get an iterator to lock the inker during compaction + FoldObjectsFun = fun(B, K, V, Acc) -> [{B, K, testutil:riak_hash(V)}|Acc] + end, + {async, HTreeF1} = leveled_bookie:book_returnfolder(Book1, + {foldobjects_allkeys, + ?RIAK_TAG, + FoldObjectsFun}), + % Delete the keys + SW2 = os:timestamp(), + lists:foreach(fun({Bucket, Key}) -> + ok = leveled_bookie:book_riakdelete(Book1, + Bucket, + Key, + []) + end, + KL1), + io:format("Deletion took ~w microseconds for 80K keys~n", + [timer:now_diff(os:timestamp(), SW2)]), + + + + ok = leveled_bookie:book_compactjournal(Book1, 30000), + F = fun leveled_bookie:book_islastcompactionpending/1, + lists:foldl(fun(X, Pending) -> + case Pending of + false -> + false; + true -> + io:format("Loop ~w waiting for journal " + ++ "compaction to complete~n", [X]), + timer:sleep(20000), + F(Book1) + end end, + true, + lists:seq(1, 15)), + io:format("Waiting for journal deletes - blocked~n"), + timer:sleep(20000), + KeyHashList1 = HTreeF1(), + io:format("Key Hash List returned of length ~w~n", [length(KeyHashList1)]), + true = length(KeyHashList1) == 80000, + io:format("Waiting for journal deletes - unblocked~n"), + timer:sleep(20000), + {ok, FNsB_L} = file:list_dir(RootPath ++ "/ledger/ledger_files"), + {ok, FNsB_J} = file:list_dir(RootPath ++ "/journal/journal_files"), + {ok, FNsB_PC} = file:list_dir(RootPath + ++ "/journal/journal_files/post_compact"), + PointB_Journals = length(FNsB_J) + length(FNsB_PC), + io:format("Bookie has ~w journal files and ~w ledger files " ++ + "after deletes~n", + [PointB_Journals, length(FNsB_L)]), + + {async, F2} = leveled_bookie:book_returnfolder(Book1, {keylist, o_rkv}), + SW3 = os:timestamp(), + KL2 = F2(), + ok = case length(KL2) of + 0 -> + io:format("Key list took ~w microseconds for no keys~n", + [timer:now_diff(os:timestamp(), SW3)]), + ok + end, + ok = leveled_bookie:book_close(Book1), + + {ok, Book2} = leveled_bookie:book_start(StartOpts1), + {async, F3} = leveled_bookie:book_returnfolder(Book2, {keylist, o_rkv}), + SW4 = os:timestamp(), + KL3 = F3(), + ok = case length(KL3) of + 0 -> + io:format("Key list took ~w microseconds for no keys~n", + [timer:now_diff(os:timestamp(), SW4)]), + ok + end, + ok = leveled_bookie:book_close(Book2), + {ok, FNsC_L} = file:list_dir(RootPath ++ "/ledger/ledger_files"), + io:format("Bookie has ~w ledger files " ++ + "after close~n", [length(FNsC_L)]), + + {ok, Book3} = leveled_bookie:book_start(StartOpts1), + io:format("This should cause a final ledger merge event~n"), + io:format("Will require the penciller to resolve the issue of creating" ++ + " an empty file as all keys compact on merge~n"), + timer:sleep(12000), + ok = leveled_bookie:book_close(Book3), + {ok, FNsD_L} = file:list_dir(RootPath ++ "/ledger/ledger_files"), + io:format("Bookie has ~w ledger files " ++ + "after second close~n", [length(FNsD_L)]), + true = PointB_Journals < length(FNsA_J), + true = length(FNsD_L) < length(FNsA_L), + true = length(FNsD_L) < length(FNsB_L), + true = length(FNsD_L) < length(FNsC_L), + true = length(FNsD_L) == 0. \ No newline at end of file diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl new file mode 100644 index 0000000..c52cee9 --- /dev/null +++ b/test/end_to_end/iterator_SUITE.erl @@ -0,0 +1,360 @@ +-module(iterator_SUITE). + +-include_lib("common_test/include/ct.hrl"). +-include("include/leveled.hrl"). + +-define(KEY_ONLY, {false, undefined}). + +-export([all/0]). +-export([small_load_with2i/1, + query_count/1, + rotating_objects/1]). + +all() -> [ + small_load_with2i, + query_count, + rotating_objects + ]. + + +small_load_with2i(_Config) -> + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}, + {max_journalsize, 5000000}], + % low journal size to make sure > 1 created + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + testutil:check_formissingobject(Bookie1, "Bucket1", "Key2"), + testutil:check_forobject(Bookie1, TestObject), + ObjectGen = testutil:get_compressiblevalue_andinteger(), + IndexGen = testutil:get_randomindexes_generator(8), + ObjL1 = testutil:generate_objects(10000, + uuid, + [], + ObjectGen, + IndexGen), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie1, Obj, Spc) end, + ObjL1), + ChkList1 = lists:sublist(lists:sort(ObjL1), 100), + testutil:check_forlist(Bookie1, ChkList1), + testutil:check_forobject(Bookie1, TestObject), + + %% Delete the objects from the ChkList removing the indexes + lists:foreach(fun({_RN, Obj, Spc}) -> + DSpc = lists:map(fun({add, F, T}) -> {remove, F, T} + end, + Spc), + {B, K} = leveled_codec:riakto_keydetails(Obj), + leveled_bookie:book_riakdelete(Bookie1, B, K, DSpc) + end, + ChkList1), + %% Get the Buckets Keys and Hashes for the whole bucket + FoldObjectsFun = fun(B, K, V, Acc) -> [{B, K, testutil:riak_hash(V)}|Acc] + end, + {async, HTreeF1} = leveled_bookie:book_returnfolder(Bookie1, + {foldobjects_allkeys, + ?RIAK_TAG, + FoldObjectsFun}), + KeyHashList1 = HTreeF1(), + {async, HTreeF2} = leveled_bookie:book_returnfolder(Bookie1, + {foldobjects_bybucket, + ?RIAK_TAG, + "Bucket", + FoldObjectsFun}), + KeyHashList2 = HTreeF2(), + {async, HTreeF3} = leveled_bookie:book_returnfolder(Bookie1, + {foldobjects_byindex, + ?RIAK_TAG, + "Bucket", + {"idx1_bin", + "#", "~"}, + FoldObjectsFun}), + KeyHashList3 = HTreeF3(), + true = 9901 == length(KeyHashList1), % also includes the test object + true = 9900 == length(KeyHashList2), + true = 9900 == length(KeyHashList3), + + SumIntegerFun = fun(_B, _K, V, Acc) -> + [C] = V#r_object.contents, + {I, _Bin} = C#r_content.value, + Acc + I + end, + {async, Sum1} = leveled_bookie:book_returnfolder(Bookie1, + {foldobjects_bybucket, + ?RIAK_TAG, + "Bucket", + {SumIntegerFun, + 0}}), + Total1 = Sum1(), + true = Total1 > 100000, + + ok = leveled_bookie:book_close(Bookie1), + + {ok, Bookie2} = leveled_bookie:book_start(StartOpts1), + + {async, Sum2} = leveled_bookie:book_returnfolder(Bookie2, + {foldobjects_bybucket, + ?RIAK_TAG, + "Bucket", + {SumIntegerFun, + 0}}), + Total2 = Sum2(), + true = Total2 == Total1, + + ok = leveled_bookie:book_close(Bookie2), + testutil:reset_filestructure(). + + +query_count(_Config) -> + RootPath = testutil:reset_filestructure(), + {ok, Book1} = leveled_bookie:book_start(RootPath, 2000, 50000000), + {TestObject, TestSpec} = testutil:generate_testobject("Bucket", + "Key1", + "Value1", + [], + {"MDK1", "MDV1"}), + ok = leveled_bookie:book_riakput(Book1, TestObject, TestSpec), + testutil:check_forobject(Book1, TestObject), + testutil:check_formissingobject(Book1, "Bucket1", "Key2"), + testutil:check_forobject(Book1, TestObject), + lists:foreach(fun(_X) -> + V = testutil:get_compressiblevalue(), + Indexes = testutil:get_randomindexes_generator(8), + SW = os:timestamp(), + ObjL1 = testutil:generate_objects(10000, + uuid, + [], + V, + Indexes), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Book1, + Obj, + Spc) + end, + ObjL1), + io:format("Put of 10000 objects with 8 index entries " + ++ + "each completed in ~w microseconds~n", + [timer:now_diff(os:timestamp(), SW)]) + end, + lists:seq(1, 8)), + testutil:check_forobject(Book1, TestObject), + Total = lists:foldl(fun(X, Acc) -> + IdxF = "idx" ++ integer_to_list(X) ++ "_bin", + T = count_termsonindex("Bucket", + IdxF, + Book1, + ?KEY_ONLY), + io:format("~w terms found on index ~s~n", + [T, IdxF]), + Acc + T + end, + 0, + lists:seq(1, 8)), + ok = case Total of + 640000 -> + ok + end, + Index1Count = count_termsonindex("Bucket", + "idx1_bin", + Book1, + ?KEY_ONLY), + ok = leveled_bookie:book_close(Book1), + {ok, Book2} = leveled_bookie:book_start(RootPath, 1000, 50000000), + Index1Count = count_termsonindex("Bucket", + "idx1_bin", + Book2, + ?KEY_ONLY), + NameList = testutil:name_list(), + TotalNameByName = lists:foldl(fun({_X, Name}, Acc) -> + {ok, Regex} = re:compile("[0-9]+" ++ + Name), + SW = os:timestamp(), + T = count_termsonindex("Bucket", + "idx1_bin", + Book2, + {false, + Regex}), + TD = timer:now_diff(os:timestamp(), + SW), + io:format("~w terms found on " ++ + "index idx1 with a " ++ + "regex in ~w " ++ + "microseconds~n", + [T, TD]), + Acc + T + end, + 0, + NameList), + ok = case TotalNameByName of + Index1Count -> + ok + end, + {ok, RegMia} = re:compile("[0-9]+Mia"), + {async, + Mia2KFolder1} = leveled_bookie:book_returnfolder(Book2, + {index_query, + "Bucket", + {"idx2_bin", + "2000", + "2000~"}, + {false, + RegMia}}), + Mia2000Count1 = length(Mia2KFolder1()), + {async, + Mia2KFolder2} = leveled_bookie:book_returnfolder(Book2, + {index_query, + "Bucket", + {"idx2_bin", + "2000", + "2001"}, + {true, + undefined}}), + Mia2000Count2 = lists:foldl(fun({Term, _Key}, Acc) -> + case re:run(Term, RegMia) of + nomatch -> + Acc; + _ -> + Acc + 1 + end end, + 0, + Mia2KFolder2()), + ok = case Mia2000Count2 of + Mia2000Count1 when Mia2000Count1 > 0 -> + io:format("Mia2000 counts match at ~w~n", + [Mia2000Count1]), + ok + end, + {ok, RxMia2K} = re:compile("^2000[0-9]+Mia"), + {async, + Mia2KFolder3} = leveled_bookie:book_returnfolder(Book2, + {index_query, + "Bucket", + {"idx2_bin", + "1980", + "2100"}, + {false, + RxMia2K}}), + Mia2000Count1 = length(Mia2KFolder3()), + + V9 = testutil:get_compressiblevalue(), + Indexes9 = testutil:get_randomindexes_generator(8), + [{_RN, Obj9, Spc9}] = testutil:generate_objects(1, uuid, [], V9, Indexes9), + ok = leveled_bookie:book_riakput(Book2, Obj9, Spc9), + R9 = lists:map(fun({add, IdxF, IdxT}) -> + R = leveled_bookie:book_returnfolder(Book2, + {index_query, + "Bucket", + {IdxF, + IdxT, + IdxT}, + ?KEY_ONLY}), + {async, Fldr} = R, + case length(Fldr()) of + X when X > 0 -> + {IdxF, IdxT, X} + end + end, + Spc9), + Spc9Del = lists:map(fun({add, IdxF, IdxT}) -> {remove, IdxF, IdxT} end, + Spc9), + ok = leveled_bookie:book_riakput(Book2, Obj9, Spc9Del), + lists:foreach(fun({IdxF, IdxT, X}) -> + R = leveled_bookie:book_returnfolder(Book2, + {index_query, + "Bucket", + {IdxF, + IdxT, + IdxT}, + ?KEY_ONLY}), + {async, Fldr} = R, + case length(Fldr()) of + Y -> + Y = X - 1 + end + end, + R9), + ok = leveled_bookie:book_close(Book2), + {ok, Book3} = leveled_bookie:book_start(RootPath, 2000, 50000000), + lists:foreach(fun({IdxF, IdxT, X}) -> + R = leveled_bookie:book_returnfolder(Book3, + {index_query, + "Bucket", + {IdxF, + IdxT, + IdxT}, + ?KEY_ONLY}), + {async, Fldr} = R, + case length(Fldr()) of + Y -> + Y = X - 1 + end + end, + R9), + ok = leveled_bookie:book_riakput(Book3, Obj9, Spc9), + ok = leveled_bookie:book_close(Book3), + {ok, Book4} = leveled_bookie:book_start(RootPath, 2000, 50000000), + lists:foreach(fun({IdxF, IdxT, X}) -> + R = leveled_bookie:book_returnfolder(Book4, + {index_query, + "Bucket", + {IdxF, + IdxT, + IdxT}, + ?KEY_ONLY}), + {async, Fldr} = R, + case length(Fldr()) of + X -> + ok + end + end, + R9), + testutil:check_forobject(Book4, TestObject), + ok = leveled_bookie:book_close(Book4), + testutil:reset_filestructure(). + + + +count_termsonindex(Bucket, IdxField, Book, QType) -> + lists:foldl(fun(X, Acc) -> + SW = os:timestamp(), + ST = integer_to_list(X), + ET = ST ++ "~", + R = leveled_bookie:book_returnfolder(Book, + {index_query, + Bucket, + {IdxField, + ST, + ET}, + QType}), + {async, Folder} = R, + Items = length(Folder()), + io:format("2i query from term ~s on index ~s took " ++ + "~w microseconds~n", + [ST, + IdxField, + timer:now_diff(os:timestamp(), SW)]), + Acc + Items + end, + 0, + lists:seq(190, 221)). + + +rotating_objects(_Config) -> + RootPath = testutil:reset_filestructure(), + ok = testutil:rotating_object_check(RootPath, "Bucket1", 10), + ok = testutil:rotating_object_check(RootPath, "Bucket2", 200), + ok = testutil:rotating_object_check(RootPath, "Bucket3", 800), + ok = testutil:rotating_object_check(RootPath, "Bucket4", 1600), + ok = testutil:rotating_object_check(RootPath, "Bucket5", 3200), + ok = testutil:rotating_object_check(RootPath, "Bucket6", 9600), + testutil:reset_filestructure(). + + + + + + diff --git a/test/end_to_end/recovery_SUITE.erl b/test/end_to_end/recovery_SUITE.erl new file mode 100644 index 0000000..de58e4c --- /dev/null +++ b/test/end_to_end/recovery_SUITE.erl @@ -0,0 +1,314 @@ +-module(recovery_SUITE). +-include_lib("common_test/include/ct.hrl"). +-include("include/leveled.hrl"). +-export([all/0]). +-export([retain_strategy/1, + aae_bustedjournal/1, + journal_compaction_bustedjournal/1 + ]). + +all() -> [ + retain_strategy, + aae_bustedjournal, + journal_compaction_bustedjournal + ]. + +retain_strategy(_Config) -> + RootPath = testutil:reset_filestructure(), + BookOpts = [{root_path, RootPath}, + {cache_size, 1000}, + {max_journalsize, 5000000}, + {reload_strategy, [{?RIAK_TAG, retain}]}], + BookOptsAlt = [{root_path, RootPath}, + {cache_size, 1000}, + {max_journalsize, 100000}, + {reload_strategy, [{?RIAK_TAG, retain}]}, + {max_run_length, 8}], + {ok, Spcl3, LastV3} = rotating_object_check(BookOpts, "Bucket3", 800), + ok = restart_from_blankledger(BookOpts, [{"Bucket3", Spcl3, LastV3}]), + {ok, Spcl4, LastV4} = rotating_object_check(BookOpts, "Bucket4", 1600), + ok = restart_from_blankledger(BookOpts, [{"Bucket3", Spcl3, LastV3}, + {"Bucket4", Spcl4, LastV4}]), + {ok, Spcl5, LastV5} = rotating_object_check(BookOpts, "Bucket5", 3200), + ok = restart_from_blankledger(BookOptsAlt, [{"Bucket3", Spcl3, LastV3}, + {"Bucket5", Spcl5, LastV5}]), + {ok, Spcl6, LastV6} = rotating_object_check(BookOpts, "Bucket6", 6400), + ok = restart_from_blankledger(BookOpts, [{"Bucket3", Spcl3, LastV3}, + {"Bucket4", Spcl4, LastV4}, + {"Bucket5", Spcl5, LastV5}, + {"Bucket6", Spcl6, LastV6}]), + testutil:reset_filestructure(). + + + +aae_bustedjournal(_Config) -> + RootPath = testutil:reset_filestructure(), + StartOpts = [{root_path, RootPath}, + {max_journalsize, 20000000}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + GenList = [2], + _CLs = testutil:load_objects(20000, GenList, Bookie1, TestObject, + fun testutil:generate_objects/2), + ok = leveled_bookie:book_close(Bookie1), + CDBFiles = testutil:find_journals(RootPath), + [HeadF|_Rest] = CDBFiles, + io:format("Selected Journal for corruption of ~s~n", [HeadF]), + testutil:corrupt_journal(RootPath, HeadF, 1000, 2048, 1000), + {ok, Bookie2} = leveled_bookie:book_start(StartOpts), + + {async, KeyF} = leveled_bookie:book_returnfolder(Bookie2, + {keylist, ?RIAK_TAG}), + KeyList = KeyF(), + 20001 = length(KeyList), + HeadCount = lists:foldl(fun({B, K}, Acc) -> + case leveled_bookie:book_riakhead(Bookie2, + B, + K) of + {ok, _} -> Acc + 1; + not_found -> Acc + end + end, + 0, + KeyList), + 20001 = HeadCount, + GetCount = lists:foldl(fun({B, K}, Acc) -> + case leveled_bookie:book_riakget(Bookie2, + B, + K) of + {ok, _} -> Acc + 1; + not_found -> Acc + end + end, + 0, + KeyList), + true = GetCount > 19000, + true = GetCount < HeadCount, + + {async, HashTreeF1} = leveled_bookie:book_returnfolder(Bookie2, + {hashtree_query, + ?RIAK_TAG, + false}), + KeyHashList1 = HashTreeF1(), + 20001 = length(KeyHashList1), + {async, HashTreeF2} = leveled_bookie:book_returnfolder(Bookie2, + {hashtree_query, + ?RIAK_TAG, + check_presence}), + KeyHashList2 = HashTreeF2(), + % The file is still there, and the hashtree is not corrupted + KeyHashList2 = KeyHashList1, + % Will need to remove the file or corrupt the hashtree to get presence to + % fail + + FoldObjectsFun = fun(B, K, V, Acc) -> [{B, K, testutil:riak_hash(V)}|Acc] + end, + SW = os:timestamp(), + {async, HashTreeF3} = leveled_bookie:book_returnfolder(Bookie2, + {foldobjects_allkeys, + ?RIAK_TAG, + FoldObjectsFun}), + KeyHashList3 = HashTreeF3(), + + true = length(KeyHashList3) > 19000, + true = length(KeyHashList3) < HeadCount, + Delta = length(lists:subtract(KeyHashList1, KeyHashList3)), + true = Delta < 1001, + io:format("Fetch of hashtree using fold objects took ~w microseconds" ++ + " and found a Delta of ~w and an objects count of ~w~n", + [timer:now_diff(os:timestamp(), SW), + Delta, + length(KeyHashList3)]), + + ok = leveled_bookie:book_close(Bookie2), + {ok, BytesCopied} = testutil:restore_file(RootPath, HeadF), + io:format("File restored is of size ~w~n", [BytesCopied]), + {ok, Bookie3} = leveled_bookie:book_start(StartOpts), + + SW4 = os:timestamp(), + {async, HashTreeF4} = leveled_bookie:book_returnfolder(Bookie3, + {foldobjects_allkeys, + ?RIAK_TAG, + FoldObjectsFun}), + KeyHashList4 = HashTreeF4(), + + true = length(KeyHashList4) == 20001, + io:format("Fetch of hashtree using fold objects took ~w microseconds" ++ + " and found an object count of ~w~n", + [timer:now_diff(os:timestamp(), SW4), length(KeyHashList4)]), + + ok = leveled_bookie:book_close(Bookie3), + testutil:corrupt_journal(RootPath, HeadF, 500, BytesCopied - 8000, 14), + + {ok, Bookie4} = leveled_bookie:book_start(StartOpts), + + SW5 = os:timestamp(), + {async, HashTreeF5} = leveled_bookie:book_returnfolder(Bookie4, + {foldobjects_allkeys, + ?RIAK_TAG, + FoldObjectsFun}), + KeyHashList5 = HashTreeF5(), + + true = length(KeyHashList5) > 19000, + true = length(KeyHashList5) < HeadCount, + Delta5 = length(lists:subtract(KeyHashList1, KeyHashList5)), + true = Delta5 < 1001, + io:format("Fetch of hashtree using fold objects took ~w microseconds" ++ + " and found a Delta of ~w and an objects count of ~w~n", + [timer:now_diff(os:timestamp(), SW5), + Delta5, + length(KeyHashList5)]), + + {async, HashTreeF6} = leveled_bookie:book_returnfolder(Bookie4, + {hashtree_query, + ?RIAK_TAG, + check_presence}), + KeyHashList6 = HashTreeF6(), + true = length(KeyHashList6) > 19000, + true = length(KeyHashList6) < HeadCount, + + ok = leveled_bookie:book_close(Bookie4), + + testutil:restore_topending(RootPath, HeadF), + + {ok, Bookie5} = leveled_bookie:book_start(StartOpts), + + SW6 = os:timestamp(), + {async, HashTreeF7} = leveled_bookie:book_returnfolder(Bookie5, + {foldobjects_allkeys, + ?RIAK_TAG, + FoldObjectsFun}), + KeyHashList7 = HashTreeF7(), + + true = length(KeyHashList7) == 20001, + io:format("Fetch of hashtree using fold objects took ~w microseconds" ++ + " and found an object count of ~w~n", + [timer:now_diff(os:timestamp(), SW6), length(KeyHashList7)]), + + ok = leveled_bookie:book_close(Bookie5), + testutil:reset_filestructure(). + + +journal_compaction_bustedjournal(_Config) -> + % Simply confirms that none of this causes a crash + RootPath = testutil:reset_filestructure(), + StartOpts1 = [{root_path, RootPath}, + {max_journalsize, 10000000}, + {max_run_length, 10}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = leveled_bookie:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + ObjList1 = testutil:generate_objects(50000, 2), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie1, Obj, Spc) end, + ObjList1), + %% Now replace all the objects + ObjList2 = testutil:generate_objects(50000, 2), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie1, Obj, Spc) end, + ObjList2), + ok = leveled_bookie:book_close(Bookie1), + + CDBFiles = testutil:find_journals(RootPath), + lists:foreach(fun(FN) -> + testutil:corrupt_journal(RootPath, FN, 100, 2048, 1000) + end, + CDBFiles), + + {ok, Bookie2} = leveled_bookie:book_start(StartOpts1), + + ok = leveled_bookie:book_compactjournal(Bookie2, 30000), + F = fun leveled_bookie:book_islastcompactionpending/1, + lists:foldl(fun(X, Pending) -> + case Pending of + false -> + false; + true -> + io:format("Loop ~w waiting for journal " + ++ "compaction to complete~n", [X]), + timer:sleep(20000), + F(Bookie2) + end end, + true, + lists:seq(1, 15)), + + ok = leveled_bookie:book_close(Bookie2), + testutil:reset_filestructure(10000). + + +rotating_object_check(BookOpts, B, NumberOfObjects) -> + {ok, Book1} = leveled_bookie:book_start(BookOpts), + {KSpcL1, V1} = testutil:put_indexed_objects(Book1, B, NumberOfObjects), + ok = testutil:check_indexed_objects(Book1, + B, + KSpcL1, + V1), + {KSpcL2, V2} = testutil:put_altered_indexed_objects(Book1, + B, + KSpcL1, + false), + ok = testutil:check_indexed_objects(Book1, + B, + KSpcL1 ++ KSpcL2, + V2), + {KSpcL3, V3} = testutil:put_altered_indexed_objects(Book1, + B, + KSpcL2, + false), + ok = leveled_bookie:book_close(Book1), + {ok, Book2} = leveled_bookie:book_start(BookOpts), + ok = testutil:check_indexed_objects(Book2, + B, + KSpcL1 ++ KSpcL2 ++ KSpcL3, + V3), + {KSpcL4, V4} = testutil:put_altered_indexed_objects(Book2, + B, + KSpcL3, + false), + io:format("Bucket complete - checking index before compaction~n"), + ok = testutil:check_indexed_objects(Book2, + B, + KSpcL1 ++ KSpcL2 ++ KSpcL3 ++ KSpcL4, + V4), + + ok = leveled_bookie:book_compactjournal(Book2, 30000), + F = fun leveled_bookie:book_islastcompactionpending/1, + lists:foldl(fun(X, Pending) -> + case Pending of + false -> + false; + true -> + io:format("Loop ~w waiting for journal " + ++ "compaction to complete~n", [X]), + timer:sleep(20000), + F(Book2) + end end, + true, + lists:seq(1, 15)), + io:format("Waiting for journal deletes~n"), + timer:sleep(20000), + + io:format("Checking index following compaction~n"), + ok = testutil:check_indexed_objects(Book2, + B, + KSpcL1 ++ KSpcL2 ++ KSpcL3 ++ KSpcL4, + V4), + + ok = leveled_bookie:book_close(Book2), + {ok, KSpcL1 ++ KSpcL2 ++ KSpcL3 ++ KSpcL4, V4}. + + +restart_from_blankledger(BookOpts, B_SpcL) -> + leveled_penciller:clean_testdir(proplists:get_value(root_path, BookOpts) ++ + "/ledger"), + {ok, Book1} = leveled_bookie:book_start(BookOpts), + io:format("Checking index following restart~n"), + lists:foreach(fun({B, SpcL, V}) -> + ok = testutil:check_indexed_objects(Book1, B, SpcL, V) + end, + B_SpcL), + ok = leveled_bookie:book_close(Book1), + ok. \ No newline at end of file diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl new file mode 100644 index 0000000..755cf88 --- /dev/null +++ b/test/end_to_end/testutil.erl @@ -0,0 +1,440 @@ +-module(testutil). + +-include("../include/leveled.hrl"). + +-export([reset_filestructure/0, + reset_filestructure/1, + check_bucket_stats/2, + check_forlist/2, + check_forlist/3, + check_formissinglist/2, + check_forobject/2, + check_formissingobject/3, + generate_testobject/0, + generate_testobject/5, + generate_compressibleobjects/2, + generate_smallobjects/2, + generate_objects/2, + generate_objects/5, + generate_objects/6, + set_object/5, + get_key/1, + get_value/1, + get_compressiblevalue/0, + get_compressiblevalue_andinteger/0, + get_randomindexes_generator/1, + name_list/0, + load_objects/5, + put_indexed_objects/3, + put_altered_indexed_objects/3, + put_altered_indexed_objects/4, + check_indexed_objects/4, + rotating_object_check/3, + corrupt_journal/5, + restore_file/2, + restore_topending/2, + find_journals/1, + riak_hash/1]). + +-define(RETURN_TERMS, {true, undefined}). + + +reset_filestructure() -> + reset_filestructure(0). + +reset_filestructure(Wait) -> + io:format("Waiting ~w ms to give a chance for all file closes " ++ + "to complete~n", [Wait]), + timer:sleep(Wait), + RootPath = "test", + filelib:ensure_dir(RootPath ++ "/journal/"), + filelib:ensure_dir(RootPath ++ "/ledger/"), + leveled_inker:clean_testdir(RootPath ++ "/journal"), + leveled_penciller:clean_testdir(RootPath ++ "/ledger"), + RootPath. + + + +check_bucket_stats(Bookie, Bucket) -> + FoldSW1 = os:timestamp(), + io:format("Checking bucket size~n"), + {async, Folder1} = leveled_bookie:book_returnfolder(Bookie, + {riakbucket_stats, + Bucket}), + {B1Size, B1Count} = Folder1(), + io:format("Bucket fold completed in ~w microseconds~n", + [timer:now_diff(os:timestamp(), FoldSW1)]), + io:format("Bucket ~s has size ~w and count ~w~n", + [Bucket, B1Size, B1Count]), + {B1Size, B1Count}. + + +check_forlist(Bookie, ChkList) -> + check_forlist(Bookie, ChkList, false). + +check_forlist(Bookie, ChkList, Log) -> + SW = os:timestamp(), + lists:foreach(fun({_RN, Obj, _Spc}) -> + if + Log == true -> + io:format("Fetching Key ~s~n", [Obj#r_object.key]); + true -> + ok + end, + R = leveled_bookie:book_riakget(Bookie, + Obj#r_object.bucket, + Obj#r_object.key), + ok = case R of + {ok, Obj} -> + ok; + not_found -> + io:format("Object not found for key ~s~n", + [Obj#r_object.key]), + error + end + end, + ChkList), + io:format("Fetch check took ~w microseconds checking list of length ~w~n", + [timer:now_diff(os:timestamp(), SW), length(ChkList)]). + +check_formissinglist(Bookie, ChkList) -> + SW = os:timestamp(), + lists:foreach(fun({_RN, Obj, _Spc}) -> + R = leveled_bookie:book_riakget(Bookie, + Obj#r_object.bucket, + Obj#r_object.key), + R = not_found end, + ChkList), + io:format("Miss check took ~w microseconds checking list of length ~w~n", + [timer:now_diff(os:timestamp(), SW), length(ChkList)]). + +check_forobject(Bookie, TestObject) -> + {ok, TestObject} = leveled_bookie:book_riakget(Bookie, + TestObject#r_object.bucket, + TestObject#r_object.key), + {ok, HeadObject} = leveled_bookie:book_riakhead(Bookie, + TestObject#r_object.bucket, + TestObject#r_object.key), + ok = case {HeadObject#r_object.bucket, + HeadObject#r_object.key, + HeadObject#r_object.vclock} of + {B1, K1, VC1} when B1 == TestObject#r_object.bucket, + K1 == TestObject#r_object.key, + VC1 == TestObject#r_object.vclock -> + ok + end. + +check_formissingobject(Bookie, Bucket, Key) -> + not_found = leveled_bookie:book_riakget(Bookie, Bucket, Key), + not_found = leveled_bookie:book_riakhead(Bookie, Bucket, Key). + + +generate_testobject() -> + {B1, K1, V1, Spec1, MD} = {"Bucket1", + "Key1", + "Value1", + [], + {"MDK1", "MDV1"}}, + generate_testobject(B1, K1, V1, Spec1, MD). + +generate_testobject(B, K, V, Spec, MD) -> + Content = #r_content{metadata=MD, value=V}, + {#r_object{bucket=B, key=K, contents=[Content], vclock=[{'a',1}]}, + Spec}. + + +generate_compressibleobjects(Count, KeyNumber) -> + V = get_compressiblevalue(), + generate_objects(Count, KeyNumber, [], V). + + +get_compressiblevalue_andinteger() -> + {random:uniform(1000), get_compressiblevalue()}. + +get_compressiblevalue() -> + S1 = "111111111111111", + S2 = "222222222222222", + S3 = "333333333333333", + S4 = "aaaaaaaaaaaaaaa", + S5 = "AAAAAAAAAAAAAAA", + S6 = "GGGGGGGGGGGGGGG", + S7 = "===============", + S8 = "...............", + Selector = [{1, S1}, {2, S2}, {3, S3}, {4, S4}, + {5, S5}, {6, S6}, {7, S7}, {8, S8}], + L = lists:seq(1, 1024), + lists:foldl(fun(_X, Acc) -> + {_, Str} = lists:keyfind(random:uniform(8), 1, Selector), + Acc ++ Str end, + "", + L). + +generate_smallobjects(Count, KeyNumber) -> + generate_objects(Count, KeyNumber, [], crypto:rand_bytes(512)). + +generate_objects(Count, KeyNumber) -> + generate_objects(Count, KeyNumber, [], crypto:rand_bytes(4096)). + + +generate_objects(Count, KeyNumber, ObjL, Value) -> + generate_objects(Count, KeyNumber, ObjL, Value, fun() -> [] end). + +generate_objects(Count, KeyNumber, ObjL, Value, IndexGen) -> + generate_objects(Count, KeyNumber, ObjL, Value, IndexGen, "Bucket"). + +generate_objects(0, _KeyNumber, ObjL, _Value, _IndexGen, _Bucket) -> + ObjL; +generate_objects(Count, uuid, ObjL, Value, IndexGen, Bucket) -> + {Obj1, Spec1} = set_object(Bucket, + leveled_codec:generate_uuid(), + Value, + IndexGen), + generate_objects(Count - 1, + uuid, + ObjL ++ [{random:uniform(), Obj1, Spec1}], + Value, + IndexGen, + Bucket); +generate_objects(Count, KeyNumber, ObjL, Value, IndexGen, Bucket) -> + {Obj1, Spec1} = set_object(Bucket, + "Key" ++ integer_to_list(KeyNumber), + Value, + IndexGen), + generate_objects(Count - 1, + KeyNumber + 1, + ObjL ++ [{random:uniform(), Obj1, Spec1}], + Value, + IndexGen, + Bucket). + +set_object(Bucket, Key, Value, IndexGen) -> + set_object(Bucket, Key, Value, IndexGen, []). + +set_object(Bucket, Key, Value, IndexGen, Indexes2Remove) -> + Obj = {Bucket, + Key, + Value, + IndexGen() ++ lists:map(fun({add, IdxF, IdxV}) -> + {remove, IdxF, IdxV} end, + Indexes2Remove), + [{"MDK", "MDV" ++ Key}, + {"MDK2", "MDV" ++ Key}]}, + {B1, K1, V1, Spec1, MD} = Obj, + Content = #r_content{metadata=MD, value=V1}, + {#r_object{bucket=B1, key=K1, contents=[Content], vclock=[{'a',1}]}, + Spec1}. + +get_key(Object) -> + Object#r_object.key. + +get_value(Object) -> + [Content] = Object#r_object.contents, + Content#r_content.value. + +load_objects(ChunkSize, GenList, Bookie, TestObject, Generator) -> + lists:map(fun(KN) -> + ObjListA = Generator(ChunkSize, KN), + StartWatchA = os:timestamp(), + lists:foreach(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Bookie, Obj, Spc) + end, + ObjListA), + Time = timer:now_diff(os:timestamp(), StartWatchA), + io:format("~w objects loaded in ~w seconds~n", + [ChunkSize, Time/1000000]), + if + TestObject == no_check -> + ok; + true -> + check_forobject(Bookie, TestObject) + end, + lists:sublist(ObjListA, 1000) end, + GenList). + + +get_randomindexes_generator(Count) -> + Generator = fun() -> + lists:map(fun(X) -> + {add, + "idx" ++ integer_to_list(X) ++ "_bin", + get_randomdate() ++ get_randomname()} end, + lists:seq(1, Count)) + end, + Generator. + +name_list() -> + [{1, "Sophia"}, {2, "Emma"}, {3, "Olivia"}, {4, "Ava"}, + {5, "Isabella"}, {6, "Mia"}, {7, "Zoe"}, {8, "Lily"}, + {9, "Emily"}, {10, "Madelyn"}, {11, "Madison"}, {12, "Chloe"}, + {13, "Charlotte"}, {14, "Aubrey"}, {15, "Avery"}, + {16, "Abigail"}]. + +get_randomname() -> + NameList = name_list(), + N = random:uniform(16), + {N, Name} = lists:keyfind(N, 1, NameList), + Name. + +get_randomdate() -> + LowTime = 60000000000, + HighTime = 70000000000, + RandPoint = LowTime + random:uniform(HighTime - LowTime), + Date = calendar:gregorian_seconds_to_datetime(RandPoint), + {{Year, Month, Day}, {Hour, Minute, Second}} = Date, + lists:flatten(io_lib:format("~4..0w~2..0w~2..0w~2..0w~2..0w~2..0w", + [Year, Month, Day, Hour, Minute, Second])). + + +check_indexed_objects(Book, B, KSpecL, V) -> + % Check all objects match, return what should be the results of an all + % index query + IdxR = lists:map(fun({K, Spc}) -> + {ok, O} = leveled_bookie:book_riakget(Book, B, K), + V = testutil:get_value(O), + {add, + "idx1_bin", + IdxVal} = lists:keyfind(add, 1, Spc), + {IdxVal, K} end, + KSpecL), + % Check the all index query matches expectations + R = leveled_bookie:book_returnfolder(Book, + {index_query, + B, + {"idx1_bin", + "0", + "~"}, + ?RETURN_TERMS}), + SW = os:timestamp(), + {async, Fldr} = R, + QR0 = Fldr(), + io:format("Query match found of length ~w in ~w microseconds " ++ + "expected ~w ~n", + [length(QR0), + timer:now_diff(os:timestamp(), SW), + length(IdxR)]), + QR = lists:sort(QR0), + ER = lists:sort(IdxR), + + ok = if + ER == QR -> + ok + end, + ok. + + +put_indexed_objects(Book, Bucket, Count) -> + V = testutil:get_compressiblevalue(), + IndexGen = testutil:get_randomindexes_generator(1), + SW = os:timestamp(), + ObjL1 = testutil:generate_objects(Count, + uuid, + [], + V, + IndexGen, + Bucket), + KSpecL = lists:map(fun({_RN, Obj, Spc}) -> + leveled_bookie:book_riakput(Book, + Obj, + Spc), + {testutil:get_key(Obj), Spc} + end, + ObjL1), + io:format("Put of ~w objects with ~w index entries " + ++ + "each completed in ~w microseconds~n", + [Count, 1, timer:now_diff(os:timestamp(), SW)]), + {KSpecL, V}. + + +put_altered_indexed_objects(Book, Bucket, KSpecL) -> + put_altered_indexed_objects(Book, Bucket, KSpecL, true). + +put_altered_indexed_objects(Book, Bucket, KSpecL, RemoveOld2i) -> + IndexGen = testutil:get_randomindexes_generator(1), + V = testutil:get_compressiblevalue(), + RplKSpecL = lists:map(fun({K, Spc}) -> + AddSpc = if + RemoveOld2i == true -> + [lists:keyfind(add, 1, Spc)]; + RemoveOld2i == false -> + [] + end, + {O, AltSpc} = testutil:set_object(Bucket, + K, + V, + IndexGen, + AddSpc), + ok = leveled_bookie:book_riakput(Book, + O, + AltSpc), + {K, AltSpc} end, + KSpecL), + {RplKSpecL, V}. + +rotating_object_check(RootPath, B, NumberOfObjects) -> + BookOpts = [{root_path, RootPath}, + {cache_size, 1000}, + {max_journalsize, 5000000}], + {ok, Book1} = leveled_bookie:book_start(BookOpts), + {KSpcL1, V1} = testutil:put_indexed_objects(Book1, B, NumberOfObjects), + ok = testutil:check_indexed_objects(Book1, B, KSpcL1, V1), + {KSpcL2, V2} = testutil:put_altered_indexed_objects(Book1, B, KSpcL1), + ok = testutil:check_indexed_objects(Book1, B, KSpcL2, V2), + {KSpcL3, V3} = testutil:put_altered_indexed_objects(Book1, B, KSpcL2), + ok = leveled_bookie:book_close(Book1), + {ok, Book2} = leveled_bookie:book_start(BookOpts), + ok = testutil:check_indexed_objects(Book2, B, KSpcL3, V3), + {KSpcL4, V4} = testutil:put_altered_indexed_objects(Book2, B, KSpcL3), + ok = testutil:check_indexed_objects(Book2, B, KSpcL4, V4), + ok = leveled_bookie:book_close(Book2), + ok. + +corrupt_journal(RootPath, FileName, Corruptions, BasePosition, GapSize) -> + OriginalPath = RootPath ++ "/journal/journal_files/" ++ FileName, + BackupPath = RootPath ++ "/journal/journal_files/" ++ + filename:basename(FileName, ".cdb") ++ ".bak", + {ok, _BytesCopied} = file:copy(OriginalPath, BackupPath), + {ok, Handle} = file:open(OriginalPath, [binary, raw, read, write]), + lists:foreach(fun(X) -> + Position = X * GapSize + BasePosition, + ok = file:pwrite(Handle, Position, <<0:8/integer>>) + end, + lists:seq(1, Corruptions)), + ok = file:close(Handle). + + +restore_file(RootPath, FileName) -> + OriginalPath = RootPath ++ "/journal/journal_files/" ++ FileName, + BackupPath = RootPath ++ "/journal/journal_files/" ++ + filename:basename(FileName, ".cdb") ++ ".bak", + file:copy(BackupPath, OriginalPath). + +restore_topending(RootPath, FileName) -> + OriginalPath = RootPath ++ "/journal/journal_files/" ++ FileName, + PndPath = RootPath ++ "/journal/journal_files/" ++ + filename:basename(FileName, ".cdb") ++ ".pnd", + ok = file:rename(OriginalPath, PndPath), + false = filelib:is_file(OriginalPath). + +find_journals(RootPath) -> + {ok, FNsA_J} = file:list_dir(RootPath ++ "/journal/journal_files"), + {ok, Regex} = re:compile(".*\.cdb"), + CDBFiles = lists:foldl(fun(FN, Acc) -> case re:run(FN, Regex) of + nomatch -> + Acc; + _ -> + [FN|Acc] + end + end, + [], + FNsA_J), + CDBFiles. + + +riak_hash(Obj=#r_object{}) -> + Vclock = vclock(Obj), + UpdObj = set_vclock(Obj, lists:sort(Vclock)), + erlang:phash2(term_to_binary(UpdObj)). + +set_vclock(Object=#r_object{}, VClock) -> Object#r_object{vclock=VClock}. +vclock(#r_object{vclock=VClock}) -> VClock. diff --git a/test/lookup_test.erl b/test/lookup_test.erl new file mode 100644 index 0000000..8afe7a4 --- /dev/null +++ b/test/lookup_test.erl @@ -0,0 +1,323 @@ +-module(lookup_test). + +-export([go_dict/1, + go_ets/1, + go_gbtree/1, + go_arrayofdict/1, + go_arrayofgbtree/1, + go_arrayofdict_withcache/1, + create_blocks/3, + size_testblocks/1, + test_testblocks/2]). + +-define(CACHE_SIZE, 512). + +hash(Key) -> + H = 5381, + hash1(H,Key) band 16#FFFFFFFF. + +hash1(H,[]) ->H; +hash1(H,[B|Rest]) -> + H1 = H * 33, + H2 = H1 bxor B, + hash1(H2,Rest). + +% Get the least significant 8 bits from the hash. +hash_to_index(Hash) -> + Hash band 255. + + +%% +%% Timings (microseconds): +%% +%% go_dict(200000) : 1569894 +%% go_dict(1000000) : 17191365 +%% go_dict(5000000) : forever + +go_dict(N) -> + go_dict(dict:new(), N, N). + +go_dict(_, 0, _) -> + {erlang:memory(), statistics(garbage_collection)}; +go_dict(D, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + dict:find(LookupHash, D), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + case dict:find(Hash, D) of + error -> + go_dict(dict:store(Hash, [N], D), N-1, M); + {ok, List} -> + go_dict(dict:store(Hash, [N|List], D), N-1, M) + end. + + + +%% +%% Timings (microseconds): +%% +%% go_ets(200000) : 609119 +%% go_ets(1000000) : 3520757 +%% go_ets(5000000) : 19974562 + +go_ets(N) -> + go_ets(ets:new(ets_test, [private, bag]), N, N). + +go_ets(_, 0, _) -> + {erlang:memory(), statistics(garbage_collection)}; +go_ets(Ets, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + ets:lookup(Ets, LookupHash), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + ets:insert(Ets, {Hash, N}), + go_ets(Ets, N - 1, M). + +%% +%% Timings (microseconds): +%% +%% go_gbtree(200000) : 1393936 +%% go_gbtree(1000000) : 8430997 +%% go_gbtree(5000000) : 45630810 + +go_gbtree(N) -> + go_gbtree(gb_trees:empty(), N, N). + +go_gbtree(_, 0, _) -> + {erlang:memory(), statistics(garbage_collection)}; +go_gbtree(Tree, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + gb_trees:lookup(LookupHash, Tree), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + case gb_trees:lookup(Hash, Tree) of + none -> + go_gbtree(gb_trees:insert(Hash, [N], Tree), N - 1, M); + {value, List} -> + go_gbtree(gb_trees:update(Hash, [N|List], Tree), N - 1, M) + end. + + +%% +%% Timings (microseconds): +%% +%% go_arrayofidict(200000) : 1266931 +%% go_arrayofidict(1000000) : 7387219 +%% go_arrayofidict(5000000) : 49511484 + +go_arrayofdict(N) -> + go_arrayofdict(array:new(256, {default, dict:new()}), N, N). + +go_arrayofdict(_, 0, _) -> + % dict:to_list(array:get(0, Array)), + % dict:to_list(array:get(1, Array)), + % dict:to_list(array:get(2, Array)), + % dict:to_list(array:get(3, Array)), + % dict:to_list(array:get(4, Array)), + % dict:to_list(array:get(5, Array)), + % dict:to_list(array:get(6, Array)), + % dict:to_list(array:get(7, Array)), + % dict:to_list(array:get(8, Array)), + % dict:to_list(array:get(9, Array)), + {erlang:memory(), statistics(garbage_collection)}; +go_arrayofdict(Array, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + LookupIndex = hash_to_index(LookupHash), + dict:find(LookupHash, array:get(LookupIndex, Array)), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + Index = hash_to_index(Hash), + D = array:get(Index, Array), + case dict:find(Hash, D) of + error -> + go_arrayofdict(array:set(Index, + dict:store(Hash, [N], D), Array), N-1, M); + {ok, List} -> + go_arrayofdict(array:set(Index, + dict:store(Hash, [N|List], D), Array), N-1, M) + end. + +%% +%% Timings (microseconds): +%% +%% go_arrayofgbtree(200000) : 1176224 +%% go_arrayofgbtree(1000000) : 7480653 +%% go_arrayofgbtree(5000000) : 41266701 + +go_arrayofgbtree(N) -> + go_arrayofgbtree(array:new(256, {default, gb_trees:empty()}), N, N). + +go_arrayofgbtree(_, 0, _) -> + % gb_trees:to_list(array:get(0, Array)), + % gb_trees:to_list(array:get(1, Array)), + % gb_trees:to_list(array:get(2, Array)), + % gb_trees:to_list(array:get(3, Array)), + % gb_trees:to_list(array:get(4, Array)), + % gb_trees:to_list(array:get(5, Array)), + % gb_trees:to_list(array:get(6, Array)), + % gb_trees:to_list(array:get(7, Array)), + % gb_trees:to_list(array:get(8, Array)), + % gb_trees:to_list(array:get(9, Array)), + {erlang:memory(), statistics(garbage_collection)}; +go_arrayofgbtree(Array, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + LookupIndex = hash_to_index(LookupHash), + gb_trees:lookup(LookupHash, array:get(LookupIndex, Array)), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + Index = hash_to_index(Hash), + Tree = array:get(Index, Array), + case gb_trees:lookup(Hash, Tree) of + none -> + go_arrayofgbtree(array:set(Index, + gb_trees:insert(Hash, [N], Tree), Array), N - 1, M); + {value, List} -> + go_arrayofgbtree(array:set(Index, + gb_trees:update(Hash, [N|List], Tree), Array), N - 1, M) + end. + + +%% +%% Timings (microseconds): +%% +%% go_arrayofdict_withcache(200000) : 1432951 +%% go_arrayofdict_withcache(1000000) : 9140169 +%% go_arrayofdict_withcache(5000000) : 59435511 + +go_arrayofdict_withcache(N) -> + go_arrayofdict_withcache({array:new(256, {default, dict:new()}), + array:new(256, {default, dict:new()})}, N, N). + +go_arrayofdict_withcache(_, 0, _) -> + {erlang:memory(), statistics(garbage_collection)}; +go_arrayofdict_withcache({MArray, CArray}, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + LookupIndex = hash_to_index(LookupHash), + dict:find(LookupHash, array:get(LookupIndex, CArray)), + dict:find(LookupHash, array:get(LookupIndex, MArray)), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + Index = hash_to_index(Hash), + Cache = array:get(Index, CArray), + case dict:find(Hash, Cache) of + error -> + UpdCache = dict:store(Hash, [N], Cache); + {ok, _} -> + UpdCache = dict:append(Hash, N, Cache) + end, + case dict:size(UpdCache) of + ?CACHE_SIZE -> + UpdCArray = array:set(Index, dict:new(), CArray), + UpdMArray = array:set(Index, dict:merge(fun merge_values/3, UpdCache, array:get(Index, MArray)), MArray), + go_arrayofdict_withcache({UpdMArray, UpdCArray}, N - 1, M); + _ -> + UpdCArray = array:set(Index, UpdCache, CArray), + go_arrayofdict_withcache({MArray, UpdCArray}, N - 1, M) + end. + + + +merge_values(_, Value1, Value2) -> + lists:append(Value1, Value2). + + +%% Some functions for testing options compressing term_to_binary + +create_block(N, BlockType) -> + case BlockType of + keylist -> + create_block(N, BlockType, []); + keygbtree -> + create_block(N, BlockType, gb_trees:empty()) + end. + +create_block(0, _, KeyStruct) -> + KeyStruct; +create_block(N, BlockType, KeyStruct) -> + Bucket = <<"pdsRecord">>, + case N of + 20 -> + Key = lists:concat(["key-20-special"]); + _ -> + Key = lists:concat(["key-", N, "-", random:uniform(1000)]) + end, + SequenceNumber = random:uniform(1000000000), + Indexes = [{<<"DateOfBirth_int">>, random:uniform(10000)}, {<<"index1_bin">>, lists:concat([random:uniform(1000), "SomeCommonText"])}, {<<"index2_bin">>, <<"RepetitionRepetitionRepetition">>}], + case BlockType of + keylist -> + Term = {o, Bucket, Key, {Indexes, SequenceNumber}}, + create_block(N-1, BlockType, [Term|KeyStruct]); + keygbtree -> + create_block(N-1, BlockType, gb_trees:insert({o, Bucket, Key}, {Indexes, SequenceNumber}, KeyStruct)) + end. + + +create_blocks(N, Compression, BlockType) -> + create_blocks(N, Compression, BlockType, 10000, []). + +create_blocks(_, _, _, 0, BlockList) -> + BlockList; +create_blocks(N, Compression, BlockType, TestLoops, BlockList) -> + NewBlock = term_to_binary(create_block(N, BlockType), [{compressed, Compression}]), + create_blocks(N, Compression, BlockType, TestLoops - 1, [NewBlock|BlockList]). + +size_testblocks(BlockList) -> + size_testblocks(BlockList,0). + +size_testblocks([], Acc) -> + Acc; +size_testblocks([H|T], Acc) -> + size_testblocks(T, Acc + byte_size(H)). + +test_testblocks([], _) -> + true; +test_testblocks([H|T], BlockType) -> + Block = binary_to_term(H), + case findkey("key-20-special", Block, BlockType) of + true -> + test_testblocks(T, BlockType); + not_found -> + false + end. + +findkey(_, [], keylist) -> + not_found; +findkey(Key, [H|T], keylist) -> + case H of + {o, <<"pdsRecord">>, Key, _} -> + true; + _ -> + findkey(Key,T, keylist) + end; +findkey(Key, Tree, keygbtree) -> + case gb_trees:lookup({o, <<"pdsRecord">>, Key}, Tree) of + none -> + not_found; + _ -> + true + end. + \ No newline at end of file diff --git a/test/rice_test.erl b/test/rice_test.erl new file mode 100644 index 0000000..1bbb43f --- /dev/null +++ b/test/rice_test.erl @@ -0,0 +1,59 @@ +%% Test performance and accuracy of rice-encoded bloom filters +%% +%% Calling check_negative(2048, 1000000) should return about 122 false +%% positives in around 11 seconds, with a size below 4KB +%% +%% The equivalent positive check is check_positive(2048, 488) and this +%% should take around 6 seconds. +%% +%% So a blooom with 2048 members should support o(100K) checks per second +%% on a modern CPU, whilst requiring 2 bytes per member. + +-module(rice_test). + +-export([check_positive/2, check_negative/2, calc_hash/2]). + + + +check_positive(KeyCount, LoopCount) -> + KeyList = produce_keylist(KeyCount), + Bloom = leveled_rice:create_bloom(KeyList), + check_positive(KeyList, Bloom, LoopCount). + +check_positive(_, Bloom, 0) -> + {ok, byte_size(Bloom)}; +check_positive(KeyList, Bloom, LoopCount) -> + true = leveled_rice:check_keys(KeyList, Bloom), + check_positive(KeyList, Bloom, LoopCount - 1). + + +produce_keylist(KeyCount) -> + KeyPrefix = lists:concat(["PositiveKey-", random:uniform(KeyCount)]), + produce_keylist(KeyCount, [], KeyPrefix). + +produce_keylist(0, KeyList, _) -> + KeyList; +produce_keylist(KeyCount, KeyList, KeyPrefix) -> + Key = lists:concat([KeyPrefix, KeyCount]), + produce_keylist(KeyCount - 1, [Key|KeyList], KeyPrefix). + + +check_negative(KeyCount, CheckCount) -> + KeyList = produce_keylist(KeyCount), + Bloom = leveled_rice:create_bloom(KeyList), + check_negative(Bloom, CheckCount, 0). + +check_negative(Bloom, 0, FalsePos) -> + {byte_size(Bloom), FalsePos}; +check_negative(Bloom, CheckCount, FalsePos) -> + Key = lists:concat(["NegativeKey-", CheckCount, random:uniform(CheckCount)]), + case leveled_rice:check_key(Key, Bloom) of + true -> check_negative(Bloom, CheckCount - 1, FalsePos + 1); + false -> check_negative(Bloom, CheckCount - 1, FalsePos) + end. + +calc_hash(_, 0) -> + ok; +calc_hash(Key, Count) -> + erlang:phash2(lists:concat([Key, Count, "sometxt"])), + calc_hash(Key, Count -1).