diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl new file mode 100644 index 0000000..321c7f8 --- /dev/null +++ b/src/leveled_bookie.erl @@ -0,0 +1,104 @@ +%% -------- Overview --------- +%% +%% The eleveleddb is based on the LSM-tree similar to leveldb, except that: +%% - Keys, Metadata and Values are not persisted together - the Keys and +%% Metadata are kept in a tree-based ledger, whereas the values are stored +%% only in a sequential Journal. +%% - Different file formats are used for Journal (based on constant +%% database), and the ledger (sft, based on sst) +%% - It is not intended to be general purpose, but be specifically suited for +%% use as a Riak backend in specific circumstances (relatively large values, +%% and frequent use of iterators) +%% - The Journal is an extended nursery log in leveldb terms. It is keyed +%% on the sequence number of the write +%% - The ledger is a LSM tree, where the key is the actaul object key, and +%% the value is the metadata of the object including the sequence number +%% +%% +%% -------- The actors --------- +%% +%% The store is fronted by a Bookie, who takes support from different actors: +%% - An Inker who persists new data into the jornal, and returns items from +%% the journal based on sequence number +%% - A Penciller who periodically redraws the ledger +%% - One or more Clerks, who may be used by either the inker or the penciller +%% to fulfill background tasks +%% +%% Both the Inker and the Penciller maintain a manifest of the files which +%% represent the current state of the Journal and the Ledger repsectively. +%% For the Inker the manifest maps ranges of sequence numbers to cdb files. +%% For the Penciller the manifest maps key ranges to files at each level of +%% the Ledger. +%% +%% -------- PUT -------- +%% +%% A PUT request consists of +%% - A primary Key +%% - Metadata associated with the primary key (2i, vector clock, object size) +%% - A value +%% - A set of secondary key changes which should be made as part of the commit +%% +%% The Bookie takes the place request and passes it first to the Inker to add +%% the request to the ledger. +%% +%% The inker will pass the request to the current (append only) CDB journal +%% fileto persist the change. The call should return either 'ok' or 'roll'. +%% 'roll' indicates that the CDB file has insufficient capacity for +%% this write. + +%% In resonse to a 'roll', the inker should: +%% - start a new active journal file with an open_write_request, and then; +%% - call to PUT the object in this file; +%% - reply to the bookie, but then in the background +%% - close the previously active journal file (writing the hashtree), and move +%% it to the historic journal +%% +%% Once the object has been persisted to the Journal, the Key and Metadata can +%% be added to the ledger. Initially this will be added to the Bookie's +%% in-memory view of recent changes only. +%% +%% The Bookie's memory consists of up to two in-memory ets tables +%% - the 'cmem' (current in-memory table) which is always subject to potential +%% change; +%% - the 'imem' (the immutable in-memory table) which is awaiting persistence +%% to the disk-based lsm-tree by the Penciller. +%% +%% The key and metadata should be written to the cmem store if it has +%% sufficient capacity, but this potentially should include the secondary key +%% changes which have been made as part of the transaction. +%% +%% If there is insufficient space in the cmem, the cmem should be converted +%% into the imem, and a new cmem be created. This requires the previous imem +%% to have been cleared from state due to compaction into the persisted Ledger +%% by the Penciller - otherwise the PUT is blocked. On creation of an imem, +%% the compaction process for that imem by the Penciller should be triggered. +%% +%% This completes the non-deferrable work associated with a PUT +%% +%% -------- Snapshots (Key & Metadata Only) -------- +%% +%% If there is a snapshot request (e.g. to iterate over the keys) the Bookie +%% must first produce a tree representing the results of the request which are +%% present in its in-memory view of the ledger. The Bookie then requests +%% a copy of the current Ledger manifest from the Penciller, and the Penciller +%5 should interest of the iterator at the manifest sequence number at the time +%% of the request. +%% +%% Iterators should de-register themselves from the Penciller on completion. +%% Iterators should be automatically release after a timeout period. A file +%% can only be deleted from the Ledger if it is no longer in the manifest, and +%% there are no registered iterators from before the point the file was +%% removed from the manifest. +%% +%% Snapshots may be non-recent, if recency is unimportant. Non-recent +%% snapshots do no require the Bookie to return the results of the in-memory +%% table, the Penciller alone cna be asked. +%% +%% -------- Special Ops -------- +%% +%% e.g. Get all for SegmentID/Partition +%% + + +-module(leveled_bookie). + diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index ff52acf..e2c266f 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -58,17 +58,7 @@ cdb_open_reader/1, cdb_get/2, cdb_put/3, - from_dict/2, - create/2, - dump/1, - get/2, - get_mem/3, - put/4, - open_active_file/1, - get_nextkey/1, - get_nextkey/2, - fold/3, - fold_keys/3]). + cdb_close/1]). -include_lib("eunit/include/eunit.hrl"). @@ -114,9 +104,9 @@ cdb_get(Pid, Key) -> cdb_put(Pid, Key, Value) -> gen_server:call(Pid, {cdb_put, Key, Value}, infinity). -% -%cdb_close(Pid) -> -% gen_server:call(Pid, cdb_close, infinity). + +cdb_close(Pid) -> + gen_server:call(Pid, cdb_close, infinity). %%%============================================================================ @@ -159,18 +149,30 @@ handle_call({cdb_put, Key, Value}, _From, State) -> Result = put(State#state.handle, Key, Value, {State#state.last_position, State#state.hashtree}), - {UpdHandle, NewPosition, HashTree} = Result, - {reply, - ok, - State#state{handle=UpdHandle, - last_position=NewPosition, - hashtree=HashTree}}; + case Result of + roll -> + {reply, roll, State}; + {UpdHandle, NewPosition, HashTree} -> + {reply, ok, State#state{handle=UpdHandle, + last_position=NewPosition, + hashtree=HashTree}} + end; false -> {reply, {error, read_only}, State} - end. - + end; +handle_call(cdb_close, _From, State) -> + case State#state.writer of + true -> + ok = close_file(State#state.handle, + State#state.hashtree, + State#state.last_position); + false -> + ok = file:close(State#state.handle) + end, + {stop, normal, ok, State}. + handle_cast(_Msg, State) -> {noreply, State}. @@ -178,8 +180,8 @@ handle_cast(_Msg, State) -> handle_info(_Info, State) -> {noreply, State}. -terminate(_Reason, _State) -> - ok. +terminate(_Reason, State) -> + file:close(State#state.handle). code_change(_OldVsn, State, _Extra) -> {ok, State}. @@ -264,8 +266,8 @@ open_active_file(FileName) when is_list(FileName) -> ok = file:close(Handle); {ok, _} -> LogDetails = [LastPosition, file:position(Handle, eof)], - io:format("File to be truncated at last position of" - "~w with end of file at ~w~n", LogDetails), + io:format("File to be truncated at last position of ~w " + "with end of file at ~w~n", LogDetails), {ok, LastPosition} = file:position(Handle, LastPosition), ok = file:truncate(Handle), ok = file:close(Handle) diff --git a/src/leveled_housekeeping.erl b/src/leveled_clerk.erl similarity index 99% rename from src/leveled_housekeeping.erl rename to src/leveled_clerk.erl index 3d6b8c1..807a254 100644 --- a/src/leveled_housekeeping.erl +++ b/src/leveled_clerk.erl @@ -2,7 +2,7 @@ %% level and cleaning out of old files across a level --module(leveled_housekeeping). +-module(leveled_clerk). -export([merge_file/3, perform_merge/3]). diff --git a/src/leveled_inker.erl b/src/leveled_inker.erl new file mode 100644 index 0000000..c192433 --- /dev/null +++ b/src/leveled_inker.erl @@ -0,0 +1,8 @@ +%% -------- Inker --------- +%% +%% +%% +%% -------- Ledger --------- +%% +%% + diff --git a/src/leveled_concierge.erl b/src/leveled_penciller.erl similarity index 72% rename from src/leveled_concierge.erl rename to src/leveled_penciller.erl index 8abb892..2ac3384 100644 --- a/src/leveled_concierge.erl +++ b/src/leveled_penciller.erl @@ -1,87 +1,28 @@ -%% -------- Overview --------- +%% -------- Penciller --------- %% -%% The eleveleddb is based on the LSM-tree similar to leveldb, except that: -%% - Values are kept seperately to Keys & Metadata -%% - Different file formats are used for value store (based on constant -%% database), and key store (based on sst) -%% - It is not intended to be general purpose, but be specifically suited for -%% use as a Riak backend in specific circumstances (relatively large values, -%% and frequent use of iterators) -%% - The Value store is an extended nursery log in leveldb terms. It is keyed -%% on the sequence number of the write -%% - The Key Store is a LSM tree, where the key is the actaul object key, and -%% the value is the metadata of the object including the sequence number +%% The penciller is repsonsible for writing and re-writing the ledger - a +%% persisted, ordered view of non-recent Keys and Metadata which have been +%% added to the store. +%% - The penciller maintains a manifest of all the files within the current +%% Ledger. +%% - The Penciller queues re-write (compaction) work up to be managed by Clerks +%% - The Penciller mainatins a register of iterators who have requested +%% snapshots of the Ledger +%% - The accepts new dumps (in the form of immutable ets tables) from the +%% Bookie, and calls the Bookie once the process of pencilling this data in +%% the Ledger is complete - and the Bookie is free to forget about the data %% -%% -------- Concierge & Manifest --------- +%% -------- Ledger --------- %% -%% The concierge is responsible for opening up the store, and keeps a manifest -%% of where items can be found. The manifest keeps a mapping of: -%% - Sequence Number ranges and the PID of the Value Store file that contains -%% that range -%% - Key ranges to PID mappings for each leval of the KeyStore -%% -%% -------- GET -------- -%% -%% A GET request for Key and Metadata requires a lookup in the KeyStore only. -%% - The concierge should consult the manifest for the lowest level to find -%% the PID which may contain the Key -%% - The concierge should ask the file owner if the Key is present, if not -%% present lower levels should be consulted until the objetc is found -%% -%% If a value is required, when the Key/Metadata has been fetched from the -%% KeyStore, the sequence number should be tkane, and matched in the ValueStore -%% manifest to find the right value. -%% -%% For recent PUTs the Key/Metadata is added into memory, and there is an -%% in-memory hash table for the entries in the most recent ValueStore CDB. -%% -%% -------- PUT -------- -%% -%% A PUT request must be persisted to the open (and append only) CDB file which -%% acts as a transaction log to persist the change. The Key & Metadata needs -%% also to be placed in memory. -%% -%% Once the CDB file is full, the managing process should be requested to -%% complete the lookup hash, and a new CDB file be started. -%% -%% Once the in-memory -%% -%% -------- Snapshots (Key Only) -------- -%% -%% If there is a iterator/snapshot request, the concierge will simply handoff a -%% copy of the manifest, and register the interest of the iterator at the -%% manifest sequence number at the time of the request. Iterators should -%% de-register themselves from the manager on completion. Iterators should be -%% automatically release after a timeout period. A file can be deleted if -%% there are no registered iterators from before the point the file was -%% removed from the manifest. -%% -%% -------- Snapshots (Key & Value) -------- -%% -%% -%% -%% -------- Special Ops -------- -%% -%% e.g. Get all for SegmentID/Partition -%% -%% -------- KeyStore --------- -%% -%% The concierge is responsible for controlling access to the store and -%% maintaining both an in-memory view and a persisted state of all the sft -%% files in use across the store. -%% -%% The store is divided into many levels -%% L0: May contain one, and only one sft file PID which is the most recent file -%% added to the top of the store. Access to the store will be stalled when a -%% second file is added whilst one still remains at this level. The target -%% size of L0 is therefore 0. +%% The Ledger is divided into many levels %% L1 - Ln: May contain multiple non-overlapping PIDs managing sft files. %% Compaction work should be sheduled if the number of files exceeds the target %% size of the level, where the target size is 8 ^ n. %% %% The most recent revision of a Key can be found by checking each level until -%% the key is found. To check a level the write file must be sought from the -%% manifest for that level, and then a call is made to that level. +%% the key is found. To check a level the correct file must be sought from the +%% manifest for that level, and then a call is made to that file. If the Key +%% is not present then every level should be checked. %% %% If a compaction change takes the size of a level beyond the target size, %% then compaction work for that level + 1 should be added to the compaction @@ -93,20 +34,19 @@ %% The compaction worker will always call the level manager to find out the %% highest priority work currently in the queue before proceeding. %% -%% When the compaction worker picks work off the queue it will take the current -%% manifest for the level and level - 1. The compaction worker will choose -%% which file to compact from level - 1, and once the compaction is complete -%% will call to the manager with the new version of the manifest to be written. +%% When the clerk picks work off the queue it will take the current manifest +%% for the level and level - 1. The clerk will choose which file to compact +%% from level - 1, and once the compaction is complete will call to the +%% Penciller with the new version of the manifest to be written. +%% %% Once the new version of the manifest had been persisted, the state of any %% deleted files will be changed to pending deletion. In pending deletion they -%% will call the manifets manager on a timeout to confirm that they are no -%% longer in use (by any iterators). +%% will call the Penciller on a timeout to confirm that they are no longer in +%% use (by any iterators). %% - - --module(leveled_concierge). +-module(leveled_penciller). %% -behaviour(gen_server). @@ -315,4 +255,3 @@ compaction_work_assessment_test() -> OngoingWork3 = lists:append(OngoingWork2, [{1, dummy_pid, os:timestamp()}]), WorkQ5 = assess_workqueue([], 0, [{0, []}, {1, L1Alt}], OngoingWork3), ?assertMatch(WorkQ5, []). - diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index ef1fd9d..04e08ba 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -150,10 +150,6 @@ handle_info/2, terminate/2, code_change/3, - speedtest_check_forsegment/4, - generate_randomsegfilter/1, - generate_randomkeys/1, - strip_to_keyonly/1, sft_new/4, sft_open/1, sft_get/2, @@ -1296,7 +1292,10 @@ generate_sequentialkeys(Target, Incr, Acc) -> {active, infinity}, null}, generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]). - +dummy_test() -> + R = speedtest_check_forsegment(a, 0, b, c), + ?assertMatch(R, true), + _ = generate_randomsegfilter(8). simple_create_block_test() -> KeyList1 = [{{o, "Bucket1", "Key1"}, 1, {active, infinity}, null},