Re-naming

Naming things is hard.  This change renames things based on the
Bookie/Inker/Penciller terminology
This commit is contained in:
martinsumner 2016-08-02 13:44:48 +01:00
parent 04da891272
commit 2bdb5fba6c
6 changed files with 170 additions and 118 deletions

104
src/leveled_bookie.erl Normal file
View file

@ -0,0 +1,104 @@
%% -------- Overview ---------
%%
%% The eleveleddb is based on the LSM-tree similar to leveldb, except that:
%% - Keys, Metadata and Values are not persisted together - the Keys and
%% Metadata are kept in a tree-based ledger, whereas the values are stored
%% only in a sequential Journal.
%% - Different file formats are used for Journal (based on constant
%% database), and the ledger (sft, based on sst)
%% - It is not intended to be general purpose, but be specifically suited for
%% use as a Riak backend in specific circumstances (relatively large values,
%% and frequent use of iterators)
%% - The Journal is an extended nursery log in leveldb terms. It is keyed
%% on the sequence number of the write
%% - The ledger is a LSM tree, where the key is the actaul object key, and
%% the value is the metadata of the object including the sequence number
%%
%%
%% -------- The actors ---------
%%
%% The store is fronted by a Bookie, who takes support from different actors:
%% - An Inker who persists new data into the jornal, and returns items from
%% the journal based on sequence number
%% - A Penciller who periodically redraws the ledger
%% - One or more Clerks, who may be used by either the inker or the penciller
%% to fulfill background tasks
%%
%% Both the Inker and the Penciller maintain a manifest of the files which
%% represent the current state of the Journal and the Ledger repsectively.
%% For the Inker the manifest maps ranges of sequence numbers to cdb files.
%% For the Penciller the manifest maps key ranges to files at each level of
%% the Ledger.
%%
%% -------- PUT --------
%%
%% A PUT request consists of
%% - A primary Key
%% - Metadata associated with the primary key (2i, vector clock, object size)
%% - A value
%% - A set of secondary key changes which should be made as part of the commit
%%
%% The Bookie takes the place request and passes it first to the Inker to add
%% the request to the ledger.
%%
%% The inker will pass the request to the current (append only) CDB journal
%% fileto persist the change. The call should return either 'ok' or 'roll'.
%% 'roll' indicates that the CDB file has insufficient capacity for
%% this write.
%% In resonse to a 'roll', the inker should:
%% - start a new active journal file with an open_write_request, and then;
%% - call to PUT the object in this file;
%% - reply to the bookie, but then in the background
%% - close the previously active journal file (writing the hashtree), and move
%% it to the historic journal
%%
%% Once the object has been persisted to the Journal, the Key and Metadata can
%% be added to the ledger. Initially this will be added to the Bookie's
%% in-memory view of recent changes only.
%%
%% The Bookie's memory consists of up to two in-memory ets tables
%% - the 'cmem' (current in-memory table) which is always subject to potential
%% change;
%% - the 'imem' (the immutable in-memory table) which is awaiting persistence
%% to the disk-based lsm-tree by the Penciller.
%%
%% The key and metadata should be written to the cmem store if it has
%% sufficient capacity, but this potentially should include the secondary key
%% changes which have been made as part of the transaction.
%%
%% If there is insufficient space in the cmem, the cmem should be converted
%% into the imem, and a new cmem be created. This requires the previous imem
%% to have been cleared from state due to compaction into the persisted Ledger
%% by the Penciller - otherwise the PUT is blocked. On creation of an imem,
%% the compaction process for that imem by the Penciller should be triggered.
%%
%% This completes the non-deferrable work associated with a PUT
%%
%% -------- Snapshots (Key & Metadata Only) --------
%%
%% If there is a snapshot request (e.g. to iterate over the keys) the Bookie
%% must first produce a tree representing the results of the request which are
%% present in its in-memory view of the ledger. The Bookie then requests
%% a copy of the current Ledger manifest from the Penciller, and the Penciller
%5 should interest of the iterator at the manifest sequence number at the time
%% of the request.
%%
%% Iterators should de-register themselves from the Penciller on completion.
%% Iterators should be automatically release after a timeout period. A file
%% can only be deleted from the Ledger if it is no longer in the manifest, and
%% there are no registered iterators from before the point the file was
%% removed from the manifest.
%%
%% Snapshots may be non-recent, if recency is unimportant. Non-recent
%% snapshots do no require the Bookie to return the results of the in-memory
%% table, the Penciller alone cna be asked.
%%
%% -------- Special Ops --------
%%
%% e.g. Get all for SegmentID/Partition
%%
-module(leveled_bookie).

View file

@ -58,17 +58,7 @@
cdb_open_reader/1,
cdb_get/2,
cdb_put/3,
from_dict/2,
create/2,
dump/1,
get/2,
get_mem/3,
put/4,
open_active_file/1,
get_nextkey/1,
get_nextkey/2,
fold/3,
fold_keys/3]).
cdb_close/1]).
-include_lib("eunit/include/eunit.hrl").
@ -114,9 +104,9 @@ cdb_get(Pid, Key) ->
cdb_put(Pid, Key, Value) ->
gen_server:call(Pid, {cdb_put, Key, Value}, infinity).
%
%cdb_close(Pid) ->
% gen_server:call(Pid, cdb_close, infinity).
cdb_close(Pid) ->
gen_server:call(Pid, cdb_close, infinity).
%%%============================================================================
@ -159,18 +149,30 @@ handle_call({cdb_put, Key, Value}, _From, State) ->
Result = put(State#state.handle,
Key, Value,
{State#state.last_position, State#state.hashtree}),
{UpdHandle, NewPosition, HashTree} = Result,
{reply,
ok,
State#state{handle=UpdHandle,
last_position=NewPosition,
hashtree=HashTree}};
case Result of
roll ->
{reply, roll, State};
{UpdHandle, NewPosition, HashTree} ->
{reply, ok, State#state{handle=UpdHandle,
last_position=NewPosition,
hashtree=HashTree}}
end;
false ->
{reply,
{error, read_only},
State}
end.
end;
handle_call(cdb_close, _From, State) ->
case State#state.writer of
true ->
ok = close_file(State#state.handle,
State#state.hashtree,
State#state.last_position);
false ->
ok = file:close(State#state.handle)
end,
{stop, normal, ok, State}.
handle_cast(_Msg, State) ->
{noreply, State}.
@ -178,8 +180,8 @@ handle_cast(_Msg, State) ->
handle_info(_Info, State) ->
{noreply, State}.
terminate(_Reason, _State) ->
ok.
terminate(_Reason, State) ->
file:close(State#state.handle).
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
@ -264,8 +266,8 @@ open_active_file(FileName) when is_list(FileName) ->
ok = file:close(Handle);
{ok, _} ->
LogDetails = [LastPosition, file:position(Handle, eof)],
io:format("File to be truncated at last position of"
"~w with end of file at ~w~n", LogDetails),
io:format("File to be truncated at last position of ~w "
"with end of file at ~w~n", LogDetails),
{ok, LastPosition} = file:position(Handle, LastPosition),
ok = file:truncate(Handle),
ok = file:close(Handle)

View file

@ -2,7 +2,7 @@
%% level and cleaning out of old files across a level
-module(leveled_housekeeping).
-module(leveled_clerk).
-export([merge_file/3, perform_merge/3]).

8
src/leveled_inker.erl Normal file
View file

@ -0,0 +1,8 @@
%% -------- Inker ---------
%%
%%
%%
%% -------- Ledger ---------
%%
%%

View file

@ -1,87 +1,28 @@
%% -------- Overview ---------
%% -------- Penciller ---------
%%
%% The eleveleddb is based on the LSM-tree similar to leveldb, except that:
%% - Values are kept seperately to Keys & Metadata
%% - Different file formats are used for value store (based on constant
%% database), and key store (based on sst)
%% - It is not intended to be general purpose, but be specifically suited for
%% use as a Riak backend in specific circumstances (relatively large values,
%% and frequent use of iterators)
%% - The Value store is an extended nursery log in leveldb terms. It is keyed
%% on the sequence number of the write
%% - The Key Store is a LSM tree, where the key is the actaul object key, and
%% the value is the metadata of the object including the sequence number
%% The penciller is repsonsible for writing and re-writing the ledger - a
%% persisted, ordered view of non-recent Keys and Metadata which have been
%% added to the store.
%% - The penciller maintains a manifest of all the files within the current
%% Ledger.
%% - The Penciller queues re-write (compaction) work up to be managed by Clerks
%% - The Penciller mainatins a register of iterators who have requested
%% snapshots of the Ledger
%% - The accepts new dumps (in the form of immutable ets tables) from the
%% Bookie, and calls the Bookie once the process of pencilling this data in
%% the Ledger is complete - and the Bookie is free to forget about the data
%%
%% -------- Concierge & Manifest ---------
%% -------- Ledger ---------
%%
%% The concierge is responsible for opening up the store, and keeps a manifest
%% of where items can be found. The manifest keeps a mapping of:
%% - Sequence Number ranges and the PID of the Value Store file that contains
%% that range
%% - Key ranges to PID mappings for each leval of the KeyStore
%%
%% -------- GET --------
%%
%% A GET request for Key and Metadata requires a lookup in the KeyStore only.
%% - The concierge should consult the manifest for the lowest level to find
%% the PID which may contain the Key
%% - The concierge should ask the file owner if the Key is present, if not
%% present lower levels should be consulted until the objetc is found
%%
%% If a value is required, when the Key/Metadata has been fetched from the
%% KeyStore, the sequence number should be tkane, and matched in the ValueStore
%% manifest to find the right value.
%%
%% For recent PUTs the Key/Metadata is added into memory, and there is an
%% in-memory hash table for the entries in the most recent ValueStore CDB.
%%
%% -------- PUT --------
%%
%% A PUT request must be persisted to the open (and append only) CDB file which
%% acts as a transaction log to persist the change. The Key & Metadata needs
%% also to be placed in memory.
%%
%% Once the CDB file is full, the managing process should be requested to
%% complete the lookup hash, and a new CDB file be started.
%%
%% Once the in-memory
%%
%% -------- Snapshots (Key Only) --------
%%
%% If there is a iterator/snapshot request, the concierge will simply handoff a
%% copy of the manifest, and register the interest of the iterator at the
%% manifest sequence number at the time of the request. Iterators should
%% de-register themselves from the manager on completion. Iterators should be
%% automatically release after a timeout period. A file can be deleted if
%% there are no registered iterators from before the point the file was
%% removed from the manifest.
%%
%% -------- Snapshots (Key & Value) --------
%%
%%
%%
%% -------- Special Ops --------
%%
%% e.g. Get all for SegmentID/Partition
%%
%% -------- KeyStore ---------
%%
%% The concierge is responsible for controlling access to the store and
%% maintaining both an in-memory view and a persisted state of all the sft
%% files in use across the store.
%%
%% The store is divided into many levels
%% L0: May contain one, and only one sft file PID which is the most recent file
%% added to the top of the store. Access to the store will be stalled when a
%% second file is added whilst one still remains at this level. The target
%% size of L0 is therefore 0.
%% The Ledger is divided into many levels
%% L1 - Ln: May contain multiple non-overlapping PIDs managing sft files.
%% Compaction work should be sheduled if the number of files exceeds the target
%% size of the level, where the target size is 8 ^ n.
%%
%% The most recent revision of a Key can be found by checking each level until
%% the key is found. To check a level the write file must be sought from the
%% manifest for that level, and then a call is made to that level.
%% the key is found. To check a level the correct file must be sought from the
%% manifest for that level, and then a call is made to that file. If the Key
%% is not present then every level should be checked.
%%
%% If a compaction change takes the size of a level beyond the target size,
%% then compaction work for that level + 1 should be added to the compaction
@ -93,20 +34,19 @@
%% The compaction worker will always call the level manager to find out the
%% highest priority work currently in the queue before proceeding.
%%
%% When the compaction worker picks work off the queue it will take the current
%% manifest for the level and level - 1. The compaction worker will choose
%% which file to compact from level - 1, and once the compaction is complete
%% will call to the manager with the new version of the manifest to be written.
%% When the clerk picks work off the queue it will take the current manifest
%% for the level and level - 1. The clerk will choose which file to compact
%% from level - 1, and once the compaction is complete will call to the
%% Penciller with the new version of the manifest to be written.
%%
%% Once the new version of the manifest had been persisted, the state of any
%% deleted files will be changed to pending deletion. In pending deletion they
%% will call the manifets manager on a timeout to confirm that they are no
%% longer in use (by any iterators).
%% will call the Penciller on a timeout to confirm that they are no longer in
%% use (by any iterators).
%%
-module(leveled_concierge).
-module(leveled_penciller).
%% -behaviour(gen_server).
@ -315,4 +255,3 @@ compaction_work_assessment_test() ->
OngoingWork3 = lists:append(OngoingWork2, [{1, dummy_pid, os:timestamp()}]),
WorkQ5 = assess_workqueue([], 0, [{0, []}, {1, L1Alt}], OngoingWork3),
?assertMatch(WorkQ5, []).

View file

@ -150,10 +150,6 @@
handle_info/2,
terminate/2,
code_change/3,
speedtest_check_forsegment/4,
generate_randomsegfilter/1,
generate_randomkeys/1,
strip_to_keyonly/1,
sft_new/4,
sft_open/1,
sft_get/2,
@ -1296,7 +1292,10 @@ generate_sequentialkeys(Target, Incr, Acc) ->
{active, infinity}, null},
generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]).
dummy_test() ->
R = speedtest_check_forsegment(a, 0, b, c),
?assertMatch(R, true),
_ = generate_randomsegfilter(8).
simple_create_block_test() ->
KeyList1 = [{{o, "Bucket1", "Key1"}, 1, {active, infinity}, null},