Re-naming
Naming things is hard. This change renames things based on the Bookie/Inker/Penciller terminology
This commit is contained in:
parent
04da891272
commit
2bdb5fba6c
6 changed files with 170 additions and 118 deletions
104
src/leveled_bookie.erl
Normal file
104
src/leveled_bookie.erl
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
%% -------- Overview ---------
|
||||||
|
%%
|
||||||
|
%% The eleveleddb is based on the LSM-tree similar to leveldb, except that:
|
||||||
|
%% - Keys, Metadata and Values are not persisted together - the Keys and
|
||||||
|
%% Metadata are kept in a tree-based ledger, whereas the values are stored
|
||||||
|
%% only in a sequential Journal.
|
||||||
|
%% - Different file formats are used for Journal (based on constant
|
||||||
|
%% database), and the ledger (sft, based on sst)
|
||||||
|
%% - It is not intended to be general purpose, but be specifically suited for
|
||||||
|
%% use as a Riak backend in specific circumstances (relatively large values,
|
||||||
|
%% and frequent use of iterators)
|
||||||
|
%% - The Journal is an extended nursery log in leveldb terms. It is keyed
|
||||||
|
%% on the sequence number of the write
|
||||||
|
%% - The ledger is a LSM tree, where the key is the actaul object key, and
|
||||||
|
%% the value is the metadata of the object including the sequence number
|
||||||
|
%%
|
||||||
|
%%
|
||||||
|
%% -------- The actors ---------
|
||||||
|
%%
|
||||||
|
%% The store is fronted by a Bookie, who takes support from different actors:
|
||||||
|
%% - An Inker who persists new data into the jornal, and returns items from
|
||||||
|
%% the journal based on sequence number
|
||||||
|
%% - A Penciller who periodically redraws the ledger
|
||||||
|
%% - One or more Clerks, who may be used by either the inker or the penciller
|
||||||
|
%% to fulfill background tasks
|
||||||
|
%%
|
||||||
|
%% Both the Inker and the Penciller maintain a manifest of the files which
|
||||||
|
%% represent the current state of the Journal and the Ledger repsectively.
|
||||||
|
%% For the Inker the manifest maps ranges of sequence numbers to cdb files.
|
||||||
|
%% For the Penciller the manifest maps key ranges to files at each level of
|
||||||
|
%% the Ledger.
|
||||||
|
%%
|
||||||
|
%% -------- PUT --------
|
||||||
|
%%
|
||||||
|
%% A PUT request consists of
|
||||||
|
%% - A primary Key
|
||||||
|
%% - Metadata associated with the primary key (2i, vector clock, object size)
|
||||||
|
%% - A value
|
||||||
|
%% - A set of secondary key changes which should be made as part of the commit
|
||||||
|
%%
|
||||||
|
%% The Bookie takes the place request and passes it first to the Inker to add
|
||||||
|
%% the request to the ledger.
|
||||||
|
%%
|
||||||
|
%% The inker will pass the request to the current (append only) CDB journal
|
||||||
|
%% fileto persist the change. The call should return either 'ok' or 'roll'.
|
||||||
|
%% 'roll' indicates that the CDB file has insufficient capacity for
|
||||||
|
%% this write.
|
||||||
|
|
||||||
|
%% In resonse to a 'roll', the inker should:
|
||||||
|
%% - start a new active journal file with an open_write_request, and then;
|
||||||
|
%% - call to PUT the object in this file;
|
||||||
|
%% - reply to the bookie, but then in the background
|
||||||
|
%% - close the previously active journal file (writing the hashtree), and move
|
||||||
|
%% it to the historic journal
|
||||||
|
%%
|
||||||
|
%% Once the object has been persisted to the Journal, the Key and Metadata can
|
||||||
|
%% be added to the ledger. Initially this will be added to the Bookie's
|
||||||
|
%% in-memory view of recent changes only.
|
||||||
|
%%
|
||||||
|
%% The Bookie's memory consists of up to two in-memory ets tables
|
||||||
|
%% - the 'cmem' (current in-memory table) which is always subject to potential
|
||||||
|
%% change;
|
||||||
|
%% - the 'imem' (the immutable in-memory table) which is awaiting persistence
|
||||||
|
%% to the disk-based lsm-tree by the Penciller.
|
||||||
|
%%
|
||||||
|
%% The key and metadata should be written to the cmem store if it has
|
||||||
|
%% sufficient capacity, but this potentially should include the secondary key
|
||||||
|
%% changes which have been made as part of the transaction.
|
||||||
|
%%
|
||||||
|
%% If there is insufficient space in the cmem, the cmem should be converted
|
||||||
|
%% into the imem, and a new cmem be created. This requires the previous imem
|
||||||
|
%% to have been cleared from state due to compaction into the persisted Ledger
|
||||||
|
%% by the Penciller - otherwise the PUT is blocked. On creation of an imem,
|
||||||
|
%% the compaction process for that imem by the Penciller should be triggered.
|
||||||
|
%%
|
||||||
|
%% This completes the non-deferrable work associated with a PUT
|
||||||
|
%%
|
||||||
|
%% -------- Snapshots (Key & Metadata Only) --------
|
||||||
|
%%
|
||||||
|
%% If there is a snapshot request (e.g. to iterate over the keys) the Bookie
|
||||||
|
%% must first produce a tree representing the results of the request which are
|
||||||
|
%% present in its in-memory view of the ledger. The Bookie then requests
|
||||||
|
%% a copy of the current Ledger manifest from the Penciller, and the Penciller
|
||||||
|
%5 should interest of the iterator at the manifest sequence number at the time
|
||||||
|
%% of the request.
|
||||||
|
%%
|
||||||
|
%% Iterators should de-register themselves from the Penciller on completion.
|
||||||
|
%% Iterators should be automatically release after a timeout period. A file
|
||||||
|
%% can only be deleted from the Ledger if it is no longer in the manifest, and
|
||||||
|
%% there are no registered iterators from before the point the file was
|
||||||
|
%% removed from the manifest.
|
||||||
|
%%
|
||||||
|
%% Snapshots may be non-recent, if recency is unimportant. Non-recent
|
||||||
|
%% snapshots do no require the Bookie to return the results of the in-memory
|
||||||
|
%% table, the Penciller alone cna be asked.
|
||||||
|
%%
|
||||||
|
%% -------- Special Ops --------
|
||||||
|
%%
|
||||||
|
%% e.g. Get all for SegmentID/Partition
|
||||||
|
%%
|
||||||
|
|
||||||
|
|
||||||
|
-module(leveled_bookie).
|
||||||
|
|
|
@ -58,17 +58,7 @@
|
||||||
cdb_open_reader/1,
|
cdb_open_reader/1,
|
||||||
cdb_get/2,
|
cdb_get/2,
|
||||||
cdb_put/3,
|
cdb_put/3,
|
||||||
from_dict/2,
|
cdb_close/1]).
|
||||||
create/2,
|
|
||||||
dump/1,
|
|
||||||
get/2,
|
|
||||||
get_mem/3,
|
|
||||||
put/4,
|
|
||||||
open_active_file/1,
|
|
||||||
get_nextkey/1,
|
|
||||||
get_nextkey/2,
|
|
||||||
fold/3,
|
|
||||||
fold_keys/3]).
|
|
||||||
|
|
||||||
-include_lib("eunit/include/eunit.hrl").
|
-include_lib("eunit/include/eunit.hrl").
|
||||||
|
|
||||||
|
@ -114,9 +104,9 @@ cdb_get(Pid, Key) ->
|
||||||
|
|
||||||
cdb_put(Pid, Key, Value) ->
|
cdb_put(Pid, Key, Value) ->
|
||||||
gen_server:call(Pid, {cdb_put, Key, Value}, infinity).
|
gen_server:call(Pid, {cdb_put, Key, Value}, infinity).
|
||||||
%
|
|
||||||
%cdb_close(Pid) ->
|
cdb_close(Pid) ->
|
||||||
% gen_server:call(Pid, cdb_close, infinity).
|
gen_server:call(Pid, cdb_close, infinity).
|
||||||
|
|
||||||
|
|
||||||
%%%============================================================================
|
%%%============================================================================
|
||||||
|
@ -159,17 +149,29 @@ handle_call({cdb_put, Key, Value}, _From, State) ->
|
||||||
Result = put(State#state.handle,
|
Result = put(State#state.handle,
|
||||||
Key, Value,
|
Key, Value,
|
||||||
{State#state.last_position, State#state.hashtree}),
|
{State#state.last_position, State#state.hashtree}),
|
||||||
{UpdHandle, NewPosition, HashTree} = Result,
|
case Result of
|
||||||
{reply,
|
roll ->
|
||||||
ok,
|
{reply, roll, State};
|
||||||
State#state{handle=UpdHandle,
|
{UpdHandle, NewPosition, HashTree} ->
|
||||||
last_position=NewPosition,
|
{reply, ok, State#state{handle=UpdHandle,
|
||||||
hashtree=HashTree}};
|
last_position=NewPosition,
|
||||||
|
hashtree=HashTree}}
|
||||||
|
end;
|
||||||
false ->
|
false ->
|
||||||
{reply,
|
{reply,
|
||||||
{error, read_only},
|
{error, read_only},
|
||||||
State}
|
State}
|
||||||
end.
|
end;
|
||||||
|
handle_call(cdb_close, _From, State) ->
|
||||||
|
case State#state.writer of
|
||||||
|
true ->
|
||||||
|
ok = close_file(State#state.handle,
|
||||||
|
State#state.hashtree,
|
||||||
|
State#state.last_position);
|
||||||
|
false ->
|
||||||
|
ok = file:close(State#state.handle)
|
||||||
|
end,
|
||||||
|
{stop, normal, ok, State}.
|
||||||
|
|
||||||
|
|
||||||
handle_cast(_Msg, State) ->
|
handle_cast(_Msg, State) ->
|
||||||
|
@ -178,8 +180,8 @@ handle_cast(_Msg, State) ->
|
||||||
handle_info(_Info, State) ->
|
handle_info(_Info, State) ->
|
||||||
{noreply, State}.
|
{noreply, State}.
|
||||||
|
|
||||||
terminate(_Reason, _State) ->
|
terminate(_Reason, State) ->
|
||||||
ok.
|
file:close(State#state.handle).
|
||||||
|
|
||||||
code_change(_OldVsn, State, _Extra) ->
|
code_change(_OldVsn, State, _Extra) ->
|
||||||
{ok, State}.
|
{ok, State}.
|
||||||
|
@ -264,8 +266,8 @@ open_active_file(FileName) when is_list(FileName) ->
|
||||||
ok = file:close(Handle);
|
ok = file:close(Handle);
|
||||||
{ok, _} ->
|
{ok, _} ->
|
||||||
LogDetails = [LastPosition, file:position(Handle, eof)],
|
LogDetails = [LastPosition, file:position(Handle, eof)],
|
||||||
io:format("File to be truncated at last position of"
|
io:format("File to be truncated at last position of ~w "
|
||||||
"~w with end of file at ~w~n", LogDetails),
|
"with end of file at ~w~n", LogDetails),
|
||||||
{ok, LastPosition} = file:position(Handle, LastPosition),
|
{ok, LastPosition} = file:position(Handle, LastPosition),
|
||||||
ok = file:truncate(Handle),
|
ok = file:truncate(Handle),
|
||||||
ok = file:close(Handle)
|
ok = file:close(Handle)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
%% level and cleaning out of old files across a level
|
%% level and cleaning out of old files across a level
|
||||||
|
|
||||||
|
|
||||||
-module(leveled_housekeeping).
|
-module(leveled_clerk).
|
||||||
|
|
||||||
-export([merge_file/3, perform_merge/3]).
|
-export([merge_file/3, perform_merge/3]).
|
||||||
|
|
8
src/leveled_inker.erl
Normal file
8
src/leveled_inker.erl
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
%% -------- Inker ---------
|
||||||
|
%%
|
||||||
|
%%
|
||||||
|
%%
|
||||||
|
%% -------- Ledger ---------
|
||||||
|
%%
|
||||||
|
%%
|
||||||
|
|
|
@ -1,87 +1,28 @@
|
||||||
%% -------- Overview ---------
|
%% -------- Penciller ---------
|
||||||
%%
|
%%
|
||||||
%% The eleveleddb is based on the LSM-tree similar to leveldb, except that:
|
%% The penciller is repsonsible for writing and re-writing the ledger - a
|
||||||
%% - Values are kept seperately to Keys & Metadata
|
%% persisted, ordered view of non-recent Keys and Metadata which have been
|
||||||
%% - Different file formats are used for value store (based on constant
|
%% added to the store.
|
||||||
%% database), and key store (based on sst)
|
%% - The penciller maintains a manifest of all the files within the current
|
||||||
%% - It is not intended to be general purpose, but be specifically suited for
|
%% Ledger.
|
||||||
%% use as a Riak backend in specific circumstances (relatively large values,
|
%% - The Penciller queues re-write (compaction) work up to be managed by Clerks
|
||||||
%% and frequent use of iterators)
|
%% - The Penciller mainatins a register of iterators who have requested
|
||||||
%% - The Value store is an extended nursery log in leveldb terms. It is keyed
|
%% snapshots of the Ledger
|
||||||
%% on the sequence number of the write
|
%% - The accepts new dumps (in the form of immutable ets tables) from the
|
||||||
%% - The Key Store is a LSM tree, where the key is the actaul object key, and
|
%% Bookie, and calls the Bookie once the process of pencilling this data in
|
||||||
%% the value is the metadata of the object including the sequence number
|
%% the Ledger is complete - and the Bookie is free to forget about the data
|
||||||
%%
|
%%
|
||||||
%% -------- Concierge & Manifest ---------
|
%% -------- Ledger ---------
|
||||||
%%
|
%%
|
||||||
%% The concierge is responsible for opening up the store, and keeps a manifest
|
%% The Ledger is divided into many levels
|
||||||
%% of where items can be found. The manifest keeps a mapping of:
|
|
||||||
%% - Sequence Number ranges and the PID of the Value Store file that contains
|
|
||||||
%% that range
|
|
||||||
%% - Key ranges to PID mappings for each leval of the KeyStore
|
|
||||||
%%
|
|
||||||
%% -------- GET --------
|
|
||||||
%%
|
|
||||||
%% A GET request for Key and Metadata requires a lookup in the KeyStore only.
|
|
||||||
%% - The concierge should consult the manifest for the lowest level to find
|
|
||||||
%% the PID which may contain the Key
|
|
||||||
%% - The concierge should ask the file owner if the Key is present, if not
|
|
||||||
%% present lower levels should be consulted until the objetc is found
|
|
||||||
%%
|
|
||||||
%% If a value is required, when the Key/Metadata has been fetched from the
|
|
||||||
%% KeyStore, the sequence number should be tkane, and matched in the ValueStore
|
|
||||||
%% manifest to find the right value.
|
|
||||||
%%
|
|
||||||
%% For recent PUTs the Key/Metadata is added into memory, and there is an
|
|
||||||
%% in-memory hash table for the entries in the most recent ValueStore CDB.
|
|
||||||
%%
|
|
||||||
%% -------- PUT --------
|
|
||||||
%%
|
|
||||||
%% A PUT request must be persisted to the open (and append only) CDB file which
|
|
||||||
%% acts as a transaction log to persist the change. The Key & Metadata needs
|
|
||||||
%% also to be placed in memory.
|
|
||||||
%%
|
|
||||||
%% Once the CDB file is full, the managing process should be requested to
|
|
||||||
%% complete the lookup hash, and a new CDB file be started.
|
|
||||||
%%
|
|
||||||
%% Once the in-memory
|
|
||||||
%%
|
|
||||||
%% -------- Snapshots (Key Only) --------
|
|
||||||
%%
|
|
||||||
%% If there is a iterator/snapshot request, the concierge will simply handoff a
|
|
||||||
%% copy of the manifest, and register the interest of the iterator at the
|
|
||||||
%% manifest sequence number at the time of the request. Iterators should
|
|
||||||
%% de-register themselves from the manager on completion. Iterators should be
|
|
||||||
%% automatically release after a timeout period. A file can be deleted if
|
|
||||||
%% there are no registered iterators from before the point the file was
|
|
||||||
%% removed from the manifest.
|
|
||||||
%%
|
|
||||||
%% -------- Snapshots (Key & Value) --------
|
|
||||||
%%
|
|
||||||
%%
|
|
||||||
%%
|
|
||||||
%% -------- Special Ops --------
|
|
||||||
%%
|
|
||||||
%% e.g. Get all for SegmentID/Partition
|
|
||||||
%%
|
|
||||||
%% -------- KeyStore ---------
|
|
||||||
%%
|
|
||||||
%% The concierge is responsible for controlling access to the store and
|
|
||||||
%% maintaining both an in-memory view and a persisted state of all the sft
|
|
||||||
%% files in use across the store.
|
|
||||||
%%
|
|
||||||
%% The store is divided into many levels
|
|
||||||
%% L0: May contain one, and only one sft file PID which is the most recent file
|
|
||||||
%% added to the top of the store. Access to the store will be stalled when a
|
|
||||||
%% second file is added whilst one still remains at this level. The target
|
|
||||||
%% size of L0 is therefore 0.
|
|
||||||
%% L1 - Ln: May contain multiple non-overlapping PIDs managing sft files.
|
%% L1 - Ln: May contain multiple non-overlapping PIDs managing sft files.
|
||||||
%% Compaction work should be sheduled if the number of files exceeds the target
|
%% Compaction work should be sheduled if the number of files exceeds the target
|
||||||
%% size of the level, where the target size is 8 ^ n.
|
%% size of the level, where the target size is 8 ^ n.
|
||||||
%%
|
%%
|
||||||
%% The most recent revision of a Key can be found by checking each level until
|
%% The most recent revision of a Key can be found by checking each level until
|
||||||
%% the key is found. To check a level the write file must be sought from the
|
%% the key is found. To check a level the correct file must be sought from the
|
||||||
%% manifest for that level, and then a call is made to that level.
|
%% manifest for that level, and then a call is made to that file. If the Key
|
||||||
|
%% is not present then every level should be checked.
|
||||||
%%
|
%%
|
||||||
%% If a compaction change takes the size of a level beyond the target size,
|
%% If a compaction change takes the size of a level beyond the target size,
|
||||||
%% then compaction work for that level + 1 should be added to the compaction
|
%% then compaction work for that level + 1 should be added to the compaction
|
||||||
|
@ -93,20 +34,19 @@
|
||||||
%% The compaction worker will always call the level manager to find out the
|
%% The compaction worker will always call the level manager to find out the
|
||||||
%% highest priority work currently in the queue before proceeding.
|
%% highest priority work currently in the queue before proceeding.
|
||||||
%%
|
%%
|
||||||
%% When the compaction worker picks work off the queue it will take the current
|
%% When the clerk picks work off the queue it will take the current manifest
|
||||||
%% manifest for the level and level - 1. The compaction worker will choose
|
%% for the level and level - 1. The clerk will choose which file to compact
|
||||||
%% which file to compact from level - 1, and once the compaction is complete
|
%% from level - 1, and once the compaction is complete will call to the
|
||||||
%% will call to the manager with the new version of the manifest to be written.
|
%% Penciller with the new version of the manifest to be written.
|
||||||
|
%%
|
||||||
%% Once the new version of the manifest had been persisted, the state of any
|
%% Once the new version of the manifest had been persisted, the state of any
|
||||||
%% deleted files will be changed to pending deletion. In pending deletion they
|
%% deleted files will be changed to pending deletion. In pending deletion they
|
||||||
%% will call the manifets manager on a timeout to confirm that they are no
|
%% will call the Penciller on a timeout to confirm that they are no longer in
|
||||||
%% longer in use (by any iterators).
|
%% use (by any iterators).
|
||||||
%%
|
%%
|
||||||
|
|
||||||
|
|
||||||
|
-module(leveled_penciller).
|
||||||
|
|
||||||
-module(leveled_concierge).
|
|
||||||
|
|
||||||
%% -behaviour(gen_server).
|
%% -behaviour(gen_server).
|
||||||
|
|
||||||
|
@ -315,4 +255,3 @@ compaction_work_assessment_test() ->
|
||||||
OngoingWork3 = lists:append(OngoingWork2, [{1, dummy_pid, os:timestamp()}]),
|
OngoingWork3 = lists:append(OngoingWork2, [{1, dummy_pid, os:timestamp()}]),
|
||||||
WorkQ5 = assess_workqueue([], 0, [{0, []}, {1, L1Alt}], OngoingWork3),
|
WorkQ5 = assess_workqueue([], 0, [{0, []}, {1, L1Alt}], OngoingWork3),
|
||||||
?assertMatch(WorkQ5, []).
|
?assertMatch(WorkQ5, []).
|
||||||
|
|
|
@ -150,10 +150,6 @@
|
||||||
handle_info/2,
|
handle_info/2,
|
||||||
terminate/2,
|
terminate/2,
|
||||||
code_change/3,
|
code_change/3,
|
||||||
speedtest_check_forsegment/4,
|
|
||||||
generate_randomsegfilter/1,
|
|
||||||
generate_randomkeys/1,
|
|
||||||
strip_to_keyonly/1,
|
|
||||||
sft_new/4,
|
sft_new/4,
|
||||||
sft_open/1,
|
sft_open/1,
|
||||||
sft_get/2,
|
sft_get/2,
|
||||||
|
@ -1296,7 +1292,10 @@ generate_sequentialkeys(Target, Incr, Acc) ->
|
||||||
{active, infinity}, null},
|
{active, infinity}, null},
|
||||||
generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]).
|
generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]).
|
||||||
|
|
||||||
|
dummy_test() ->
|
||||||
|
R = speedtest_check_forsegment(a, 0, b, c),
|
||||||
|
?assertMatch(R, true),
|
||||||
|
_ = generate_randomsegfilter(8).
|
||||||
|
|
||||||
simple_create_block_test() ->
|
simple_create_block_test() ->
|
||||||
KeyList1 = [{{o, "Bucket1", "Key1"}, 1, {active, infinity}, null},
|
KeyList1 = [{{o, "Bucket1", "Key1"}, 1, {active, infinity}, null},
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue