WIP - First draft of Bookie code
First draft of untested bookie code
This commit is contained in:
parent
86666b1cb6
commit
e73a5bbf31
6 changed files with 536 additions and 216 deletions
|
@ -33,19 +33,23 @@
|
|||
%% -------- PUT --------
|
||||
%%
|
||||
%% A PUT request consists of
|
||||
%% - A primary Key
|
||||
%% - Metadata associated with the primary key (2i, vector clock, object size)
|
||||
%% - A value
|
||||
%% - A set of secondary key changes which should be made as part of the commit
|
||||
%% - A Primary Key and a Value
|
||||
%% - IndexSpecs - a set of secondary key changes associated with the
|
||||
%% transaction
|
||||
%%
|
||||
%% The Bookie takes the place request and passes it first to the Inker to add
|
||||
%% the request to the ledger.
|
||||
%%
|
||||
%% The inker will pass the request to the current (append only) CDB journal
|
||||
%% file to persist the change. The call should return either 'ok' or 'roll'.
|
||||
%% 'roll' indicates that the CDB file has insufficient capacity for
|
||||
%% The inker will pass the PK/Value/IndexSpecs to the current (append only)
|
||||
%% CDB journal file to persist the change. The call should return either 'ok'
|
||||
%% or 'roll'. -'roll' indicates that the CDB file has insufficient capacity for
|
||||
%% this write.
|
||||
|
||||
%%
|
||||
%% (Note that storing the IndexSpecs will create some duplication with the
|
||||
%% Metadata wrapped up within the Object value. This Value and the IndexSpecs
|
||||
%% are compressed before storage, so this should provide some mitigation for
|
||||
%% the duplication).
|
||||
%%
|
||||
%% In resonse to a 'roll', the inker should:
|
||||
%% - start a new active journal file with an open_write_request, and then;
|
||||
%% - call to PUT the object in this file;
|
||||
|
@ -53,13 +57,31 @@
|
|||
%% - close the previously active journal file (writing the hashtree), and move
|
||||
%% it to the historic journal
|
||||
%%
|
||||
%% Once the object has been persisted to the Journal, the Key with Metadata
|
||||
%% and the keychanges can be added to the ledger. Initially this will be
|
||||
%% added to the Bookie'sin-memory view of recent changes only.
|
||||
%% The inker will also return the SQN which the change has been made at, as
|
||||
%% well as the object size on disk within the Journal.
|
||||
%%
|
||||
%% The Bookie's memory consists of an in-memory ets table. Periodically, the
|
||||
%% current table is pushed to the Penciller for eventual persistence, and a
|
||||
%% new table is started.
|
||||
%% Once the object has been persisted to the Journal, the Ledger can be updated.
|
||||
%% The Ledger is updated by the Bookie applying a function (passed in at
|
||||
%% startup) to the Value to return the Object Metadata, a function to generate
|
||||
%% a hash of the Value and also taking the Primary Key, the IndexSpecs, the
|
||||
%% Sequence Number in the Journal and the Object Size (returned from the
|
||||
%% Inker).
|
||||
%%
|
||||
%% The Bookie should generate a series of ledger key changes from this
|
||||
%% information, using a function passed in at startup. For Riak this will be
|
||||
%% of the form:
|
||||
%% {{o, Bucket, Key},
|
||||
%% SQN,
|
||||
%% {Hash, Size, {Riak_Metadata}},
|
||||
%% {active, TS}|{tomb, TS}} or
|
||||
%% {{i, Bucket, IndexTerm, IndexField, Key},
|
||||
%% SQN,
|
||||
%% null,
|
||||
%% {active, TS}|{tomb, TS}}
|
||||
%%
|
||||
%% Recent Ledger changes are retained initially in the Bookies' memory (in an
|
||||
%% in-memory ets table). Periodically, the current table is pushed to the
|
||||
%% Penciller for eventual persistence, and a new table is started.
|
||||
%%
|
||||
%% This completes the non-deferrable work associated with a PUT
|
||||
%%
|
||||
|
@ -86,6 +108,18 @@
|
|||
%%
|
||||
%% e.g. Get all for SegmentID/Partition
|
||||
%%
|
||||
%%
|
||||
%%
|
||||
%% -------- On Startup --------
|
||||
%%
|
||||
%% On startup the Bookie must restart both the Inker to load the Journal, and
|
||||
%% the Penciller to load the Ledger. Once the Penciller has started, the
|
||||
%% Bookie should request the highest sequence number in the Ledger, and then
|
||||
%% and try and rebuild any missing information from the Journal
|
||||
%%
|
||||
%% To rebuild the Ledger it requests the Inker to scan over the files from
|
||||
%% the sequence number and re-generate the Ledger changes.
|
||||
|
||||
|
||||
|
||||
-module(leveled_bookie).
|
||||
|
@ -99,22 +133,28 @@
|
|||
handle_cast/2,
|
||||
handle_info/2,
|
||||
terminate/2,
|
||||
code_change/3]).
|
||||
code_change/3,
|
||||
book_start/1,
|
||||
book_put/4,
|
||||
book_get/2,
|
||||
book_head/2,
|
||||
strip_to_keyonly/1,
|
||||
strip_to_keyseqonly/1,
|
||||
strip_to_seqonly/1,
|
||||
strip_to_statusonly/1,
|
||||
strip_to_details/1]).
|
||||
|
||||
-include_lib("eunit/include/eunit.hrl").
|
||||
|
||||
-define(CACHE_SIZE, 1000).
|
||||
|
||||
-record(state, {inker :: pid(),
|
||||
penciller :: pid()}).
|
||||
|
||||
-record(item, {primary_key :: term(),
|
||||
contents :: list(),
|
||||
metadatas :: list(),
|
||||
vclock,
|
||||
hash :: integer(),
|
||||
size :: integer(),
|
||||
key_changes :: list()})
|
||||
|
||||
penciller :: pid(),
|
||||
metadata_extractor :: function(),
|
||||
indexspec_converter :: function(),
|
||||
cache_size :: integer(),
|
||||
back_pressure :: boolean(),
|
||||
ledger_cache :: gb_trees:tree()}).
|
||||
|
||||
|
||||
|
||||
|
@ -122,7 +162,17 @@
|
|||
%%% API
|
||||
%%%============================================================================
|
||||
|
||||
book_start(Opts) ->
|
||||
gen_server:start(?MODULE, [Opts], []).
|
||||
|
||||
book_put(Pid, PrimaryKey, Object, IndexSpecs) ->
|
||||
gen_server:call(Pid, {put, PrimaryKey, Object, IndexSpecs}, infinity).
|
||||
|
||||
book_get(Pid, PrimaryKey) ->
|
||||
gen_server:call(Pid, {get, PrimaryKey}, infinity).
|
||||
|
||||
book_head(Pid, PrimaryKey) ->
|
||||
gen_server:call(Pid, {head, PrimaryKey}, infinity).
|
||||
|
||||
%%%============================================================================
|
||||
%%% gen_server callbacks
|
||||
|
@ -131,11 +181,81 @@
|
|||
init([Opts]) ->
|
||||
{InkerOpts, PencillerOpts} = set_options(Opts),
|
||||
{Inker, Penciller} = startup(InkerOpts, PencillerOpts),
|
||||
{ok, #state{inker=Inker, penciller=Penciller}}.
|
||||
Extractor = if
|
||||
Opts#bookie_options.metadata_extractor == undefined ->
|
||||
fun extract_metadata/2;
|
||||
true ->
|
||||
Opts#bookie_options.metadata_extractor
|
||||
end,
|
||||
Converter = if
|
||||
Opts#bookie_options.indexspec_converter == undefined ->
|
||||
fun convert_indexspecs/3;
|
||||
true ->
|
||||
Opts#bookie_options.indexspec_converter
|
||||
end,
|
||||
CacheSize = if
|
||||
Opts#bookie_options.cache_size == undefined ->
|
||||
?CACHE_SIZE;
|
||||
true ->
|
||||
Opts#bookie_options.cache_size
|
||||
end,
|
||||
{ok, #state{inker=Inker,
|
||||
penciller=Penciller,
|
||||
metadata_extractor=Extractor,
|
||||
indexspec_converter=Converter,
|
||||
cache_size=CacheSize,
|
||||
ledger_cache=gb_trees:empty()}}.
|
||||
|
||||
|
||||
handle_call(_, _From, State) ->
|
||||
{reply, ok, State}.
|
||||
handle_call({put, PrimaryKey, Object, IndexSpecs}, From, State) ->
|
||||
{ok, SQN, ObjSize} = leveled_inker:ink_put(PrimaryKey, Object, IndexSpecs),
|
||||
Changes = preparefor_ledgercache(PrimaryKey,
|
||||
SQN,
|
||||
Object,
|
||||
ObjSize,
|
||||
IndexSpecs),
|
||||
Cache0 = addto_ledgercache(Changes, State#state.ledger_cache),
|
||||
gen_server:reply(From, ok),
|
||||
case maybepush_ledgercache(State#state.cache_size,
|
||||
Cache0,
|
||||
State#state.penciller) of
|
||||
{ok, NewCache} ->
|
||||
{noreply, State#state{ledger_cache=NewCache, back_pressure=false}};
|
||||
{pause, NewCache} ->
|
||||
{noreply, State#state{ledger_cache=NewCache, back_pressure=true}}
|
||||
end;
|
||||
handle_call({get, Key}, _From, State) ->
|
||||
case fetch_head(Key, State#state.penciller, State#state.ledger_cache) of
|
||||
not_present ->
|
||||
{reply, not_found, State};
|
||||
Head ->
|
||||
{Key, Seqn, Status} = strip_to_details(Head),
|
||||
case Status of
|
||||
{tomb, _} ->
|
||||
{reply, not_found, State};
|
||||
{active, _} ->
|
||||
case fetch_value(Key, Seqn, State#state.inker) of
|
||||
not_present ->
|
||||
{reply, not_found, State};
|
||||
Object ->
|
||||
{reply, {ok, Object}, State}
|
||||
end
|
||||
end
|
||||
end;
|
||||
handle_call({head, Key}, _From, State) ->
|
||||
case fetch_head(Key, State#state.penciller, State#state.ledger_cache) of
|
||||
not_present ->
|
||||
{reply, not_found, State};
|
||||
Head ->
|
||||
{Key, _Seqn, Status} = strip_to_details(Head),
|
||||
case Status of
|
||||
{tomb, _} ->
|
||||
{reply, not_found, State};
|
||||
{active, _} ->
|
||||
MD = strip_to_mdonly(Head),
|
||||
{reply, {ok, MD}, State}
|
||||
end
|
||||
end.
|
||||
|
||||
handle_cast(_Msg, State) ->
|
||||
{noreply, State}.
|
||||
|
@ -154,18 +274,137 @@ code_change(_OldVsn, State, _Extra) ->
|
|||
%%% Internal functions
|
||||
%%%============================================================================
|
||||
|
||||
set_options(_Opts) ->
|
||||
{#inker_options{}, #penciller_options{}}.
|
||||
set_options(Opts) ->
|
||||
{#inker_options{root_path=Opts#bookie_options.root_path},
|
||||
#penciller_options{root_path=Opts#bookie_options.root_path}}.
|
||||
|
||||
startup(InkerOpts, PencillerOpts) ->
|
||||
{ok, Inker} = leveled_inker:ink_start(InkerOpts),
|
||||
{ok, Penciller} = leveled_penciller:pcl_start(PencillerOpts),
|
||||
LedgerSQN = leveled_penciller:pcl_getstartupsequencenumber(Penciller),
|
||||
KeyChanges = leveled_inker:ink_fetchkeychangesfrom(Inker, LedgerSQN),
|
||||
ok = leveled_penciller:pcl_pushmem(Penciller, KeyChanges),
|
||||
ok = leveled_inker:ink_loadpcl(LedgerSQN, fun load_fun/4, Penciller),
|
||||
{Inker, Penciller}.
|
||||
|
||||
|
||||
fetch_head(Key, Penciller, Cache) ->
|
||||
case gb_trees:lookup(Key, Cache) of
|
||||
{value, Head} ->
|
||||
Head;
|
||||
none ->
|
||||
case leveled_penciller:pcl_fetch(Penciller, Key) of
|
||||
{Key, Head} ->
|
||||
Head;
|
||||
not_present ->
|
||||
not_present
|
||||
end
|
||||
end.
|
||||
|
||||
fetch_value(Key, SQN, Inker) ->
|
||||
case leveled_inker:ink_fetch(Inker, Key, SQN) of
|
||||
{ok, Value} ->
|
||||
Value;
|
||||
not_present ->
|
||||
not_present
|
||||
end.
|
||||
|
||||
%% Format of a Key within the ledger is
|
||||
%% {PrimaryKey, SQN, Metadata, Status}
|
||||
|
||||
strip_to_keyonly({keyonly, K}) -> K;
|
||||
strip_to_keyonly({K, _V}) -> K.
|
||||
|
||||
strip_to_keyseqonly({K, {SeqN, _, _}}) -> {K, SeqN}.
|
||||
|
||||
strip_to_statusonly({_, {_, St, _}}) -> St.
|
||||
|
||||
strip_to_seqonly({_, {SeqN, _, _}}) -> SeqN.
|
||||
|
||||
strip_to_details({K, {SeqN, St, _}}) -> {K, SeqN, St}.
|
||||
|
||||
strip_to_mdonly({_, {_, _, MD}}) -> MD.
|
||||
|
||||
get_metadatas(#r_object{contents=Contents}) ->
|
||||
[Content#r_content.metadata || Content <- Contents].
|
||||
|
||||
set_vclock(Object=#r_object{}, VClock) -> Object#r_object{vclock=VClock}.
|
||||
|
||||
vclock(#r_object{vclock=VClock}) -> VClock.
|
||||
|
||||
to_binary(v0, Obj) ->
|
||||
term_to_binary(Obj).
|
||||
|
||||
hash(Obj=#r_object{}) ->
|
||||
Vclock = vclock(Obj),
|
||||
UpdObj = set_vclock(Obj, lists:sort(Vclock)),
|
||||
erlang:phash2(to_binary(v0, UpdObj)).
|
||||
|
||||
extract_metadata(Obj, Size) ->
|
||||
{get_metadatas(Obj), vclock(Obj), hash(Obj), Size}.
|
||||
|
||||
convert_indexspecs(IndexSpecs, SQN, PrimaryKey) ->
|
||||
lists:map(fun({IndexOp, IndexField, IndexValue}) ->
|
||||
Status = case IndexOp of
|
||||
add ->
|
||||
%% TODO: timestamp support
|
||||
{active, infinity};
|
||||
remove ->
|
||||
%% TODO: timestamps for delayed reaping
|
||||
{tomb, infinity}
|
||||
end,
|
||||
{o, B, K} = PrimaryKey,
|
||||
{{i, B, IndexField, IndexValue, K},
|
||||
{SQN, Status, null}}
|
||||
end,
|
||||
IndexSpecs).
|
||||
|
||||
|
||||
preparefor_ledgercache(PK, SQN, Obj, Size, IndexSpecs) ->
|
||||
PrimaryChange = {PK,
|
||||
{SQN,
|
||||
{active, infinity},
|
||||
extract_metadata(Obj, Size)}},
|
||||
SecChanges = convert_indexspecs(IndexSpecs, SQN, PK),
|
||||
[PrimaryChange] ++ SecChanges.
|
||||
|
||||
addto_ledgercache(Changes, Cache) ->
|
||||
lists:foldl(fun({{K, V}, Acc}) -> gb_trees:enter(K, V, Acc) end,
|
||||
Cache,
|
||||
Changes).
|
||||
|
||||
maybepush_ledgercache(MaxCacheSize, Cache, Penciller) ->
|
||||
CacheSize = gb_trees:size(Cache),
|
||||
if
|
||||
CacheSize > MaxCacheSize ->
|
||||
case leveled_penciller:pcl_pushmem(Penciller,
|
||||
gb_trees:to_list(Cache)) of
|
||||
ok ->
|
||||
{ok, gb_trees:empty()};
|
||||
pause ->
|
||||
{pause, gb_trees:empty()};
|
||||
refused ->
|
||||
{ok, Cache}
|
||||
end;
|
||||
true ->
|
||||
{ok, Cache}
|
||||
end.
|
||||
|
||||
load_fun(KeyInLedger, ValueInLedger, _Position, Acc0) ->
|
||||
{MinSQN, MaxSQN, Output} = Acc0,
|
||||
{SQN, PK} = KeyInLedger,
|
||||
{Obj, IndexSpecs} = ValueInLedger,
|
||||
case SQN of
|
||||
SQN when SQN < MinSQN ->
|
||||
{loop, Acc0};
|
||||
SQN when SQN =< MaxSQN ->
|
||||
%% TODO - get correct size in a more efficient manner
|
||||
%% Need to have compressed size
|
||||
Size = byte_size(term_to_binary(ValueInLedger, [compressed])),
|
||||
Changes = preparefor_ledgercache(PK, SQN, Obj, Size, IndexSpecs),
|
||||
{loop, {MinSQN, MaxSQN, Output ++ Changes}};
|
||||
SQN when SQN > MaxSQN ->
|
||||
{stop, Acc0}
|
||||
end.
|
||||
|
||||
|
||||
%%%============================================================================
|
||||
%%% Test
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue