Add ability to control journal size by object count

This helps when there are files wiht large numbers of key deltas (and hence small values), where otherwise the object count may get out of control.
This commit is contained in:
Martin Sumner 2019-07-25 09:45:23 +01:00
parent 0ba7b3347e
commit dab9652f6c
6 changed files with 157 additions and 62 deletions

View file

@ -38,7 +38,8 @@
bloom :: binary() | none | undefined}). bloom :: binary() | none | undefined}).
-record(cdb_options, -record(cdb_options,
{max_size :: integer() | undefined, {max_size :: pos_integer() | undefined,
max_count :: pos_integer() | undefined,
file_path :: string() | undefined, file_path :: string() | undefined,
waste_path :: string() | undefined, waste_path :: string() | undefined,
binary_mode = false :: boolean(), binary_mode = false :: boolean(),

View file

@ -63,12 +63,27 @@
]}. ]}.
%% @doc The approximate size (in bytes) when a Journal file should be rolled. %% @doc The approximate size (in bytes) when a Journal file should be rolled.
%% Normally keep this as around the size of o(100K) objects. Default is 500MB %% Normally keep this as around the size of o(100K) objects. Default is 1GB.
%% Note that on startup an actual maximum size will be chosen which varies by
%% a random factor from this point - to avoid coordination of roll events
%% across vnodes.
{mapping, "leveled.journal_size", "leveled.journal_size", [ {mapping, "leveled.journal_size", "leveled.journal_size", [
{default, 1000000000}, {default, 1000000000},
{datatype, integer} {datatype, integer}
]}. ]}.
%% @doc The approximate size (in bytes) when a Journal file should be rolled.
%% This time measured in object count, a file will be rolled if either the
%% object count or the journal size limit is reached. Default 200K.
%% Note that on startup an actual maximum size will be chosen which varies by
%% a random factor from this point - to avoid coordination of roll events
%% across vnodes.
{mapping, "leveled.journal_objectcount", "leveled.journal_objectcount", [
{default, 200000},
{datatype, integer}
]}.
%% @doc The number of journal compactions per vnode per day %% @doc The number of journal compactions per vnode per day
%% The higher the value, the more compaction runs, and the sooner space is %% The higher the value, the more compaction runs, and the sooner space is
%% recovered. But each run has a cost %% recovered. But each run has a cost

View file

@ -57,11 +57,24 @@
%% @doc The approximate size (in bytes) when a Journal file should be rolled. %% @doc The approximate size (in bytes) when a Journal file should be rolled.
%% Normally keep this as around the size of o(100K) objects. Default is 500MB %% Normally keep this as around the size of o(100K) objects. Default is 1GB.
%% Note that on startup an actual maximum size will be chosen which varies by
%% a random factor from this point - to avoid coordination of roll events
%% across vnodes.
{mapping, "multi_backend.$name.leveled.journal_size", "riak_kv.multi_backend", [ {mapping, "multi_backend.$name.leveled.journal_size", "riak_kv.multi_backend", [
{default, 500000000}, {default, 1000000000},
{datatype, integer}, {datatype, integer}
hidden ]}.
%% @doc The approximate size (in bytes) when a Journal file should be rolled.
%% This time measured in object count, a file will be rolled if either the
%% object count or the journal size limit is reached. Default 200K.
%% Note that on startup an actual maximum size will be chosen which varies by
%% a random factor from this point - to avoid coordination of roll events
%% across vnodes.
{mapping, "multi_backend.$name.leveled.journal_objectcount", "riak_kv.multi_backend", [
{default, 200000},
{datatype, integer}
]}. ]}.
%% @doc The number of journal compactions per vnode per day %% @doc The number of journal compactions per vnode per day

View file

@ -134,6 +134,7 @@
{snapshot_bookie, undefined}, {snapshot_bookie, undefined},
{cache_size, ?CACHE_SIZE}, {cache_size, ?CACHE_SIZE},
{max_journalsize, 1000000000}, {max_journalsize, 1000000000},
{max_journalobjectcount, 200000},
{max_sstslots, 256}, {max_sstslots, 256},
{sync_strategy, none}, {sync_strategy, none},
{head_only, false}, {head_only, false},
@ -230,8 +231,12 @@
% configured values % configured values
% The minimum value is 100 - any lower value will be ignored % The minimum value is 100 - any lower value will be ignored
{max_journalsize, pos_integer()} | {max_journalsize, pos_integer()} |
% The maximum size of a journal file in bytes. The abolute % The maximum size of a journal file in bytes. The absolute
% maximum must be 4GB due to 4 byte file pointers being used % maximum must be 4GB due to 4 byte file pointers being used
{max_journalobjectcount, pos_integer()} |
% The maximum size of the journal by count of the objects. The
% journal must remian within the limit set by both this figures and
% the max_journalsize
{max_sstslots, pos_integer()} | {max_sstslots, pos_integer()} |
% The maximum number of slots in a SST file. All testing is done % The maximum number of slots in a SST file. All testing is done
% at a size of 256 (except for Quickcheck tests}, altering this % at a size of 256 (except for Quickcheck tests}, altering this
@ -1644,6 +1649,11 @@ set_options(Opts) ->
MaxJournalSize = MaxJournalSize =
min(?ABSOLUTEMAX_JOURNALSIZE, min(?ABSOLUTEMAX_JOURNALSIZE,
MaxJournalSize0 - erlang:phash2(self()) rem JournalSizeJitter), MaxJournalSize0 - erlang:phash2(self()) rem JournalSizeJitter),
MaxJournalCount0 =
proplists:get_value(max_journalobjectcount, Opts),
JournalCountJitter = MaxJournalCount0 div (100 div ?JOURNAL_SIZE_JITTER),
MaxJournalCount =
MaxJournalCount0 - erlang:phash2(self()) rem JournalCountJitter,
SyncStrat = proplists:get_value(sync_strategy, Opts), SyncStrat = proplists:get_value(sync_strategy, Opts),
WRP = proplists:get_value(waste_retention_period, Opts), WRP = proplists:get_value(waste_retention_period, Opts),
@ -1697,6 +1707,7 @@ set_options(Opts) ->
compress_on_receipt = CompressOnReceipt, compress_on_receipt = CompressOnReceipt,
cdb_options = cdb_options =
#cdb_options{max_size=MaxJournalSize, #cdb_options{max_size=MaxJournalSize,
max_count=MaxJournalCount,
binary_mode=true, binary_mode=true,
sync_strategy=SyncStrat, sync_strategy=SyncStrat,
log_options=leveled_log:get_opts()}}, log_options=leveled_log:get_opts()}},

View file

@ -138,10 +138,12 @@
-record(state, {hashtree, -record(state, {hashtree,
last_position :: integer() | undefined, last_position :: integer() | undefined,
last_key = empty, last_key = empty,
current_count = 0 :: non_neg_integer(),
hash_index = {} :: tuple(), hash_index = {} :: tuple(),
filename :: string() | undefined, filename :: string() | undefined,
handle :: file:fd() | undefined, handle :: file:fd() | undefined,
max_size :: integer() | undefined, max_size :: pos_integer() | undefined,
max_count :: pos_integer() | undefined,
binary_mode = false :: boolean(), binary_mode = false :: boolean(),
delete_point = 0 :: integer(), delete_point = 0 :: integer(),
inker :: pid() | undefined, inker :: pid() | undefined,
@ -425,15 +427,24 @@ cdb_clerkcomplete(Pid) ->
%%%============================================================================ %%%============================================================================
init([Opts]) -> init([Opts]) ->
MaxSize = case Opts#cdb_options.max_size of MaxSize =
undefined -> case Opts#cdb_options.max_size of
?MAX_FILE_SIZE; undefined ->
M -> ?MAX_FILE_SIZE;
M MS ->
end, MS
end,
MaxCount =
case Opts#cdb_options.max_count of
undefined ->
?MAX_FILE_SIZE div 1000;
MC ->
MC
end,
{ok, {ok,
starting, starting,
#state{max_size=MaxSize, #state{max_size=MaxSize,
max_count=MaxCount,
binary_mode=Opts#cdb_options.binary_mode, binary_mode=Opts#cdb_options.binary_mode,
waste_path=Opts#cdb_options.waste_path, waste_path=Opts#cdb_options.waste_path,
sync_strategy=Opts#cdb_options.sync_strategy, sync_strategy=Opts#cdb_options.sync_strategy,
@ -447,6 +458,7 @@ starting({open_writer, Filename}, _From, State) ->
leveled_log:log("CDB13", [WriteOps]), leveled_log:log("CDB13", [WriteOps]),
{ok, Handle} = file:open(Filename, WriteOps), {ok, Handle} = file:open(Filename, WriteOps),
State0 = State#state{handle=Handle, State0 = State#state{handle=Handle,
current_count = size_hashtree(HashTree),
sync_strategy = UpdStrategy, sync_strategy = UpdStrategy,
last_position=LastPosition, last_position=LastPosition,
last_key=LastKey, last_key=LastKey,
@ -490,47 +502,63 @@ writer({key_check, Key}, _From, State) ->
writer, writer,
State}; State};
writer({put_kv, Key, Value}, _From, State) -> writer({put_kv, Key, Value}, _From, State) ->
Result = put(State#state.handle, NewCount = State#state.current_count + 1,
Key, case NewCount >= State#state.max_count of
Value, true ->
{State#state.last_position, State#state.hashtree},
State#state.binary_mode,
State#state.max_size,
State#state.last_key == empty),
case Result of
roll ->
%% Key and value could not be written
{reply, roll, writer, State}; {reply, roll, writer, State};
{UpdHandle, NewPosition, HashTree} -> false ->
ok = Result = put(State#state.handle,
case State#state.sync_strategy of Key,
riak_sync -> Value,
file:datasync(UpdHandle); {State#state.last_position, State#state.hashtree},
_ -> State#state.binary_mode,
ok State#state.max_size,
end, State#state.last_key == empty),
{reply, ok, writer, State#state{handle=UpdHandle, case Result of
last_position=NewPosition, roll ->
last_key=Key, %% Key and value could not be written
hashtree=HashTree}} {reply, roll, writer, State};
{UpdHandle, NewPosition, HashTree} ->
ok =
case State#state.sync_strategy of
riak_sync ->
file:datasync(UpdHandle);
_ ->
ok
end,
{reply, ok, writer, State#state{handle=UpdHandle,
current_count=NewCount,
last_position=NewPosition,
last_key=Key,
hashtree=HashTree}}
end
end; end;
writer({mput_kv, []}, _From, State) -> writer({mput_kv, []}, _From, State) ->
{reply, ok, writer, State}; {reply, ok, writer, State};
writer({mput_kv, KVList}, _From, State) -> writer({mput_kv, KVList}, _From, State) ->
Result = mput(State#state.handle, NewCount = State#state.current_count + length(KVList),
KVList, TooMany = NewCount >= State#state.max_count,
{State#state.last_position, State#state.hashtree}, NotEmpty = State#state.current_count > 0,
State#state.binary_mode, case (TooMany and NotEmpty) of
State#state.max_size), true ->
case Result of
roll ->
%% Keys and values could not be written
{reply, roll, writer, State}; {reply, roll, writer, State};
{UpdHandle, NewPosition, HashTree, LastKey} -> false ->
{reply, ok, writer, State#state{handle=UpdHandle, Result = mput(State#state.handle,
last_position=NewPosition, KVList,
last_key=LastKey, {State#state.last_position, State#state.hashtree},
hashtree=HashTree}} State#state.binary_mode,
State#state.max_size),
case Result of
roll ->
%% Keys and values could not be written
{reply, roll, writer, State};
{UpdHandle, NewPosition, HashTree, LastKey} ->
{reply, ok, writer, State#state{handle=UpdHandle,
current_count=NewCount,
last_position=NewPosition,
last_key=LastKey,
hashtree=HashTree}}
end
end; end;
writer(cdb_complete, _From, State) -> writer(cdb_complete, _From, State) ->
NewName = determine_new_filename(State#state.filename), NewName = determine_new_filename(State#state.filename),
@ -1775,6 +1803,9 @@ add_position_tohashtree(HashTree, Index, Hash, Position) ->
new_hashtree() -> new_hashtree() ->
ets:new(hashtree, [ordered_set]). ets:new(hashtree, [ordered_set]).
size_hashtree(HashTree) ->
ets:info(HashTree, size).
to_list(HashTree, Index) -> to_list(HashTree, Index) ->
to_list(HashTree, Index, {0, -1}, []). to_list(HashTree, Index, {0, -1}, []).

View file

@ -603,11 +603,14 @@ allkeydelta_journal_multicompact(_Config) ->
% Simply confirms that none of this causes a crash % Simply confirms that none of this causes a crash
RootPath = testutil:reset_filestructure(), RootPath = testutil:reset_filestructure(),
B = <<"test_bucket">>, B = <<"test_bucket">>,
StartOpts1 = [{root_path, RootPath}, StartOptsFun =
{max_journalsize, 50000000}, fun(JOC) ->
{max_run_length, 6}, [{root_path, RootPath},
{sync_strategy, testutil:sync_strategy()}], {max_journalobjectcount, JOC},
{ok, Bookie1} = leveled_bookie:book_start(StartOpts1), {max_run_length, 6},
{sync_strategy, testutil:sync_strategy()}]
end,
{ok, Bookie1} = leveled_bookie:book_start(StartOptsFun(16000)),
{KSpcL1, _V1} = testutil:put_indexed_objects(Bookie1, B, 40000), {KSpcL1, _V1} = testutil:put_indexed_objects(Bookie1, B, 40000),
{KSpcL2, V2} = testutil:put_altered_indexed_objects(Bookie1, {KSpcL2, V2} = testutil:put_altered_indexed_objects(Bookie1,
B, B,
@ -633,26 +636,47 @@ allkeydelta_journal_multicompact(_Config) ->
ok = leveled_bookie:book_close(Bookie1), ok = leveled_bookie:book_close(Bookie1),
leveled_penciller:clean_testdir(RootPath ++ "/ledger"), leveled_penciller:clean_testdir(RootPath ++ "/ledger"),
{ok, Bookie2} = leveled_bookie:book_start(StartOpts1), io:format("Restart without ledger~n"),
{ok, Bookie2} = leveled_bookie:book_start(StartOptsFun(24000)),
ok = testutil:check_indexed_objects(Bookie2, ok = testutil:check_indexed_objects(Bookie2,
B, B,
KSpcL1 ++ KSpcL2, KSpcL1 ++ KSpcL2,
V2), V2),
{KSpcL3, V3} = testutil:put_altered_indexed_objects(Bookie2, {KSpcL3, _V3} = testutil:put_altered_indexed_objects(Bookie2,
B, B,
KSpcL2, KSpcL2,
false), false),
compact_and_wait(Bookie2, 0), compact_and_wait(Bookie2, 0),
{ok, FileList3} =
ok = testutil:check_indexed_objects(Bookie2, file:list_dir(
B, filename:join(RootPath, "journal/journal_files/post_compact")),
KSpcL1 ++ KSpcL2 ++ KSpcL3, io:format("Number of files after compaction ~w~n", [length(FileList3)]),
V3),
ok = leveled_bookie:book_close(Bookie2), ok = leveled_bookie:book_close(Bookie2),
io:format("Restart with smaller journal object count~n"),
{ok, Bookie3} = leveled_bookie:book_start(StartOptsFun(8000)),
{KSpcL4, V4} = testutil:put_altered_indexed_objects(Bookie3,
B,
KSpcL3,
false),
compact_and_wait(Bookie3, 0),
ok = testutil:check_indexed_objects(Bookie3,
B,
KSpcL1 ++ KSpcL2 ++ KSpcL3 ++ KSpcL4,
V4),
{ok, FileList4} =
file:list_dir(
filename:join(RootPath, "journal/journal_files/post_compact")),
io:format("Number of files after compaction ~w~n", [length(FileList4)]),
true = length(FileList4) >= length(FileList3) + 4,
ok = leveled_bookie:book_close(Bookie3),
testutil:reset_filestructure(10000). testutil:reset_filestructure(10000).