WIP - Recent Modifications

Just some initial WIP code for this.  Will revisit this again after
exploring some ideas as to how to reduce the cost of the
get_keys_by_segment.

The overlal idea is that there are trees of recent modifications, with
recent being some rolling time window made up of hourly blocks, and
recency being dtermined by the last-modified date on the object metadata
- which should be conistent across a cluster.

So if we were at 15:30 we would get the tree for 14:00 - 15:00 and the
tree for 15:00-16:00 from two different queries which cover the same
partitions and then compare.

Comparison may find differences, and we know what segment the difference
is in - but how to then find all keys in that segment which have been
modified in the period?  Three ways:

Do it inefficeintly and infrequently using a fold_keys and a filter
(perhaps with SST files having a highest LMD in the metadata so that
they can be skipped).
Add a special index, where verye entry has a TTL, and the Key is
{$segment, Segment, Bucket, Key}  so that a normal 2i query cna be used.
Align hashing for segments with hashing for penciller lookup so that a
query over the actual keys cna be optimised skipping chunks of the
in-memory part, and chunks of the SST file
This commit is contained in:
martinsumner 2017-06-26 13:26:08 +01:00
parent fde9af28dd
commit 9fca17d56a
2 changed files with 77 additions and 4 deletions

View file

@ -84,7 +84,12 @@
level2 :: any() % an array - but OTP compatibility level2 :: any() % an array - but OTP compatibility
}). }).
-record(recenttrees, {trees :: list(),
size:: small|medium|large|xlarge
}).
-type tictactree() :: #tictactree{}. -type tictactree() :: #tictactree{}.
-type recenttrees() :: #recenttrees{}.
%%%============================================================================ %%%============================================================================
%%% External functions %%% External functions
@ -95,7 +100,10 @@
%% Create a new tree, zeroed out. %% Create a new tree, zeroed out.
new_tree(TreeID) -> new_tree(TreeID) ->
new_tree(TreeID, small). new_tree(TreeID, small).
-spec new_tree(any(), small|medium|large|xlarge) -> tictactree().
%% @doc
%% Create a new tree, zeroed out. Specify the t-shirt siz eof the tree
new_tree(TreeID, Size) -> new_tree(TreeID, Size) ->
{BitWidth, Width, SegmentCount} = {BitWidth, Width, SegmentCount} =
case Size of case Size of
@ -121,10 +129,12 @@ new_tree(TreeID, Size) ->
level1 = Lv1Init, level1 = Lv1Init,
level2 = Lv2Init}. level2 = Lv2Init}.
-spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree(). -spec add_kv(tictactree(), tuple(), tuple(), fun((_,_) -> integer())) ->
tictactree().
%% @doc %% @doc
%% Add a Key and value to a tictactree using the HashFun to calculate the Hash %% Add a Key and value to a tictactree using the HashFun to calculate the Hash
%% based on that key and value %% based on that key and value (or extract the Hash if it is present within
%% the value).
add_kv(TicTacTree, Key, Value, HashFun) -> add_kv(TicTacTree, Key, Value, HashFun) ->
HashV = HashFun(Key, Value), HashV = HashFun(Key, Value),
SegChangeHash = erlang:phash2(Key, HashV), SegChangeHash = erlang:phash2(Key, HashV),
@ -236,10 +246,73 @@ merge_trees(TreeA, TreeB) ->
MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}.
-spec get_segment(tuple(), integer()) -> integer().
%% @doc
%% Map the key to a segmen.
get_segment(Key, SegmentCount) -> get_segment(Key, SegmentCount) ->
erlang:phash2(Key) band (SegmentCount - 1). erlang:phash2(Key) band (SegmentCount - 1).
-spec match_hour(tuple(), tuple(), fun((_,_) -> tuple()), tuple(), integer())
-> {integer(), integer()}|no_match.
%% @doc
%% Match the modified date of the object to an hour of day, where the hour of
%% the day is within a threshold. Used for identifying recently added keys and
%% mapping those keys to the right tictac tree of recent additions
%%
%% The ModDateFun must return a datetime tuple e.g. {{Y, M, D}, {H, M, S}}
match_hour(Key, Value, ModDateFun, Now, HoursToKeep) ->
{ModDate, {ModHr, _ModMin, _ModSec}} = ModDateFun(Key, Value),
{NowDate, {NowHr, _NowMin, _NowSec}} = calendar:now_to_datetime(Now),
{DayDiff, {HourDiff, _MinDiff, _SecDiff}}
= calendar:time_difference({ModDate, {ModHr, 0, 0}},
{NowDate, {NowHr, 0, 0}}),
case HoursToKeep >= DayDiff * 24 + HourDiff of
true ->
{ModDate, ModHr};
false ->
no_match
end.
-spec add_recent_kv(tuple(), tuple(),
fun((_,_) -> integer()),
integer(), recenttrees()) -> recenttrees().
%% @doc
%% Add a recently modified key and value to the appropriate tree of recent
%% keys and values.
add_recent_kv({ModDate, ModHour}, {Key, Value},
HashFun, HoursToKeep, RecentTrees) ->
case lists:keyfind({ModDate, ModHour}, 1, RecentTrees#recenttrees.trees) of
{{ModDate, ModHour}, Tree0} ->
Tree1 = add_kv(Tree0, Key, Value, HashFun),
RT1 = lists:keyreplace({ModDate, ModHour},
1,
RecentTrees#recenttrees.trees,
{{ModDate, ModHour}, Tree1}),
RecentTrees#recenttrees{trees = RT1};
not_found ->
NT0 = new_tree(recent, RecentTrees#recenttrees.size),
NT1 = add_kv(NT0, Key, Value, HashFun),
RT0 = [{{ModDate, ModHour}, NT1}|RecentTrees#recenttrees.trees],
case length(RT0) > HoursToKeep of
true ->
FoldFun =
fun({K, _V} , Acc) ->
case K < Acc of
true ->
K;
false ->
Acc
end
end,
OldestK = lists:foldl(FoldFun, NT1, {ModDate, ModHour}),
RT1 = lists:keydelete(OldestK, 1, RT0),
RecentTrees#recenttrees{trees = RT1};
false ->
RecentTrees#recenttrees{trees = RT0}
end
end.
%%%============================================================================ %%%============================================================================
%%% Internal functions %%% Internal functions
%%%============================================================================ %%%============================================================================

View file

@ -67,7 +67,7 @@ many_put_compare(_Config) ->
{ok, Bookie3} = leveled_bookie:book_start(StartOpts3), {ok, Bookie3} = leveled_bookie:book_start(StartOpts3),
lists:foreach(fun(ObjL) -> testutil:riakload(Bookie3, ObjL) end, CLs), lists:foreach(fun(ObjL) -> testutil:riakload(Bookie3, ObjL) end, CLs),
% Now run a tictac query against both stores to see th extent to which % Now run a tictac query against both stores to see the extent to which
% state between stores is consistent % state between stores is consistent
TicTacQ = {tictactree_obj, TicTacQ = {tictactree_obj,