From 9fca17d56a90151bdc4c0da4d5b4a930bb50d486 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 26 Jun 2017 13:26:08 +0100 Subject: [PATCH] WIP - Recent Modifications Just some initial WIP code for this. Will revisit this again after exploring some ideas as to how to reduce the cost of the get_keys_by_segment. The overlal idea is that there are trees of recent modifications, with recent being some rolling time window made up of hourly blocks, and recency being dtermined by the last-modified date on the object metadata - which should be conistent across a cluster. So if we were at 15:30 we would get the tree for 14:00 - 15:00 and the tree for 15:00-16:00 from two different queries which cover the same partitions and then compare. Comparison may find differences, and we know what segment the difference is in - but how to then find all keys in that segment which have been modified in the period? Three ways: Do it inefficeintly and infrequently using a fold_keys and a filter (perhaps with SST files having a highest LMD in the metadata so that they can be skipped). Add a special index, where verye entry has a TTL, and the Key is {$segment, Segment, Bucket, Key} so that a normal 2i query cna be used. Align hashing for segments with hashing for penciller lookup so that a query over the actual keys cna be optimised skipping chunks of the in-memory part, and chunks of the SST file --- src/leveled_tictac.erl | 79 ++++++++++++++++++++++++++++++-- test/end_to_end/tictac_SUITE.erl | 2 +- 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 963521a..36ece72 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -84,7 +84,12 @@ level2 :: any() % an array - but OTP compatibility }). +-record(recenttrees, {trees :: list(), + size:: small|medium|large|xlarge + }). + -type tictactree() :: #tictactree{}. +-type recenttrees() :: #recenttrees{}. %%%============================================================================ %%% External functions @@ -95,7 +100,10 @@ %% Create a new tree, zeroed out. new_tree(TreeID) -> new_tree(TreeID, small). - + +-spec new_tree(any(), small|medium|large|xlarge) -> tictactree(). +%% @doc +%% Create a new tree, zeroed out. Specify the t-shirt siz eof the tree new_tree(TreeID, Size) -> {BitWidth, Width, SegmentCount} = case Size of @@ -121,10 +129,12 @@ new_tree(TreeID, Size) -> level1 = Lv1Init, level2 = Lv2Init}. --spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree(). +-spec add_kv(tictactree(), tuple(), tuple(), fun((_,_) -> integer())) -> + tictactree(). %% @doc %% Add a Key and value to a tictactree using the HashFun to calculate the Hash -%% based on that key and value +%% based on that key and value (or extract the Hash if it is present within +%% the value). add_kv(TicTacTree, Key, Value, HashFun) -> HashV = HashFun(Key, Value), SegChangeHash = erlang:phash2(Key, HashV), @@ -236,10 +246,73 @@ merge_trees(TreeA, TreeB) -> MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. +-spec get_segment(tuple(), integer()) -> integer(). +%% @doc +%% Map the key to a segmen. get_segment(Key, SegmentCount) -> erlang:phash2(Key) band (SegmentCount - 1). +-spec match_hour(tuple(), tuple(), fun((_,_) -> tuple()), tuple(), integer()) + -> {integer(), integer()}|no_match. +%% @doc +%% Match the modified date of the object to an hour of day, where the hour of +%% the day is within a threshold. Used for identifying recently added keys and +%% mapping those keys to the right tictac tree of recent additions +%% +%% The ModDateFun must return a datetime tuple e.g. {{Y, M, D}, {H, M, S}} +match_hour(Key, Value, ModDateFun, Now, HoursToKeep) -> + {ModDate, {ModHr, _ModMin, _ModSec}} = ModDateFun(Key, Value), + {NowDate, {NowHr, _NowMin, _NowSec}} = calendar:now_to_datetime(Now), + {DayDiff, {HourDiff, _MinDiff, _SecDiff}} + = calendar:time_difference({ModDate, {ModHr, 0, 0}}, + {NowDate, {NowHr, 0, 0}}), + case HoursToKeep >= DayDiff * 24 + HourDiff of + true -> + {ModDate, ModHr}; + false -> + no_match + end. + +-spec add_recent_kv(tuple(), tuple(), + fun((_,_) -> integer()), + integer(), recenttrees()) -> recenttrees(). +%% @doc +%% Add a recently modified key and value to the appropriate tree of recent +%% keys and values. +add_recent_kv({ModDate, ModHour}, {Key, Value}, + HashFun, HoursToKeep, RecentTrees) -> + case lists:keyfind({ModDate, ModHour}, 1, RecentTrees#recenttrees.trees) of + {{ModDate, ModHour}, Tree0} -> + Tree1 = add_kv(Tree0, Key, Value, HashFun), + RT1 = lists:keyreplace({ModDate, ModHour}, + 1, + RecentTrees#recenttrees.trees, + {{ModDate, ModHour}, Tree1}), + RecentTrees#recenttrees{trees = RT1}; + not_found -> + NT0 = new_tree(recent, RecentTrees#recenttrees.size), + NT1 = add_kv(NT0, Key, Value, HashFun), + RT0 = [{{ModDate, ModHour}, NT1}|RecentTrees#recenttrees.trees], + case length(RT0) > HoursToKeep of + true -> + FoldFun = + fun({K, _V} , Acc) -> + case K < Acc of + true -> + K; + false -> + Acc + end + end, + OldestK = lists:foldl(FoldFun, NT1, {ModDate, ModHour}), + RT1 = lists:keydelete(OldestK, 1, RT0), + RecentTrees#recenttrees{trees = RT1}; + false -> + RecentTrees#recenttrees{trees = RT0} + end + end. + %%%============================================================================ %%% Internal functions %%%============================================================================ diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 2c2b3f8..d034463 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -67,7 +67,7 @@ many_put_compare(_Config) -> {ok, Bookie3} = leveled_bookie:book_start(StartOpts3), lists:foreach(fun(ObjL) -> testutil:riakload(Bookie3, ObjL) end, CLs), - % Now run a tictac query against both stores to see th extent to which + % Now run a tictac query against both stores to see the extent to which % state between stores is consistent TicTacQ = {tictactree_obj,