From 86b11803c97897774bdafcc6c2f500f9ecfdf908 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 15 Jun 2017 15:40:23 +0100 Subject: [PATCH 01/58] Build and compare Build and compare of tictac trees. These are mergable merkle trees that are not cryptographically secure. --- src/leveled_tictac.erl | 213 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 src/leveled_tictac.erl diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl new file mode 100644 index 0000000..b6f2f90 --- /dev/null +++ b/src/leveled_tictac.erl @@ -0,0 +1,213 @@ +%% -------- TIC-TAC ACTOR --------- +%% +%% The TicTac actor is responsible for tracking the state of the store and +%% signalling that state to other trusted actors +%% +%% https://en.wikipedia.org/wiki/Tic-tac +%% +%% This is achieved through the exchange of merkle trees, but *not* trees that +%% are secure to interference - there is no attempt to protect the tree from +%% byzantine faults or tampering. The tree is only suited for use between +%% trusted actors across secure channels. +%% +%% In dropping the cryptographic security requirement, a simpler tree is +%% possible, and also one that allows for trees of a partitioned database to +%% be quickly merged to represent a global view of state for the database +%% across the partition boundaries. +%% +%% -------- PERSPECTIVES OF STATE --------- +%% +%% The insecure Merkle trees (Tic-Tac Trees) are intended to be used in two +%% ways: +%% - To support the building of a merkle tree across a coverage plan to +%% represent global state across many stores (or vnodes) i.e. scanning over +%% the real data by bucket, by key range or by index. +%% - To track changes with "recent" modification dates. +%% +%% -------- TIC-TAC TREES --------- +%% +%% The Tic-Tac tree takes is split into 256 * 4096 different segments. Every +%% key is hashed to map it to one of those segment leaves using the +%% elrang:phash2 function. +%% +%% External to the leveled_tictac module, the value should also have been +%% hashed to a 4-byte integer (presumably based on a tag-specific hash +%% function). The combination of the Object Key and the Hash is then +%% hashed together to get a segment-change hash. +%% +%% To change a segment-leaf hash, the segment-leaf hash is XORd with the +%% segment-change hash associated with the changing key. This assumes that +%% only one version of the key is ever added to the segment-leaf hash if the +%% tree is to represent the state of store (or partition of the store. If +%% not, the segment-leaf hash can only represent a history of changes under +%% that leaf, not the current state (unless the previous segment-change hash +%% for the key is removed by XORing it once more from the segment-leaf hash +%% that already contains it). +%% +%% A Level 1 hash is then created by XORing the 4096 Level 2 segment-hashes +%% in the level below it (or XORing both the previous version and the new +%% version of the segment-leaf hash from the previous level 1 hash). +%% + + +-module(leveled_tictac). + +% -behaviour(gen_server). + +-include("include/leveled.hrl"). + +-export([]). + + + +-include_lib("eunit/include/eunit.hrl"). + +-define(LEVEL1_WIDTH, 256). +-define(LEVEL2_WIDTH, 4096). +-define(LEVEL2_BITWIDTH, 12). +-define(SEGMENT_COUNT, ?LEVEL1_WIDTH * ?LEVEL2_WIDTH). +-define(HASH_SIZE, 4). + +-record(tictactree, {treeID ::integer(), + level1 :: binary(), + level2 :: array:array()}). + +%%%============================================================================ +%%% API +%%%============================================================================ + + + +%%%============================================================================ +%%% External functions +%%%============================================================================ + + +new_tree(TreeID) -> + Lv1Width = ?LEVEL1_WIDTH * ?HASH_SIZE * 8, + Lv1Init = <<0:Lv1Width/integer>>, + Lv2SegBinSize = ?LEVEL2_WIDTH * ?HASH_SIZE * 8, + Lv2SegBinInit = <<0:Lv2SegBinSize/integer>>, + Lv2Init = array:new([{size, ?LEVEL1_WIDTH}, {default, Lv2SegBinInit}]), + #tictactree{treeID = TreeID, level1 = Lv1Init, level2 = Lv2Init}. + + + +add_kv(TicTacTree, Key, Value, HashFun) -> + HashV = HashFun(Key, Value), + SegChangeHash = erlang:phash2(Key, HashV), + Segment = get_segment(Key), + + Level2Pos = Segment band (?LEVEL2_WIDTH - 1), + Level1Pos = (Segment bsr ?LEVEL2_BITWIDTH) band (?LEVEL1_WIDTH - 1), + Level2BytePos = ?HASH_SIZE * Level2Pos, + Level1BytePos = ?HASH_SIZE * Level1Pos, + + Level2 = array:get(Level1Pos, TicTacTree#tictactree.level2), + + HashIntLength = ?HASH_SIZE * 8, + <> = Level2, + <> = TicTacTree#tictactree.level1, + + SegLeaf2Upd = SegLeaf2 bxor SegChangeHash, + SegLeaf1Upd = SegLeaf1 bxor SegLeaf2 bxor SegLeaf2Upd, + + Level1Upd = <>, + Level2Upd = <>, + TicTacTree#tictactree{level1 = Level1Upd, + level2 = array:set(Level1Pos, + Level2Upd, + TicTacTree#tictactree.level2)}. + + +find_dirtyleaves(SrcTree, SinkTree) -> + IdxList = segmentcompare(SrcTree#tictactree.level1, + SinkTree#tictactree.level1), + + FoldFun = + fun(Idx, Acc) -> + L2IdxList = + segmentcompare(array:get(Idx, SrcTree#tictactree.level2), + array:get(Idx, SinkTree#tictactree.level2)), + + Acc ++ lists:map(fun(X) -> X + Idx * ?LEVEL2_WIDTH end, L2IdxList) + end, + + lists:sort(lists:foldl(FoldFun, [], IdxList)). + + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +segmentcompare(SrcBin, SinkBin) when byte_size(SrcBin)==byte_size(SinkBin) -> + segmentcompare(SrcBin, SinkBin, [], 0). + +segmentcompare(<<>>, <<>>, Acc, _Counter) -> + Acc; +segmentcompare(SrcBin, SnkBin, Acc, Counter) -> + <> = SrcBin, + <> = SnkBin, + case SrcHash of + SnkHash -> + segmentcompare(SrcTail, SnkTail, Acc, Counter + 1); + _ -> + segmentcompare(SrcTail, SnkTail, [Counter|Acc], Counter + 1) + end. + +get_segment(Key) -> + erlang:phash2(Key) band (?SEGMENT_COUNT - 1). + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + + +simple_test() -> + HashFun = fun(_K, V) -> erlang:phash2(V) end, + + Tree0 = new_tree(0), + Tree1 = add_kv(Tree0, "K1", 1, HashFun), + Tree2 = add_kv(Tree1, "K2", 2, HashFun), + Tree3 = add_kv(Tree2, "K3", 3, HashFun), + Tree3A = add_kv(Tree3, "K3", 4, HashFun), + ?assertMatch(true, Tree0#tictactree.level1 == Tree0#tictactree.level1), + ?assertMatch(false, Tree0#tictactree.level1 == Tree1#tictactree.level1), + ?assertMatch(false, Tree1#tictactree.level1 == Tree2#tictactree.level1), + ?assertMatch(false, Tree2#tictactree.level1 == Tree3#tictactree.level1), + ?assertMatch(false, Tree3#tictactree.level1 == Tree3A#tictactree.level1), + + Tree0X = new_tree(0), + Tree1X = add_kv(Tree0X, "K3", 3, HashFun), + Tree2X = add_kv(Tree1X, "K1", 1, HashFun), + Tree3X = add_kv(Tree2X, "K2", 2, HashFun), + Tree3XA = add_kv(Tree3X, "K3", 4, HashFun), + ?assertMatch(false, Tree1#tictactree.level1 == Tree1X#tictactree.level1), + ?assertMatch(false, Tree2#tictactree.level1 == Tree2X#tictactree.level1), + ?assertMatch(true, Tree3#tictactree.level1 == Tree3X#tictactree.level1), + ?assertMatch(true, Tree3XA#tictactree.level1 == Tree3XA#tictactree.level1), + + DL0 = find_dirtyleaves(Tree1, Tree0), + ?assertMatch(true, lists:member(get_segment("K1"), DL0)), + DL1 = find_dirtyleaves(Tree3, Tree1), + ?assertMatch(true, lists:member(get_segment("K2"), DL1)), + ?assertMatch(true, lists:member(get_segment("K3"), DL1)), + ?assertMatch(false, lists:member(get_segment("K1"), DL1)). + + +-endif. + + + + From 959e7f932f5c31dabac87f88150a65504c3368a8 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 15 Jun 2017 16:16:19 +0100 Subject: [PATCH 02/58] Add simple merge Allow for tictac trees to be merged --- src/leveled_tictac.erl | 53 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index b6f2f90..f9e7cdf 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -68,7 +68,7 @@ -define(SEGMENT_COUNT, ?LEVEL1_WIDTH * ?LEVEL2_WIDTH). -define(HASH_SIZE, 4). --record(tictactree, {treeID ::integer(), +-record(tictactree, {treeID :: any(), level1 :: binary(), level2 :: array:array()}). @@ -143,6 +143,25 @@ find_dirtyleaves(SrcTree, SinkTree) -> lists:sort(lists:foldl(FoldFun, [], IdxList)). +merge_trees(TreeA, TreeB) -> + MergedTree = new_tree(merge), + + L1A = TreeA#tictactree.level1, + L1B = TreeB#tictactree.level1, + NewLevel1 = merge_binaries(L1A, L1B), + + MergeFun = + fun(SQN, MergeL2) -> + L2A = array:get(SQN, TreeA#tictactree.level2), + L2B = array:get(SQN, TreeA#tictactree.level2), + NewLevel2 = merge_binaries(L2A, L2B), + array:set(SQN, NewLevel2, MergeL2) + end, + NewLevel2 = lists:foldl(MergeFun, + MergedTree#tictactree.level2, + lists:seq(0, ?LEVEL1_WIDTH - 1)), + + MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. %%%============================================================================ %%% Internal functions @@ -166,6 +185,13 @@ segmentcompare(SrcBin, SnkBin, Acc, Counter) -> get_segment(Key) -> erlang:phash2(Key) band (?SEGMENT_COUNT - 1). +merge_binaries(BinA, BinB) -> + BitSize = bit_size(BinA), + BitSize = bit_size(BinB), + <> = BinA, + <> = BinB, + MergedInt = AInt bxor BInt, + <>. %%%============================================================================ %%% Test @@ -205,6 +231,31 @@ simple_test() -> ?assertMatch(true, lists:member(get_segment("K3"), DL1)), ?assertMatch(false, lists:member(get_segment("K1"), DL1)). +merge_test() -> + HashFun = fun(_K, V) -> erlang:phash2(V) end, + + TreeX0 = new_tree(0), + TreeX1 = add_kv(TreeX0, "X1", 1, HashFun), + TreeX2 = add_kv(TreeX1, "X2", 2, HashFun), + TreeX3 = add_kv(TreeX2, "X3", 3, HashFun), + TreeX4 = add_kv(TreeX3, "X3", 4, HashFun), + + TreeY0 = new_tree(0), + TreeY1 = add_kv(TreeY0, "Y1", 101, HashFun), + TreeY2 = add_kv(TreeY1, "Y2", 102, HashFun), + TreeY3 = add_kv(TreeY2, "Y3", 103, HashFun), + TreeY4 = add_kv(TreeY3, "Y3", 104, HashFun), + + TreeZ1 = add_kv(TreeX4, "Y1", 101, HashFun), + TreeZ2 = add_kv(TreeZ1, "Y2", 102, HashFun), + TreeZ3 = add_kv(TreeZ2, "Y3", 103, HashFun), + TreeZ4 = add_kv(TreeZ3, "Y3", 104, HashFun), + + TreeM0 = merge_trees(TreeX4, TreeY4), + ?assertMatch(true, TreeM0#tictactree.level1 == TreeZ4#tictactree.level1), + + TreeM1 = merge_trees(TreeX3, TreeY4), + ?assertMatch(false, TreeM1#tictactree.level1 == TreeZ4#tictactree.level1). -endif. From 7642aac2cc90f65b8e6bb327c755caf69788f01f Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 16 Jun 2017 10:14:24 +0100 Subject: [PATCH 03/58] Change Riak object hash approach Change the riak object hash being kept in the metadata, to being a hash of the vector clock --- src/leveled_bookie.erl | 10 +++++----- src/leveled_codec.erl | 7 +++++-- test/end_to_end/recovery_SUITE.erl | 10 +++++++--- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index d12687c..db69a7e 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1511,11 +1511,11 @@ hashtree_query_test() -> ?STD_TAG, false}), KeyHashList = HTFolder(), - lists:foreach(fun({B, _K, H}) -> - ?assertMatch("Bucket", B), - ?assertMatch(true, is_integer(H)) - end, - KeyHashList), + lists:foreach(fun({B, _K, H}) -> + ?assertMatch("Bucket", B), + ?assertMatch(true, is_integer(H)) + end, + KeyHashList), ?assertMatch(1200, length(KeyHashList)), ok = book_close(Bookie1), {ok, Bookie2} = book_start([{root_path, RootPath}, diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 1d2bc13..9179687 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -445,8 +445,11 @@ build_metadata_object(PrimaryKey, MD) -> riak_extract_metadata(delete, Size) -> {delete, null, null, Size}; riak_extract_metadata(ObjBin, Size) -> - {Vclock, SibBin} = riak_metadata_from_binary(ObjBin), - {SibBin, Vclock, erlang:phash2(ObjBin), Size}. + {VclockBin, SibBin} = riak_metadata_from_binary(ObjBin), + {SibBin, + VclockBin, + erlang:phash2(lists:sort(binary_to_term(VclockBin))), + Size}. %% <>. diff --git a/test/end_to_end/recovery_SUITE.erl b/test/end_to_end/recovery_SUITE.erl index ccca1ea..778e036 100644 --- a/test/end_to_end/recovery_SUITE.erl +++ b/test/end_to_end/recovery_SUITE.erl @@ -80,7 +80,7 @@ recovr_strategy(_Config) -> Q = fun(RT) -> {index_query, "Bucket6", {fun testutil:foldkeysfun/3, []}, - {"idx1_bin", "#", "~"}, + {"idx1_bin", "#", "|"}, {RT, undefined}} end, {async, TFolder} = leveled_bookie:book_returnfolder(Book1, Q(true)), @@ -205,8 +205,12 @@ aae_bustedjournal(_Config) -> % Will need to remove the file or corrupt the hashtree to get presence to % fail - FoldObjectsFun = fun(B, K, V, Acc) -> [{B, K, erlang:phash2(V)}|Acc] - end, + FoldObjectsFun = + fun(B, K, V, Acc) -> + VC = testutil:get_vclock(V), + H = erlang:phash2(lists:sort(VC)), + [{B, K, H}|Acc] + end, SW = os:timestamp(), {async, HashTreeF3} = leveled_bookie:book_returnfolder(Bookie2, {foldobjects_allkeys, From f5dd154cee721f6ed7570620fa54e817ec7244f1 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 16 Jun 2017 12:38:49 +0100 Subject: [PATCH 04/58] Rename hashtree query Naming is now confusing now we have TicTac Trees. This query builds a list of keys and hashes not a tree - so it was misleading anyaway. Now renamed hashlist_query. --- src/leveled_bookie.erl | 22 +++++++++++----------- test/end_to_end/recovery_SUITE.erl | 6 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index db69a7e..919c970 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -308,7 +308,7 @@ book_head(Pid, Bucket, Key, Tag) -> %% {keylist, Tag, {FoldKeysFun, Acc}} -> list all keys with tag %% {keylist, Tag, Bucket, {FoldKeysFun, Acc}} -> list all keys within given %% bucket -%% {hashtree_query, Tag, JournalCheck} -> return keys and hashes for all +%% {hashlist_query, Tag, JournalCheck} -> return keys and hashes for all %% objects with a given tag %% {foldobjects_bybucket, Tag, Bucket, FoldObjectsFun} -> fold over all objects %% in a given bucket @@ -531,9 +531,9 @@ handle_call({return_folder, FolderType}, _From, State) -> {reply, bucketkey_query(State, Tag, Bucket, {FoldKeysFun, Acc}), State}; - {hashtree_query, Tag, JournalCheck} -> + {hashlist_query, Tag, JournalCheck} -> {reply, - hashtree_query(State, Tag, JournalCheck), + hashlist_query(State, Tag, JournalCheck), State}; {foldheads_allkeys, Tag, FoldHeadsFun} -> {reply, @@ -818,7 +818,7 @@ index_query(State, {async, Folder}. -hashtree_query(State, Tag, JournalCheck) -> +hashlist_query(State, Tag, JournalCheck) -> SnapType = case JournalCheck of false -> ledger; @@ -1484,7 +1484,7 @@ ttl_test() -> ok = book_close(Bookie2), reset_filestructure(). -hashtree_query_test() -> +hashlist_query_test() -> RootPath = reset_filestructure(), {ok, Bookie1} = book_start([{root_path, RootPath}, {max_journalsize, 1000000}, @@ -1507,7 +1507,7 @@ hashtree_query_test() -> ObjL2), % Scan the store for the Bucket, Keys and Hashes {async, HTFolder} = book_returnfolder(Bookie1, - {hashtree_query, + {hashlist_query, ?STD_TAG, false}), KeyHashList = HTFolder(), @@ -1522,7 +1522,7 @@ hashtree_query_test() -> {max_journalsize, 200000}, {cache_size, 500}]), {async, HTFolder2} = book_returnfolder(Bookie2, - {hashtree_query, + {hashlist_query, ?STD_TAG, false}), L0 = length(KeyHashList), @@ -1532,7 +1532,7 @@ hashtree_query_test() -> ok = book_close(Bookie2), reset_filestructure(). -hashtree_query_withjournalcheck_test() -> +hashlist_query_withjournalcheck_test() -> RootPath = reset_filestructure(), {ok, Bookie1} = book_start([{root_path, RootPath}, {max_journalsize, 1000000}, @@ -1546,12 +1546,12 @@ hashtree_query_withjournalcheck_test() -> Future) end, ObjL1), {async, HTFolder1} = book_returnfolder(Bookie1, - {hashtree_query, + {hashlist_query, ?STD_TAG, false}), KeyHashList = HTFolder1(), {async, HTFolder2} = book_returnfolder(Bookie1, - {hashtree_query, + {hashlist_query, ?STD_TAG, check_presence}), ?assertMatch(KeyHashList, HTFolder2()), @@ -1572,7 +1572,7 @@ foldobjects_vs_hashtree_test() -> Future) end, ObjL1), {async, HTFolder1} = book_returnfolder(Bookie1, - {hashtree_query, + {hashlist_query, ?STD_TAG, false}), KeyHashList1 = lists:usort(HTFolder1()), diff --git a/test/end_to_end/recovery_SUITE.erl b/test/end_to_end/recovery_SUITE.erl index 778e036..70d0b97 100644 --- a/test/end_to_end/recovery_SUITE.erl +++ b/test/end_to_end/recovery_SUITE.erl @@ -190,13 +190,13 @@ aae_bustedjournal(_Config) -> true = GetCount < HeadCount, {async, HashTreeF1} = leveled_bookie:book_returnfolder(Bookie2, - {hashtree_query, + {hashlist_query, ?RIAK_TAG, false}), KeyHashList1 = HashTreeF1(), 20001 = length(KeyHashList1), {async, HashTreeF2} = leveled_bookie:book_returnfolder(Bookie2, - {hashtree_query, + {hashlist_query, ?RIAK_TAG, check_presence}), KeyHashList2 = HashTreeF2(), @@ -268,7 +268,7 @@ aae_bustedjournal(_Config) -> length(KeyHashList5)]), {async, HashTreeF6} = leveled_bookie:book_returnfolder(Bookie4, - {hashtree_query, + {hashlist_query, ?RIAK_TAG, check_presence}), KeyHashList6 = HashTreeF6(), From 6ad98d77c583b87dde4f019130a47ba154c98dfa Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 16 Jun 2017 13:47:19 +0100 Subject: [PATCH 05/58] Spec module for dialyzer Add specs/docs for the leveled_tictac module. Dialyzer passes. --- src/leveled_tictac.erl | 120 ++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index f9e7cdf..4155807 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -56,7 +56,15 @@ -include("include/leveled.hrl"). --export([]). +-export([ + new_tree/1, + add_kv/4, + find_dirtyleaves/2, + find_dirtysegments/2, + fetch_root/1, + fetch_leaves/2, + merge_trees/2 + ]). @@ -72,6 +80,9 @@ level1 :: binary(), level2 :: array:array()}). + +-type tictactree() :: #tictactree{}. + %%%============================================================================ %%% API %%%============================================================================ @@ -82,7 +93,9 @@ %%% External functions %%%============================================================================ - +-spec new_tree(any()) -> tictactree(). +%% @doc +%% Create a new tree, zeroed out. new_tree(TreeID) -> Lv1Width = ?LEVEL1_WIDTH * ?HASH_SIZE * 8, Lv1Init = <<0:Lv1Width/integer>>, @@ -91,8 +104,10 @@ new_tree(TreeID) -> Lv2Init = array:new([{size, ?LEVEL1_WIDTH}, {default, Lv2SegBinInit}]), #tictactree{treeID = TreeID, level1 = Lv1Init, level2 = Lv2Init}. - - +-spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree(). +%% @doc +%% Add a Key and value to a tictactree using the HashFun to calculate the Hash +%% based on that key and value add_kv(TicTacTree, Key, Value, HashFun) -> HashV = HashFun(Key, Value), SegChangeHash = erlang:phash2(Key, HashV), @@ -126,23 +141,54 @@ add_kv(TicTacTree, Key, Value, HashFun) -> level2 = array:set(Level1Pos, Level2Upd, TicTacTree#tictactree.level2)}. - -find_dirtyleaves(SrcTree, SinkTree) -> - IdxList = segmentcompare(SrcTree#tictactree.level1, - SinkTree#tictactree.level1), +-spec find_dirtyleaves(tictactree(), tictactree()) -> list(integer()). +%% @doc +%% Returns a list of segment IDs that which hold differences between the state +%% represented by the two trees. +find_dirtyleaves(SrcTree, SnkTree) -> + IdxList = find_dirtysegments(fetch_root(SrcTree), fetch_root(SnkTree)), + SrcLeaves = fetch_leaves(SrcTree, IdxList), + SnkLeaves = fetch_leaves(SnkTree, IdxList), FoldFun = fun(Idx, Acc) -> - L2IdxList = - segmentcompare(array:get(Idx, SrcTree#tictactree.level2), - array:get(Idx, SinkTree#tictactree.level2)), - + {Idx, SrcLeaf} = lists:keyfind(Idx, 1, SrcLeaves), + {Idx, SnkLeaf} = lists:keyfind(Idx, 1, SnkLeaves), + L2IdxList = segmentcompare(SrcLeaf, SnkLeaf), Acc ++ lists:map(fun(X) -> X + Idx * ?LEVEL2_WIDTH end, L2IdxList) end, - lists:sort(lists:foldl(FoldFun, [], IdxList)). +-spec find_dirtysegments(binary(), binary()) -> list(integer()). +%% @doc +%% Returns a list of branch IDs that contain differences between the tress. +%% Pass in level 1 binaries to make the comparison. +find_dirtysegments(SrcBin, SinkBin) -> + segmentcompare(SrcBin, SinkBin). + +-spec fetch_root(tictactree()) -> binary(). +%% @doc +%% Return the level1 binary for a tree. +fetch_root(TicTacTree) -> + TicTacTree#tictactree.level1. + +-spec fetch_leaves(tictactree(), list(integer())) -> list(). +%% @doc +%% Return a keylist for the segment hashes for the leaves of the tree based on +%% the list of branch IDs provided +fetch_leaves(TicTacTree, BranchList) -> + MapFun = + fun(Idx) -> + {Idx, array:get(Idx, TicTacTree#tictactree.level2)} + end, + lists:map(MapFun, BranchList). + +-spec merge_trees(tictactree(), tictactree()) -> tictactree(). +%% Merge two trees providing a result that represents the combined state, +%% assuming that the two trees were correctly partitioned pre-merge. If a key +%% and value has been added to both trees, then the merge will not give the +%% expected outcome. merge_trees(TreeA, TreeB) -> MergedTree = new_tree(merge), @@ -204,10 +250,10 @@ simple_test() -> HashFun = fun(_K, V) -> erlang:phash2(V) end, Tree0 = new_tree(0), - Tree1 = add_kv(Tree0, "K1", 1, HashFun), - Tree2 = add_kv(Tree1, "K2", 2, HashFun), - Tree3 = add_kv(Tree2, "K3", 3, HashFun), - Tree3A = add_kv(Tree3, "K3", 4, HashFun), + Tree1 = add_kv(Tree0, {o, "B1", "K1", null}, {caine, 1}, HashFun), + Tree2 = add_kv(Tree1, {o, "B1", "K2", null}, {caine, 2}, HashFun), + Tree3 = add_kv(Tree2, {o, "B1", "K3", null}, {caine, 3}, HashFun), + Tree3A = add_kv(Tree3, {o, "B1", "K3", null}, {caine, 4}, HashFun), ?assertMatch(true, Tree0#tictactree.level1 == Tree0#tictactree.level1), ?assertMatch(false, Tree0#tictactree.level1 == Tree1#tictactree.level1), ?assertMatch(false, Tree1#tictactree.level1 == Tree2#tictactree.level1), @@ -215,41 +261,41 @@ simple_test() -> ?assertMatch(false, Tree3#tictactree.level1 == Tree3A#tictactree.level1), Tree0X = new_tree(0), - Tree1X = add_kv(Tree0X, "K3", 3, HashFun), - Tree2X = add_kv(Tree1X, "K1", 1, HashFun), - Tree3X = add_kv(Tree2X, "K2", 2, HashFun), - Tree3XA = add_kv(Tree3X, "K3", 4, HashFun), + Tree1X = add_kv(Tree0X, {o, "B1", "K3", null}, {caine, 3}, HashFun), + Tree2X = add_kv(Tree1X, {o, "B1", "K1", null}, {caine, 1}, HashFun), + Tree3X = add_kv(Tree2X, {o, "B1", "K2", null}, {caine, 2}, HashFun), + Tree3XA = add_kv(Tree3X, {o, "B1", "K3", null}, {caine, 4}, HashFun), ?assertMatch(false, Tree1#tictactree.level1 == Tree1X#tictactree.level1), ?assertMatch(false, Tree2#tictactree.level1 == Tree2X#tictactree.level1), ?assertMatch(true, Tree3#tictactree.level1 == Tree3X#tictactree.level1), ?assertMatch(true, Tree3XA#tictactree.level1 == Tree3XA#tictactree.level1), DL0 = find_dirtyleaves(Tree1, Tree0), - ?assertMatch(true, lists:member(get_segment("K1"), DL0)), + ?assertMatch(true, lists:member(get_segment({o, "B1", "K1", null}), DL0)), DL1 = find_dirtyleaves(Tree3, Tree1), - ?assertMatch(true, lists:member(get_segment("K2"), DL1)), - ?assertMatch(true, lists:member(get_segment("K3"), DL1)), - ?assertMatch(false, lists:member(get_segment("K1"), DL1)). + ?assertMatch(true, lists:member(get_segment({o, "B1", "K2", null}), DL1)), + ?assertMatch(true, lists:member(get_segment({o, "B1", "K3", null}), DL1)), + ?assertMatch(false, lists:member(get_segment({o, "B1", "K1", null}), DL1)). merge_test() -> HashFun = fun(_K, V) -> erlang:phash2(V) end, TreeX0 = new_tree(0), - TreeX1 = add_kv(TreeX0, "X1", 1, HashFun), - TreeX2 = add_kv(TreeX1, "X2", 2, HashFun), - TreeX3 = add_kv(TreeX2, "X3", 3, HashFun), - TreeX4 = add_kv(TreeX3, "X3", 4, HashFun), + TreeX1 = add_kv(TreeX0, {o, "B1", "X1", null}, {caine, 1}, HashFun), + TreeX2 = add_kv(TreeX1, {o, "B1", "X2", null}, {caine, 2}, HashFun), + TreeX3 = add_kv(TreeX2, {o, "B1", "X3", null}, {caine, 3}, HashFun), + TreeX4 = add_kv(TreeX3, {o, "B1", "X3", null}, {caine, 4}, HashFun), TreeY0 = new_tree(0), - TreeY1 = add_kv(TreeY0, "Y1", 101, HashFun), - TreeY2 = add_kv(TreeY1, "Y2", 102, HashFun), - TreeY3 = add_kv(TreeY2, "Y3", 103, HashFun), - TreeY4 = add_kv(TreeY3, "Y3", 104, HashFun), + TreeY1 = add_kv(TreeY0, {o, "B1", "Y1", null}, {caine, 101}, HashFun), + TreeY2 = add_kv(TreeY1, {o, "B1", "Y2", null}, {caine, 102}, HashFun), + TreeY3 = add_kv(TreeY2, {o, "B1", "Y3", null}, {caine, 103}, HashFun), + TreeY4 = add_kv(TreeY3, {o, "B1", "Y3", null}, {caine, 104}, HashFun), - TreeZ1 = add_kv(TreeX4, "Y1", 101, HashFun), - TreeZ2 = add_kv(TreeZ1, "Y2", 102, HashFun), - TreeZ3 = add_kv(TreeZ2, "Y3", 103, HashFun), - TreeZ4 = add_kv(TreeZ3, "Y3", 104, HashFun), + TreeZ1 = add_kv(TreeX4, {o, "B1", "Y1", null}, {caine, 101}, HashFun), + TreeZ2 = add_kv(TreeZ1, {o, "B1", "Y2", null}, {caine, 102}, HashFun), + TreeZ3 = add_kv(TreeZ2, {o, "B1", "Y3", null}, {caine, 103}, HashFun), + TreeZ4 = add_kv(TreeZ3, {o, "B1", "Y3", null}, {caine, 104}, HashFun), TreeM0 = merge_trees(TreeX4, TreeY4), ?assertMatch(true, TreeM0#tictactree.level1 == TreeZ4#tictactree.level1), From c586b78f45254a30cfbf2f164df88e351e4b659c Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 19 Jun 2017 11:36:57 +0100 Subject: [PATCH 06/58] Initial code with busted ct test Initiat comparison made betwene trees externally - but ct test is bust. --- src/leveled_bookie.erl | 166 +++++++++++++++++++++++++++---- src/leveled_codec.erl | 4 +- src/leveled_tictac.erl | 17 +--- test/end_to_end/testutil.erl | 21 ++-- test/end_to_end/tictac_SUITE.erl | 69 +++++++++++++ 5 files changed, 238 insertions(+), 39 deletions(-) create mode 100644 test/end_to_end/tictac_SUITE.erl diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 919c970..f1bca92 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -310,6 +310,18 @@ book_head(Pid, Bucket, Key, Tag) -> %% bucket %% {hashlist_query, Tag, JournalCheck} -> return keys and hashes for all %% objects with a given tag +%% {tictactree_idx, {Bucket, IdxField, StartValue, EndValue}, PartitionFilter} +%% -> compile a hashtree for the items on the index. A partition filter is +%% required to avoid adding an index entry in this vnode as a fallback. +%% There is no de-duplicate of results, duplicate reuslts corrupt the tree. +%% {tictactree_obj, {Bucket, StartKey, EndKey, CheckPresence}, PartitionFilter} +%% -> compile a hashtree for all the objects in the range. A partition filter +%% may be passed to restrict the query to a given partition on this vnode. The +%% filter should bea function that takes (Bucket, Key) as inputs and outputs +%% one of the atoms accumulate or pass. There is no de-duplicate of results, +%% duplicate reuslts corrupt the tree. +%% CheckPresence can be used if there is a need to do a deeper check to ensure +%% that the object is in the Journal (or at least indexed within the Journal). %% {foldobjects_bybucket, Tag, Bucket, FoldObjectsFun} -> fold over all objects %% in a given bucket %% {foldobjects_byindex, @@ -535,6 +547,28 @@ handle_call({return_folder, FolderType}, _From, State) -> {reply, hashlist_query(State, Tag, JournalCheck), State}; + {tictactree_obj, + {Tag, Bucket, StartKey, EndKey, CheckPresence}, + PartitionFilter} -> + {reply, + tictactree(State, + Tag, + Bucket, + {StartKey, EndKey}, + CheckPresence, + PartitionFilter), + State}; + {tictactree_idx, + {Bucket, IdxField, StartValue, EndValue}, + PartitionFilter} -> + {reply, + tictactree(State, + ?IDX_TAG, + Bucket, + {IdxField, StartValue, EndValue}, + false, + PartitionFilter), + State}; {foldheads_allkeys, Tag, FoldHeadsFun} -> {reply, foldheads_allkeys(State, Tag, FoldHeadsFun), @@ -848,6 +882,76 @@ hashlist_query(State, Tag, JournalCheck) -> end, {async, Folder}. +tictactree(State, Tag, Bucket, Query, JournalCheck, Filter) -> + % Journal check can be used for object key folds to confirm that the + % object is still indexed within the journal + SnapType = case JournalCheck of + false -> + ledger; + check_presence -> + store + end, + {ok, LedgerSnapshot, JournalSnapshot} = snapshot_store(State, + SnapType, + no_lookup), + Tree = leveled_tictac:new_tree(temp), + Folder = + fun() -> + % The start key and end key will vary depending on whether the + % fold is to fold over an index or a key range + {StartKey, EndKey, HashFun} = + case Tag of + ?IDX_TAG -> + {IdxField, StartIdx, EndIdx} = Query, + HashIdxValFun = + fun(_Key, IdxValue) -> + erlang:phash2(IdxValue) + end, + {leveled_codec:to_ledgerkey(Bucket, + null, + ?IDX_TAG, + IdxField, + StartIdx), + leveled_codec:to_ledgerkey(Bucket, + null, + ?IDX_TAG, + IdxField, + EndIdx), + HashIdxValFun}; + _ -> + {StartObjKey, EndObjKey} = Query, + PassHashFun = fun(_Key, Hash) -> Hash end, + {leveled_codec:to_ledgerkey(Bucket, + StartObjKey, + Tag), + leveled_codec:to_ledgerkey(Bucket, + EndObjKey, + Tag), + PassHashFun} + end, + + AccFun = accumulate_tree(Filter, + JournalCheck, + JournalSnapshot, + HashFun), + Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot, + StartKey, + EndKey, + AccFun, + Tree), + + % Close down snapshot when complete so as not to hold removed + % files open + ok = leveled_penciller:pcl_close(LedgerSnapshot), + case JournalCheck of + false -> + ok; + check_presence -> + leveled_inker:ink_close(JournalSnapshot) + end, + Acc + end, + {async, Folder}. foldobjects_allkeys(State, Tag, FoldObjectsFun) -> StartKey = leveled_codec:to_ledgerkey(null, null, Tag), @@ -1088,27 +1192,51 @@ accumulate_size() -> AccFun. accumulate_hashes(JournalCheck, InkerClone) -> + AddKeyFun = + fun(B, K, H, Acc) -> + [{B, K, H}|Acc] + end, + get_hashaccumulator(JournalCheck, + InkerClone, + AddKeyFun). + +accumulate_tree(FilterFun, JournalCheck, InkerClone, HashFun) -> + AddKeyFun = + fun(B, K, H, Tree) -> + case FilterFun(B, K) of + accumulate -> + leveled_tictac:add_kv(Tree, K, H, HashFun); + pass -> + Tree + end + end, + get_hashaccumulator(JournalCheck, + InkerClone, + AddKeyFun). + +get_hashaccumulator(JournalCheck, InkerClone, AddKeyFun) -> Now = leveled_codec:integer_now(), - AccFun = fun(LK, V, KHList) -> - case leveled_codec:is_active(LK, V, Now) of - true -> - {B, K, H} = leveled_codec:get_keyandhash(LK, V), - Check = random:uniform() < ?CHECKJOURNAL_PROB, - case {JournalCheck, Check} of - {check_presence, true} -> - case check_presence(LK, V, InkerClone) of - true -> - [{B, K, H}|KHList]; - false -> - KHList - end; - _ -> - [{B, K, H}|KHList] + AccFun = + fun(LK, V, Acc) -> + case leveled_codec:is_active(LK, V, Now) of + true -> + {B, K, H} = leveled_codec:get_keyandhash(LK, V), + Check = random:uniform() < ?CHECKJOURNAL_PROB, + case {JournalCheck, Check} of + {check_presence, true} -> + case check_presence(LK, V, InkerClone) of + true -> + AddKeyFun(B, K, H, Acc); + false -> + Acc end; - false -> - KHList - end - end, + _ -> + AddKeyFun(B, K, H, Acc) + end; + false -> + Acc + end + end, AccFun. diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 9179687..42dcb28 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -427,7 +427,9 @@ get_keyandhash(LK, Value) -> {Bucket, Key, Hash}; ?STD_TAG -> {Hash, _Size} = MD, - {Bucket, Key, Hash} + {Bucket, Key, Hash}; + ?IDX_TAG -> + from_ledgerkey(LK) % returns {Bucket, Key, IdxValue} end. diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 4155807..3463a81 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -52,8 +52,6 @@ -module(leveled_tictac). -% -behaviour(gen_server). - -include("include/leveled.hrl"). -export([ @@ -67,7 +65,6 @@ ]). - -include_lib("eunit/include/eunit.hrl"). -define(LEVEL1_WIDTH, 256). @@ -78,17 +75,11 @@ -record(tictactree, {treeID :: any(), level1 :: binary(), - level2 :: array:array()}). - + level2 :: any() % an array - but OTP compatibility + }). -type tictactree() :: #tictactree{}. -%%%============================================================================ -%%% API -%%%============================================================================ - - - %%%============================================================================ %%% External functions %%%============================================================================ @@ -192,8 +183,8 @@ fetch_leaves(TicTacTree, BranchList) -> merge_trees(TreeA, TreeB) -> MergedTree = new_tree(merge), - L1A = TreeA#tictactree.level1, - L1B = TreeB#tictactree.level1, + L1A = fetch_root(TreeA), + L1B = fetch_root(TreeB), NewLevel1 = merge_binaries(L1A, L1B), MergeFun = diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index 923b81e..01bd26d 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -31,6 +31,7 @@ get_randomindexes_generator/1, name_list/0, load_objects/5, + load_objects/6, put_indexed_objects/3, put_altered_indexed_objects/3, put_altered_indexed_objects/4, @@ -52,6 +53,7 @@ -define(MD_LASTMOD, <<"X-Riak-Last-Modified">>). -define(MD_DELETED, <<"X-Riak-Deleted">>). -define(EMPTY_VTAG_BIN, <<"e">>). +-define(ROOT_PATH, "test"). %% ================================================= %% From riak_object @@ -169,13 +171,17 @@ riakload(Bookie, ObjectList) -> reset_filestructure() -> - reset_filestructure(0). + reset_filestructure(0, ?ROOT_PATH). -reset_filestructure(Wait) -> - io:format("Waiting ~w ms to give a chance for all file closes " ++ +reset_filestructure(Wait) when is_integer(Wait) -> + reset_filestructure(Wait, ?ROOT_PATH); +reset_filestructure(RootPath) when is_list(RootPath) -> + reset_filestructure(0, RootPath). + +reset_filestructure(Wait, RootPath) -> + io:format("Waiting ~w ms to give a chance for all file closes " ++ "to complete~n", [Wait]), - timer:sleep(Wait), - RootPath = "test", + timer:sleep(Wait), filelib:ensure_dir(RootPath ++ "/journal/"), filelib:ensure_dir(RootPath ++ "/ledger/"), leveled_inker:clean_testdir(RootPath ++ "/journal"), @@ -420,6 +426,9 @@ get_vclock(ObjectBin) -> binary_to_term(VclockBin). load_objects(ChunkSize, GenList, Bookie, TestObject, Generator) -> + load_objects(ChunkSize, GenList, Bookie, TestObject, Generator, 1000). + +load_objects(ChunkSize, GenList, Bookie, TestObject, Generator, SubListL) -> lists:map(fun(KN) -> ObjListA = Generator(ChunkSize, KN), StartWatchA = os:timestamp(), @@ -433,7 +442,7 @@ load_objects(ChunkSize, GenList, Bookie, TestObject, Generator) -> true -> check_forobject(Bookie, TestObject) end, - lists:sublist(ObjListA, 1000) end, + lists:sublist(ObjListA, SubListL) end, GenList). diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl new file mode 100644 index 0000000..88ff214 --- /dev/null +++ b/test/end_to_end/tictac_SUITE.erl @@ -0,0 +1,69 @@ +-module(tictac_SUITE). +-include_lib("common_test/include/ct.hrl"). +-include("include/leveled.hrl"). +-export([all/0]). +-export([ + many_put_compare/1 + ]). + +all() -> [ + many_put_compare + ]. + + +many_put_compare(_Config) -> + RootPathA = testutil:reset_filestructure("testA"), + StartOpts1 = [{root_path, RootPathA}, + {max_pencillercachesize, 16000}, + {sync_strategy, riak_sync}], + {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), + {TestObject, TestSpec} = testutil:generate_testobject(), + ok = testutil:book_riakput(Bookie1, TestObject, TestSpec), + testutil:check_forobject(Bookie1, TestObject), + ok = leveled_bookie:book_close(Bookie1), + StartOpts2 = [{root_path, RootPathA}, + {max_journalsize, 500000000}, + {max_pencillercachesize, 32000}, + {sync_strategy, testutil:sync_strategy()}], + {ok, Bookie2} = leveled_bookie:book_start(StartOpts2), + testutil:check_forobject(Bookie2, TestObject), + GenList = [2, 20002, 40002, 60002, 80002, + 100002, 120002, 140002, 160002, 180002], + CLs = testutil:load_objects(20000, + GenList, + Bookie2, + TestObject, + fun testutil:generate_smallobjects/2, + 20000), + + RootPathB = testutil:reset_filestructure("testB"), + StartOpts3 = [{root_path, RootPathB}, + {max_journalsize, 200000000}, + {max_pencillercachesize, 16000}, + {sync_strategy, testutil:sync_strategy()}], + {ok, Bookie3} = leveled_bookie:book_start(StartOpts3), + lists:foreach(fun(ObjL) -> testutil:riakload(Bookie3, ObjL) end, CLs), + + TicTacQ = {tictactree_obj, + {o_rkv, "Bucket", null, null, false}, + fun(_B, _K) -> accumulate end}, + {async, TreeAFolder} = leveled_bookie:book_returnfolder(Bookie2, TicTacQ), + {async, TreeBFolder} = leveled_bookie:book_returnfolder(Bookie3, TicTacQ), + SWA0 = os:timestamp(), + TreeA = TreeAFolder(), + io:format("Build tictac tree with 200K objects in ~w~n", + [timer:now_diff(os:timestamp(), SWA0)]), + SWB0 = os:timestamp(), + TreeB = TreeBFolder(), + io:format("Build tictac tree with 200K objects in ~w~n", + [timer:now_diff(os:timestamp(), SWB0)]), + SWC0 = os:timestamp(), + SegList = leveled_tictac:find_dirtyleaves(TreeA, TreeB), + io:format("Compare tictac trees with 200K objects in ~w~n", + [timer:now_diff(os:timestamp(), SWC0)]), + io:format("Tree comparison shows ~w different leaves~n", + [length(SegList)]), + true = length(SegList) == 1, + + ok = leveled_bookie:book_destroy(Bookie2), + ok = leveled_bookie:book_destroy(Bookie3). From 833c7a80cb1e9f4e03f6d49c13d7e9af6c0d3fc4 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 19 Jun 2017 13:11:43 +0100 Subject: [PATCH 07/58] corrected test differing object was in wrong bucket --- test/end_to_end/tictac_SUITE.erl | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 88ff214..c502275 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -17,7 +17,12 @@ many_put_compare(_Config) -> {max_pencillercachesize, 16000}, {sync_strategy, riak_sync}], {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), - {TestObject, TestSpec} = testutil:generate_testobject(), + {B1, K1, V1, S1, MD} = {"Bucket", + "Key1.1.4567.4321", + "Value1", + [], + [{"MDK1", "MDV1"}]}, + {TestObject, TestSpec} = testutil:generate_testobject(B1, K1, V1, S1, MD), ok = testutil:book_riakput(Bookie1, TestObject, TestSpec), testutil:check_forobject(Bookie1, TestObject), ok = leveled_bookie:book_close(Bookie1), @@ -63,7 +68,25 @@ many_put_compare(_Config) -> [timer:now_diff(os:timestamp(), SWC0)]), io:format("Tree comparison shows ~w different leaves~n", [length(SegList)]), - true = length(SegList) == 1, + AltList = leveled_tictac:find_dirtyleaves(TreeA, + leveled_tictac:new_tree(0)), + io:format("Tree comparison shows ~w altered leaves~n", + [length(AltList)]), + true = length(SegList) == 1, + % only the test object should be different + true = length(AltList) > 100000, + % check there are a significant number fo differences from empty - ok = leveled_bookie:book_destroy(Bookie2), - ok = leveled_bookie:book_destroy(Bookie3). + testutil:book_riakdelete(Bookie2, B1, K1, []), + {async, TreeAFolder0} = leveled_bookie:book_returnfolder(Bookie2, TicTacQ), + SWA1 = os:timestamp(), + TreeA0 = TreeAFolder0(), + io:format("Build tictac tree with 200K objects in ~w~n", + [timer:now_diff(os:timestamp(), SWA1)]), + + SegList0 = leveled_tictac:find_dirtyleaves(TreeA0, TreeB), + true = length(SegList0) == 0, + % Removed test object so tictac trees should match + + ok = leveled_bookie:book_close(Bookie2), + ok = leveled_bookie:book_close(Bookie3). From 8203487a11feb07535d9bf2ff0fb56de681f4f9e Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 19 Jun 2017 15:43:19 +0100 Subject: [PATCH 08/58] Expanded test ct testing of tictac trees now compares between differently partitioned stores. --- test/end_to_end/tictac_SUITE.erl | 103 ++++++++++++++++++++++++++++--- 1 file changed, 96 insertions(+), 7 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index c502275..819c8b1 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -12,7 +12,14 @@ all() -> [ many_put_compare(_Config) -> + % Test requires multiple different databases, so want to mount them all + % on individual file paths RootPathA = testutil:reset_filestructure("testA"), + RootPathB = testutil:reset_filestructure("testB"), + RootPathC = testutil:reset_filestructure("testC"), + RootPathD = testutil:reset_filestructure("testD"), + + % Start the first database, load a test object, close it, start it again StartOpts1 = [{root_path, RootPathA}, {max_pencillercachesize, 16000}, {sync_strategy, riak_sync}], @@ -32,6 +39,11 @@ many_put_compare(_Config) -> {sync_strategy, testutil:sync_strategy()}], {ok, Bookie2} = leveled_bookie:book_start(StartOpts2), testutil:check_forobject(Bookie2, TestObject), + + % Generate 200K objects to be sued within the test, and load them into + % the first store (outputting the generated objects as a list of lists) + % to be used elsewhere + GenList = [2, 20002, 40002, 60002, 80002, 100002, 120002, 140002, 160002, 180002], CLs = testutil:load_objects(20000, @@ -41,7 +53,9 @@ many_put_compare(_Config) -> fun testutil:generate_smallobjects/2, 20000), - RootPathB = testutil:reset_filestructure("testB"), + % Start a new store, and load the same objects (except fot the original + % test object) into this store + StartOpts3 = [{root_path, RootPathB}, {max_journalsize, 200000000}, {max_pencillercachesize, 16000}, @@ -49,6 +63,9 @@ many_put_compare(_Config) -> {ok, Bookie3} = leveled_bookie:book_start(StartOpts3), lists:foreach(fun(ObjL) -> testutil:riakload(Bookie3, ObjL) end, CLs), + % Now run a tictac query against both stores to see th extent to which + % state between stores is consistent + TicTacQ = {tictactree_obj, {o_rkv, "Bucket", null, null, false}, fun(_B, _K) -> accumulate end}, @@ -63,20 +80,23 @@ many_put_compare(_Config) -> io:format("Build tictac tree with 200K objects in ~w~n", [timer:now_diff(os:timestamp(), SWB0)]), SWC0 = os:timestamp(), - SegList = leveled_tictac:find_dirtyleaves(TreeA, TreeB), + SegList0 = leveled_tictac:find_dirtyleaves(TreeA, TreeB), io:format("Compare tictac trees with 200K objects in ~w~n", [timer:now_diff(os:timestamp(), SWC0)]), io:format("Tree comparison shows ~w different leaves~n", - [length(SegList)]), + [length(SegList0)]), AltList = leveled_tictac:find_dirtyleaves(TreeA, leveled_tictac:new_tree(0)), io:format("Tree comparison shows ~w altered leaves~n", [length(AltList)]), - true = length(SegList) == 1, + true = length(SegList0) == 1, % only the test object should be different true = length(AltList) > 100000, % check there are a significant number fo differences from empty + % Now remove the object which represents the difference between these + % stores and confirm that the tictac trees will now match + testutil:book_riakdelete(Bookie2, B1, K1, []), {async, TreeAFolder0} = leveled_bookie:book_returnfolder(Bookie2, TicTacQ), SWA1 = os:timestamp(), @@ -84,9 +104,78 @@ many_put_compare(_Config) -> io:format("Build tictac tree with 200K objects in ~w~n", [timer:now_diff(os:timestamp(), SWA1)]), - SegList0 = leveled_tictac:find_dirtyleaves(TreeA0, TreeB), - true = length(SegList0) == 0, + SegList1 = leveled_tictac:find_dirtyleaves(TreeA0, TreeB), + io:format("Tree comparison following delete shows ~w different leaves~n", + [length(SegList1)]), + true = length(SegList1) == 0, % Removed test object so tictac trees should match + + ok = testutil:book_riakput(Bookie3, TestObject, TestSpec), + {async, TreeBFolder0} = leveled_bookie:book_returnfolder(Bookie3, TicTacQ), + SWB1 = os:timestamp(), + TreeB0 = TreeBFolder0(), + io:format("Build tictac tree with 200K objects in ~w~n", + [timer:now_diff(os:timestamp(), SWB1)]), + SegList2 = leveled_tictac:find_dirtyleaves(TreeA0, TreeB0), + true = SegList2 == SegList0, + % There is an identical difference now the difference is on Bookie3 not + % Bookie 2 (compared to it being in Bookie2 not Bookie3) + ok = leveled_bookie:book_close(Bookie3), + + % Replace Bookie 3 with two stores Bookie 4 and Bookie 5 where the ojects + % have been randomly split between the stores + + StartOpts4 = [{root_path, RootPathC}, + {max_journalsize, 200000000}, + {max_pencillercachesize, 24000}, + {sync_strategy, testutil:sync_strategy()}], + {ok, Bookie4} = leveled_bookie:book_start(StartOpts4), + StartOpts5 = [{root_path, RootPathD}, + {max_journalsize, 200000000}, + {max_pencillercachesize, 24000}, + {sync_strategy, testutil:sync_strategy()}], + {ok, Bookie5} = leveled_bookie:book_start(StartOpts5), + + SplitFun = + fun(Obj) -> + case erlang:phash2(Obj) rem 2 of + 0 -> + true; + 1 -> + false + end + end, + lists:foreach(fun(ObjL) -> + {ObjLA, ObjLB} = lists:partition(SplitFun, ObjL), + testutil:riakload(Bookie4, ObjLA), + testutil:riakload(Bookie5, ObjLB) + end, + CLs), + + % query both the stores, then merge the trees - the result should be the + % same as the result from the tree created aginst the store with both + % partitions + + {async, TreeC0Folder} = leveled_bookie:book_returnfolder(Bookie4, TicTacQ), + {async, TreeC1Folder} = leveled_bookie:book_returnfolder(Bookie5, TicTacQ), + SWD0 = os:timestamp(), + TreeC0 = TreeC0Folder(), + io:format("Build tictac tree with 100K objects in ~w~n", + [timer:now_diff(os:timestamp(), SWD0)]), + SWD1 = os:timestamp(), + TreeC1 = TreeC1Folder(), + io:format("Build tictac tree with 100K objects in ~w~n", + [timer:now_diff(os:timestamp(), SWD1)]), + + TreeC2 = leveled_tictac:merge_trees(TreeC0, TreeC1), + SegList3 = leveled_tictac:find_dirtyleaves(TreeC2, TreeB), + io:format("Tree comparison following delete shows ~w different leaves~n", + [length(SegList3)]), + true = length(SegList3) == 0, + + ok = leveled_bookie:book_close(Bookie2), - ok = leveled_bookie:book_close(Bookie3). + ok = leveled_bookie:book_close(Bookie4), + ok = leveled_bookie:book_close(Bookie5). + From d5b4cb844fb6d6bfe4727aa5792e0ee47c9d932c Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 19 Jun 2017 18:38:55 +0100 Subject: [PATCH 09/58] Finding keys Progresses from a segment list to scanning for the keys in that segment --- src/leveled_tictac.erl | 10 ++++++---- test/end_to_end/tictac_SUITE.erl | 28 +++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 3463a81..b1e5846 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -61,7 +61,8 @@ find_dirtysegments/2, fetch_root/1, fetch_leaves/2, - merge_trees/2 + merge_trees/2, + get_segment/1 ]). @@ -200,6 +201,10 @@ merge_trees(TreeA, TreeB) -> MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. +get_segment(Key) -> + erlang:phash2(Key) band (?SEGMENT_COUNT - 1). + + %%%============================================================================ %%% Internal functions %%%============================================================================ @@ -219,9 +224,6 @@ segmentcompare(SrcBin, SnkBin, Acc, Counter) -> segmentcompare(SrcTail, SnkTail, [Counter|Acc], Counter + 1) end. -get_segment(Key) -> - erlang:phash2(Key) band (?SEGMENT_COUNT - 1). - merge_binaries(BinA, BinB) -> BitSize = bit_size(BinA), BitSize = bit_size(BinB), diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 819c8b1..6230dc9 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -92,7 +92,33 @@ many_put_compare(_Config) -> true = length(SegList0) == 1, % only the test object should be different true = length(AltList) > 100000, - % check there are a significant number fo differences from empty + % check there are a significant number of differences from empty + + FoldKeysFun = + fun(SegListToFind) -> + fun(_B, K, Acc) -> + Seg = leveled_tictac:get_segment(K), + case lists:member(Seg, SegListToFind) of + true -> + [K|Acc]; + false -> + Acc + end + end + end, + SegQuery = {keylist, o_rkv, "Bucket", {FoldKeysFun(SegList0), []}}, + {async, SegKeyFinder} = + leveled_bookie:book_returnfolder(Bookie2, SegQuery), + SWSKL0 = os:timestamp(), + SegKeyList = SegKeyFinder(), + io:format("Finding ~w keys in ~w dirty segments in ~w~n", + [length(SegKeyList), + length(SegList0), + timer:now_diff(os:timestamp(), SWSKL0)]), + + true = length(SegKeyList) >= 1, + true = length(SegKeyList) < 10, + true = lists:member("Key1.1.4567.4321", SegKeyList), % Now remove the object which represents the difference between these % stores and confirm that the tictac trees will now match From 7cfa392b6e592d99456dcbd06ba3694fd3a0989e Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 20 Jun 2017 10:58:13 +0100 Subject: [PATCH 10/58] Flexible TicTacTree sizes Allow tictac tree sizes to be flexible. Tested lots of different sizes. Having both level 1 and level 2 the same size seemed to be consistently quicker than trying to make either of the levels relatively wider. There's an 8% performance improvement if the SegmentCount is reduced by a quarter. --- src/leveled_bookie.erl | 18 ++++-- src/leveled_tictac.erl | 103 +++++++++++++++++++++++-------- test/end_to_end/tictac_SUITE.erl | 7 ++- 3 files changed, 95 insertions(+), 33 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index f1bca92..cb15133 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -310,11 +310,17 @@ book_head(Pid, Bucket, Key, Tag) -> %% bucket %% {hashlist_query, Tag, JournalCheck} -> return keys and hashes for all %% objects with a given tag -%% {tictactree_idx, {Bucket, IdxField, StartValue, EndValue}, PartitionFilter} +%% {tictactree_idx, +%% {Bucket, IdxField, StartValue, EndValue}, +%% TreeSize, +%% PartitionFilter} %% -> compile a hashtree for the items on the index. A partition filter is %% required to avoid adding an index entry in this vnode as a fallback. %% There is no de-duplicate of results, duplicate reuslts corrupt the tree. -%% {tictactree_obj, {Bucket, StartKey, EndKey, CheckPresence}, PartitionFilter} +%% {tictactree_obj, +%% {Bucket, StartKey, EndKey, CheckPresence}, +%% TreeSize, +%% PartitionFilter} %% -> compile a hashtree for all the objects in the range. A partition filter %% may be passed to restrict the query to a given partition on this vnode. The %% filter should bea function that takes (Bucket, Key) as inputs and outputs @@ -549,6 +555,7 @@ handle_call({return_folder, FolderType}, _From, State) -> State}; {tictactree_obj, {Tag, Bucket, StartKey, EndKey, CheckPresence}, + TreeSize, PartitionFilter} -> {reply, tictactree(State, @@ -556,10 +563,12 @@ handle_call({return_folder, FolderType}, _From, State) -> Bucket, {StartKey, EndKey}, CheckPresence, + TreeSize, PartitionFilter), State}; {tictactree_idx, {Bucket, IdxField, StartValue, EndValue}, + TreeSize, PartitionFilter} -> {reply, tictactree(State, @@ -567,6 +576,7 @@ handle_call({return_folder, FolderType}, _From, State) -> Bucket, {IdxField, StartValue, EndValue}, false, + TreeSize, PartitionFilter), State}; {foldheads_allkeys, Tag, FoldHeadsFun} -> @@ -882,7 +892,7 @@ hashlist_query(State, Tag, JournalCheck) -> end, {async, Folder}. -tictactree(State, Tag, Bucket, Query, JournalCheck, Filter) -> +tictactree(State, Tag, Bucket, Query, JournalCheck, TreeSize, Filter) -> % Journal check can be used for object key folds to confirm that the % object is still indexed within the journal SnapType = case JournalCheck of @@ -894,7 +904,7 @@ tictactree(State, Tag, Bucket, Query, JournalCheck, Filter) -> {ok, LedgerSnapshot, JournalSnapshot} = snapshot_store(State, SnapType, no_lookup), - Tree = leveled_tictac:new_tree(temp), + Tree = leveled_tictac:new_tree(temp, TreeSize), Folder = fun() -> % The start key and end key will vary depending on whether the diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index b1e5846..b14642a 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -56,25 +56,30 @@ -export([ new_tree/1, + new_tree/2, add_kv/4, find_dirtyleaves/2, find_dirtysegments/2, fetch_root/1, fetch_leaves/2, merge_trees/2, - get_segment/1 + get_segment/2 ]). -include_lib("eunit/include/eunit.hrl"). --define(LEVEL1_WIDTH, 256). --define(LEVEL2_WIDTH, 4096). --define(LEVEL2_BITWIDTH, 12). --define(SEGMENT_COUNT, ?LEVEL1_WIDTH * ?LEVEL2_WIDTH). -define(HASH_SIZE, 4). +-define(SMALL, {8, 256, 256 * 256}). +-define(MEDIUM, {9, 512, 512 * 512}). +-define(LARGE, {10, 1024, 1024 * 1024}). +-define(XLARGE, {11, 2048, 2048 * 2048}). -record(tictactree, {treeID :: any(), + size :: small|medium|large|xlarge, + width :: integer(), + bitwidth :: integer(), + segment_count :: integer(), level1 :: binary(), level2 :: any() % an array - but OTP compatibility }). @@ -89,12 +94,32 @@ %% @doc %% Create a new tree, zeroed out. new_tree(TreeID) -> - Lv1Width = ?LEVEL1_WIDTH * ?HASH_SIZE * 8, + new_tree(TreeID, small). + +new_tree(TreeID, Size) -> + {BitWidth, Width, SegmentCount} = + case Size of + small -> + ?SMALL; + medium -> + ?MEDIUM; + large -> + ?LARGE; + xlarge -> + ?XLARGE + end, + Lv1Width = Width * ?HASH_SIZE * 8, Lv1Init = <<0:Lv1Width/integer>>, - Lv2SegBinSize = ?LEVEL2_WIDTH * ?HASH_SIZE * 8, + Lv2SegBinSize = Width * ?HASH_SIZE * 8, Lv2SegBinInit = <<0:Lv2SegBinSize/integer>>, - Lv2Init = array:new([{size, ?LEVEL1_WIDTH}, {default, Lv2SegBinInit}]), - #tictactree{treeID = TreeID, level1 = Lv1Init, level2 = Lv2Init}. + Lv2Init = array:new([{size, Width}, {default, Lv2SegBinInit}]), + #tictactree{treeID = TreeID, + size = Size, + width = Width, + bitwidth = BitWidth, + segment_count = SegmentCount, + level1 = Lv1Init, + level2 = Lv2Init}. -spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree(). %% @doc @@ -103,10 +128,13 @@ new_tree(TreeID) -> add_kv(TicTacTree, Key, Value, HashFun) -> HashV = HashFun(Key, Value), SegChangeHash = erlang:phash2(Key, HashV), - Segment = get_segment(Key), + Segment = get_segment(Key, TicTacTree#tictactree.segment_count), - Level2Pos = Segment band (?LEVEL2_WIDTH - 1), - Level1Pos = (Segment bsr ?LEVEL2_BITWIDTH) band (?LEVEL1_WIDTH - 1), + Level2Pos = + Segment band (TicTacTree#tictactree.width - 1), + Level1Pos = + (Segment bsr TicTacTree#tictactree.bitwidth) + band (TicTacTree#tictactree.width - 1), Level2BytePos = ?HASH_SIZE * Level2Pos, Level1BytePos = ?HASH_SIZE * Level1Pos, @@ -139,6 +167,10 @@ add_kv(TicTacTree, Key, Value, HashFun) -> %% Returns a list of segment IDs that which hold differences between the state %% represented by the two trees. find_dirtyleaves(SrcTree, SnkTree) -> + _Size = SrcTree#tictactree.size, + _Size = SnkTree#tictactree.size, + Width = SrcTree#tictactree.width, + IdxList = find_dirtysegments(fetch_root(SrcTree), fetch_root(SnkTree)), SrcLeaves = fetch_leaves(SrcTree, IdxList), SnkLeaves = fetch_leaves(SnkTree, IdxList), @@ -148,7 +180,7 @@ find_dirtyleaves(SrcTree, SnkTree) -> {Idx, SrcLeaf} = lists:keyfind(Idx, 1, SrcLeaves), {Idx, SnkLeaf} = lists:keyfind(Idx, 1, SnkLeaves), L2IdxList = segmentcompare(SrcLeaf, SnkLeaf), - Acc ++ lists:map(fun(X) -> X + Idx * ?LEVEL2_WIDTH end, L2IdxList) + Acc ++ lists:map(fun(X) -> X + Idx * Width end, L2IdxList) end, lists:sort(lists:foldl(FoldFun, [], IdxList)). @@ -182,7 +214,10 @@ fetch_leaves(TicTacTree, BranchList) -> %% and value has been added to both trees, then the merge will not give the %% expected outcome. merge_trees(TreeA, TreeB) -> - MergedTree = new_tree(merge), + Size = TreeA#tictactree.size, + Size = TreeB#tictactree.size, + + MergedTree = new_tree(merge, Size), L1A = fetch_root(TreeA), L1B = fetch_root(TreeB), @@ -197,12 +232,12 @@ merge_trees(TreeA, TreeB) -> end, NewLevel2 = lists:foldl(MergeFun, MergedTree#tictactree.level2, - lists:seq(0, ?LEVEL1_WIDTH - 1)), + lists:seq(0, MergedTree#tictactree.width - 1)), MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. -get_segment(Key) -> - erlang:phash2(Key) band (?SEGMENT_COUNT - 1). +get_segment(Key, SegmentCount) -> + erlang:phash2(Key) band (SegmentCount - 1). %%%============================================================================ @@ -239,10 +274,16 @@ merge_binaries(BinA, BinB) -> -ifdef(TEST). -simple_test() -> +simple_bysize_test() -> + simple_test_withsize(small), + simple_test_withsize(medium), + simple_test_withsize(large), + simple_test_withsize(xlarge). + +simple_test_withsize(Size) -> HashFun = fun(_K, V) -> erlang:phash2(V) end, - Tree0 = new_tree(0), + Tree0 = new_tree(0, Size), Tree1 = add_kv(Tree0, {o, "B1", "K1", null}, {caine, 1}, HashFun), Tree2 = add_kv(Tree1, {o, "B1", "K2", null}, {caine, 2}, HashFun), Tree3 = add_kv(Tree2, {o, "B1", "K3", null}, {caine, 3}, HashFun), @@ -253,7 +294,7 @@ simple_test() -> ?assertMatch(false, Tree2#tictactree.level1 == Tree3#tictactree.level1), ?assertMatch(false, Tree3#tictactree.level1 == Tree3A#tictactree.level1), - Tree0X = new_tree(0), + Tree0X = new_tree(0, Size), Tree1X = add_kv(Tree0X, {o, "B1", "K3", null}, {caine, 3}, HashFun), Tree2X = add_kv(Tree1X, {o, "B1", "K1", null}, {caine, 1}, HashFun), Tree3X = add_kv(Tree2X, {o, "B1", "K2", null}, {caine, 2}, HashFun), @@ -263,23 +304,31 @@ simple_test() -> ?assertMatch(true, Tree3#tictactree.level1 == Tree3X#tictactree.level1), ?assertMatch(true, Tree3XA#tictactree.level1 == Tree3XA#tictactree.level1), + SC = Tree0#tictactree.segment_count, + DL0 = find_dirtyleaves(Tree1, Tree0), - ?assertMatch(true, lists:member(get_segment({o, "B1", "K1", null}), DL0)), + ?assertMatch(true, lists:member(get_segment({o, "B1", "K1", null}, SC), DL0)), DL1 = find_dirtyleaves(Tree3, Tree1), - ?assertMatch(true, lists:member(get_segment({o, "B1", "K2", null}), DL1)), - ?assertMatch(true, lists:member(get_segment({o, "B1", "K3", null}), DL1)), - ?assertMatch(false, lists:member(get_segment({o, "B1", "K1", null}), DL1)). + ?assertMatch(true, lists:member(get_segment({o, "B1", "K2", null}, SC), DL1)), + ?assertMatch(true, lists:member(get_segment({o, "B1", "K3", null}, SC), DL1)), + ?assertMatch(false, lists:member(get_segment({o, "B1", "K1", null}, SC), DL1)). -merge_test() -> +merge_bysize_test() -> + merge_test_withsize(small), + merge_test_withsize(medium), + merge_test_withsize(large), + merge_test_withsize(xlarge). + +merge_test_withsize(Size) -> HashFun = fun(_K, V) -> erlang:phash2(V) end, - TreeX0 = new_tree(0), + TreeX0 = new_tree(0, Size), TreeX1 = add_kv(TreeX0, {o, "B1", "X1", null}, {caine, 1}, HashFun), TreeX2 = add_kv(TreeX1, {o, "B1", "X2", null}, {caine, 2}, HashFun), TreeX3 = add_kv(TreeX2, {o, "B1", "X3", null}, {caine, 3}, HashFun), TreeX4 = add_kv(TreeX3, {o, "B1", "X3", null}, {caine, 4}, HashFun), - TreeY0 = new_tree(0), + TreeY0 = new_tree(0, Size), TreeY1 = add_kv(TreeY0, {o, "B1", "Y1", null}, {caine, 101}, HashFun), TreeY2 = add_kv(TreeY1, {o, "B1", "Y2", null}, {caine, 102}, HashFun), TreeY3 = add_kv(TreeY2, {o, "B1", "Y3", null}, {caine, 103}, HashFun), diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 6230dc9..bad08eb 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -12,6 +12,8 @@ all() -> [ many_put_compare(_Config) -> + TreeSize = small, + SegmentCount = 256 * 256, % Test requires multiple different databases, so want to mount them all % on individual file paths RootPathA = testutil:reset_filestructure("testA"), @@ -68,6 +70,7 @@ many_put_compare(_Config) -> TicTacQ = {tictactree_obj, {o_rkv, "Bucket", null, null, false}, + TreeSize, fun(_B, _K) -> accumulate end}, {async, TreeAFolder} = leveled_bookie:book_returnfolder(Bookie2, TicTacQ), {async, TreeBFolder} = leveled_bookie:book_returnfolder(Bookie3, TicTacQ), @@ -91,13 +94,13 @@ many_put_compare(_Config) -> [length(AltList)]), true = length(SegList0) == 1, % only the test object should be different - true = length(AltList) > 100000, + true = length(AltList) > 10000, % check there are a significant number of differences from empty FoldKeysFun = fun(SegListToFind) -> fun(_B, K, Acc) -> - Seg = leveled_tictac:get_segment(K), + Seg = leveled_tictac:get_segment(K, SegmentCount), case lists:member(Seg, SegListToFind) of true -> [K|Acc]; From 5a012ff8a632cf15e62783f2ba2badad6a79ceb6 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 22 Jun 2017 13:54:51 +0100 Subject: [PATCH 11/58] Add test of index comparison Compare two indexes for consistency --- test/end_to_end/tictac_SUITE.erl | 116 ++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index bad08eb..da2e641 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -3,11 +3,13 @@ -include("include/leveled.hrl"). -export([all/0]). -export([ - many_put_compare/1 + many_put_compare/1, + index_compare/1 ]). all() -> [ - many_put_compare + % many_put_compare, + index_compare ]. @@ -208,3 +210,113 @@ many_put_compare(_Config) -> ok = leveled_bookie:book_close(Bookie4), ok = leveled_bookie:book_close(Bookie5). + +index_compare(_Config) -> + TreeSize = small, + LS = 2000, + JS = 50000000, + SS = testutil:sync_strategy(), + % SegmentCount = 256 * 256, + + % Test requires multiple different databases, so want to mount them all + % on individual file paths + RootPathA = testutil:reset_filestructure("testA"), + RootPathB = testutil:reset_filestructure("testB"), + RootPathC = testutil:reset_filestructure("testC"), + RootPathD = testutil:reset_filestructure("testD"), + % Book1A to get all objects + {ok, Book1A} = leveled_bookie:book_start(RootPathA, LS, JS, SS), + % Book1B/C/D will have objects partitioned across it + {ok, Book1B} = leveled_bookie:book_start(RootPathB, LS, JS, SS), + {ok, Book1C} = leveled_bookie:book_start(RootPathC, LS, JS, SS), + {ok, Book1D} = leveled_bookie:book_start(RootPathD, LS, JS, SS), + + % Generate nine lists of objects + BucketBin = list_to_binary("Bucket"), + GenMapFun = + fun(_X) -> + V = testutil:get_compressiblevalue(), + Indexes = testutil:get_randomindexes_generator(8), + testutil:generate_objects(10000, binary_uuid, [], V, Indexes) + end, + + ObjLists = lists:map(GenMapFun, lists:seq(1, 9)), + + % Load all nine lists into Book1A + lists:foreach(fun(ObjL) -> testutil:riakload(Book1A, ObjL) end, + ObjLists), + + % Split nine lists across Book1B to Book1D, three object lists in each + lists:foreach(fun(ObjL) -> testutil:riakload(Book1B, ObjL) end, + lists:sublist(ObjLists, 1, 3)), + lists:foreach(fun(ObjL) -> testutil:riakload(Book1C, ObjL) end, + lists:sublist(ObjLists, 4, 3)), + lists:foreach(fun(ObjL) -> testutil:riakload(Book1D, ObjL) end, + lists:sublist(ObjLists, 7, 3)), + + GetTicTacTreeFun = + fun(X, Bookie) -> + SW = os:timestamp(), + ST = "!", + ET = "|", + Q = {tictactree_idx, + {BucketBin, "idx" ++ integer_to_list(X) ++ "_bin", ST, ET}, + TreeSize, + fun(_B, _K) -> accumulate end}, + {async, Folder} = leveled_bookie:book_returnfolder(Bookie, Q), + R = Folder(), + io:format("TicTac Tree for index ~w took " ++ + "~w microseconds~n", + [X, timer:now_diff(os:timestamp(), SW)]), + R + end, + + % Get a TicTac tree representing one of the indexes in Bucket A + TicTacTree1_Full = GetTicTacTreeFun(1, Book1A), + TicTacTree1_P1 = GetTicTacTreeFun(1, Book1B), + TicTacTree1_P2 = GetTicTacTreeFun(1, Book1C), + TicTacTree1_P3 = GetTicTacTreeFun(1, Book1D), + + % Merge the tree across the partitions + TicTacTree1_Joined = lists:foldl(fun leveled_tictac:merge_trees/2, + TicTacTree1_P1, + [TicTacTree1_P2, TicTacTree1_P3]), + + % Go compare! Also heck we're not comparing empty trees + DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTree1_Full, + TicTacTree1_Joined), + EmptyTree = leveled_tictac:new_tree(empty, TreeSize), + DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTree1_Full, EmptyTree), + true = DL1_0 == [], + true = length(DL1_1) > 100, + + ok = leveled_bookie:book_close(Book1A), + ok = leveled_bookie:book_close(Book1B), + ok = leveled_bookie:book_close(Book1C), + ok = leveled_bookie:book_close(Book1D), + + % Double chekc all is well still after a restart + % Book1A to get all objects + {ok, Book2A} = leveled_bookie:book_start(RootPathA, LS, JS, SS), + % Book1B/C/D will have objects partitioned across it + {ok, Book2B} = leveled_bookie:book_start(RootPathB, LS, JS, SS), + {ok, Book2C} = leveled_bookie:book_start(RootPathC, LS, JS, SS), + {ok, Book2D} = leveled_bookie:book_start(RootPathD, LS, JS, SS), + % Get a TicTac tree representing one of the indexes in Bucket A + TicTacTree2_Full = GetTicTacTreeFun(2, Book2A), + TicTacTree2_P1 = GetTicTacTreeFun(2, Book2B), + TicTacTree2_P2 = GetTicTacTreeFun(2, Book2C), + TicTacTree2_P3 = GetTicTacTreeFun(2, Book2D), + + % Merge the tree across the partitions + TicTacTree2_Joined = lists:foldl(fun leveled_tictac:merge_trees/2, + TicTacTree2_P1, + [TicTacTree2_P2, TicTacTree2_P3]), + + % Go compare! Also heck we're not comparing empty trees + DL2_0 = leveled_tictac:find_dirtyleaves(TicTacTree2_Full, + TicTacTree2_Joined), + EmptyTree = leveled_tictac:new_tree(empty, TreeSize), + DL2_1 = leveled_tictac:find_dirtyleaves(TicTacTree2_Full, EmptyTree), + true = DL2_0 == [], + true = length(DL2_1) > 100. From 47655dc9c7ef68efcbc341e0040669dfe62ad986 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 22 Jun 2017 14:30:14 +0100 Subject: [PATCH 12/58] Uncomment previous test --- test/end_to_end/tictac_SUITE.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index da2e641..5544876 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -8,7 +8,7 @@ ]). all() -> [ - % many_put_compare, + many_put_compare, index_compare ]. From 4e5c3e2f647285337c42587fc210ca85bad7a9c0 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Jun 2017 12:32:37 +0100 Subject: [PATCH 13/58] Fix merge Fix typo in merge, and extra validation step to unit tests to prevent it returning. --- src/leveled_tictac.erl | 27 ++++++++++++++-- test/end_to_end/tictac_SUITE.erl | 55 +++++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 7 deletions(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index b14642a..70fce1d 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -164,7 +164,7 @@ add_kv(TicTacTree, Key, Value, HashFun) -> -spec find_dirtyleaves(tictactree(), tictactree()) -> list(integer()). %% @doc -%% Returns a list of segment IDs that which hold differences between the state +%% Returns a list of segment IDs which hold differences between the state %% represented by the two trees. find_dirtyleaves(SrcTree, SnkTree) -> _Size = SrcTree#tictactree.size, @@ -226,14 +226,14 @@ merge_trees(TreeA, TreeB) -> MergeFun = fun(SQN, MergeL2) -> L2A = array:get(SQN, TreeA#tictactree.level2), - L2B = array:get(SQN, TreeA#tictactree.level2), + L2B = array:get(SQN, TreeB#tictactree.level2), NewLevel2 = merge_binaries(L2A, L2B), array:set(SQN, NewLevel2, MergeL2) end, NewLevel2 = lists:foldl(MergeFun, MergedTree#tictactree.level2, lists:seq(0, MergedTree#tictactree.width - 1)), - + MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. get_segment(Key, SegmentCount) -> @@ -259,6 +259,25 @@ segmentcompare(SrcBin, SnkBin, Acc, Counter) -> segmentcompare(SrcTail, SnkTail, [Counter|Acc], Counter + 1) end. +checktree(TicTacTree) -> + checktree(TicTacTree#tictactree.level1, TicTacTree, 0). + +checktree(<<>>, TicTacTree, Counter) -> + true = TicTacTree#tictactree.width == Counter; +checktree(Level1Bin, TicTacTree, Counter) -> + BitSize = ?HASH_SIZE * 8, + <> = Level1Bin, + L2Bin = array:get(Counter, TicTacTree#tictactree.level2), + true = TopHash == segmentsummarise(L2Bin, 0), + checktree(Tail, TicTacTree, Counter + 1). + +segmentsummarise(<<>>, L1Acc) -> + L1Acc; +segmentsummarise(L2Bin, L1Acc) -> + BitSize = ?HASH_SIZE * 8, + <> = L2Bin, + segmentsummarise(Tail, L1Acc bxor TopHash). + merge_binaries(BinA, BinB) -> BitSize = bit_size(BinA), BitSize = bit_size(BinB), @@ -340,9 +359,11 @@ merge_test_withsize(Size) -> TreeZ4 = add_kv(TreeZ3, {o, "B1", "Y3", null}, {caine, 104}, HashFun), TreeM0 = merge_trees(TreeX4, TreeY4), + checktree(TreeM0), ?assertMatch(true, TreeM0#tictactree.level1 == TreeZ4#tictactree.level1), TreeM1 = merge_trees(TreeX3, TreeY4), + checktree(TreeM1), ?assertMatch(false, TreeM1#tictactree.level1 == TreeZ4#tictactree.level1). -endif. diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 5544876..c0d66eb 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -8,7 +8,7 @@ ]). all() -> [ - many_put_compare, + % many_put_compare, index_compare ]. @@ -282,7 +282,7 @@ index_compare(_Config) -> TicTacTree1_P1, [TicTacTree1_P2, TicTacTree1_P3]), - % Go compare! Also heck we're not comparing empty trees + % Go compare! Also check we're not comparing empty trees DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTree1_Full, TicTacTree1_Joined), EmptyTree = leveled_tictac:new_tree(empty, TreeSize), @@ -313,10 +313,57 @@ index_compare(_Config) -> TicTacTree2_P1, [TicTacTree2_P2, TicTacTree2_P3]), - % Go compare! Also heck we're not comparing empty trees + % Go compare! Also check we're not comparing empty trees DL2_0 = leveled_tictac:find_dirtyleaves(TicTacTree2_Full, TicTacTree2_Joined), EmptyTree = leveled_tictac:new_tree(empty, TreeSize), DL2_1 = leveled_tictac:find_dirtyleaves(TicTacTree2_Full, EmptyTree), true = DL2_0 == [], - true = length(DL2_1) > 100. + true = length(DL2_1) > 100, + + IdxSpc = {add, "idx2_bin", "zz999"}, + {TestObj, TestSpc} = testutil:generate_testobject(BucketBin, + term_to_binary("K9.Z"), + "Value1", + [IdxSpc], + [{"MDK1", "MDV1"}]), + ok = testutil:book_riakput(Book2C, TestObj, TestSpc), + testutil:check_forobject(Book2C, TestObj), + + TicTacTree3_Full = GetTicTacTreeFun(2, Book2A), + TicTacTree3_P1 = GetTicTacTreeFun(2, Book2B), + TicTacTree3_P2 = GetTicTacTreeFun(2, Book2C), + TicTacTree3_P3 = GetTicTacTreeFun(2, Book2D), + + % Merge the tree across the partitions + TicTacTree3_Joined = lists:foldl(fun leveled_tictac:merge_trees/2, + TicTacTree3_P1, + [TicTacTree3_P2, TicTacTree3_P3]), + + % Find all keys index, and then just the last key + IdxQ1 = {index_query, + BucketBin, + {fun testutil:foldkeysfun/3, []}, + {"idx2_bin", "zz", "zz|"}, + {true, undefined}}, + {async, IdxFolder1} = leveled_bookie:book_returnfolder(Book2C, IdxQ1), + true = IdxFolder1() >= 1, + + DL_3to2B = leveled_tictac:find_dirtyleaves(TicTacTree2_P1, + TicTacTree3_P1), + DL_3to2C = leveled_tictac:find_dirtyleaves(TicTacTree2_P2, + TicTacTree3_P2), + DL_3to2D = leveled_tictac:find_dirtyleaves(TicTacTree2_P3, + TicTacTree3_P3), + io:format("Individual tree comparison found dirty leaves of ~w ~w ~w~n", + [DL_3to2B, DL_3to2C, DL_3to2D]), + + true = length(DL_3to2B) == 0, + true = length(DL_3to2C) == 1, + true = length(DL_3to2D) == 0, + + % Go compare! Should find a difference in one leaf + DL3_0 = leveled_tictac:find_dirtyleaves(TicTacTree3_Full, + TicTacTree3_Joined), + io:format("Different leaves count ~w~n", [length(DL3_0)]), + true = length(DL3_0) == 1. From 2be4422e47f140c66d14b7d4cccdb09562ab0c0f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Jun 2017 12:44:52 +0100 Subject: [PATCH 14/58] Re-add test --- test/end_to_end/tictac_SUITE.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index c0d66eb..de3e2a7 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -8,7 +8,7 @@ ]). all() -> [ - % many_put_compare, + many_put_compare, index_compare ]. From 5e9e1347c7780a3ccd140163938aa45f105299cb Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Jun 2017 14:55:49 +0100 Subject: [PATCH 15/58] Add test to find {term, key} that represents difference Not just detect existence of difference, but clarify what that difference that is. --- test/end_to_end/tictac_SUITE.erl | 52 ++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index de3e2a7..7dc4ba1 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -8,7 +8,7 @@ ]). all() -> [ - many_put_compare, + % many_put_compare, index_compare ]. @@ -216,7 +216,7 @@ index_compare(_Config) -> LS = 2000, JS = 50000000, SS = testutil:sync_strategy(), - % SegmentCount = 256 * 256, + SegmentCount = 256 * 256, % Test requires multiple different databases, so want to mount them all % on individual file paths @@ -366,4 +366,50 @@ index_compare(_Config) -> DL3_0 = leveled_tictac:find_dirtyleaves(TicTacTree3_Full, TicTacTree3_Joined), io:format("Different leaves count ~w~n", [length(DL3_0)]), - true = length(DL3_0) == 1. + true = length(DL3_0) == 1, + + % Now we want to find for the {Term, Key} pairs that make up the segment + % diferrence (there should only be one) + % + % We want the database to filter on segment - so this doesn't have the + % overheads of key listing + + FoldKeysIndexQFun = + fun(_Bucket, {Term, Key}, Acc) -> + Seg = leveled_tictac:get_segment(Key, SegmentCount), + case lists:member(Seg, DL3_0) of + true -> + [{Term, Key}|Acc]; + false -> + Acc + end + end, + + MismatchQ = {index_query, + BucketBin, + {FoldKeysIndexQFun, []}, + {"idx2_bin", "!", "|"}, + {true, undefined}}, + {async, MMFldr_2A} = leveled_bookie:book_returnfolder(Book2A, MismatchQ), + {async, MMFldr_2B} = leveled_bookie:book_returnfolder(Book2B, MismatchQ), + {async, MMFldr_2C} = leveled_bookie:book_returnfolder(Book2C, MismatchQ), + {async, MMFldr_2D} = leveled_bookie:book_returnfolder(Book2D, MismatchQ), + + SWSS = os:timestamp(), + SL_Joined = MMFldr_2B() ++ MMFldr_2C() ++ MMFldr_2D(), + SL_Full = MMFldr_2A(), + io:format("Segment search across both clusters took ~w~n") + [timer:now_diff(os:timestamp(), SWSS)]), + + io:format("Joined SegList ~w~n", [SL_Joined]), + io:format("Full SegList ~w~n", [SL_Full]), + + Diffs = lists:subtract(SL_Full, SL_Joined) + ++ lists:subtract(SL_Joined, SL_Full), + + io:format("Differences between lists ~w~n", [Diffs]), + + % The actual difference is discovered + true = lists:member({"zz999", term_to_binary("K9.Z")}, Diffs), + % Without discovering too many others + true = length(Diffs) < 20. From 25a5065edde592aa672bc92d2a958b51a3b342ae Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 23 Jun 2017 14:56:32 +0100 Subject: [PATCH 16/58] Re-introduce test (again) --- test/end_to_end/tictac_SUITE.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 7dc4ba1..fc5e99b 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -8,7 +8,7 @@ ]). all() -> [ - % many_put_compare, + many_put_compare, index_compare ]. From 99131320c5d678fc0ebd3b0e1c1f2ba61a1693a2 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 23 Jun 2017 15:20:24 +0100 Subject: [PATCH 17/58] Broken test log --- test/end_to_end/tictac_SUITE.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index fc5e99b..774b92a 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -398,7 +398,7 @@ index_compare(_Config) -> SWSS = os:timestamp(), SL_Joined = MMFldr_2B() ++ MMFldr_2C() ++ MMFldr_2D(), SL_Full = MMFldr_2A(), - io:format("Segment search across both clusters took ~w~n") + io:format("Segment search across both clusters took ~w~n", [timer:now_diff(os:timestamp(), SWSS)]), io:format("Joined SegList ~w~n", [SL_Joined]), From e938eaa153be9cd3d8f61be666a6babc1bc16c89 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 23 Jun 2017 16:51:28 +0100 Subject: [PATCH 18/58] Add close to test --- test/end_to_end/tictac_SUITE.erl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 774b92a..2c2b3f8 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -412,4 +412,10 @@ index_compare(_Config) -> % The actual difference is discovered true = lists:member({"zz999", term_to_binary("K9.Z")}, Diffs), % Without discovering too many others - true = length(Diffs) < 20. + true = length(Diffs) < 20, + + + ok = leveled_bookie:book_close(Book2A), + ok = leveled_bookie:book_close(Book2B), + ok = leveled_bookie:book_close(Book2C), + ok = leveled_bookie:book_close(Book2D). From fde9af28ddb3f282397cd26740cb2da578d9bb91 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 26 Jun 2017 17:08:31 +0100 Subject: [PATCH 19/58] comment test to avoid timeout --- src/leveled_tictac.erl | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 70fce1d..963521a 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -332,11 +332,18 @@ simple_test_withsize(Size) -> ?assertMatch(true, lists:member(get_segment({o, "B1", "K3", null}, SC), DL1)), ?assertMatch(false, lists:member(get_segment({o, "B1", "K1", null}, SC), DL1)). -merge_bysize_test() -> - merge_test_withsize(small), - merge_test_withsize(medium), - merge_test_withsize(large), - merge_test_withsize(xlarge). +merge_bysize_small_test() -> + merge_test_withsize(small). + +merge_bysize_medium_test() -> + merge_test_withsize(medium). + +merge_bysize_large_test() -> + merge_test_withsize(large). + +% merge_bysize_xlarge_test() -> +% merge_test_withsize(xlarge). +% timmeout on cover test - so commented merge_test_withsize(Size) -> HashFun = fun(_K, V) -> erlang:phash2(V) end, From 9fca17d56a90151bdc4c0da4d5b4a930bb50d486 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 26 Jun 2017 13:26:08 +0100 Subject: [PATCH 20/58] WIP - Recent Modifications Just some initial WIP code for this. Will revisit this again after exploring some ideas as to how to reduce the cost of the get_keys_by_segment. The overlal idea is that there are trees of recent modifications, with recent being some rolling time window made up of hourly blocks, and recency being dtermined by the last-modified date on the object metadata - which should be conistent across a cluster. So if we were at 15:30 we would get the tree for 14:00 - 15:00 and the tree for 15:00-16:00 from two different queries which cover the same partitions and then compare. Comparison may find differences, and we know what segment the difference is in - but how to then find all keys in that segment which have been modified in the period? Three ways: Do it inefficeintly and infrequently using a fold_keys and a filter (perhaps with SST files having a highest LMD in the metadata so that they can be skipped). Add a special index, where verye entry has a TTL, and the Key is {$segment, Segment, Bucket, Key} so that a normal 2i query cna be used. Align hashing for segments with hashing for penciller lookup so that a query over the actual keys cna be optimised skipping chunks of the in-memory part, and chunks of the SST file --- src/leveled_tictac.erl | 79 ++++++++++++++++++++++++++++++-- test/end_to_end/tictac_SUITE.erl | 2 +- 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 963521a..36ece72 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -84,7 +84,12 @@ level2 :: any() % an array - but OTP compatibility }). +-record(recenttrees, {trees :: list(), + size:: small|medium|large|xlarge + }). + -type tictactree() :: #tictactree{}. +-type recenttrees() :: #recenttrees{}. %%%============================================================================ %%% External functions @@ -95,7 +100,10 @@ %% Create a new tree, zeroed out. new_tree(TreeID) -> new_tree(TreeID, small). - + +-spec new_tree(any(), small|medium|large|xlarge) -> tictactree(). +%% @doc +%% Create a new tree, zeroed out. Specify the t-shirt siz eof the tree new_tree(TreeID, Size) -> {BitWidth, Width, SegmentCount} = case Size of @@ -121,10 +129,12 @@ new_tree(TreeID, Size) -> level1 = Lv1Init, level2 = Lv2Init}. --spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree(). +-spec add_kv(tictactree(), tuple(), tuple(), fun((_,_) -> integer())) -> + tictactree(). %% @doc %% Add a Key and value to a tictactree using the HashFun to calculate the Hash -%% based on that key and value +%% based on that key and value (or extract the Hash if it is present within +%% the value). add_kv(TicTacTree, Key, Value, HashFun) -> HashV = HashFun(Key, Value), SegChangeHash = erlang:phash2(Key, HashV), @@ -236,10 +246,73 @@ merge_trees(TreeA, TreeB) -> MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. +-spec get_segment(tuple(), integer()) -> integer(). +%% @doc +%% Map the key to a segmen. get_segment(Key, SegmentCount) -> erlang:phash2(Key) band (SegmentCount - 1). +-spec match_hour(tuple(), tuple(), fun((_,_) -> tuple()), tuple(), integer()) + -> {integer(), integer()}|no_match. +%% @doc +%% Match the modified date of the object to an hour of day, where the hour of +%% the day is within a threshold. Used for identifying recently added keys and +%% mapping those keys to the right tictac tree of recent additions +%% +%% The ModDateFun must return a datetime tuple e.g. {{Y, M, D}, {H, M, S}} +match_hour(Key, Value, ModDateFun, Now, HoursToKeep) -> + {ModDate, {ModHr, _ModMin, _ModSec}} = ModDateFun(Key, Value), + {NowDate, {NowHr, _NowMin, _NowSec}} = calendar:now_to_datetime(Now), + {DayDiff, {HourDiff, _MinDiff, _SecDiff}} + = calendar:time_difference({ModDate, {ModHr, 0, 0}}, + {NowDate, {NowHr, 0, 0}}), + case HoursToKeep >= DayDiff * 24 + HourDiff of + true -> + {ModDate, ModHr}; + false -> + no_match + end. + +-spec add_recent_kv(tuple(), tuple(), + fun((_,_) -> integer()), + integer(), recenttrees()) -> recenttrees(). +%% @doc +%% Add a recently modified key and value to the appropriate tree of recent +%% keys and values. +add_recent_kv({ModDate, ModHour}, {Key, Value}, + HashFun, HoursToKeep, RecentTrees) -> + case lists:keyfind({ModDate, ModHour}, 1, RecentTrees#recenttrees.trees) of + {{ModDate, ModHour}, Tree0} -> + Tree1 = add_kv(Tree0, Key, Value, HashFun), + RT1 = lists:keyreplace({ModDate, ModHour}, + 1, + RecentTrees#recenttrees.trees, + {{ModDate, ModHour}, Tree1}), + RecentTrees#recenttrees{trees = RT1}; + not_found -> + NT0 = new_tree(recent, RecentTrees#recenttrees.size), + NT1 = add_kv(NT0, Key, Value, HashFun), + RT0 = [{{ModDate, ModHour}, NT1}|RecentTrees#recenttrees.trees], + case length(RT0) > HoursToKeep of + true -> + FoldFun = + fun({K, _V} , Acc) -> + case K < Acc of + true -> + K; + false -> + Acc + end + end, + OldestK = lists:foldl(FoldFun, NT1, {ModDate, ModHour}), + RT1 = lists:keydelete(OldestK, 1, RT0), + RecentTrees#recenttrees{trees = RT1}; + false -> + RecentTrees#recenttrees{trees = RT0} + end + end. + %%%============================================================================ %%% Internal functions %%%============================================================================ diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 2c2b3f8..d034463 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -67,7 +67,7 @@ many_put_compare(_Config) -> {ok, Bookie3} = leveled_bookie:book_start(StartOpts3), lists:foreach(fun(ObjL) -> testutil:riakload(Bookie3, ObjL) end, CLs), - % Now run a tictac query against both stores to see th extent to which + % Now run a tictac query against both stores to see the extent to which % state between stores is consistent TicTacQ = {tictactree_obj, From f81a4bca0d954aeedf931470f493674a3d6fda90 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 27 Jun 2017 14:46:43 +0100 Subject: [PATCH 21/58] Revert "WIP - Recent Modifications" This reverts commit bc19a05d83a02d7ec03771657df85b33acc6cfee. --- src/leveled_tictac.erl | 79 ++------------------------------ test/end_to_end/tictac_SUITE.erl | 2 +- 2 files changed, 4 insertions(+), 77 deletions(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 36ece72..963521a 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -84,12 +84,7 @@ level2 :: any() % an array - but OTP compatibility }). --record(recenttrees, {trees :: list(), - size:: small|medium|large|xlarge - }). - -type tictactree() :: #tictactree{}. --type recenttrees() :: #recenttrees{}. %%%============================================================================ %%% External functions @@ -100,10 +95,7 @@ %% Create a new tree, zeroed out. new_tree(TreeID) -> new_tree(TreeID, small). - --spec new_tree(any(), small|medium|large|xlarge) -> tictactree(). -%% @doc -%% Create a new tree, zeroed out. Specify the t-shirt siz eof the tree + new_tree(TreeID, Size) -> {BitWidth, Width, SegmentCount} = case Size of @@ -129,12 +121,10 @@ new_tree(TreeID, Size) -> level1 = Lv1Init, level2 = Lv2Init}. --spec add_kv(tictactree(), tuple(), tuple(), fun((_,_) -> integer())) -> - tictactree(). +-spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree(). %% @doc %% Add a Key and value to a tictactree using the HashFun to calculate the Hash -%% based on that key and value (or extract the Hash if it is present within -%% the value). +%% based on that key and value add_kv(TicTacTree, Key, Value, HashFun) -> HashV = HashFun(Key, Value), SegChangeHash = erlang:phash2(Key, HashV), @@ -246,73 +236,10 @@ merge_trees(TreeA, TreeB) -> MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. --spec get_segment(tuple(), integer()) -> integer(). -%% @doc -%% Map the key to a segmen. get_segment(Key, SegmentCount) -> erlang:phash2(Key) band (SegmentCount - 1). --spec match_hour(tuple(), tuple(), fun((_,_) -> tuple()), tuple(), integer()) - -> {integer(), integer()}|no_match. -%% @doc -%% Match the modified date of the object to an hour of day, where the hour of -%% the day is within a threshold. Used for identifying recently added keys and -%% mapping those keys to the right tictac tree of recent additions -%% -%% The ModDateFun must return a datetime tuple e.g. {{Y, M, D}, {H, M, S}} -match_hour(Key, Value, ModDateFun, Now, HoursToKeep) -> - {ModDate, {ModHr, _ModMin, _ModSec}} = ModDateFun(Key, Value), - {NowDate, {NowHr, _NowMin, _NowSec}} = calendar:now_to_datetime(Now), - {DayDiff, {HourDiff, _MinDiff, _SecDiff}} - = calendar:time_difference({ModDate, {ModHr, 0, 0}}, - {NowDate, {NowHr, 0, 0}}), - case HoursToKeep >= DayDiff * 24 + HourDiff of - true -> - {ModDate, ModHr}; - false -> - no_match - end. - --spec add_recent_kv(tuple(), tuple(), - fun((_,_) -> integer()), - integer(), recenttrees()) -> recenttrees(). -%% @doc -%% Add a recently modified key and value to the appropriate tree of recent -%% keys and values. -add_recent_kv({ModDate, ModHour}, {Key, Value}, - HashFun, HoursToKeep, RecentTrees) -> - case lists:keyfind({ModDate, ModHour}, 1, RecentTrees#recenttrees.trees) of - {{ModDate, ModHour}, Tree0} -> - Tree1 = add_kv(Tree0, Key, Value, HashFun), - RT1 = lists:keyreplace({ModDate, ModHour}, - 1, - RecentTrees#recenttrees.trees, - {{ModDate, ModHour}, Tree1}), - RecentTrees#recenttrees{trees = RT1}; - not_found -> - NT0 = new_tree(recent, RecentTrees#recenttrees.size), - NT1 = add_kv(NT0, Key, Value, HashFun), - RT0 = [{{ModDate, ModHour}, NT1}|RecentTrees#recenttrees.trees], - case length(RT0) > HoursToKeep of - true -> - FoldFun = - fun({K, _V} , Acc) -> - case K < Acc of - true -> - K; - false -> - Acc - end - end, - OldestK = lists:foldl(FoldFun, NT1, {ModDate, ModHour}), - RT1 = lists:keydelete(OldestK, 1, RT0), - RecentTrees#recenttrees{trees = RT1}; - false -> - RecentTrees#recenttrees{trees = RT0} - end - end. - %%%============================================================================ %%% Internal functions %%%============================================================================ diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index d034463..2c2b3f8 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -67,7 +67,7 @@ many_put_compare(_Config) -> {ok, Bookie3} = leveled_bookie:book_start(StartOpts3), lists:foreach(fun(ObjL) -> testutil:riakload(Bookie3, ObjL) end, CLs), - % Now run a tictac query against both stores to see the extent to which + % Now run a tictac query against both stores to see th extent to which % state between stores is consistent TicTacQ = {tictactree_obj, From ebef27f021b34314c193a8b19452734b35deaa8d Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 27 Jun 2017 16:25:09 +0100 Subject: [PATCH 22/58] Extract Last Modified Date from Riak Object As part of process to supporting a recent changes index for near-real-time anti-entropy --- src/leveled_bookie.erl | 26 ++++++----------- src/leveled_codec.erl | 61 +++++++++++++++++++++++++++++---------- src/leveled_sst.erl | 14 ++++----- src/leveled_tinybloom.erl | 14 ++++----- 4 files changed, 64 insertions(+), 51 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index cb15133..f5c6342 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1383,25 +1383,17 @@ accumulate_index(TermRe, AddFun, FoldKeysFun) -> preparefor_ledgercache(?INKT_KEYD, - LedgerKey, SQN, _Obj, _Size, {IndexSpecs, TTL}) -> + LedgerKey, SQN, _Obj, _Size, {IdxSpecs, TTL}) -> {Bucket, Key} = leveled_codec:from_ledgerkey(LedgerKey), - KeyChanges = leveled_codec:convert_indexspecs(IndexSpecs, - Bucket, - Key, - SQN, - TTL), + KeyChanges = + leveled_codec:convert_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL), {no_lookup, SQN, KeyChanges}; -preparefor_ledgercache(_Type, LedgerKey, SQN, Obj, Size, {IndexSpecs, TTL}) -> - {Bucket, Key, ObjKeyChange, H} = leveled_codec:generate_ledgerkv(LedgerKey, - SQN, - Obj, - Size, - TTL), - KeyChanges = [ObjKeyChange] ++ leveled_codec:convert_indexspecs(IndexSpecs, - Bucket, - Key, - SQN, - TTL), +preparefor_ledgercache(_Type, LedgerKey, SQN, Obj, Size, {IdxSpecs, TTL}) -> + {Bucket, Key, MetaValue, H, _LastMods} = + leveled_codec:generate_ledgerkv(LedgerKey, SQN, Obj, Size, TTL), + KeyChanges = + [{LedgerKey, MetaValue}] ++ + leveled_codec:convert_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL), {H, SQN, KeyChanges}. diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 42dcb28..a8f57fc 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -378,6 +378,20 @@ convert_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL) -> end, IndexSpecs). +-spec generate_ledgerkv(tuple(), integer(), any(), + integer(), tuple()|infinity) -> + {any(), any(), any(), integer()|no_lookup, list()}. +%% @doc +%% Function to extract from an object the information necessary to populate +%% the Penciller's ledger. +%% Outputs - +%% Bucket - original Bucket extracted from the PrimaryKey +%% Key - original Key extracted from the PrimaryKey +%% Value - the value to be used in the Ledger (essentially the extracted +%% metadata) +%% Hash - A magic hash of the key to be used in lookups and filters +%% LastMods - the last modified dates for the object (may be multiple due to +%% siblings) generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> {Tag, Bucket, Key, _} = PrimaryKey, Status = case Obj of @@ -387,11 +401,12 @@ generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> {active, TS} end, Hash = magic_hash(PrimaryKey), + {MD, LastMods} = extract_metadata(Obj, Size, Tag), Value = {SQN, Status, Hash, - extract_metadata(Obj, Size, Tag)}, - {Bucket, Key, {PrimaryKey, Value}, Hash}. + MD}, + {Bucket, Key, Value, Hash, LastMods}. integer_now() -> @@ -404,7 +419,7 @@ integer_time(TS) -> extract_metadata(Obj, Size, ?RIAK_TAG) -> riak_extract_metadata(Obj, Size); extract_metadata(Obj, Size, ?STD_TAG) -> - {hash(Obj), Size}. + {{hash(Obj), Size}, []}. get_size(PK, Value) -> {Tag, _Bucket, _Key, _} = PK, @@ -445,13 +460,14 @@ build_metadata_object(PrimaryKey, MD) -> riak_extract_metadata(delete, Size) -> - {delete, null, null, Size}; + {{delete, null, null, Size}, []}; riak_extract_metadata(ObjBin, Size) -> - {VclockBin, SibBin} = riak_metadata_from_binary(ObjBin), - {SibBin, - VclockBin, - erlang:phash2(lists:sort(binary_to_term(VclockBin))), - Size}. + {VclockBin, SibBin, LastMods} = riak_metadata_from_binary(ObjBin), + {{SibBin, + VclockBin, + erlang:phash2(lists:sort(binary_to_term(VclockBin))), + Size}, + LastMods}. %% <>. @@ -466,28 +482,41 @@ riak_metadata_from_binary(V1Binary) -> <> = V1Binary, <> = Rest, - SibMetaBin = + {SibMetaBin, LastMods} = case SibCount of SC when is_integer(SC) -> get_metadata_from_siblings(SibsBin, SibCount, - <>) + <>, + []) end, - {VclockBin, SibMetaBin}. + {VclockBin, SibMetaBin, LastMods}. -get_metadata_from_siblings(<<>>, 0, SibMetaBin) -> - SibMetaBin; +get_metadata_from_siblings(<<>>, 0, SibMetaBin, LastMods) -> + {SibMetaBin, LastMods}; get_metadata_from_siblings(<>, SibCount, - SibMetaBin) -> + SibMetaBin, + LastMods) -> <<_ValBin:ValLen/binary, MetaLen:32/integer, Rest1/binary>> = Rest0, <> = Rest1, + LastMod = + case MetaBin of + <> -> + {MegaSec, Sec, MicroSec}; + _ -> + {0, 0, 0} + end, get_metadata_from_siblings(Rest2, SibCount - 1, <>). + MetaBin:MetaLen/binary>>, + [LastMod|LastMods]). diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 74f84db..eb5af40 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -1481,17 +1481,13 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> BRand = random:uniform(BRange), BNumber = string:right(integer_to_list(BucketLow + BRand), 4, $0), KNumber = string:right(integer_to_list(random:uniform(1000)), 6, $0), - LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, - "Key" ++ KNumber, - o), - {_B, _K, KV, _H} = leveled_codec:generate_ledgerkv(LedgerKey, - Seqn, - crypto:rand_bytes(64), - 64, - infinity), + LK = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, "Key" ++ KNumber, o), + Chunk = crypto:rand_bytes(64), + {_B, _K, MV, _H, _LMs} = + leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity), generate_randomkeys(Seqn + 1, Count - 1, - [KV|Acc], + [{LK, MV}|Acc], BucketLow, BRange). diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 9d0ae32..15bd732 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -238,17 +238,13 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> BRand = random:uniform(BRange), BNumber = string:right(integer_to_list(BucketLow + BRand), 4, $0), KNumber = string:right(integer_to_list(random:uniform(10000)), 6, $0), - LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, - "Key" ++ KNumber, - o), - {_B, _K, KV, _H} = leveled_codec:generate_ledgerkv(LedgerKey, - Seqn, - crypto:rand_bytes(64), - 64, - infinity), + LK = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, "Key" ++ KNumber, o), + Chunk = crypto:rand_bytes(64), + {_B, _K, MV, _H, _LMs} = + leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity), generate_randomkeys(Seqn + 1, Count - 1, - [KV|Acc], + [{LK, MV}|Acc], BucketLow, BRange). From 8e7aaf0ee760bf4ea0e66e94827279c934bff9b1 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 27 Jun 2017 17:11:13 +0100 Subject: [PATCH 23/58] Correct testutil to understand riak_extract_metadata Change, but change not reflected in tets code --- test/end_to_end/testutil.erl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index 01bd26d..afc9aec 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -264,10 +264,8 @@ check_forobject(Bookie, TestObject) -> {ok, HeadBinary} = book_riakhead(Bookie, TestObject#r_object.bucket, TestObject#r_object.key), - {_SibMetaBin, - Vclock, - _Hash, - size} = leveled_codec:riak_extract_metadata(HeadBinary, size), + {{_SibMetaBin, Vclock, _Hash, size}, _LMS} + = leveled_codec:riak_extract_metadata(HeadBinary, size), true = binary_to_term(Vclock) == TestObject#r_object.vclock. check_formissingobject(Bookie, Bucket, Key) -> From 2dd303237bfb893ed5d650206fd591ba724e3e4c Mon Sep 17 00:00:00 2001 From: martinsumner Date: Wed, 28 Jun 2017 10:55:54 +0100 Subject: [PATCH 24/58] Change XOR --- src/leveled_tictac.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 963521a..1ba4b08 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -149,7 +149,7 @@ add_kv(TicTacTree, Key, Value, HashFun) -> PostL1/binary>> = TicTacTree#tictactree.level1, SegLeaf2Upd = SegLeaf2 bxor SegChangeHash, - SegLeaf1Upd = SegLeaf1 bxor SegLeaf2 bxor SegLeaf2Upd, + SegLeaf1Upd = SegLeaf1 bxor SegChangeHash, Level1Upd = < Date: Fri, 30 Jun 2017 10:03:36 +0100 Subject: [PATCH 25/58] Add temporary aae index Pending ct tests. The aae index should expire after limit_minutes and be on an index which is rounded to unit_minutes. --- include/leveled.hrl | 21 ++++ src/leveled_bookie.erl | 103 ++++++++++++-------- src/leveled_codec.erl | 179 ++++++++++++++++++++++++++++++++--- src/leveled_penciller.erl | 2 +- src/leveled_sst.erl | 10 +- src/leveled_tictac.erl | 35 ++++--- test/end_to_end/testutil.erl | 3 +- 7 files changed, 281 insertions(+), 72 deletions(-) diff --git a/include/leveled.hrl b/include/leveled.hrl index f6b0294..6e1b603 100644 --- a/include/leveled.hrl +++ b/include/leveled.hrl @@ -5,6 +5,8 @@ -define(STD_TAG, o). %% Tag used for secondary index keys -define(IDX_TAG, i). +%% Tag used for near real-time anti-entropy index keys +-define(AAE_TAG, i_aae). %% Inker key type used for 'normal' objects -define(INKT_STND, stnd). @@ -67,6 +69,25 @@ waste_retention_period :: integer(), reload_strategy = [] :: list()}). +-record(recent_aae, {buckets :: list()|all, + % whitelist of buckets to support recent recent AAE + % or all to support all buckets + + limit_minutes :: integer(), + % how long to retain entries the temporary index for + % It will actually be retained for limit + unit minutes + % 60 minutes seems sensible + + unit_minutes :: integer(), + % What the minimum unit size will be for a query + % e.g. the minimum time duration to be used in range + % queries of the aae index + % 5 minutes seems sensible + + tree_size = small :: atom() + % Just defaulted to small for now + }). + -record(r_content, { metadata, value :: term() diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index f5c6342..8b46d81 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -82,6 +82,7 @@ -define(CACHE_SIZE_JITTER, 25). -define(JOURNAL_SIZE_JITTER, 20). -define(LONG_RUNNING, 80000). +-define(RECENT_AAE, false). -record(ledger_cache, {mem :: ets:tab(), loader = leveled_tree:empty(?CACHE_TYPE) @@ -94,6 +95,7 @@ -record(state, {inker :: pid(), penciller :: pid(), cache_size :: integer(), + recent_aae :: false|#recent_aae{}, ledger_cache = #ledger_cache{}, is_snapshot :: boolean(), slow_offer = false :: boolean(), @@ -157,7 +159,7 @@ book_start(RootPath, LedgerCacheSize, JournalSize, SyncStrategy) -> %% %% TODO: %% The reload_strategy is exposed as currently no firm decision has been made -%% about how recovery should work. For instance if we were to trust evrything +%% about how recovery should work. For instance if we were to trust everything %% as permanent in the Ledger once it is persisted, then there would be no %% need to retain a skinny history of key changes in the Journal after %% compaction. If, as an alternative we assume the Ledger is never permanent, @@ -383,15 +385,28 @@ init([Opts]) -> undefined -> % Start from file not snapshot {InkerOpts, PencillerOpts} = set_options(Opts), - {Inker, Penciller} = startup(InkerOpts, PencillerOpts), + CacheJitter = ?CACHE_SIZE div (100 div ?CACHE_SIZE_JITTER), CacheSize = get_opt(cache_size, Opts, ?CACHE_SIZE) + erlang:phash2(self()) rem CacheJitter, + RecentAAE = + case get_opt(recent_aae, Opts, ?RECENT_AAE) of + false -> + false; + {BucketList, LimitMinutes, UnitMinutes} -> + #recent_aae{buckets = BucketList, + limit_minutes = LimitMinutes, + unit_minutes = UnitMinutes} + end, + + {Inker, Penciller} = startup(InkerOpts, PencillerOpts, RecentAAE), + NewETS = ets:new(mem, [ordered_set]), leveled_log:log("B0001", [Inker, Penciller]), {ok, #state{inker=Inker, penciller=Penciller, cache_size=CacheSize, + recent_aae=RecentAAE, ledger_cache=#ledger_cache{mem = NewETS}, is_snapshot=false}}; Bookie -> @@ -418,7 +433,8 @@ handle_call({put, Bucket, Key, Object, IndexSpecs, Tag, TTL}, From, State) -> SQN, Object, ObjSize, - {IndexSpecs, TTL}), + {IndexSpecs, TTL}, + State#state.recent_aae), Cache0 = addto_ledgercache(Changes, State#state.ledger_cache), T1 = timer:now_diff(os:timestamp(), SW) - T0, PutTimes = leveled_log:put_timing(bookie, State#state.put_timing, T0, T1), @@ -1151,14 +1167,14 @@ set_options(Opts) -> max_inmemory_tablesize = PCLL0CacheSize, levelzero_cointoss = true}}. -startup(InkerOpts, PencillerOpts) -> +startup(InkerOpts, PencillerOpts, RecentAAE) -> {ok, Inker} = leveled_inker:ink_start(InkerOpts), {ok, Penciller} = leveled_penciller:pcl_start(PencillerOpts), LedgerSQN = leveled_penciller:pcl_getstartupsequencenumber(Penciller), leveled_log:log("B0005", [LedgerSQN]), ok = leveled_inker:ink_loadpcl(Inker, LedgerSQN + 1, - fun load_fun/5, + get_loadfun(RecentAAE), Penciller), {Inker, Penciller}. @@ -1383,17 +1399,21 @@ accumulate_index(TermRe, AddFun, FoldKeysFun) -> preparefor_ledgercache(?INKT_KEYD, - LedgerKey, SQN, _Obj, _Size, {IdxSpecs, TTL}) -> + LedgerKey, SQN, _Obj, _Size, {IdxSpecs, TTL}, + _AAE) -> {Bucket, Key} = leveled_codec:from_ledgerkey(LedgerKey), KeyChanges = - leveled_codec:convert_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL), + leveled_codec:idx_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL), {no_lookup, SQN, KeyChanges}; -preparefor_ledgercache(_Type, LedgerKey, SQN, Obj, Size, {IdxSpecs, TTL}) -> - {Bucket, Key, MetaValue, H, _LastMods} = +preparefor_ledgercache(_InkTag, + LedgerKey, SQN, Obj, Size, {IdxSpecs, TTL}, + AAE) -> + {Bucket, Key, MetaValue, H, LastMods} = leveled_codec:generate_ledgerkv(LedgerKey, SQN, Obj, Size, TTL), KeyChanges = [{LedgerKey, MetaValue}] ++ - leveled_codec:convert_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL), + leveled_codec:idx_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL) ++ + leveled_codec:aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods), {H, SQN, KeyChanges}. @@ -1452,35 +1472,40 @@ maybe_withjitter(CacheSize, MaxCacheSize) -> end. - -load_fun(KeyInJournal, ValueInJournal, _Position, Acc0, ExtractFun) -> - {MinSQN, MaxSQN, OutputTree} = Acc0, - {SQN, Type, PK} = KeyInJournal, - % VBin may already be a term - {VBin, VSize} = ExtractFun(ValueInJournal), - {Obj, IndexSpecs} = leveled_codec:split_inkvalue(VBin), - case SQN of - SQN when SQN < MinSQN -> - {loop, Acc0}; - SQN when SQN < MaxSQN -> - Changes = preparefor_ledgercache(Type, PK, SQN, - Obj, VSize, IndexSpecs), - {loop, - {MinSQN, - MaxSQN, - addto_ledgercache(Changes, OutputTree, loader)}}; - MaxSQN -> - leveled_log:log("B0006", [SQN]), - Changes = preparefor_ledgercache(Type, PK, SQN, - Obj, VSize, IndexSpecs), - {stop, - {MinSQN, - MaxSQN, - addto_ledgercache(Changes, OutputTree, loader)}}; - SQN when SQN > MaxSQN -> - leveled_log:log("B0007", [MaxSQN, SQN]), - {stop, Acc0} - end. +get_loadfun(RecentAAE) -> + PrepareFun = + fun(Tag, PK, SQN, Obj, VS, IdxSpecs) -> + preparefor_ledgercache(Tag, PK, SQN, Obj, VS, IdxSpecs, RecentAAE) + end, + LoadFun = + fun(KeyInJournal, ValueInJournal, _Pos, Acc0, ExtractFun) -> + {MinSQN, MaxSQN, OutputTree} = Acc0, + {SQN, InkTag, PK} = KeyInJournal, + % VBin may already be a term + {VBin, VSize} = ExtractFun(ValueInJournal), + {Obj, IdxSpecs} = leveled_codec:split_inkvalue(VBin), + case SQN of + SQN when SQN < MinSQN -> + {loop, Acc0}; + SQN when SQN < MaxSQN -> + Chngs = PrepareFun(InkTag, PK, SQN, Obj, VSize, IdxSpecs), + {loop, + {MinSQN, + MaxSQN, + addto_ledgercache(Chngs, OutputTree, loader)}}; + MaxSQN -> + leveled_log:log("B0006", [SQN]), + Chngs = PrepareFun(InkTag, PK, SQN, Obj, VSize, IdxSpecs), + {stop, + {MinSQN, + MaxSQN, + addto_ledgercache(Chngs, OutputTree, loader)}}; + SQN when SQN > MaxSQN -> + leveled_log:log("B0007", [MaxSQN, SQN]), + {stop, Acc0} + end + end, + LoadFun. get_opt(Key, Opts) -> diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index a8f57fc..621f997 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -59,7 +59,8 @@ generate_ledgerkv/5, get_size/2, get_keyandhash/2, - convert_indexspecs/5, + idx_indexspecs/5, + aae_indexspecs/6, generate_uuid/0, integer_now/0, riak_extract_metadata/2, @@ -68,22 +69,19 @@ -define(V1_VERS, 1). -define(MAGIC, 53). % riak_kv -> riak_object +-define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). + +-type recent_aae() :: #recent_aae{}. + +-spec magic_hash(any()) -> integer(). +%% @doc %% Use DJ Bernstein magic hash function. Note, this is more expensive than %% phash2 but provides a much more balanced result. %% %% Hash function contains mysterious constants, some explanation here as to %% what they are - %% http://stackoverflow.com/questions/10696223/reason-for-5381-number-in-djb-hash-function - -to_lookup(Key) -> - case element(1, Key) of - ?IDX_TAG -> - no_lookup; - _ -> - lookup - end. - magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) -> magic_hash({Bucket, Key}); magic_hash({?STD_TAG, Bucket, Key, _SubKey}) -> @@ -100,7 +98,23 @@ hash1(H, <>) -> H2 = H1 bxor B, hash1(H2, Rest). +%% @doc +%% Should it be possible to lookup a key in the merge tree. This is not true +%% For keys that should only be read through range queries. Direct lookup +%% keys will have presence in bloom filters and other lookup accelerators. +to_lookup(Key) -> + case element(1, Key) of + ?IDX_TAG -> + no_lookup; + ?AAE_TAG -> + no_lookup; + _ -> + lookup + end. +-spec generate_uuid() -> list(). +%% @doc +%% Generate a new globally unique ID as a string. %% Credit to %% https://github.com/afiskon/erlang-uuid-v4/blob/master/src/uuid.erl generate_uuid() -> @@ -363,7 +377,7 @@ endkey_passed({EK1, EK2, EK3, null}, {CK1, CK2, CK3, _}) -> endkey_passed(EndKey, CheckingKey) -> EndKey < CheckingKey. -convert_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL) -> +idx_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL) -> lists:map(fun({IndexOp, IdxField, IdxValue}) -> Status = case IndexOp of add -> @@ -378,6 +392,85 @@ convert_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL) -> end, IndexSpecs). +-spec aae_indexspecs(false|recent_aae(), + any(), any(), + integer(), integer(), + list()) + -> list(). +%% @doc +%% Generate an additional index term representing the change, if the last +%% modified date for the change is within the definition of recency. +%% +%% The objetc may have multiple last modified dates (siblings), and in this +%% case index entries for all dates within the range are added. +%% +%% The index should entry auto-expire in the future (when it is no longer +%% relevant to assessing recent changes) +aae_indexspecs(false, _Bucket, _Key, _SQN, _H, _LastMods) -> + []; +aae_indexspecs(_AAE, _Bucket, _Key, _SQN, _H, []) -> + []; +aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) -> + InBucket = + case AAE#recent_aae.buckets of + all -> + true; + ListB -> + lists:member(Bucket, ListB) + end, + case InBucket of + true -> + GenTagFun = + fun(LMD0, Acc) -> + Dates = parse_date(LMD0, + AAE#recent_aae.unit_minutes, + AAE#recent_aae.limit_minutes, + integer_now()), + case Dates of + no_index -> + Acc; + {LMD1, TTL} -> + TreeSize = AAE#recent_aae.tree_size, + SegmentID = + leveled_tictac:get_segment(Key, TreeSize), + IdxK = {?AAE_TAG, + LMD1, + {{SegmentID, H}, Bucket}, + Key}, + IdxV = {SQN, {active, TTL}, no_lookup, null}, + [{IdxK, IdxV}|Acc] + end + end, + lists:foldl(GenTagFun, [], LastMods); + false -> + [] + end. + +-spec parse_date(tuple(), integer(), integer(), integer()) -> + no_index|{binary(), integer()}. +%% @doc +%% Parse the lat modified date and the AAE date configuration to return a +%% binary to be used as the last modified date part of the index, and an +%% integer to be used as the TTL of the index entry. +%% Return no_index if the change is not recent. +parse_date(LMD, UnitMins, LimitMins, Now) -> + LMDsecs = integer_time(LMD), + Recent = (LMDsecs + LimitMins * 60) > Now, + case Recent of + false -> + no_index; + true -> + {{Y, M, D}, {Hour, Minute, _Second}} = + calendar:now_to_datetime(LMD), + RoundMins = + UnitMins * (Minute div UnitMins), + StrTime = + lists:flatten(io_lib:format(?LMD_FORMAT, + [Y, M, D, Hour, RoundMins])), + TTL = min(Now, LMDsecs) + (LimitMins + UnitMins) * 60, + {list_to_binary(StrTime), TTL} + end. + -spec generate_ledgerkv(tuple(), integer(), any(), integer(), tuple()|infinity) -> {any(), any(), any(), integer()|no_lookup, list()}. @@ -532,7 +625,7 @@ indexspecs_test() -> IndexSpecs = [{add, "t1_int", 456}, {add, "t1_bin", "adbc123"}, {remove, "t1_bin", "abdc456"}], - Changes = convert_indexspecs(IndexSpecs, "Bucket", "Key2", 1, infinity), + Changes = idx_indexspecs(IndexSpecs, "Bucket", "Key2", 1, infinity), ?assertMatch({{i, "Bucket", {"t1_int", 456}, "Key2"}, {1, {active, infinity}, no_lookup, null}}, lists:nth(1, Changes)), @@ -642,5 +735,67 @@ magichashperf_test() -> {TimeMH2, _HL1} = timer:tc(lists, map, [fun(K) -> magic_hash(K) end, KL]), io:format(user, "1000 keys magic hashed in ~w microseconds~n", [TimeMH2]). +parsedate_test() -> + {MeS, S, MiS} = os:timestamp(), + timer:sleep(100), + Now = integer_now(), + UnitMins = 5, + LimitMins = 60, + PD = parse_date({MeS, S, MiS}, UnitMins, LimitMins, Now), + io:format("Parsed Date ~w~n", [PD]), + ?assertMatch(true, is_tuple(PD)), + check_pd(PD, UnitMins), + CheckFun = + fun(Offset) -> + ModDate = {MeS, S + Offset * 60, MiS}, + check_pd(parse_date(ModDate, UnitMins, LimitMins, Now), UnitMins) + end, + lists:foreach(CheckFun, lists:seq(1, 60)). + +check_pd(PD, UnitMins) -> + {LMDbin, _TTL} = PD, + LMDstr = binary_to_list(LMDbin), + Minutes = list_to_integer(lists:nthtail(10, LMDstr)), + ?assertMatch(0, Minutes rem UnitMins). + +parseolddate_test() -> + LMD = os:timestamp(), + timer:sleep(100), + Now = integer_now() + 60 * 60, + UnitMins = 5, + LimitMins = 60, + PD = parse_date(LMD, UnitMins, LimitMins, Now), + io:format("Parsed Date ~w~n", [PD]), + ?assertMatch(no_index, PD). + +genaaeidx_test() -> + AAE = #recent_aae{buckets=all, limit_minutes=60, unit_minutes=5}, + Bucket = <<"Bucket1">>, + Key = <<"Key1">>, + SQN = 1, + H = erlang:phash2(null), + LastMods = [os:timestamp(), os:timestamp()], + + AAESpecs = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods), + ?assertMatch(2, length(AAESpecs)), + + LastMods1 = [os:timestamp()], + AAESpecs1 = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods1), + ?assertMatch(1, length(AAESpecs1)), + + LastMods0 = [], + AAESpecs0 = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods0), + ?assertMatch(0, length(AAESpecs0)), + + AAE0 = AAE#recent_aae{buckets=[<<"Bucket0">>]}, + AAESpecsB0 = aae_indexspecs(AAE0, Bucket, Key, SQN, H, LastMods1), + ?assertMatch(0, length(AAESpecsB0)), + AAESpecsB1 = aae_indexspecs(AAE0, <<"Bucket0">>, Key, SQN, H, LastMods1), + + ?assertMatch(1, length(AAESpecsB1)), + [{{?AAE_TAG, _LMD, {{SegID, H}, <<"Bucket0">>}, <<"Key1">>}, + {SQN, {active, TS}, no_lookup, null}}] = AAESpecsB1, + ?assertMatch(true, is_integer(SegID)), + ?assertMatch(true, is_integer(TS)). -endif. \ No newline at end of file diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index bfb2bff..41e3732 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -1002,7 +1002,7 @@ plain_fetch_mem(Key, Hash, Manifest, L0Cache, L0Index) -> element(1, R). fetch_mem(Key, Hash, Manifest, L0Cache, L0Index) -> - PosList = leveled_pmem:check_index(Hash, L0Index), + PosList = leveled_pmem:check_index(Hash, L0Index), L0Check = leveled_pmem:check_levelzero(Key, Hash, PosList, L0Cache), case L0Check of {false, not_found} -> diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index eb5af40..6e2073a 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -1503,11 +1503,11 @@ generate_indexkeys(Count, IndexList) -> generate_indexkey(Term, Count) -> IndexSpecs = [{add, "t1_int", Term}], - leveled_codec:convert_indexspecs(IndexSpecs, - "Bucket", - "Key" ++ integer_to_list(Count), - Count, - infinity). + leveled_codec:idx_indexspecs(IndexSpecs, + "Bucket", + "Key" ++ integer_to_list(Count), + Count, + infinity). form_slot_test() -> % If a skip key happens, mustn't switch to loookup by accident as could be diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 1ba4b08..88d1744 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -97,17 +97,7 @@ new_tree(TreeID) -> new_tree(TreeID, small). new_tree(TreeID, Size) -> - {BitWidth, Width, SegmentCount} = - case Size of - small -> - ?SMALL; - medium -> - ?MEDIUM; - large -> - ?LARGE; - xlarge -> - ?XLARGE - end, + {BitWidth, Width, SegmentCount} = get_size(Size), Lv1Width = Width * ?HASH_SIZE * 8, Lv1Init = <<0:Lv1Width/integer>>, Lv2SegBinSize = Width * ?HASH_SIZE * 8, @@ -236,14 +226,31 @@ merge_trees(TreeA, TreeB) -> MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. -get_segment(Key, SegmentCount) -> - erlang:phash2(Key) band (SegmentCount - 1). - +-spec get_segment(any(), integer()|small|medium|large|xlarge) -> integer(). +%% @doc +%% Return the segment ID for a Key. Can pass the tree size or the actual +%% segment count derived from the size +get_segment(Key, SegmentCount) when is_integer(SegmentCount) -> + erlang:phash2(Key) band (SegmentCount - 1); +get_segment(Key, TreeSize) -> + get_segment(Key, element(3, get_size(TreeSize))). %%%============================================================================ %%% Internal functions %%%============================================================================ +get_size(Size) -> + case Size of + small -> + ?SMALL; + medium -> + ?MEDIUM; + large -> + ?LARGE; + xlarge -> + ?XLARGE + end. + segmentcompare(SrcBin, SinkBin) when byte_size(SrcBin)==byte_size(SinkBin) -> segmentcompare(SrcBin, SinkBin, [], 0). diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index afc9aec..cd143cf 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -374,7 +374,8 @@ set_object(Bucket, Key, Value, IndexGen, Indexes2Remove) -> {remove, IdxF, IdxV} end, Indexes2Remove), [{"MDK", "MDV" ++ Key}, - {"MDK2", "MDV" ++ Key}]}, + {"MDK2", "MDV" ++ Key}, + {?MD_LASTMOD, os:timestamp()}]}, {B1, K1, V1, Spec1, MD} = Obj, Content = #r_content{metadata=dict:from_list(MD), value=V1}, {#r_object{bucket=B1, From 954995e23fb43f9412063d7768cdb6e1c182e6b0 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 30 Jun 2017 16:31:22 +0100 Subject: [PATCH 26/58] Support for recent AAE index With basic ct test. Doesn't currently prove expiry of index. Doesn't prove ability to find segments. Assumes that either "all" buckets or a special list of buckets require indexing this way. Will lead to unexpected results if the same bucket name is used across different Tags. The format of the index has been chosen so that hopeully standard index features can be used (e.g. return_terms). --- include/leveled.hrl | 2 - src/leveled_bookie.erl | 8 +- src/leveled_codec.erl | 137 ++++++++++------- src/leveled_tictac.erl | 12 +- test/end_to_end/tictac_SUITE.erl | 251 ++++++++++++++++++++++++++++++- 5 files changed, 346 insertions(+), 64 deletions(-) diff --git a/include/leveled.hrl b/include/leveled.hrl index 6e1b603..bfb0593 100644 --- a/include/leveled.hrl +++ b/include/leveled.hrl @@ -5,8 +5,6 @@ -define(STD_TAG, o). %% Tag used for secondary index keys -define(IDX_TAG, i). -%% Tag used for near real-time anti-entropy index keys --define(AAE_TAG, i_aae). %% Inker key type used for 'normal' objects -define(INKT_STND, stnd). diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 8b46d81..f2b76db 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1246,7 +1246,7 @@ get_hashaccumulator(JournalCheck, InkerClone, AddKeyFun) -> fun(LK, V, Acc) -> case leveled_codec:is_active(LK, V, Now) of true -> - {B, K, H} = leveled_codec:get_keyandhash(LK, V), + {B, K, H} = leveled_codec:get_keyandobjhash(LK, V), Check = random:uniform() < ?CHECKJOURNAL_PROB, case {JournalCheck, Check} of {check_presence, true} -> @@ -1408,13 +1408,13 @@ preparefor_ledgercache(?INKT_KEYD, preparefor_ledgercache(_InkTag, LedgerKey, SQN, Obj, Size, {IdxSpecs, TTL}, AAE) -> - {Bucket, Key, MetaValue, H, LastMods} = + {Bucket, Key, MetaValue, {KeyH, ObjH}, LastMods} = leveled_codec:generate_ledgerkv(LedgerKey, SQN, Obj, Size, TTL), KeyChanges = [{LedgerKey, MetaValue}] ++ leveled_codec:idx_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL) ++ - leveled_codec:aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods), - {H, SQN, KeyChanges}. + leveled_codec:aae_indexspecs(AAE, Bucket, Key, SQN, ObjH, LastMods), + {KeyH, SQN, KeyChanges}. addto_ledgercache({H, SQN, KeyChanges}, Cache) -> diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 621f997..485a497 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -58,7 +58,7 @@ build_metadata_object/2, generate_ledgerkv/5, get_size/2, - get_keyandhash/2, + get_keyandobjhash/2, idx_indexspecs/5, aae_indexspecs/6, generate_uuid/0, @@ -70,7 +70,8 @@ -define(V1_VERS, 1). -define(MAGIC, 53). % riak_kv -> riak_object -define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). - +-define(NRT_IDX, "$aae."). +-define(ALL_BUCKETS, list_to_binary("$all")). -type recent_aae() :: #recent_aae{}. @@ -106,8 +107,6 @@ to_lookup(Key) -> case element(1, Key) of ?IDX_TAG -> no_lookup; - ?AAE_TAG -> - no_lookup; _ -> lookup end. @@ -378,19 +377,24 @@ endkey_passed(EndKey, CheckingKey) -> EndKey < CheckingKey. idx_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL) -> - lists:map(fun({IndexOp, IdxField, IdxValue}) -> - Status = case IndexOp of - add -> - {active, TTL}; - remove -> - %% TODO: timestamps for delayed reaping - tomb - end, - {to_ledgerkey(Bucket, Key, ?IDX_TAG, - IdxField, IdxValue), - {SQN, Status, no_lookup, null}} - end, - IndexSpecs). + lists:map( + fun({IdxOp, IdxFld, IdxTrm}) -> + gen_indexspec(Bucket, Key, IdxOp, IdxFld, IdxTrm, SQN, TTL) + end, + IndexSpecs + ). + +gen_indexspec(Bucket, Key, IdxOp, IdxField, IdxTerm, SQN, TTL) -> + Status = + case IdxOp of + add -> + {active, TTL}; + remove -> + %% TODO: timestamps for delayed reaping + tomb + end, + {to_ledgerkey(Bucket, Key, ?IDX_TAG, IdxField, IdxTerm), + {SQN, Status, no_lookup, null}}. -spec aae_indexspecs(false|recent_aae(), any(), any(), @@ -411,16 +415,23 @@ aae_indexspecs(false, _Bucket, _Key, _SQN, _H, _LastMods) -> aae_indexspecs(_AAE, _Bucket, _Key, _SQN, _H, []) -> []; aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) -> - InBucket = + Bucket0 = case AAE#recent_aae.buckets of all -> - true; + ?ALL_BUCKETS; ListB -> - lists:member(Bucket, ListB) + case lists:member(Bucket, ListB) of + true -> + Bucket; + false -> + false + end end, - case InBucket of - true -> - GenTagFun = + case Bucket0 of + false -> + []; + Bucket0 -> + GenIdxFun = fun(LMD0, Acc) -> Dates = parse_date(LMD0, AAE#recent_aae.unit_minutes, @@ -431,19 +442,23 @@ aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) -> Acc; {LMD1, TTL} -> TreeSize = AAE#recent_aae.tree_size, - SegmentID = + SegID = leveled_tictac:get_segment(Key, TreeSize), - IdxK = {?AAE_TAG, - LMD1, - {{SegmentID, H}, Bucket}, - Key}, - IdxV = {SQN, {active, TTL}, no_lookup, null}, + IdxFldStr = ?NRT_IDX ++ LMD1 ++ "_bin", + IdxTrmStr = + string:right(integer_to_list(SegID), 24, $0) ++ + "." ++ + string:right(integer_to_list(H), 24, $0), + {IdxK, IdxV} = + gen_indexspec(Bucket0, Key, + add, + list_to_binary(IdxFldStr), + list_to_binary(IdxTrmStr), + SQN, TTL), [{IdxK, IdxV}|Acc] end end, - lists:foldl(GenTagFun, [], LastMods); - false -> - [] + lists:foldl(GenIdxFun, [], LastMods) end. -spec parse_date(tuple(), integer(), integer(), integer()) -> @@ -468,12 +483,12 @@ parse_date(LMD, UnitMins, LimitMins, Now) -> lists:flatten(io_lib:format(?LMD_FORMAT, [Y, M, D, Hour, RoundMins])), TTL = min(Now, LMDsecs) + (LimitMins + UnitMins) * 60, - {list_to_binary(StrTime), TTL} + {StrTime, TTL} end. --spec generate_ledgerkv(tuple(), integer(), any(), - integer(), tuple()|infinity) -> - {any(), any(), any(), integer()|no_lookup, list()}. +-spec generate_ledgerkv( + tuple(), integer(), any(), integer(), tuple()|infinity) -> + {any(), any(), any(), {integer()|no_lookup, integer()}, list()}. %% @doc %% Function to extract from an object the information necessary to populate %% the Penciller's ledger. @@ -482,7 +497,8 @@ parse_date(LMD, UnitMins, LimitMins, Now) -> %% Key - original Key extracted from the PrimaryKey %% Value - the value to be used in the Ledger (essentially the extracted %% metadata) -%% Hash - A magic hash of the key to be used in lookups and filters +%% {Hash, ObjHash} - A magic hash of the key to accelerate lookups, and a hash +%% of the value to be used for equality checking between objects %% LastMods - the last modified dates for the object (may be multiple due to %% siblings) generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> @@ -495,11 +511,12 @@ generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> end, Hash = magic_hash(PrimaryKey), {MD, LastMods} = extract_metadata(Obj, Size, Tag), + ObjHash = get_objhash(Tag, MD), Value = {SQN, Status, Hash, MD}, - {Bucket, Key, Value, Hash, LastMods}. + {Bucket, Key, Value, {Hash, ObjHash}, LastMods}. integer_now() -> @@ -525,22 +542,33 @@ get_size(PK, Value) -> {_Hash, Size} = MD, Size end. - -get_keyandhash(LK, Value) -> + +-spec get_keyandobjhash(tuple(), tuple()) -> tuple(). +%% @doc +%% Return a tucple of {Bucket, Key, Hash} where hash is a has of the object +%% not the key (for example with Riak tagged objects this will be a hash of +%% the sorted vclock) +get_keyandobjhash(LK, Value) -> {Tag, Bucket, Key, _} = LK, {_, _, _, MD} = Value, case Tag of - ?RIAK_TAG -> - {_RMD, _VC, Hash, _Size} = MD, - {Bucket, Key, Hash}; - ?STD_TAG -> - {Hash, _Size} = MD, - {Bucket, Key, Hash}; ?IDX_TAG -> - from_ledgerkey(LK) % returns {Bucket, Key, IdxValue} + from_ledgerkey(LK); % returns {Bucket, Key, IdxValue} + _ -> + {Bucket, Key, get_objhash(Tag, MD)} end. - +get_objhash(Tag, ObjMetaData) -> + case Tag of + ?RIAK_TAG -> + {_RMD, _VC, Hash, _Size} = ObjMetaData, + Hash; + ?STD_TAG -> + {Hash, _Size} = ObjMetaData, + Hash + end. + + build_metadata_object(PrimaryKey, MD) -> {Tag, _Bucket, _Key, null} = PrimaryKey, case Tag of @@ -753,8 +781,7 @@ parsedate_test() -> lists:foreach(CheckFun, lists:seq(1, 60)). check_pd(PD, UnitMins) -> - {LMDbin, _TTL} = PD, - LMDstr = binary_to_list(LMDbin), + {LMDstr, _TTL} = PD, Minutes = list_to_integer(lists:nthtail(10, LMDstr)), ?assertMatch(0, Minutes rem UnitMins). @@ -782,6 +809,9 @@ genaaeidx_test() -> LastMods1 = [os:timestamp()], AAESpecs1 = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods1), ?assertMatch(1, length(AAESpecs1)), + IdxB = element(2, element(1, lists:nth(1, AAESpecs1))), + io:format(user, "AAE IDXSpecs1 ~w~n", [AAESpecs1]), + ?assertMatch(<<"$all">>, IdxB), LastMods0 = [], AAESpecs0 = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods0), @@ -793,9 +823,10 @@ genaaeidx_test() -> AAESpecsB1 = aae_indexspecs(AAE0, <<"Bucket0">>, Key, SQN, H, LastMods1), ?assertMatch(1, length(AAESpecsB1)), - [{{?AAE_TAG, _LMD, {{SegID, H}, <<"Bucket0">>}, <<"Key1">>}, + [{{?IDX_TAG, <<"Bucket0">>, {Fld, Term}, <<"Key1">>}, {SQN, {active, TS}, no_lookup, null}}] = AAESpecsB1, - ?assertMatch(true, is_integer(SegID)), - ?assertMatch(true, is_integer(TS)). + ?assertMatch(true, is_integer(TS)), + ?assertMatch(49, length(binary_to_list(Term))), + ?assertMatch("$aae.", lists:sublist(binary_to_list(Fld), 5)). -endif. \ No newline at end of file diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 88d1744..bd7f591 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -63,7 +63,8 @@ fetch_root/1, fetch_leaves/2, merge_trees/2, - get_segment/2 + get_segment/2, + tictac_hash/2 ]). @@ -117,7 +118,7 @@ new_tree(TreeID, Size) -> %% based on that key and value add_kv(TicTacTree, Key, Value, HashFun) -> HashV = HashFun(Key, Value), - SegChangeHash = erlang:phash2(Key, HashV), + SegChangeHash = tictac_hash(Key, HashV), Segment = get_segment(Key, TicTacTree#tictactree.segment_count), Level2Pos = @@ -235,6 +236,13 @@ get_segment(Key, SegmentCount) when is_integer(SegmentCount) -> get_segment(Key, TreeSize) -> get_segment(Key, element(3, get_size(TreeSize))). + +-spec tictac_hash(tuple(), any()) -> integer(). +%% @doc +%% Hash the key and term +tictac_hash(Key, Term) -> + erlang:phash2({Key, Term}). + %%%============================================================================ %%% Internal functions %%%============================================================================ diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 2c2b3f8..33ace07 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -4,14 +4,19 @@ -export([all/0]). -export([ many_put_compare/1, - index_compare/1 + index_compare/1, + recent_aae_noaae/1, + recent_aae_allaae/1 ]). all() -> [ - many_put_compare, - index_compare + % many_put_compare, + % index_compare, + % recent_aae_noaae, + recent_aae_allaae ]. +-define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). many_put_compare(_Config) -> TreeSize = small, @@ -419,3 +424,243 @@ index_compare(_Config) -> ok = leveled_bookie:book_close(Book2B), ok = leveled_bookie:book_close(Book2C), ok = leveled_bookie:book_close(Book2D). + + +recent_aae_noaae(_Config) -> + TreeSize = small, + % SegmentCount = 256 * 256, + UnitMins = 2, + + % Test requires multiple different databases, so want to mount them all + % on individual file paths + RootPathA = testutil:reset_filestructure("testA"), + RootPathB = testutil:reset_filestructure("testB"), + RootPathC = testutil:reset_filestructure("testC"), + RootPathD = testutil:reset_filestructure("testD"), + StartOptsA = aae_startopts(RootPathA, false), + StartOptsB = aae_startopts(RootPathB, false), + StartOptsC = aae_startopts(RootPathC, false), + StartOptsD = aae_startopts(RootPathD, false), + + % Book1A to get all objects + {ok, Book1A} = leveled_bookie:book_start(StartOptsA), + % Book1B/C/D will have objects partitioned across it + {ok, Book1B} = leveled_bookie:book_start(StartOptsB), + {ok, Book1C} = leveled_bookie:book_start(StartOptsC), + {ok, Book1D} = leveled_bookie:book_start(StartOptsD), + + {B1, K1, V1, S1, MD} = {"Bucket", + "Key1.1.4567.4321", + "Value1", + [], + [{"MDK1", "MDV1"}]}, + {TestObject, TestSpec} = testutil:generate_testobject(B1, K1, V1, S1, MD), + + SW_StartLoad = os:timestamp(), + + ok = testutil:book_riakput(Book1A, TestObject, TestSpec), + ok = testutil:book_riakput(Book1B, TestObject, TestSpec), + testutil:check_forobject(Book1A, TestObject), + testutil:check_forobject(Book1B, TestObject), + + {TicTacTreeJoined, TicTacTreeFull, EmptyTree, _LMDIndexes} = + load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, + SW_StartLoad, TreeSize, UnitMins, + false), + % Go compare! Also confirm we're not comparing empty trees + DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, + TicTacTreeJoined), + + DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), + true = DL1_0 == [], + true = length(DL1_1) == 0, + + ok = leveled_bookie:book_close(Book1A), + ok = leveled_bookie:book_close(Book1B), + ok = leveled_bookie:book_close(Book1C), + ok = leveled_bookie:book_close(Book1D). + + +recent_aae_allaae(_Config) -> + TreeSize = small, + % SegmentCount = 256 * 256, + UnitMins = 2, + AAE = {all, 60, UnitMins}, + + % Test requires multiple different databases, so want to mount them all + % on individual file paths + RootPathA = testutil:reset_filestructure("testA"), + RootPathB = testutil:reset_filestructure("testB"), + RootPathC = testutil:reset_filestructure("testC"), + RootPathD = testutil:reset_filestructure("testD"), + StartOptsA = aae_startopts(RootPathA, AAE), + StartOptsB = aae_startopts(RootPathB, AAE), + StartOptsC = aae_startopts(RootPathC, AAE), + StartOptsD = aae_startopts(RootPathD, AAE), + + % Book1A to get all objects + {ok, Book1A} = leveled_bookie:book_start(StartOptsA), + % Book1B/C/D will have objects partitioned across it + {ok, Book1B} = leveled_bookie:book_start(StartOptsB), + {ok, Book1C} = leveled_bookie:book_start(StartOptsC), + {ok, Book1D} = leveled_bookie:book_start(StartOptsD), + + {B1, K1, V1, S1, MD} = {"Bucket", + "Key1.1.4567.4321", + "Value1", + [], + [{"MDK1", "MDV1"}]}, + {TestObject, TestSpec} = testutil:generate_testobject(B1, K1, V1, S1, MD), + + SW_StartLoad = os:timestamp(), + + ok = testutil:book_riakput(Book1A, TestObject, TestSpec), + ok = testutil:book_riakput(Book1B, TestObject, TestSpec), + testutil:check_forobject(Book1A, TestObject), + testutil:check_forobject(Book1B, TestObject), + + {TicTacTreeJoined, TicTacTreeFull, EmptyTree, _LMDIndexes} = + load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, + SW_StartLoad, TreeSize, UnitMins, + false), + % Go compare! Also confirm we're not comparing empty trees + DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, + TicTacTreeJoined), + + DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), + true = DL1_0 == [], + true = length(DL1_1) > 100, + + ok = leveled_bookie:book_close(Book1A), + ok = leveled_bookie:book_close(Book1B), + ok = leveled_bookie:book_close(Book1C), + ok = leveled_bookie:book_close(Book1D). + + +load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, + SW_StartLoad, TreeSize, UnitMins, + LMDIndexes_Loaded) -> + + LMDIndexes = + case LMDIndexes_Loaded of + false -> + % Generate nine lists of objects + % BucketBin = list_to_binary("Bucket"), + GenMapFun = + fun(_X) -> + V = testutil:get_compressiblevalue(), + Indexes = testutil:get_randomindexes_generator(8), + testutil:generate_objects(5000, + binary_uuid, + [], + V, + Indexes) + end, + + ObjLists = lists:map(GenMapFun, lists:seq(1, 9)), + + % Load all nine lists into Book1A + lists:foreach(fun(ObjL) -> testutil:riakload(Book1A, ObjL) end, + ObjLists), + + % Split nine lists across Book1B to Book1D, three object lists + % in each + lists:foreach(fun(ObjL) -> testutil:riakload(Book1B, ObjL) end, + lists:sublist(ObjLists, 1, 3)), + lists:foreach(fun(ObjL) -> testutil:riakload(Book1C, ObjL) end, + lists:sublist(ObjLists, 4, 3)), + lists:foreach(fun(ObjL) -> testutil:riakload(Book1D, ObjL) end, + lists:sublist(ObjLists, 7, 3)), + + SW_EndLoad = os:timestamp(), + determine_lmd_indexes(SW_StartLoad, SW_EndLoad, UnitMins); + _ -> + LMDIndexes_Loaded + end, + + EmptyTree = leveled_tictac:new_tree(empty, TreeSize), + + GetTicTacTreeFun = + fun(Bookie) -> + fun(LMD, Acc) -> + SW = os:timestamp(), + ST = <<"0">>, + ET = <<"A">>, + Q = {tictactree_idx, + {<<"$all">>, + list_to_binary("$aae." ++ LMD ++ "_bin"), + ST, + ET}, + TreeSize, + fun(_B, _K) -> accumulate end}, + {async, Folder} = leveled_bookie:book_returnfolder(Bookie, Q), + R = Folder(), + io:format("TicTac Tree for index ~w took " ++ + "~w microseconds~n", + [LMD, timer:now_diff(os:timestamp(), SW)]), + leveled_tictac:merge_trees(R, Acc) + end + end, + + % Get a TicTac tree representing one of the indexes in Bucket A + TicTacTree1_Full = + lists:foldl(GetTicTacTreeFun(Book1A), EmptyTree, LMDIndexes), + + TicTacTree1_P1 = + lists:foldl(GetTicTacTreeFun(Book1B), EmptyTree, LMDIndexes), + TicTacTree1_P2 = + lists:foldl(GetTicTacTreeFun(Book1C), EmptyTree, LMDIndexes), + TicTacTree1_P3 = + lists:foldl(GetTicTacTreeFun(Book1D), EmptyTree, LMDIndexes), + + % Merge the tree across the partitions + TicTacTree1_Joined = lists:foldl(fun leveled_tictac:merge_trees/2, + TicTacTree1_P1, + [TicTacTree1_P2, TicTacTree1_P3]), + + {TicTacTree1_Full, TicTacTree1_Joined, EmptyTree, LMDIndexes}. + + +aae_startopts(RootPath, AAE) -> + LS = 2000, + JS = 50000000, + SS = testutil:sync_strategy(), + [{root_path, RootPath}, + {sync_strategy, SS}, + {cache_size, LS}, + {max_journalsize, JS}, + {recent_aae, AAE}]. + + +determine_lmd_indexes(StartTS, EndTS, UnitMins) -> + StartDT = calendar:now_to_datetime(StartTS), + EndDT = calendar:now_to_datetime(EndTS), + StartTimeStr = get_strtime(StartDT, UnitMins), + EndTimeStr = get_strtime(EndDT, UnitMins), + + AddTimeFun = + fun(X, Acc) -> + case lists:member(EndTimeStr, Acc) of + true -> + Acc; + false -> + NextTime = + 300 * X + + calendar:datetime_to_gregorian_seconds(StartDT), + NextDT = + calendar:gregorian_seconds_to_datetime(NextTime), + Acc ++ [get_strtime(NextDT, UnitMins)] + end + end, + + lists:foldl(AddTimeFun, [StartTimeStr], lists:seq(1, 5)). + + +get_strtime(DateTime, UnitMins) -> + {{Y, M, D}, {Hour, Minute, _Second}} = DateTime, + RoundMins = + UnitMins * (Minute div UnitMins), + StrTime = + lists:flatten(io_lib:format(?LMD_FORMAT, + [Y, M, D, Hour, RoundMins])), + StrTime. \ No newline at end of file From a15c046887fdf0ab3be904ddbb9c4f1deabf74bb Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 30 Jun 2017 16:31:48 +0100 Subject: [PATCH 27/58] Re-introduce commented tests --- test/end_to_end/tictac_SUITE.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 33ace07..18969a3 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -10,9 +10,9 @@ ]). all() -> [ - % many_put_compare, - % index_compare, - % recent_aae_noaae, + many_put_compare, + index_compare, + recent_aae_noaae, recent_aae_allaae ]. From da53808e2ee66f19647909d022da3c27cb4e1e6a Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 1 Jul 2017 08:24:58 +0100 Subject: [PATCH 28/58] Extend test beyond restart Prove that recency check still works after a restart --- test/end_to_end/tictac_SUITE.erl | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 18969a3..fc9f36b 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -519,7 +519,7 @@ recent_aae_allaae(_Config) -> testutil:check_forobject(Book1A, TestObject), testutil:check_forobject(Book1B, TestObject), - {TicTacTreeJoined, TicTacTreeFull, EmptyTree, _LMDIndexes} = + {TicTacTreeJoined, TicTacTreeFull, EmptyTree, LMDIndexes} = load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, SW_StartLoad, TreeSize, UnitMins, false), @@ -534,7 +534,31 @@ recent_aae_allaae(_Config) -> ok = leveled_bookie:book_close(Book1A), ok = leveled_bookie:book_close(Book1B), ok = leveled_bookie:book_close(Book1C), - ok = leveled_bookie:book_close(Book1D). + ok = leveled_bookie:book_close(Book1D), + + % Book2A to get all objects + {ok, Book2A} = leveled_bookie:book_start(StartOptsA), + % Book2B/C/D will have objects partitioned across it + {ok, Book2B} = leveled_bookie:book_start(StartOptsB), + {ok, Book2C} = leveled_bookie:book_start(StartOptsC), + {ok, Book2D} = leveled_bookie:book_start(StartOptsD), + + {TicTacTreeJoined, TicTacTreeFull, EmptyTree, _LMDIndexes} = + load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, + SW_StartLoad, TreeSize, UnitMins, + LMDIndexes), + % Go compare! Also confirm we're not comparing empty trees + DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, + TicTacTreeJoined), + + DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), + true = DL1_0 == [], + true = length(DL1_1) > 100, + + ok = leveled_bookie:book_close(Book2A), + ok = leveled_bookie:book_close(Book2B), + ok = leveled_bookie:book_close(Book2C), + ok = leveled_bookie:book_close(Book2D). load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, From 52ca0e4b6c4ee05ea7718d80cd909fcbe67c8c06 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 2 Jul 2017 19:33:18 +0100 Subject: [PATCH 29/58] Test expansion Detect a recent difference --- src/leveled_codec.erl | 6 +++--- test/end_to_end/testutil.erl | 8 ++++++-- test/end_to_end/tictac_SUITE.erl | 26 ++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 485a497..2d413e4 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -446,9 +446,9 @@ aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) -> leveled_tictac:get_segment(Key, TreeSize), IdxFldStr = ?NRT_IDX ++ LMD1 ++ "_bin", IdxTrmStr = - string:right(integer_to_list(SegID), 24, $0) ++ + string:right(integer_to_list(SegID), 8, $0) ++ "." ++ - string:right(integer_to_list(H), 24, $0), + string:right(integer_to_list(H), 8, $0), {IdxK, IdxV} = gen_indexspec(Bucket0, Key, add, @@ -826,7 +826,7 @@ genaaeidx_test() -> [{{?IDX_TAG, <<"Bucket0">>, {Fld, Term}, <<"Key1">>}, {SQN, {active, TS}, no_lookup, null}}] = AAESpecsB1, ?assertMatch(true, is_integer(TS)), - ?assertMatch(49, length(binary_to_list(Term))), + ?assertMatch(17, length(binary_to_list(Term))), ?assertMatch("$aae.", lists:sublist(binary_to_list(Fld), 5)). -endif. \ No newline at end of file diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index cd143cf..305a371 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -282,8 +282,12 @@ generate_testobject() -> generate_testobject(B1, K1, V1, Spec1, MD). generate_testobject(B, K, V, Spec, MD) -> - Content = #r_content{metadata=dict:from_list(MD), value=V}, - {#r_object{bucket=B, key=K, contents=[Content], vclock=[{'a',1}]}, + MD0 = [{?MD_LASTMOD, os:timestamp()}|MD], + Content = #r_content{metadata=dict:from_list(MD0), value=V}, + {#r_object{bucket=B, + key=K, + contents=[Content], + vclock=generate_vclock()}, Spec}. diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index fc9f36b..c16858d 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -543,7 +543,7 @@ recent_aae_allaae(_Config) -> {ok, Book2C} = leveled_bookie:book_start(StartOptsC), {ok, Book2D} = leveled_bookie:book_start(StartOptsD), - {TicTacTreeJoined, TicTacTreeFull, EmptyTree, _LMDIndexes} = + {TicTacTreeJoined, TicTacTreeFull, EmptyTree, LMDIndexes} = load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, SW_StartLoad, TreeSize, UnitMins, LMDIndexes), @@ -555,6 +555,28 @@ recent_aae_allaae(_Config) -> true = DL1_0 == [], true = length(DL1_1) > 100, + V2 = "Value2", + {TestObject2, TestSpec2} = + testutil:generate_testobject(B1, K1, V2, S1, MD), + + New_startTS = os:timestamp(), + + ok = testutil:book_riakput(Book2B, TestObject2, TestSpec2), + testutil:check_forobject(Book2B, TestObject2), + testutil:check_forobject(Book2A, TestObject), + + New_endTS = os:timestamp(), + NewLMDIndexes = determine_lmd_indexes(New_startTS, New_endTS, UnitMins), + {TicTacTreeJoined2, TicTacTreeFull2, _EmptyTree, NewLMDIndexes} = + load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, + New_startTS, TreeSize, UnitMins, + NewLMDIndexes), + DL2_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull2, + TicTacTreeJoined2), + + % DL2_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), + true = length(DL2_0) == 1, + ok = leveled_bookie:book_close(Book2A), ok = leveled_bookie:book_close(Book2B), ok = leveled_bookie:book_close(Book2C), @@ -619,7 +641,7 @@ load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, fun(_B, _K) -> accumulate end}, {async, Folder} = leveled_bookie:book_returnfolder(Bookie, Q), R = Folder(), - io:format("TicTac Tree for index ~w took " ++ + io:format("TicTac Tree for index ~s took " ++ "~w microseconds~n", [LMD, timer:now_diff(os:timestamp(), SW)]), leveled_tictac:merge_trees(R, Acc) From fd84e4f60812cb8217a30f48769adf8c201b41f3 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Sun, 2 Jul 2017 22:23:02 +0100 Subject: [PATCH 30/58] Test timeouts So that coverage testing will run. --- src/leveled_pmem.erl | 6 +++++- src/leveled_tictac.erl | 8 +++++--- test/end_to_end/basic_SUITE.erl | 8 ++++---- test/end_to_end/testutil.erl | 2 +- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index a36d4ac..b5ea64d 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -351,7 +351,11 @@ compare_method_test() -> [timer:now_diff(os:timestamp(), SWb), Sz1]), ?assertMatch(Sz0, Sz1). -with_index_test() -> +with_index_test_() -> + % Otherwise this test may timeout when run with coverage enabled + {timeout, 60, fun with_index_test2/0}. + +with_index_test2() -> IndexPrepareFun = fun({K, _V}, Acc) -> H = leveled_codec:magic_hash(K), diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index bd7f591..e7ced1e 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -356,9 +356,11 @@ merge_bysize_medium_test() -> merge_bysize_large_test() -> merge_test_withsize(large). -% merge_bysize_xlarge_test() -> -% merge_test_withsize(xlarge). -% timmeout on cover test - so commented +merge_bysize_xlarge_test_() -> + {timeout, 60, fun merge_bysize_xlarge_test2/0}. + +merge_bysize_xlarge_test2() -> + merge_test_withsize(xlarge). merge_test_withsize(Size) -> HashFun = fun(_K, V) -> erlang:phash2(V) end, diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl index 2f3337d..ce8f4b0 100644 --- a/test/end_to_end/basic_SUITE.erl +++ b/test/end_to_end/basic_SUITE.erl @@ -123,11 +123,11 @@ journal_compaction(_Config) -> ChkList1 = lists:sublist(lists:sort(ObjList1), 10000), testutil:check_forlist(Bookie1, ChkList1), testutil:check_forobject(Bookie1, TestObject), - {B2, K2, V2, Spec2, MD} = {"Bucket1", - "Key1", - "Value1", + {B2, K2, V2, Spec2, MD} = {"Bucket2", + "Key2", + "Value2", [], - [{"MDK1", "MDV1"}]}, + [{"MDK2", "MDV2"}]}, {TestObject2, TestSpec2} = testutil:generate_testobject(B2, K2, V2, Spec2, MD), ok = testutil:book_riakput(Bookie1, TestObject2, TestSpec2), diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index 305a371..78c92c4 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -502,7 +502,7 @@ check_indexed_objects(Book, B, KSpecL, V) -> {fun foldkeysfun/3, []}, {"idx1_bin", "0", - "~"}, + "|"}, ?RETURN_TERMS}), SW = os:timestamp(), {async, Fldr} = R, From d0a825a14525efbd1bb993d61a18fe6c6ee266cd Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 3 Jul 2017 10:33:24 +0100 Subject: [PATCH 31/58] Extend test to detect keys When comparing recent changes demonstration the detection of the keys which have changed with a follow-up query --- test/end_to_end/tictac_SUITE.erl | 44 ++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index c16858d..a9c09a9 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -577,6 +577,50 @@ recent_aae_allaae(_Config) -> % DL2_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), true = length(DL2_0) == 1, + [DirtySeg] = DL2_0, + TermPrefix = string:right(integer_to_list(DirtySeg), 8, $0), + + LMDSegFolder = + fun(LMD, {Acc, Bookie}) -> + IdxLMD = list_to_binary("$aae." ++ LMD ++ "_bin"), + IdxQ1 = + {index_query, + <<"$all">>, + {fun testutil:foldkeysfun/3, []}, + {IdxLMD, + list_to_binary(TermPrefix ++ "."), + list_to_binary(TermPrefix ++ "|")}, + {true, undefined}}, + {async, IdxFolder} = + leveled_bookie:book_returnfolder(Bookie, IdxQ1), + {Acc ++ IdxFolder(), Bookie} + end, + {KeysTerms2A, _} = lists:foldl(LMDSegFolder, + {[], Book2A}, + lists:usort(LMDIndexes ++ NewLMDIndexes)), + true = length(KeysTerms2A) >= 1, + + {KeysTerms2B, _} = lists:foldl(LMDSegFolder, + {[], Book2B}, + lists:usort(LMDIndexes ++ NewLMDIndexes)), + {KeysTerms2C, _} = lists:foldl(LMDSegFolder, + {[], Book2C}, + lists:usort(LMDIndexes ++ NewLMDIndexes)), + {KeysTerms2D, _} = lists:foldl(LMDSegFolder, + {[], Book2D}, + lists:usort(LMDIndexes ++ NewLMDIndexes)), + + KeysTerms2Joined = KeysTerms2B ++ KeysTerms2C ++ KeysTerms2D, + DeltaX = lists:subtract(KeysTerms2A, KeysTerms2Joined), + DeltaY = lists:subtract(KeysTerms2Joined, KeysTerms2A), + + io:format("DeltaX ~w~n", [DeltaX]), + io:format("DeltaY ~w~n", [DeltaY]), + + true = length(DeltaX) == 0, % This hasn't seen any extra changes + true = length(DeltaY) == 1, % This has seen an extra change + [{_, K1}] = DeltaY, + ok = leveled_bookie:book_close(Book2A), ok = leveled_bookie:book_close(Book2B), ok = leveled_bookie:book_close(Book2C), From b143ea1c08be452a392b3adb890eecb3e4c113dd Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 3 Jul 2017 11:12:03 +0100 Subject: [PATCH 32/58] Anti-entropy write-up Some initial jottings --- docs/ANTI_ENTROPY.md | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 docs/ANTI_ENTROPY.md diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md new file mode 100644 index 0000000..2ea6dc4 --- /dev/null +++ b/docs/ANTI_ENTROPY.md @@ -0,0 +1,42 @@ +# Anti-Entropy + +Leveled is primarily designed to be a backend for Riak. Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters, as part of exploring the potential for improvements in these anti-entropy mechanisms - some features have been added directly to Leveled. The purpose of these features is to: + +- Allow for the database state within in a Leveled store or stores to be compared with an other store or stores which should have the same data; +- Allow for quicker checking that recent changes to one store or stores have also been received by another store or stores that should be receiving the same changes. + +The aim is to use these backend capabilities to allow for a Riak anti-entropy mechanism with the following features: + +- Comparison can be made between clusters with different ring-sizes - comparison is not coupled to partitioning. +- Comparison can use a consistent approach to compare state within and between clusters. +- Comparison does not rely on duplication of database state to a separate store, with further anti-entropy required to manage state variance between the actual and anti-entropy stores. +- Comparison of state can be abstracted from Riak specific implementation so that mechanisms to compare between riak clusters can be re-used to compare between a Riak cluster and another database store. Coordination with another data store (e.g. Solr) can be controlled by the Riak user not the Riak developer. + +## Merkle Trees + +Riak has historically used [Merkle trees](https://en.wikipedia.org/wiki/Merkle_tree) as a way to communicate state efficiently between actors. Merkle trees have been designed to be cryptographically secure so that they don't leak details of the individual transactions themselves. This strength is useful in many Merkle Tree use cases, and is part derived from the use of concatenation when calculating branch hashes from leaf hashes: + +> A hash tree is a tree of hashes in which the leaves are hashes of data blocks in, for instance, a file or set of files. Nodes further up in the tree are the hashes of their respective children. For example, in the picture hash 0 is the result of hashing the concatenation of hash 0-0 and hash 0-1. That is, hash 0 = hash( hash 0-0 + hash 0-1 ) where + denotes concatenation. + +A side effect of the concatenation decision is that trees cannot be calculated incrementally, when elements are not ordered by segment. To calculate the hash of an individual leaf (or segment), the hashes of all the elements under that leaf must be accumulated first. + +## Tic-Tac Trees + +Anti-entropy in leveled is supported using the leveled_tictac module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) which has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. + +The change from Merkle trees to Tic-Tac trees is simply to no longer use a cryptographically strong hashing algorithm, and now combine hashes through XORing rather than concatenation - to enable merging and incremental builds. + +## Divide and Conquer + +.... to be completed + + +Splitting the problem into two parts + +full database state +recent changes + +as opposed to + +full database state +rebuilt full database state From 97fdd36d53ba67c975ca7ca61fc8302c7ad3bca5 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 3 Jul 2017 18:03:13 +0100 Subject: [PATCH 33/58] Returning bucket when bucket is all Need to know {Bucket, Key} not just Key if all buckets are being covered by nrt aae. So shoehorning this in - will also allow for proper use of FilterFun when filtering by partition. --- docs/ANTI_ENTROPY.md | 42 ++++++++++++++++++++++++++++---- src/leveled_bookie.erl | 2 -- src/leveled_codec.erl | 31 +++++++++++++++++------ test/end_to_end/testutil.erl | 6 +++++ test/end_to_end/tictac_SUITE.erl | 8 +++--- 5 files changed, 70 insertions(+), 19 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 2ea6dc4..0919aba 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -3,14 +3,18 @@ Leveled is primarily designed to be a backend for Riak. Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters, as part of exploring the potential for improvements in these anti-entropy mechanisms - some features have been added directly to Leveled. The purpose of these features is to: - Allow for the database state within in a Leveled store or stores to be compared with an other store or stores which should have the same data; + - Allow for quicker checking that recent changes to one store or stores have also been received by another store or stores that should be receiving the same changes. The aim is to use these backend capabilities to allow for a Riak anti-entropy mechanism with the following features: - Comparison can be made between clusters with different ring-sizes - comparison is not coupled to partitioning. + - Comparison can use a consistent approach to compare state within and between clusters. + - Comparison does not rely on duplication of database state to a separate store, with further anti-entropy required to manage state variance between the actual and anti-entropy stores. -- Comparison of state can be abstracted from Riak specific implementation so that mechanisms to compare between riak clusters can be re-used to compare between a Riak cluster and another database store. Coordination with another data store (e.g. Solr) can be controlled by the Riak user not the Riak developer. + +- Comparison of state can be abstracted from Riak specific implementation so that mechanisms to compare between Riak clusters can be re-used to compare between a Riak cluster and another database store. Coordination with another data store (e.g. Solr) can be controlled by the Riak user not the Riak developer. ## Merkle Trees @@ -18,15 +22,43 @@ Riak has historically used [Merkle trees](https://en.wikipedia.org/wiki/Merkle_t > A hash tree is a tree of hashes in which the leaves are hashes of data blocks in, for instance, a file or set of files. Nodes further up in the tree are the hashes of their respective children. For example, in the picture hash 0 is the result of hashing the concatenation of hash 0-0 and hash 0-1. That is, hash 0 = hash( hash 0-0 + hash 0-1 ) where + denotes concatenation. -A side effect of the concatenation decision is that trees cannot be calculated incrementally, when elements are not ordered by segment. To calculate the hash of an individual leaf (or segment), the hashes of all the elements under that leaf must be accumulated first. +A side effect of the concatenation decision is that trees cannot be calculated incrementally, when elements are not ordered by segment. To calculate the hash of an individual leaf (or segment), the hashes of all the elements under that leaf must be accumulated first. In the case of the leaf segments in Riak, the leaf segments are made up of a hash of the concatenation of {Key, Hash} pairs under that leaf: + +``hash([{K1, H1}, {K2, H2} .. {Kn, Hn}])`` + +This requires all of the keys and hashes need to be pulled into memory to build the hashtree - unless the tree is being built segment by segment. The Riak hashtree data store is therefore ordered by segment so that it can be incrementally built. The segments which have had key changes are tracked, and at exchange time all "dirty segments" are re-scanned in the store segment by segment, so that the hashtree can be rebuilt. ## Tic-Tac Trees -Anti-entropy in leveled is supported using the leveled_tictac module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) which has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. +Anti-entropy in leveled is supported using the leveled_tictac module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, or make the tree tamper-proof, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) which has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. + +The change from Merkle trees to Tic-Tac trees is simply to no longer use a cryptographically strong hashing algorithm, and now combine hashes through XORing rather than concatenation. So a segment leaf is calculated from: + +``hash(K1, H1) XOR hash(K2, H2) XOR ... hash(Kn, Hn)`` + +The Keys and hashes can now be combined in any order with any grouping. + +This enables two things: + +- The tree can be built incrementally when scanning across a store not in segment order (i.e. scanning across a store in key order) without needing to hold an state in memory beyond the size of the tree. + +- Two trees from stores with non-overlapping key ranges can be merged to reflect the combined state of that store i.e. the trees for each store can be built independently and in parallel and the subsequently merged without needing to build an interim view of the combined state. + +It is assumed that the trees will only be transferred securely between trusted actors already permitted to view, store and transfer the real data: so the loss of cryptographic strength of the tree is irrelevant to the overall security of the system. + +## Recent and Whole + +Anti-entropy in Riak is a dual-track process: + +- there is a need to efficiently and rapidly provide an update on store state that represents recent additions; + +- there is a need to ensure that the anti-entropy view of state represents the state of the whole database. + +Within the current Riak AAE implementation, recent changes are supported by having a separate anti-entropy store organised by segments so that the Merkle tree can be updated incrementally to reflect recent changes. The Merkle tree produced following these changes should then represent the whole state of the database. + +However as the view of the whole state is maintained in a different store to that holding the actual data: there is an entropy problem between the actual store and the AAE store e.g. data could be lost from the real store, and go undetected as it is not lost from the AAE store. So periodically the AAE store is rebuilt by scanning the whole of the real store. This rebuild can be an expensive process, and the cost is commonly controlled through performing this task infrequently, with changes pending in develop to try and manage the scheduling and throttling of this process. -The change from Merkle trees to Tic-Tac trees is simply to no longer use a cryptographically strong hashing algorithm, and now combine hashes through XORing rather than concatenation - to enable merging and incremental builds. -## Divide and Conquer .... to be completed diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index f2b76db..ba346b9 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1333,8 +1333,6 @@ accumulate_objects(FoldObjectsFun, InkerClone, Tag, DeferredFetch) -> AccFun. - - check_presence(Key, Value, InkerClone) -> {LedgerKey, SQN} = leveled_codec:strip_to_keyseqonly({Key, Value}), case leveled_inker:ink_keycheck(InkerClone, LedgerKey, SQN) of diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 2d413e4..1dbccf3 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -71,7 +71,7 @@ -define(MAGIC, 53). % riak_kv -> riak_object -define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). -define(NRT_IDX, "$aae."). --define(ALL_BUCKETS, list_to_binary("$all")). +-define(ALL_BUCKETS, <<"$all">>). -type recent_aae() :: #recent_aae{}. @@ -181,9 +181,10 @@ is_active(Key, Value, Now) -> false end. -from_ledgerkey({Tag, Bucket, {_IdxField, IdxValue}, Key}) - when Tag == ?IDX_TAG -> - {Bucket, Key, IdxValue}; +from_ledgerkey({?IDX_TAG, ?ALL_BUCKETS, {_IdxFld, IdxVal}, {Bucket, Key}}) -> + {Bucket, Key, IdxVal}; +from_ledgerkey({?IDX_TAG, Bucket, {_IdxFld, IdxVal}, Key}) -> + {Bucket, Key, IdxVal}; from_ledgerkey({_Tag, Bucket, Key, null}) -> {Bucket, Key}. @@ -393,8 +394,22 @@ gen_indexspec(Bucket, Key, IdxOp, IdxField, IdxTerm, SQN, TTL) -> %% TODO: timestamps for delayed reaping tomb end, - {to_ledgerkey(Bucket, Key, ?IDX_TAG, IdxField, IdxTerm), - {SQN, Status, no_lookup, null}}. + case Bucket of + {all, RealBucket} -> + {to_ledgerkey(?ALL_BUCKETS, + {RealBucket, Key}, + ?IDX_TAG, + IdxField, + IdxTerm), + {SQN, Status, no_lookup, null}}; + _ -> + {to_ledgerkey(Bucket, + Key, + ?IDX_TAG, + IdxField, + IdxTerm), + {SQN, Status, no_lookup, null}} + end. -spec aae_indexspecs(false|recent_aae(), any(), any(), @@ -418,7 +433,7 @@ aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) -> Bucket0 = case AAE#recent_aae.buckets of all -> - ?ALL_BUCKETS; + {all, Bucket}; ListB -> case lists:member(Bucket, ListB) of true -> @@ -545,7 +560,7 @@ get_size(PK, Value) -> -spec get_keyandobjhash(tuple(), tuple()) -> tuple(). %% @doc -%% Return a tucple of {Bucket, Key, Hash} where hash is a has of the object +%% Return a tucple of {Bucket, Key, Hash} where hash is a hash of the object %% not the key (for example with Riak tagged objects this will be a hash of %% the sorted vclock) get_keyandobjhash(LK, Value) -> diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index 78c92c4..2952d48 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -43,6 +43,7 @@ find_journals/1, wait_for_compaction/1, foldkeysfun/3, + foldkeysfun_returnbucket/3, sync_strategy/0]). -define(RETURN_TERMS, {true, undefined}). @@ -484,6 +485,11 @@ get_randomdate() -> foldkeysfun(_Bucket, Item, Acc) -> Acc ++ [Item]. +foldkeysfun_returnbucket(Bucket, {Term, Key}, Acc) -> + Acc ++ [{Term, {Bucket, Key}}]; +foldkeysfun_returnbucket(Bucket, Key, Acc) -> + Acc ++ [{Bucket, Key}]. + check_indexed_objects(Book, B, KSpecL, V) -> % Check all objects match, return what should be the results of an all % index query diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index a9c09a9..f11388c 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -106,11 +106,11 @@ many_put_compare(_Config) -> FoldKeysFun = fun(SegListToFind) -> - fun(_B, K, Acc) -> + fun(B, K, Acc) -> Seg = leveled_tictac:get_segment(K, SegmentCount), case lists:member(Seg, SegListToFind) of true -> - [K|Acc]; + [{B, K}|Acc]; false -> Acc end @@ -586,7 +586,7 @@ recent_aae_allaae(_Config) -> IdxQ1 = {index_query, <<"$all">>, - {fun testutil:foldkeysfun/3, []}, + {fun testutil:foldkeysfun_returnbucket/3, []}, {IdxLMD, list_to_binary(TermPrefix ++ "."), list_to_binary(TermPrefix ++ "|")}, @@ -619,7 +619,7 @@ recent_aae_allaae(_Config) -> true = length(DeltaX) == 0, % This hasn't seen any extra changes true = length(DeltaY) == 1, % This has seen an extra change - [{_, K1}] = DeltaY, + [{_, {B1, K1}}] = DeltaY, ok = leveled_bookie:book_close(Book2A), ok = leveled_bookie:book_close(Book2B), From 1af9ac56dc664c90aa883a66144734c24b0175e1 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 3 Jul 2017 19:06:41 +0100 Subject: [PATCH 34/58] Revert passing Bucket Bad edit. Reverted --- test/end_to_end/tictac_SUITE.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index f11388c..4ba62d3 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -106,11 +106,11 @@ many_put_compare(_Config) -> FoldKeysFun = fun(SegListToFind) -> - fun(B, K, Acc) -> + fun(_B, K, Acc) -> Seg = leveled_tictac:get_segment(K, SegmentCount), case lists:member(Seg, SegListToFind) of true -> - [{B, K}|Acc]; + [K|Acc]; false -> Acc end From 439bf8c3b8c65d71ff3b9f86f5c2d647cbb55be2 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 4 Jul 2017 10:55:46 +0100 Subject: [PATCH 35/58] Add bucket whitelist test --- test/end_to_end/tictac_SUITE.erl | 185 ++++++++++++++++++++++++++++++- 1 file changed, 181 insertions(+), 4 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 4ba62d3..8feb703 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -6,14 +6,16 @@ many_put_compare/1, index_compare/1, recent_aae_noaae/1, - recent_aae_allaae/1 + recent_aae_allaae/1, + recent_aae_bucketaae/1 ]). all() -> [ many_put_compare, index_compare, recent_aae_noaae, - recent_aae_allaae + recent_aae_allaae, + recent_aae_bucketaae ]. -define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). @@ -627,10 +629,185 @@ recent_aae_allaae(_Config) -> ok = leveled_bookie:book_close(Book2D). + +recent_aae_bucketaae(_Config) -> + % Configure AAE to work only on a single whitelisted bucket + % Confirm that we can spot a delta in this bucket, but not + % in another bucket + + TreeSize = small, + % SegmentCount = 256 * 256, + UnitMins = 2, + AAE = {[<<"Bucket">>], 60, UnitMins}, + + % Test requires multiple different databases, so want to mount them all + % on individual file paths + RootPathA = testutil:reset_filestructure("testA"), + RootPathB = testutil:reset_filestructure("testB"), + RootPathC = testutil:reset_filestructure("testC"), + RootPathD = testutil:reset_filestructure("testD"), + StartOptsA = aae_startopts(RootPathA, AAE), + StartOptsB = aae_startopts(RootPathB, AAE), + StartOptsC = aae_startopts(RootPathC, AAE), + StartOptsD = aae_startopts(RootPathD, AAE), + + % Book1A to get all objects + {ok, Book1A} = leveled_bookie:book_start(StartOptsA), + % Book1B/C/D will have objects partitioned across it + {ok, Book1B} = leveled_bookie:book_start(StartOptsB), + {ok, Book1C} = leveled_bookie:book_start(StartOptsC), + {ok, Book1D} = leveled_bookie:book_start(StartOptsD), + + {B1, K1, V1, S1, MD} = {<<"Bucket">>, + "Key1.1.4567.4321", + "Value1", + [], + [{"MDK1", "MDV1"}]}, + {TestObject, TestSpec} = testutil:generate_testobject(B1, K1, V1, S1, MD), + + SW_StartLoad = os:timestamp(), + + ok = testutil:book_riakput(Book1A, TestObject, TestSpec), + ok = testutil:book_riakput(Book1B, TestObject, TestSpec), + testutil:check_forobject(Book1A, TestObject), + testutil:check_forobject(Book1B, TestObject), + + {TicTacTreeJoined, TicTacTreeFull, EmptyTree, LMDIndexes} = + load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, + SW_StartLoad, TreeSize, UnitMins, + false, <<"Bucket">>), + % Go compare! Also confirm we're not comparing empty trees + DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, + TicTacTreeJoined), + + DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), + true = DL1_0 == [], + true = length(DL1_1) > 100, + + ok = leveled_bookie:book_close(Book1A), + ok = leveled_bookie:book_close(Book1B), + ok = leveled_bookie:book_close(Book1C), + ok = leveled_bookie:book_close(Book1D), + + % Book2A to get all objects + {ok, Book2A} = leveled_bookie:book_start(StartOptsA), + % Book2B/C/D will have objects partitioned across it + {ok, Book2B} = leveled_bookie:book_start(StartOptsB), + {ok, Book2C} = leveled_bookie:book_start(StartOptsC), + {ok, Book2D} = leveled_bookie:book_start(StartOptsD), + + % Change the value for a key in another bucket + % If we get trees for this period, no difference should be found + + V2 = "Value2", + {TestObject2, TestSpec2} = + testutil:generate_testobject(<<"NotBucket">>, K1, V2, S1, MD), + + New_startTS2 = os:timestamp(), + + ok = testutil:book_riakput(Book2B, TestObject2, TestSpec2), + testutil:check_forobject(Book2B, TestObject2), + testutil:check_forobject(Book2A, TestObject), + + New_endTS2 = os:timestamp(), + NewLMDIndexes2 = determine_lmd_indexes(New_startTS2, New_endTS2, UnitMins), + {TicTacTreeJoined2, TicTacTreeFull2, _EmptyTree, NewLMDIndexes2} = + load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, + New_startTS2, TreeSize, UnitMins, + NewLMDIndexes2, <<"Bucket">>), + DL2_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull2, + TicTacTreeJoined2), + true = length(DL2_0) == 0, + + % Now create an object that is a change to an existing key in the + % monitored bucket. A differrence should be found + + {TestObject3, TestSpec3} = + testutil:generate_testobject(B1, K1, V2, S1, MD), + + New_startTS3 = os:timestamp(), + + ok = testutil:book_riakput(Book2B, TestObject3, TestSpec3), + testutil:check_forobject(Book2B, TestObject3), + testutil:check_forobject(Book2A, TestObject), + + New_endTS3 = os:timestamp(), + NewLMDIndexes3 = determine_lmd_indexes(New_startTS3, New_endTS3, UnitMins), + {TicTacTreeJoined3, TicTacTreeFull3, _EmptyTree, NewLMDIndexes3} = + load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, + New_startTS3, TreeSize, UnitMins, + NewLMDIndexes3, <<"Bucket">>), + DL3_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull3, + TicTacTreeJoined3), + + % DL2_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), + true = length(DL3_0) == 1, + + % Find the dirty segment, and use that to find the dirty key + % + % Note that unlike when monitoring $all, fold_keys can be used as there + % is no need to return the Bucket (as hte bucket is known) + + [DirtySeg] = DL3_0, + TermPrefix = string:right(integer_to_list(DirtySeg), 8, $0), + + LMDSegFolder = + fun(LMD, {Acc, Bookie}) -> + IdxLMD = list_to_binary("$aae." ++ LMD ++ "_bin"), + IdxQ1 = + {index_query, + <<"Bucket">>, + {fun testutil:foldkeysfun/3, []}, + {IdxLMD, + list_to_binary(TermPrefix ++ "."), + list_to_binary(TermPrefix ++ "|")}, + {true, undefined}}, + {async, IdxFolder} = + leveled_bookie:book_returnfolder(Bookie, IdxQ1), + {Acc ++ IdxFolder(), Bookie} + end, + {KeysTerms2A, _} = lists:foldl(LMDSegFolder, + {[], Book2A}, + lists:usort(LMDIndexes ++ NewLMDIndexes3)), + true = length(KeysTerms2A) >= 1, + + {KeysTerms2B, _} = lists:foldl(LMDSegFolder, + {[], Book2B}, + lists:usort(LMDIndexes ++ NewLMDIndexes3)), + {KeysTerms2C, _} = lists:foldl(LMDSegFolder, + {[], Book2C}, + lists:usort(LMDIndexes ++ NewLMDIndexes3)), + {KeysTerms2D, _} = lists:foldl(LMDSegFolder, + {[], Book2D}, + lists:usort(LMDIndexes ++ NewLMDIndexes3)), + + KeysTerms2Joined = KeysTerms2B ++ KeysTerms2C ++ KeysTerms2D, + DeltaX = lists:subtract(KeysTerms2A, KeysTerms2Joined), + DeltaY = lists:subtract(KeysTerms2Joined, KeysTerms2A), + + io:format("DeltaX ~w~n", [DeltaX]), + io:format("DeltaY ~w~n", [DeltaY]), + + true = length(DeltaX) == 0, % This hasn't seen any extra changes + true = length(DeltaY) == 1, % This has seen an extra change + [{_, K1}] = DeltaY, + + ok = leveled_bookie:book_close(Book2A), + ok = leveled_bookie:book_close(Book2B), + ok = leveled_bookie:book_close(Book2C), + ok = leveled_bookie:book_close(Book2D). + + load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, SW_StartLoad, TreeSize, UnitMins, LMDIndexes_Loaded) -> - + load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, + SW_StartLoad, TreeSize, UnitMins, + LMDIndexes_Loaded, <<"$all">>). + +load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, + SW_StartLoad, TreeSize, UnitMins, + LMDIndexes_Loaded, Bucket) -> LMDIndexes = case LMDIndexes_Loaded of false -> @@ -677,7 +854,7 @@ load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, ST = <<"0">>, ET = <<"A">>, Q = {tictactree_idx, - {<<"$all">>, + {Bucket, list_to_binary("$aae." ++ LMD ++ "_bin"), ST, ET}, From 0d72b353fe9b11b48a34af8ead19e75c23ac732b Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 4 Jul 2017 13:29:40 +0100 Subject: [PATCH 36/58] Add test of expiry of nrt aae terms --- test/end_to_end/tictac_SUITE.erl | 120 +++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 23 deletions(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 8feb703..20a1b47 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -7,7 +7,8 @@ index_compare/1, recent_aae_noaae/1, recent_aae_allaae/1, - recent_aae_bucketaae/1 + recent_aae_bucketaae/1, + recent_aae_expiry/1 ]). all() -> [ @@ -15,7 +16,8 @@ all() -> [ index_compare, recent_aae_noaae, recent_aae_allaae, - recent_aae_bucketaae + recent_aae_bucketaae, + recent_aae_expiry ]. -define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). @@ -798,6 +800,74 @@ recent_aae_bucketaae(_Config) -> ok = leveled_bookie:book_close(Book2D). +recent_aae_expiry(_Config) -> + % Proof that the index entries are indeed expired + + TreeSize = small, + % SegmentCount = 256 * 256, + UnitMins = 1, + TotalMins = 2, + AAE = {all, TotalMins, UnitMins}, + + % Test requires multiple different databases, so want to mount them all + % on individual file paths + RootPathA = testutil:reset_filestructure("testA"), + StartOptsA = aae_startopts(RootPathA, AAE), + + % Book1A to get all objects + {ok, Book1A} = leveled_bookie:book_start(StartOptsA), + + GenMapFun = + fun(_X) -> + V = testutil:get_compressiblevalue(), + Indexes = testutil:get_randomindexes_generator(8), + testutil:generate_objects(5000, + binary_uuid, + [], + V, + Indexes) + end, + + ObjLists = lists:map(GenMapFun, lists:seq(1, 3)), + + SW0 = os:timestamp(), + % Load all nine lists into Book1A + lists:foreach(fun(ObjL) -> testutil:riakload(Book1A, ObjL) end, + ObjLists), + SW1 = os:timestamp(), + % sleep for two minutes, so all index entries will have expired + GetTicTacTreeFun = + fun(Bookie) -> + get_tictactree_fun(Bookie, <<"$all">>, TreeSize) + end, + EmptyTree = leveled_tictac:new_tree(empty, TreeSize), + LMDIndexes = determine_lmd_indexes(SW0, SW1, UnitMins), + + % Should get a non-empty answer to the query + TicTacTree1_Full = + lists:foldl(GetTicTacTreeFun(Book1A), EmptyTree, LMDIndexes), + DL3_0 = leveled_tictac:find_dirtyleaves(TicTacTree1_Full, EmptyTree), + true = length(DL3_0) > 0, + + SecondsSinceLMD = timer:now_diff(os:timestamp(), SW0) div 1000000, + SecondsToExpiry = (TotalMins + UnitMins) * 60, + + case SecondsToExpiry > SecondsSinceLMD of + true -> + timer:sleep((SecondsToExpiry - SecondsSinceLMD) * 1000); + false -> + tier:sleep(0) + end, + + % Should now get an empty answer - all entries have expired + TicTacTree2_Full = + lists:foldl(GetTicTacTreeFun(Book1A), EmptyTree, LMDIndexes), + + DL4_0 = leveled_tictac:find_dirtyleaves(TicTacTree2_Full, EmptyTree), + true = length(DL4_0) == 0. + + + load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, SW_StartLoad, TreeSize, UnitMins, LMDIndexes_Loaded) -> @@ -849,24 +919,7 @@ load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, GetTicTacTreeFun = fun(Bookie) -> - fun(LMD, Acc) -> - SW = os:timestamp(), - ST = <<"0">>, - ET = <<"A">>, - Q = {tictactree_idx, - {Bucket, - list_to_binary("$aae." ++ LMD ++ "_bin"), - ST, - ET}, - TreeSize, - fun(_B, _K) -> accumulate end}, - {async, Folder} = leveled_bookie:book_returnfolder(Bookie, Q), - R = Folder(), - io:format("TicTac Tree for index ~s took " ++ - "~w microseconds~n", - [LMD, timer:now_diff(os:timestamp(), SW)]), - leveled_tictac:merge_trees(R, Acc) - end + get_tictactree_fun(Bookie, Bucket, TreeSize) end, % Get a TicTac tree representing one of the indexes in Bucket A @@ -912,7 +965,7 @@ determine_lmd_indexes(StartTS, EndTS, UnitMins) -> Acc; false -> NextTime = - 300 * X + + UnitMins * 60 * X + calendar:datetime_to_gregorian_seconds(StartDT), NextDT = calendar:gregorian_seconds_to_datetime(NextTime), @@ -920,7 +973,7 @@ determine_lmd_indexes(StartTS, EndTS, UnitMins) -> end end, - lists:foldl(AddTimeFun, [StartTimeStr], lists:seq(1, 5)). + lists:foldl(AddTimeFun, [StartTimeStr], lists:seq(1, 10)). get_strtime(DateTime, UnitMins) -> @@ -930,4 +983,25 @@ get_strtime(DateTime, UnitMins) -> StrTime = lists:flatten(io_lib:format(?LMD_FORMAT, [Y, M, D, Hour, RoundMins])), - StrTime. \ No newline at end of file + StrTime. + + +get_tictactree_fun(Bookie, Bucket, TreeSize) -> + fun(LMD, Acc) -> + SW = os:timestamp(), + ST = <<"0">>, + ET = <<"A">>, + Q = {tictactree_idx, + {Bucket, + list_to_binary("$aae." ++ LMD ++ "_bin"), + ST, + ET}, + TreeSize, + fun(_B, _K) -> accumulate end}, + {async, Folder} = leveled_bookie:book_returnfolder(Bookie, Q), + R = Folder(), + io:format("TicTac Tree for index ~s took " ++ + "~w microseconds~n", + [LMD, timer:now_diff(os:timestamp(), SW)]), + leveled_tictac:merge_trees(R, Acc) + end. \ No newline at end of file From 3105656d2e8df8852f65f891446e410001aad78f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 6 Jul 2017 15:40:30 +0100 Subject: [PATCH 37/58] Add test descriptions and further documentation --- docs/ANTI_ENTROPY.md | 132 +++++++++++++++++++++++++++++-- test/end_to_end/tictac_SUITE.erl | 18 +++++ 2 files changed, 142 insertions(+), 8 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 0919aba..c536dd2 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -48,6 +48,8 @@ It is assumed that the trees will only be transferred securely between trusted a ## Recent and Whole +### Current Riak AAE + Anti-entropy in Riak is a dual-track process: - there is a need to efficiently and rapidly provide an update on store state that represents recent additions; @@ -56,19 +58,133 @@ Anti-entropy in Riak is a dual-track process: Within the current Riak AAE implementation, recent changes are supported by having a separate anti-entropy store organised by segments so that the Merkle tree can be updated incrementally to reflect recent changes. The Merkle tree produced following these changes should then represent the whole state of the database. -However as the view of the whole state is maintained in a different store to that holding the actual data: there is an entropy problem between the actual store and the AAE store e.g. data could be lost from the real store, and go undetected as it is not lost from the AAE store. So periodically the AAE store is rebuilt by scanning the whole of the real store. This rebuild can be an expensive process, and the cost is commonly controlled through performing this task infrequently, with changes pending in develop to try and manage the scheduling and throttling of this process. +However as the view of the whole state is maintained in a different store to that holding the actual data: there is an entropy problem between the actual store and the AAE store e.g. data could be lost from the real store, and go undetected as it is not lost from the AAE store. So periodically the AAE store is rebuilt by scanning the whole of the real store. This rebuild can be an expensive process, and the cost is commonly controlled through performing this task infrequently. Prior to the end of Basho there were changes pending in develop to better throttle and schedule these updates - through the riak_kv_sweeper. + +The AAE store also needs to be partially scanned on a regular basis to update the current view of the Merkle tree. If a vnode has 100M keys, and there has been 1000 updates since the last merkle tree was updated - then there will need to be o(1000) seeks across subsets of the store returning o(100K) keys in total. As the store grows, the AAE store can grow to a non-trivial size, and have an impact on the page-cache and disk busyness. + +The AAE store is re-usable for checking consistency between databases, but with the following limitations: + +- the two stores need to be partitioned equally, constraining replication to other database technologies, and preventing replication from being used as an approach to re-partitioning (ring re-sizing). + +- the aae store is not split by bucket, and so supporting replication configured per bucket is challenging. + +### Proposed Leveled AAE + +There are three primary costs with scanning over the whole database: + +- the impact on the page cache as all keys and values have to be read from disk, including not-recently used values; + +- the overall I/O load (primarily disk-related) of scanning the full database from disk; + +- the overall I/O load (primarily network-related) of streaming results from the fold. + +The third cost can be addressed by the fold output being an incrementally updatable tree of a fixed size; i.e. if the fold builds a Tic-Tac tree and doesn't stream results (like list keys), and guarantees a fixed size output both from a single partition and following merging across multiple partitions. Within Leveled the first two costs are reduced by design due to the separation of Keys and Metadata from the object value, reducing significantly the workload associated with such a scan - especially where values are large. + +The [testing of traditional Riak AAE](https://github.com/martinsumner/leveled/blob/master/docs/VOLUME.md#leveled-aae-rebuild-with-journal-check) already undertaken has shown that scanning the database is not necessarily such a big issue in Leveled. So it does seem potentially feasible to scan the store on a regular basis. The testing of Leveldb with the riak_kv_sweeper feature shows that with the improved throttling more regular scanning is also possible here: testing with riak_kv_sweeper managed to achieve 10 x the number of sweeps, with only a 9% drop in throughput. + +A hypothesis is proposed that regular scanning of the full store to produce a Tic-Tac tree is certainly feasible in Leveled, but also potentially tolerable in other back-ends. However, frequent scanning may still be impractical. It is therefore suggested that there should be an alternative form of anti-entropy that can be run in addition to scanning, that is lower cost and can be run be frequently in support of scanning to produce Tic-Tac trees. This supporting anti-entropy should focus on the job of verifying that recent changes have been received. So there would be two anti-entropy mechanisms, one which can be run frequently (minutes) to check for the receipt of recent changes, and one that can be run regularly but infrequently (hours/days) to check that full database state is consistent. + +To provide a check on recent changes it is proposed to add a temporary index within the store, with an entry for each change that is built from a rounded last modified date and the hash of the value, so that the index can be scanned to form a Tic-Tac tree of recent changes. This assumes that each object has a Last Modified Date that is consistent (for that version) across all points where that particular version is stored, to use as the field name for the index. The term of the index is based on the segment ID (for the tree) and the hash of the value. This allows for a scan to build a tree of changes for a given range of modified dates, as well as a scan for keys and hashes to be returned for a given segment ID and date range. + +Within the Leveled the index can be made temporary by giving the entry a time-to-live, independent of any object time to live. So once the change is beyond the timescale in which the operator wishes to check for recent changes, it will naturally be removed from the database (through deletion on the next compaction event that hits the entry in the Ledger). + +Hence overall this should give: + +- A low cost mechanism for checking for the distribution of recent changes. + +- A mechanism for infrequently comparing overall state that is naturally consistent with the actual store state, without compromising operational stability of the store. + +- No additional long-term overhead (i.e. duplicate key store for anti-entropy). +## Leveled Implementation -.... to be completed +### Full Database Anti-Entropy +There are two parts to the full database anti-entropy mechanism: the Tic-Tac trees implemented in the leveled_tictac modules; and the queries required to build the trees available through the book_returnfolder function. There are two types of queries supported - -Splitting the problem into two parts +``` +{tictactree_obj, + {Tag, Bucket, StartKey, EndKey, CheckPresence}, + TreeSize, + PartitionFilter} +``` -full database state -recent changes +``` +{tictactree_idx, + {Bucket, IdxField, StartValue, EndValue}, + TreeSize, + PartitionFilter} +``` -as opposed to +The tictactree_obj folder produces a Tic-Tac tree form a fold across the objects (or more precisely the heads of the objects in the Ledger)m using the constraints Tag, Bucket, StartKey and EndKey. CheckPresence can be used to require the folder to confirm if the value is present in the Journal before including it in the tree - this will slow down the fold significantly, but protect from corruption in the Journal not represented in the Ledger. The partition filter cna be used where the store holds data from multiple partitions, and only data form a subset of partitions should be included, with the partition filter being a function on the Bucket and Key to make that decision. -full database state -rebuilt full database state +The tictactree_idx folder produces a tic-Tac tree from a range of an index, and so can be used as with tictactree_obj but for checking that an index is consistent between coverage offsets or between databases. + +These two folds are tested in the tictac_SUITE test suite in the ``many_put_compare`` and ``index_compare`` tests. + +### Near Real-Time Anti-Entropy + +The near real-time anti-entropy process can be run in two modes: blacklisting and whitelisting. In blacklisting mode, specific buckets can be excluded from anti-entropy management, and all buckets not excluded are managed in a single "$all" bucket. Anti-entropy queries will need to always be requested against the "$all" bucket. In whitelisting mode, only specific buckets are included in anti-entropy management. Anti-entropy queries will need to be requested separately for each whitelisted bucket, and may be scheduled differently for each bucket. + +The index entry is then of the form: + +- Tag: ?IDX_TAG + +- Bucket: Bucket + +- Field: Last Modified Date (rounded down to a configured unit in minutes) + +- Term: Segment ++ "." ++ Hash + +- Key : Key + +In blacklist mode the Bucket will be $all, and the Key will actually be a {Bucket, Key} pair. + +The index entry is given a TTL of a configurable amount (e.g. 1 hour) - and no index entry may be added if the change is already considered to be too far in the past. The index entry is added to the Ledger in the same transaction as the other changes, and will be re-calculated and re-added out of the Journal under restart conditions where the change has not reached a persisted state in the Ledger prior to the close. + +The near real-time entropy index currently has four ct tests: + +- recent_aae_noaae (confirming loading a store with real-time aae disabled has no impact); + +- recent_aae_allaae (confirming that a single store loaded with data can be compared with the a store where the same data is spread across three leveled instances - with all buckets covered by anti-entropy); + +- recent_aae_bucketaae (confirming that a single store loaded with data can be compared with the a store where the same data is spread across three leveled instances - with a single buckets covered by anti-entropy); + +- recent_aae_expiry (confirming that aae index will expire). + +### Clock Drift + +The proposed near-real-time anti-entropy mechanism depends on a timestamp, so ultimately some false positives and false negatives are unavoidable - especially if clock drift is large. The assumption is though: + +- that this method is never a single dependency for ensuring consistency, it is supported by other mechanisms to further validate, that would detect false negatives. + +- that recovery from false positives will be safely implemented, so that a falsely identified discrepancy is validated before a change is made (e.g. repair through read-repair). + +Even with this mitigation, the volume of false positives and negatives needs to be controlled, in particular where clock drift is small (i.e. measured in seconds), and hence likely. If the object has a last modified date set in one place, as with Riak, there is no issue with different actors seeing a different last modified date for the same change. However, as the index object should expire the risk exists that the store will set an inappropriate expiry time, or even not index the object as it considers the object to be a modification too far in the past. The Near Real-Time AAE process has the concept of unit minutes, which represents the level of granularity all times will be rounded to. All expiry times are set with a tolerance equal to the unit minutes, to avoid false positives or negatives when clock drift is small. + +## Alternative Approaches + +The approach considered for Leveled has been to modify the Merkle trees used to ease their implementation, as well as specifically separating out anti-entropy for recent changes as a different problem to long-term anti-entropy of global state. + +There is [emergent research ongoing](http://haslab.uminho.pt/tome/files/global_logical_clocks.pdf) to try and leverage the use of dotted version vectors at a node level to improve the efficiency of managing key-level consistency, reduce the risks associated with deletes (without permanent tombstones), but also provide an inherently integrated approach to active anti-entropy. + +The Global Logical Clock approach does assume that durability is not mutable: + +> Nodes have access to durable storage; nodes can crash but +eventually will recover with the content of the durable storage as at the time of +the crash. + +It is strongly preferred that our anti-entropy approach can deal with the loss of data that had been persisted to disk (e.g. perhaps through administrative error or disk failure), not just the loss of updates not received. This doesn't mean that such an approach is invalid as: + +- the near real-time approach element of anti-entropy *is* only focused on the loss of updates not received; + +- it may be possible to periodically rebuild the state of bitmapped version vectors based on the data found on disk (similarly to the current hashtree rebuild process in Riak AAE). + +Some further consideration has been given to using a version of this Global Logical Clock approach to managing near-real-time anti-entropy only. More understanding of the approach is required to progress though, in particular: + +- How to manage comparisons between clusters with different partitioning algorithms (e.g different ring-sizes); + +- How to discover key IDs from missing dots where the controlling node for the update has recently failed. + +This likely represent gaps in current understanding, rather than flaws in the approach. The evolution of this research will be tracked with interest. diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 20a1b47..e2319da 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -431,6 +431,9 @@ index_compare(_Config) -> recent_aae_noaae(_Config) -> + % Starts databases with recent_aae tables, and attempt to query to fetch + % recent aae trees returns empty trees as no index entries are found. + TreeSize = small, % SegmentCount = 256 * 256, UnitMins = 2, @@ -486,6 +489,21 @@ recent_aae_noaae(_Config) -> recent_aae_allaae(_Config) -> + % Leveled is started in blacklisted mode with no buckets blacklisted. + % + % A number of changes are then loaded into a store, and also partitioned + % across a separate set of three stores. A merge tree is returned from + % both the single store and the partitioned store, and proven to compare + % the same. + % + % A single change is then made, but into one half of the system only. The + % aae index is then re-queried and it is verified that a signle segment + % difference is found. + % + % The segment Id found is then used in a query to find the Keys that make + % up that segment, and the delta discovered should be just that one key + % which was known to have been changed + TreeSize = small, % SegmentCount = 256 * 256, UnitMins = 2, From 57f7614b40129cc0b2beb5b234d64e411df08edd Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 6 Jul 2017 16:51:07 +0100 Subject: [PATCH 38/58] Initial edits Part way through read-through. Disovered when referencing Cassandra Merkle Trees that they also use XOR of hashes rather than concatenation before hashing --- docs/ANTI_ENTROPY.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index c536dd2..5b8fec0 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -1,10 +1,10 @@ # Anti-Entropy -Leveled is primarily designed to be a backend for Riak. Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters, as part of exploring the potential for improvements in these anti-entropy mechanisms - some features have been added directly to Leveled. The purpose of these features is to: +Leveled is primarily designed to be a backend for Riak, and Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters. As part of the ongoing community work to build improvements into Riak going forward, some features have been added directly to Leveled to directly test some potential enhancements to anti-entropy. These features are concerned with: -- Allow for the database state within in a Leveled store or stores to be compared with an other store or stores which should have the same data; +- Allowing for the database state within in a Leveled store or stores to be compared with an other store or stores which should have the same data; -- Allow for quicker checking that recent changes to one store or stores have also been received by another store or stores that should be receiving the same changes. +- Allowing for quicker checking that recent changes to one store or stores have also been received by another store or stores that should be receiving the same changes. The aim is to use these backend capabilities to allow for a Riak anti-entropy mechanism with the following features: @@ -12,9 +12,9 @@ The aim is to use these backend capabilities to allow for a Riak anti-entropy me - Comparison can use a consistent approach to compare state within and between clusters. -- Comparison does not rely on duplication of database state to a separate store, with further anti-entropy required to manage state variance between the actual and anti-entropy stores. +- Comparison does not rely on duplication of database state to a separate store, with further anti-entropy required to manage state variance between the actual stores and anti-entropy stores. -- Comparison of state can be abstracted from Riak specific implementation so that mechanisms to compare between Riak clusters can be re-used to compare between a Riak cluster and another database store. Coordination with another data store (e.g. Solr) can be controlled by the Riak user not the Riak developer. +- Comparison of state can be abstracted from Riak specific implementation so that mechanisms to compare between Riak clusters can be re-used to compare between a Riak cluster and another database store. Coordination with another data store (e.g. Solr) can be controlled by the Riak user not just the Riak developer. ## Merkle Trees @@ -30,13 +30,13 @@ This requires all of the keys and hashes need to be pulled into memory to build ## Tic-Tac Trees -Anti-entropy in leveled is supported using the leveled_tictac module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, or make the tree tamper-proof, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) which has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. +Anti-entropy in leveled is supported using the leveled_tictac module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, or make the tree tamper-proof, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These Merkle trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) to fit in with Bookmaker-based naming conventions of leveled. The Tic-Tac language has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. The change from Merkle trees to Tic-Tac trees is simply to no longer use a cryptographically strong hashing algorithm, and now combine hashes through XORing rather than concatenation. So a segment leaf is calculated from: ``hash(K1, H1) XOR hash(K2, H2) XOR ... hash(Kn, Hn)`` -The Keys and hashes can now be combined in any order with any grouping. +The Keys and hashes can now be combined in any order with any grouping. The use of XOR instead of concatentation is [discouraged in secure Merkle Trees](https://security.stackexchange.com/questions/89847/can-xor-be-used-in-a-merkle-tree-instead-of-concatenation) but is not novel in its use within [trees focused on anti-entropy](http://distributeddatastore.blogspot.co.uk/2013/07/cassandra-using-merkle-trees-to-detect.html). This enables two things: From 780bf8aeb8e23850dfdddc343d4eef50c41d6666 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 6 Jul 2017 17:01:19 +0100 Subject: [PATCH 39/58] Wording changes Attempt to clarify --- docs/ANTI_ENTROPY.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 5b8fec0..4b646a1 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -1,12 +1,12 @@ # Anti-Entropy -Leveled is primarily designed to be a backend for Riak, and Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters. As part of the ongoing community work to build improvements into Riak going forward, some features have been added directly to Leveled to directly test some potential enhancements to anti-entropy. These features are concerned with: +Leveled is primarily designed to be a backend for Riak, and Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters. As part of the ongoing community work to build improvements into a new pure open-source release of Riak, some features have been added directly to Leveled to directly test some potential enhancements to anti-entropy. These features are concerned with: - Allowing for the database state within in a Leveled store or stores to be compared with an other store or stores which should have the same data; - Allowing for quicker checking that recent changes to one store or stores have also been received by another store or stores that should be receiving the same changes. -The aim is to use these backend capabilities to allow for a Riak anti-entropy mechanism with the following features: +The aim is to use these as new backend capabilities, combined with new coverage FSM query behaviour, to allow for new Riak anti-entropy mechanisms with the following features: - Comparison can be made between clusters with different ring-sizes - comparison is not coupled to partitioning. @@ -16,6 +16,10 @@ The aim is to use these backend capabilities to allow for a Riak anti-entropy me - Comparison of state can be abstracted from Riak specific implementation so that mechanisms to compare between Riak clusters can be re-used to compare between a Riak cluster and another database store. Coordination with another data store (e.g. Solr) can be controlled by the Riak user not just the Riak developer. +- Comparison can be controlled at a bucket level, so that buckets can be configured to be either specifically whitelisted into the anti-entropy scope, or blacklisted from it - with the option to support different schedules for anti-entropy operations for different buckets when whitelisting is used. + +- Through the use of key types allow for flexibility to calculate anti-entropy mechanisms in a way specific to the type of object being stored (e.g. support alternative mechanisms for some CRDT types). + ## Merkle Trees Riak has historically used [Merkle trees](https://en.wikipedia.org/wiki/Merkle_tree) as a way to communicate state efficiently between actors. Merkle trees have been designed to be cryptographically secure so that they don't leak details of the individual transactions themselves. This strength is useful in many Merkle Tree use cases, and is part derived from the use of concatenation when calculating branch hashes from leaf hashes: From 74f0e2bb6dfcb23793b593636c29d8b813f927d0 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 6 Jul 2017 17:22:09 +0100 Subject: [PATCH 40/58] Further updates --- docs/ANTI_ENTROPY.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 4b646a1..1eb6785 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -1,8 +1,8 @@ # Anti-Entropy -Leveled is primarily designed to be a backend for Riak, and Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters. As part of the ongoing community work to build improvements into a new pure open-source release of Riak, some features have been added directly to Leveled to directly test some potential enhancements to anti-entropy. These features are concerned with: +Leveled is primarily designed to be a backend for Riak, and Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters. As part of the ongoing community work to build improvements into a new pure open-source release of Riak, some features have been added directly to Leveled to explore some potential enhancements to anti-entropy. These features are concerned with: -- Allowing for the database state within in a Leveled store or stores to be compared with an other store or stores which should have the same data; +- Allowing for the database state within in a Leveled store or stores to be compared with an other store or stores which should share a portion of that state; - Allowing for quicker checking that recent changes to one store or stores have also been received by another store or stores that should be receiving the same changes. @@ -12,7 +12,7 @@ The aim is to use these as new backend capabilities, combined with new coverage - Comparison can use a consistent approach to compare state within and between clusters. -- Comparison does not rely on duplication of database state to a separate store, with further anti-entropy required to manage state variance between the actual stores and anti-entropy stores. +- Comparison does not rely on duplication of database state to a separate anti-entropy database, with further anti-entropy required to manage state variance between the actual stores and anti-entropy stores. - Comparison of state can be abstracted from Riak specific implementation so that mechanisms to compare between Riak clusters can be re-used to compare between a Riak cluster and another database store. Coordination with another data store (e.g. Solr) can be controlled by the Riak user not just the Riak developer. @@ -36,15 +36,15 @@ This requires all of the keys and hashes need to be pulled into memory to build Anti-entropy in leveled is supported using the leveled_tictac module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, or make the tree tamper-proof, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These Merkle trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) to fit in with Bookmaker-based naming conventions of leveled. The Tic-Tac language has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. -The change from Merkle trees to Tic-Tac trees is simply to no longer use a cryptographically strong hashing algorithm, and now combine hashes through XORing rather than concatenation. So a segment leaf is calculated from: +The change from secure Merkle trees is simply to XOR and not hashing/concatenation for combining hashes, combined with using trees of fixed sizes, so that tree merging can also be managed through XOR operations. So a segment leaf is calculated from: ``hash(K1, H1) XOR hash(K2, H2) XOR ... hash(Kn, Hn)`` -The Keys and hashes can now be combined in any order with any grouping. The use of XOR instead of concatentation is [discouraged in secure Merkle Trees](https://security.stackexchange.com/questions/89847/can-xor-be-used-in-a-merkle-tree-instead-of-concatenation) but is not novel in its use within [trees focused on anti-entropy](http://distributeddatastore.blogspot.co.uk/2013/07/cassandra-using-merkle-trees-to-detect.html). +The Keys and hashes can now be combined in any order with any grouping. The use of XOR instead of concatenation is [discouraged in secure Merkle Trees](https://security.stackexchange.com/questions/89847/can-xor-be-used-in-a-merkle-tree-instead-of-concatenation) but is not novel in its use within [trees focused on anti-entropy](http://distributeddatastore.blogspot.co.uk/2013/07/cassandra-using-merkle-trees-to-detect.html). This enables two things: -- The tree can be built incrementally when scanning across a store not in segment order (i.e. scanning across a store in key order) without needing to hold an state in memory beyond the size of the tree. +- The tree can be built incrementally when scanning across a store not in segment order (i.e. scanning across a store in key order) without needing to hold an state in memory beyond the fixed size of the tree. - Two trees from stores with non-overlapping key ranges can be merged to reflect the combined state of that store i.e. the trees for each store can be built independently and in parallel and the subsequently merged without needing to build an interim view of the combined state. From 4cbe838b529db8b4b1a5ed19e685c4f51f6fa536 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 6 Jul 2017 17:37:59 +0100 Subject: [PATCH 41/58] Ongoing updates --- docs/ANTI_ENTROPY.md | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 1eb6785..83a7ebc 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -56,15 +56,15 @@ It is assumed that the trees will only be transferred securely between trusted a Anti-entropy in Riak is a dual-track process: -- there is a need to efficiently and rapidly provide an update on store state that represents recent additions; +- there is a need to efficiently and rapidly provide an update on vnode-state that represents recent additions; -- there is a need to ensure that the anti-entropy view of state represents the state of the whole database. +- there is a need to ensure that the anti-entropy view of state represents the state of the whole database, as actually exists on disk. -Within the current Riak AAE implementation, recent changes are supported by having a separate anti-entropy store organised by segments so that the Merkle tree can be updated incrementally to reflect recent changes. The Merkle tree produced following these changes should then represent the whole state of the database. +Within the current Riak AAE implementation, tracking recent changes is supported by having a dedicated anti-entropy store organised by segments so that the Merkle tree can be updated incrementally to reflect recent changes. The Merkle tree produced following these changes should then represent the whole state of the database. -However as the view of the whole state is maintained in a different store to that holding the actual data: there is an entropy problem between the actual store and the AAE store e.g. data could be lost from the real store, and go undetected as it is not lost from the AAE store. So periodically the AAE store is rebuilt by scanning the whole of the real store. This rebuild can be an expensive process, and the cost is commonly controlled through performing this task infrequently. Prior to the end of Basho there were changes pending in develop to better throttle and schedule these updates - through the riak_kv_sweeper. +However as the view of the whole state is maintained in a different store to that holding the actual data: there is an entropy problem between the actual store and the AAE store e.g. data could be lost from the real store, and go undetected as it is not lost from the AAE store. So periodically the AAE store is rebuilt by scanning the whole of the real store. This rebuild can be an expensive process, and the cost is commonly controlled through performing this task infrequently. Prior to the end of Basho there were changes pending in develop to better throttle and schedule these updates - through the riak_kv_sweeper, so that the store could be built more frequently with safety (and so that the scans necessary to build the store could be multi-purpose). -The AAE store also needs to be partially scanned on a regular basis to update the current view of the Merkle tree. If a vnode has 100M keys, and there has been 1000 updates since the last merkle tree was updated - then there will need to be o(1000) seeks across subsets of the store returning o(100K) keys in total. As the store grows, the AAE store can grow to a non-trivial size, and have an impact on the page-cache and disk busyness. +The AAE store also needs to be partially scanned on a regular basis to update the current view of the Merkle tree. If a vnode has 100M keys, and there has been 1000 updates since the last merkle tree was updated - then there will need to be o(1000) seeks across subsets of the store returning o(100K) keys in total. As the store grows, the AAE store can grow to a non-trivial size, and have an impact on the page-cache and disk busyness within the node. The AAE store is re-usable for checking consistency between databases, but with the following limitations: @@ -74,7 +74,7 @@ The AAE store is re-usable for checking consistency between databases, but with ### Proposed Leveled AAE -There are three primary costs with scanning over the whole database: +The first stage in considering an alternative approach to anti-entropy, was to question the necessity of having a dedicated AAE database that needs to reflect all key changes in the actual vnode store. A separate store can have features such as being sorted by segment ID that make that store easier to scan for rebuilds of the tree. By contrast, there are three primary costs with scanning over the primary database: - the impact on the page cache as all keys and values have to be read from disk, including not-recently used values; @@ -82,15 +82,17 @@ There are three primary costs with scanning over the whole database: - the overall I/O load (primarily network-related) of streaming results from the fold. -The third cost can be addressed by the fold output being an incrementally updatable tree of a fixed size; i.e. if the fold builds a Tic-Tac tree and doesn't stream results (like list keys), and guarantees a fixed size output both from a single partition and following merging across multiple partitions. Within Leveled the first two costs are reduced by design due to the separation of Keys and Metadata from the object value, reducing significantly the workload associated with such a scan - especially where values are large. +The third cost can be addressed by the fold output being an incrementally updatable tree of a fixed size; i.e. if the fold builds a Tic-Tac tree and doesn't stream results (like list keys), and guarantees a fixed size output both from a single partition and following merging across multiple partitions. Within Leveled the first two costs are reduced by design due to the separation of Keys and Metadata from the object value, reducing significantly the workload associated with such a scan; especially where values are large. The [testing of traditional Riak AAE](https://github.com/martinsumner/leveled/blob/master/docs/VOLUME.md#leveled-aae-rebuild-with-journal-check) already undertaken has shown that scanning the database is not necessarily such a big issue in Leveled. So it does seem potentially feasible to scan the store on a regular basis. The testing of Leveldb with the riak_kv_sweeper feature shows that with the improved throttling more regular scanning is also possible here: testing with riak_kv_sweeper managed to achieve 10 x the number of sweeps, with only a 9% drop in throughput. -A hypothesis is proposed that regular scanning of the full store to produce a Tic-Tac tree is certainly feasible in Leveled, but also potentially tolerable in other back-ends. However, frequent scanning may still be impractical. It is therefore suggested that there should be an alternative form of anti-entropy that can be run in addition to scanning, that is lower cost and can be run be frequently in support of scanning to produce Tic-Tac trees. This supporting anti-entropy should focus on the job of verifying that recent changes have been received. So there would be two anti-entropy mechanisms, one which can be run frequently (minutes) to check for the receipt of recent changes, and one that can be run regularly but infrequently (hours/days) to check that full database state is consistent. +A hypothesis is proposed that regular scanning of the full store to produce a Tic-Tac tree is certainly feasible in Leveled, but also potentially tolerable in other back-ends. However, frequent scanning is likely to still be impractical. It is therefore suggested that there should be an alternative form of anti-entropy that can be run in addition to scanning, that is lower cost and can be run be frequently in support of whole database scanning. This additional anti-entropy mechanism would focus on the job of verifying that recent changes have been received. So there would be two anti-entropy mechanisms, one which can be run frequently (minutes) to check for the receipt of recent changes, and one that can be run regularly but infrequently (hours/days) to check that full database state is consistent. + +It is proposed to compare full database state by scanning the actual store, but producing a Tic-Tac tree as the outcome, one that can be merged across partitions through a coverage query to provide an overall view of the database state. This view could be compared with different coverage query offsets within the same cluster, and with different replicated clusters. To provide a check on recent changes it is proposed to add a temporary index within the store, with an entry for each change that is built from a rounded last modified date and the hash of the value, so that the index can be scanned to form a Tic-Tac tree of recent changes. This assumes that each object has a Last Modified Date that is consistent (for that version) across all points where that particular version is stored, to use as the field name for the index. The term of the index is based on the segment ID (for the tree) and the hash of the value. This allows for a scan to build a tree of changes for a given range of modified dates, as well as a scan for keys and hashes to be returned for a given segment ID and date range. -Within the Leveled the index can be made temporary by giving the entry a time-to-live, independent of any object time to live. So once the change is beyond the timescale in which the operator wishes to check for recent changes, it will naturally be removed from the database (through deletion on the next compaction event that hits the entry in the Ledger). +Within the Leveled the index can be made temporary by giving the entry a time-to-live, independent of any object time to live. So once the change is beyond the timescale in which the operator wishes to check for recent changes, it will naturally be removed from the database (through deletion on the next compaction event that hits the entry in the Ledger). Therefore in the long-term, there is no need to maintain additional state outside of the primary database stores, in order to manage anti-entropy. Hence overall this should give: From 417687c0044ac4ea7ff7c5cc5f606922b14920c9 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 7 Jul 2017 10:13:37 +0100 Subject: [PATCH 42/58] Word wrangling --- docs/ANTI_ENTROPY.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 83a7ebc..781372d 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -32,7 +32,7 @@ A side effect of the concatenation decision is that trees cannot be calculated i This requires all of the keys and hashes need to be pulled into memory to build the hashtree - unless the tree is being built segment by segment. The Riak hashtree data store is therefore ordered by segment so that it can be incrementally built. The segments which have had key changes are tracked, and at exchange time all "dirty segments" are re-scanned in the store segment by segment, so that the hashtree can be rebuilt. -## Tic-Tac Trees +## Tic-Tac Merkle Trees Anti-entropy in leveled is supported using the leveled_tictac module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, or make the tree tamper-proof, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These Merkle trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) to fit in with Bookmaker-based naming conventions of leveled. The Tic-Tac language has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. @@ -56,15 +56,15 @@ It is assumed that the trees will only be transferred securely between trusted a Anti-entropy in Riak is a dual-track process: -- there is a need to efficiently and rapidly provide an update on vnode-state that represents recent additions; +- there is a capacity to efficiently and rapidly provide an update on overall vnode state that reflects recent additions, by maintaining a separate anti-entropy store in parallel to each primary store; -- there is a need to ensure that the anti-entropy view of state represents the state of the whole database, as actually exists on disk. +- there is a capacity to ensure that the anti-entropy view of state represents the state of the whole database, as actually exists on disk, by periodically rebuilding that view from the primary store. -Within the current Riak AAE implementation, tracking recent changes is supported by having a dedicated anti-entropy store organised by segments so that the Merkle tree can be updated incrementally to reflect recent changes. The Merkle tree produced following these changes should then represent the whole state of the database. +Within the current Riak AAE implementation, tracking recent changes is supported by having a dedicated anti-entropy store organised by segments (an identifier of a leaf of the Merkle tree) so that the Merkle tree can be updated incrementally to reflect recent changes. When a new update is received an AAE tree update is made following the vnode update, and the segment is marked as requiring an update before completing the next Merkle tree exchange. However as the view of the whole state is maintained in a different store to that holding the actual data: there is an entropy problem between the actual store and the AAE store e.g. data could be lost from the real store, and go undetected as it is not lost from the AAE store. So periodically the AAE store is rebuilt by scanning the whole of the real store. This rebuild can be an expensive process, and the cost is commonly controlled through performing this task infrequently. Prior to the end of Basho there were changes pending in develop to better throttle and schedule these updates - through the riak_kv_sweeper, so that the store could be built more frequently with safety (and so that the scans necessary to build the store could be multi-purpose). -The AAE store also needs to be partially scanned on a regular basis to update the current view of the Merkle tree. If a vnode has 100M keys, and there has been 1000 updates since the last merkle tree was updated - then there will need to be o(1000) seeks across subsets of the store returning o(100K) keys in total. As the store grows, the AAE store can grow to a non-trivial size, and have an impact on the page-cache and disk busyness within the node. +The AAE store also needs to be partially scanned on a regular basis to update the current view of the Merkle tree, to reflect the segments which have been altered by recent changes. If a vnode has 100M keys, and there has been 1000 updates since the last merkle tree was updated - then there will need to be o(1000) seeks across subsets of the store returning o(100K) keys in total. As the store grows, the AAE store can grow to a non-trivial size, and these operations may have an impact on the page-cache and disk busyness within the node. The AAE store is re-usable for checking consistency between databases, but with the following limitations: @@ -72,6 +72,8 @@ The AAE store is re-usable for checking consistency between databases, but with - the aae store is not split by bucket, and so supporting replication configured per bucket is challenging. +The AAE process in production system commonly raises false positives (prompts repairs that are unnecessary), sometimes for [known reasons](https://github.com/basho/riak_kv/issues/1189), sometimes for unknown reasons, especially following rebuilds which follow ring changes. The repair process has [a throttle](http://docs.basho.com/riak/kv/2.2.3/using/cluster-operations/active-anti-entropy/#throttling) to prevent this from impacting a production system, but this commonly needs to be re-tuned based on experience. + ### Proposed Leveled AAE The first stage in considering an alternative approach to anti-entropy, was to question the necessity of having a dedicated AAE database that needs to reflect all key changes in the actual vnode store. A separate store can have features such as being sorted by segment ID that make that store easier to scan for rebuilds of the tree. By contrast, there are three primary costs with scanning over the primary database: From d5a365b368aba781d0d07be747505f2e7fee8108 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 7 Jul 2017 10:23:21 +0100 Subject: [PATCH 43/58] More word wrangling --- docs/ANTI_ENTROPY.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 781372d..e8064c7 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -76,7 +76,7 @@ The AAE process in production system commonly raises false positives (prompts re ### Proposed Leveled AAE -The first stage in considering an alternative approach to anti-entropy, was to question the necessity of having a dedicated AAE database that needs to reflect all key changes in the actual vnode store. A separate store can have features such as being sorted by segment ID that make that store easier to scan for rebuilds of the tree. By contrast, there are three primary costs with scanning over the primary database: +The first stage in considering an alternative approach to anti-entropy, was to question the necessity of having a dedicated AAE database that needs to reflect all key changes in the actual vnode store. A separate store can have features such as being sorted by segment ID that make that store easier to scan for rebuilds of the tree: hence avoiding the three main costs with scanning over the primary database: - the impact on the page cache as all keys and values have to be read from disk, including not-recently used values; @@ -88,13 +88,17 @@ The third cost can be addressed by the fold output being an incrementally updata The [testing of traditional Riak AAE](https://github.com/martinsumner/leveled/blob/master/docs/VOLUME.md#leveled-aae-rebuild-with-journal-check) already undertaken has shown that scanning the database is not necessarily such a big issue in Leveled. So it does seem potentially feasible to scan the store on a regular basis. The testing of Leveldb with the riak_kv_sweeper feature shows that with the improved throttling more regular scanning is also possible here: testing with riak_kv_sweeper managed to achieve 10 x the number of sweeps, with only a 9% drop in throughput. -A hypothesis is proposed that regular scanning of the full store to produce a Tic-Tac tree is certainly feasible in Leveled, but also potentially tolerable in other back-ends. However, frequent scanning is likely to still be impractical. It is therefore suggested that there should be an alternative form of anti-entropy that can be run in addition to scanning, that is lower cost and can be run be frequently in support of whole database scanning. This additional anti-entropy mechanism would focus on the job of verifying that recent changes have been received. So there would be two anti-entropy mechanisms, one which can be run frequently (minutes) to check for the receipt of recent changes, and one that can be run regularly but infrequently (hours/days) to check that full database state is consistent. +A hypothesis is proposed that regular scanning of the full store to produce a Tic-Tac tree is certainly feasible in Leveled, but also potentially tolerable in other back-ends. However, frequent scanning is likely to still be impractical. If it is not possible to scan the database frequently, if a recent failure event has led to a discrepancy between stores, this will not be detected in a timely manner. It is therefore suggested that there should be an alternative form of anti-entropy that can be run in addition to scanning, that is lower cost and can be run frequently in support of whole database scanning. This additional anti-entropy mechanism would focus on the job of verifying that recent changes have been received. -It is proposed to compare full database state by scanning the actual store, but producing a Tic-Tac tree as the outcome, one that can be merged across partitions through a coverage query to provide an overall view of the database state. This view could be compared with different coverage query offsets within the same cluster, and with different replicated clusters. +So there would be two anti-entropy mechanisms, one which can be run frequently (minutes) to check for the receipt of recent changes, and one that can be run regularly but infrequently (hours/days) to check that full database state is consistent. + +It is proposed to compare full database state by scanning the actual store, but producing a Tic-Tac Merkle tree as the outcome, one that can be merged across partitions through a coverage query to provide an overall view of the database state. This view could be compared with different coverage query offsets within the same cluster, and with different replicated clusters. To provide a check on recent changes it is proposed to add a temporary index within the store, with an entry for each change that is built from a rounded last modified date and the hash of the value, so that the index can be scanned to form a Tic-Tac tree of recent changes. This assumes that each object has a Last Modified Date that is consistent (for that version) across all points where that particular version is stored, to use as the field name for the index. The term of the index is based on the segment ID (for the tree) and the hash of the value. This allows for a scan to build a tree of changes for a given range of modified dates, as well as a scan for keys and hashes to be returned for a given segment ID and date range. -Within the Leveled the index can be made temporary by giving the entry a time-to-live, independent of any object time to live. So once the change is beyond the timescale in which the operator wishes to check for recent changes, it will naturally be removed from the database (through deletion on the next compaction event that hits the entry in the Ledger). Therefore in the long-term, there is no need to maintain additional state outside of the primary database stores, in order to manage anti-entropy. +As this index only covers recent changes, it will be limited in size, and mainly in-memory, and so it can be scanned frequently in a cost-effective manner to both gather trees for comparison, and discover Keys in segments with variations. + +Within the Leveled the index can be made temporary by giving the entry a time-to-live, independent of any object time to live. So once the change is beyond the timescale in which the operator wishes to check for recent changes, it will naturally be removed from the database (through deletion on the next compaction event that hits the entry in the Ledger). Therefore in the long-term, there is no need to maintain additional state outside of the primary database stores, in order to manage anti-entropy. This may also be possible using TTL features in leveldb. Hence overall this should give: From 414785802e00b15b1aaaa1193e08a7967d866a0e Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 7 Jul 2017 16:38:02 +0100 Subject: [PATCH 44/58] Minor edits --- docs/ANTI_ENTROPY.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index e8064c7..db57ef1 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -129,7 +129,7 @@ There are two parts to the full database anti-entropy mechanism: the Tic-Tac tr PartitionFilter} ``` -The tictactree_obj folder produces a Tic-Tac tree form a fold across the objects (or more precisely the heads of the objects in the Ledger)m using the constraints Tag, Bucket, StartKey and EndKey. CheckPresence can be used to require the folder to confirm if the value is present in the Journal before including it in the tree - this will slow down the fold significantly, but protect from corruption in the Journal not represented in the Ledger. The partition filter cna be used where the store holds data from multiple partitions, and only data form a subset of partitions should be included, with the partition filter being a function on the Bucket and Key to make that decision. +The tictactree_obj folder produces a Tic-Tac tree from a fold across the objects (or more precisely the heads of the objects in the Ledger) using the constraints Tag, Bucket, StartKey and EndKey. CheckPresence can be used to require the folder to confirm if the value is present in the Journal before including it in the tree - this will slow down the fold significantly, but protect from corruption in the Journal not represented in the Ledger. The partition filter can be used where the store holds data from multiple partitions, and only data form a subset of partitions should be included, with the partition filter being a function on the Bucket and Key to make that decision. The tictactree_idx folder produces a tic-Tac tree from a range of an index, and so can be used as with tictactree_obj but for checking that an index is consistent between coverage offsets or between databases. @@ -179,7 +179,6 @@ Even with this mitigation, the volume of false positives and negatives needs to The approach considered for Leveled has been to modify the Merkle trees used to ease their implementation, as well as specifically separating out anti-entropy for recent changes as a different problem to long-term anti-entropy of global state. -There is [emergent research ongoing](http://haslab.uminho.pt/tome/files/global_logical_clocks.pdf) to try and leverage the use of dotted version vectors at a node level to improve the efficiency of managing key-level consistency, reduce the risks associated with deletes (without permanent tombstones), but also provide an inherently integrated approach to active anti-entropy. The Global Logical Clock approach does assume that durability is not mutable: From 6439da626a820c0ece621c129898f06147c7a32b Mon Sep 17 00:00:00 2001 From: Russell Brown Date: Sun, 9 Jul 2017 17:06:50 +0100 Subject: [PATCH 45/58] Some proposed edits --- docs/ANTI_ENTROPY.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index db57ef1..400faae 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -30,13 +30,13 @@ A side effect of the concatenation decision is that trees cannot be calculated i ``hash([{K1, H1}, {K2, H2} .. {Kn, Hn}])`` -This requires all of the keys and hashes need to be pulled into memory to build the hashtree - unless the tree is being built segment by segment. The Riak hashtree data store is therefore ordered by segment so that it can be incrementally built. The segments which have had key changes are tracked, and at exchange time all "dirty segments" are re-scanned in the store segment by segment, so that the hashtree can be rebuilt. +This requires all of the keys and hashes to be pulled into memory to build the hashtree - unless the tree is being built segment by segment. The Riak hashtree data store is therefore ordered by segment so that it can be incrementally built. The segments which have had key changes are tracked, and at exchange time all "dirty segments" are re-scanned in the store segment by segment, so that the hashtree can be rebuilt. ## Tic-Tac Merkle Trees -Anti-entropy in leveled is supported using the leveled_tictac module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, or make the tree tamper-proof, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These Merkle trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) to fit in with Bookmaker-based naming conventions of leveled. The Tic-Tac language has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. +Anti-entropy in leveled is supported using the [leveled_tictac](https://github.com/martinsumner/leveled/blob/mas-tictac/src/leveled_tictac.erl) module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, or make the tree tamper-proof, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These Merkle trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) to fit in with Bookmaker-based naming conventions of leveled. The Tic-Tac language has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. -The change from secure Merkle trees is simply to XOR and not hashing/concatenation for combining hashes, combined with using trees of fixed sizes, so that tree merging can also be managed through XOR operations. So a segment leaf is calculated from: +The change from secure Merkle trees is simply to (use XOR? or XOR hashes), and not hashing/concatenation, for combining hashes, combined with using trees of fixed sizes, so that tree merging can also be managed through XOR operations. So a segment leaf is calculated from: ``hash(K1, H1) XOR hash(K2, H2) XOR ... hash(Kn, Hn)`` @@ -62,7 +62,7 @@ Anti-entropy in Riak is a dual-track process: Within the current Riak AAE implementation, tracking recent changes is supported by having a dedicated anti-entropy store organised by segments (an identifier of a leaf of the Merkle tree) so that the Merkle tree can be updated incrementally to reflect recent changes. When a new update is received an AAE tree update is made following the vnode update, and the segment is marked as requiring an update before completing the next Merkle tree exchange. -However as the view of the whole state is maintained in a different store to that holding the actual data: there is an entropy problem between the actual store and the AAE store e.g. data could be lost from the real store, and go undetected as it is not lost from the AAE store. So periodically the AAE store is rebuilt by scanning the whole of the real store. This rebuild can be an expensive process, and the cost is commonly controlled through performing this task infrequently. Prior to the end of Basho there were changes pending in develop to better throttle and schedule these updates - through the riak_kv_sweeper, so that the store could be built more frequently with safety (and so that the scans necessary to build the store could be multi-purpose). +However as the view of the whole state is maintained in a different store to that holding the actual data: there is an entropy problem between the actual store and the AAE store e.g. data could be lost from the real store, and go undetected as it is not lost from the AAE store. So periodically the AAE store is rebuilt by scanning the whole of the real store. This rebuild can be an expensive process, and the cost is commonly controlled through performing this task infrequently. Prior to the end of Basho there were changes pending in riak\_kv's develop branch to better throttle and schedule these updates - through the `riak_kv_sweeper`, so that the store could be built more frequently with safety (and so that the scans necessary to build the store could be multi-purpose). The AAE store also needs to be partially scanned on a regular basis to update the current view of the Merkle tree, to reflect the segments which have been altered by recent changes. If a vnode has 100M keys, and there has been 1000 updates since the last merkle tree was updated - then there will need to be o(1000) seeks across subsets of the store returning o(100K) keys in total. As the store grows, the AAE store can grow to a non-trivial size, and these operations may have an impact on the page-cache and disk busyness within the node. @@ -70,7 +70,7 @@ The AAE store is re-usable for checking consistency between databases, but with - the two stores need to be partitioned equally, constraining replication to other database technologies, and preventing replication from being used as an approach to re-partitioning (ring re-sizing). -- the aae store is not split by bucket, and so supporting replication configured per bucket is challenging. +- the AAE store is not split by bucket, and so supporting replication configured per bucket is challenging. The AAE process in production system commonly raises false positives (prompts repairs that are unnecessary), sometimes for [known reasons](https://github.com/basho/riak_kv/issues/1189), sometimes for unknown reasons, especially following rebuilds which follow ring changes. The repair process has [a throttle](http://docs.basho.com/riak/kv/2.2.3/using/cluster-operations/active-anti-entropy/#throttling) to prevent this from impacting a production system, but this commonly needs to be re-tuned based on experience. @@ -86,7 +86,7 @@ The first stage in considering an alternative approach to anti-entropy, was to q The third cost can be addressed by the fold output being an incrementally updatable tree of a fixed size; i.e. if the fold builds a Tic-Tac tree and doesn't stream results (like list keys), and guarantees a fixed size output both from a single partition and following merging across multiple partitions. Within Leveled the first two costs are reduced by design due to the separation of Keys and Metadata from the object value, reducing significantly the workload associated with such a scan; especially where values are large. -The [testing of traditional Riak AAE](https://github.com/martinsumner/leveled/blob/master/docs/VOLUME.md#leveled-aae-rebuild-with-journal-check) already undertaken has shown that scanning the database is not necessarily such a big issue in Leveled. So it does seem potentially feasible to scan the store on a regular basis. The testing of Leveldb with the riak_kv_sweeper feature shows that with the improved throttling more regular scanning is also possible here: testing with riak_kv_sweeper managed to achieve 10 x the number of sweeps, with only a 9% drop in throughput. +The [testing of traditional Riak AAE](https://github.com/martinsumner/leveled/blob/master/docs/VOLUME.md#leveled-aae-rebuild-with-journal-check) already undertaken has shown that scanning the database is not necessarily such a big issue in Leveled. So it does seem potentially feasible to scan the store on a regular basis. The testing of Leveldb with the `riak_kv_sweeper` feature shows that with the improved throttling more regular scanning is also possible here: testing with `riak_kv_sweeper` managed to achieve 10 x the number of sweeps, with only a 9% drop in throughput. A hypothesis is proposed that regular scanning of the full store to produce a Tic-Tac tree is certainly feasible in Leveled, but also potentially tolerable in other back-ends. However, frequent scanning is likely to still be impractical. If it is not possible to scan the database frequently, if a recent failure event has led to a discrepancy between stores, this will not be detected in a timely manner. It is therefore suggested that there should be an alternative form of anti-entropy that can be run in addition to scanning, that is lower cost and can be run frequently in support of whole database scanning. This additional anti-entropy mechanism would focus on the job of verifying that recent changes have been received. @@ -131,7 +131,7 @@ There are two parts to the full database anti-entropy mechanism: the Tic-Tac tr The tictactree_obj folder produces a Tic-Tac tree from a fold across the objects (or more precisely the heads of the objects in the Ledger) using the constraints Tag, Bucket, StartKey and EndKey. CheckPresence can be used to require the folder to confirm if the value is present in the Journal before including it in the tree - this will slow down the fold significantly, but protect from corruption in the Journal not represented in the Ledger. The partition filter can be used where the store holds data from multiple partitions, and only data form a subset of partitions should be included, with the partition filter being a function on the Bucket and Key to make that decision. -The tictactree_idx folder produces a tic-Tac tree from a range of an index, and so can be used as with tictactree_obj but for checking that an index is consistent between coverage offsets or between databases. +The tictactree_idx folder produces a Tic-Tac tree from a range of an index, and so can be used like tictactree_obj but for checking that an index is consistent between coverage offsets or between databases. These two folds are tested in the tictac_SUITE test suite in the ``many_put_compare`` and ``index_compare`` tests. @@ -153,25 +153,25 @@ The index entry is then of the form: In blacklist mode the Bucket will be $all, and the Key will actually be a {Bucket, Key} pair. -The index entry is given a TTL of a configurable amount (e.g. 1 hour) - and no index entry may be added if the change is already considered to be too far in the past. The index entry is added to the Ledger in the same transaction as the other changes, and will be re-calculated and re-added out of the Journal under restart conditions where the change has not reached a persisted state in the Ledger prior to the close. +The index entry is given a TTL of a configurable amount (e.g. 1 hour) - and no index entry may be added if the change is already considered to be too far in the past. The index entry is added to the Ledger in the same transaction as an object value update, and will be re-calculated and re-added out of the Journal under restart conditions where the change has not reached a persisted state in the Ledger prior to the close, for example after a crash. The near real-time entropy index currently has four ct tests: -- recent_aae_noaae (confirming loading a store with real-time aae disabled has no impact); +- `recent_aae_noaae` (confirming loading a store with real-time aae disabled has no impact); -- recent_aae_allaae (confirming that a single store loaded with data can be compared with the a store where the same data is spread across three leveled instances - with all buckets covered by anti-entropy); +- `recent_aae_allaae` (confirming that a single store loaded with data can be compared with the a store where the same data is spread across three leveled instances - with all buckets covered by anti-entropy); -- recent_aae_bucketaae (confirming that a single store loaded with data can be compared with the a store where the same data is spread across three leveled instances - with a single buckets covered by anti-entropy); +- `recent_aae_bucketaae` (confirming that a single store loaded with data can be compared with the a store where the same data is spread across three leveled instances - with a single buckets covered by anti-entropy); -- recent_aae_expiry (confirming that aae index will expire). +- `recent_aae_expiry` (confirming that aae index will expire). ### Clock Drift -The proposed near-real-time anti-entropy mechanism depends on a timestamp, so ultimately some false positives and false negatives are unavoidable - especially if clock drift is large. The assumption is though: +The proposed near-real-time anti-entropy mechanism depends on a timestamp, so ultimately some false positives and false negatives are unavoidable - especially if clock drift is large. The assumption is: - that this method is never a single dependency for ensuring consistency, it is supported by other mechanisms to further validate, that would detect false negatives. -- that recovery from false positives will be safely implemented, so that a falsely identified discrepancy is validated before a change is made (e.g. repair through read-repair). +- that recovery from false positives will be safely implemented, so that a falsely identified discrepancy is validated before a change is made (e.g. read-repair). Even with this mitigation, the volume of false positives and negatives needs to be controlled, in particular where clock drift is small (i.e. measured in seconds), and hence likely. If the object has a last modified date set in one place, as with Riak, there is no issue with different actors seeing a different last modified date for the same change. However, as the index object should expire the risk exists that the store will set an inappropriate expiry time, or even not index the object as it considers the object to be a modification too far in the past. The Near Real-Time AAE process has the concept of unit minutes, which represents the level of granularity all times will be rounded to. All expiry times are set with a tolerance equal to the unit minutes, to avoid false positives or negatives when clock drift is small. @@ -180,7 +180,7 @@ Even with this mitigation, the volume of false positives and negatives needs to The approach considered for Leveled has been to modify the Merkle trees used to ease their implementation, as well as specifically separating out anti-entropy for recent changes as a different problem to long-term anti-entropy of global state. -The Global Logical Clock approach does assume that durability is not mutable: +The [Global Logical Clock](https://github.com/ricardobcl/DottedDB) approach does assume that durability is not mutable: > Nodes have access to durable storage; nodes can crash but eventually will recover with the content of the durable storage as at the time of From e7c322d956e14560c8eee9f28f2a2b81273703a3 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 10 Jul 2017 09:49:20 +0100 Subject: [PATCH 46/58] Clarify pointer to research --- docs/ANTI_ENTROPY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index db57ef1..2a65951 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -179,6 +179,7 @@ Even with this mitigation, the volume of false positives and negatives needs to The approach considered for Leveled has been to modify the Merkle trees used to ease their implementation, as well as specifically separating out anti-entropy for recent changes as a different problem to long-term anti-entropy of global state. +[Recent research](http://haslab.uminho.pt/tome/files/global_logical_clocks.pdf) has been released which examines using dotted version vectors at a node level to improve the efficiency of managing key-level consistency, reducing the risks associated with deletes (without permanent tombstones), but also provide an inherently integrated approach to active anti-entropy. The Global Logical Clock approach does assume that durability is not mutable: From 03d0b913d5ed90fa4977980fccce04356ca76dbe Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 10 Jul 2017 11:45:35 +0100 Subject: [PATCH 47/58] Add new background section --- docs/ANTI_ENTROPY.md | 50 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 3875bf8..6bbf866 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -1,5 +1,53 @@ # Anti-Entropy +## Background + +In the initial releases of Riak, there were three levels of protection against loss of data, where loss is caused by either a backend store not receiving data (because it was unavailable), or losing writes (due to a crash, or corruption of previously written data): + +- [Read repair](http://docs.basho.com/riak/kv/2.2.3/learn/concepts/replication/#read-repair), whenever an object was read, if as part of that read it was discovered that a primary node that should have the an update but instead has an older version of an object; then post the completion of the read the finite-state-machine managing the get would update the out-of-date vnode with the latest version. + +- [Hinted handoff](http://docs.basho.com/riak/kv/2.2.3/using/reference/handoff/#types-of-handoff), if a fallback node has taken responsibility for writes to a given vnode due to a temporary ring change in the cluster (e.g. due to a node failure), then when the expected primary returns to service the fallback node should be triggered to handoff any data it has (from this or any previous fallback period) to the expected primary vnode. Once handoff was complete the vnode would then self-destruct and remove any durable state. Fallback nodes start vnodes for the same ring partition as the primary vnode. A fallback node is selected because it owns the next vnode in the ring, but it starts a new vnode to replace the primary vnode, it doesn't store data in the vnode backend which caused it to be considered a fallback (fallback is to a node not to a vnode) - so handoff is not required to be selective about the data that is handed off. + +- [Key-listing for multi-data-centre replication](http://docs.basho.com/riak/kv/2.2.3/using/reference/v2-multi-datacenter/architecture/#fullsync-replication), for customers with the proprietary Riak Enterprise software there was a mechanism whereby vnode by vnode there would be a fold over all the objects in the vnode, for a replicated bucket, calculating a hash for the object and sending the keys and hashes to a replicated cluster for comparison with the result of its equivalent object fold. Any variances would then be repaired by streaming those missing updates between the clusters to be re-added across all required vnodes. + +There were three primary issues with these mechanisms: + +- Some objects may be read very infrequently, and such objects may be lost due to a series of failure or disk-corruption events that occurred between reads. + +- For large stores per-vnode object folding required for MDC was an expensive operation, and when run in parallel with standard database load could lead to unpredictability in response times. + +- Some read events do not validate across multiple vnodes, primarily secondary index queries, so an inconsistent index due to a failed write would never be detected by the database. Secondary index queries were not necessarily eventually consistent, but were potentially never consistent. + +To address these weaknesses Active Anti-Entropy (AAE) was introduced to Riak, as a configurable option. Configuring Active Anti-Entropy would start a new AAE hashtree store for every primary vnode in the ring. The vnode process would following a successful put [update this hashtree store process](https://github.com/basho/riak_kv/blob/2.1.7/src/riak_kv_vnode.erl#L2139-L2169) by sending it the updated object after converting it from its binary format. This would generally happen in an async way, but periodically the change would block the vnode to confirm that the AAE process was keeping up. The hashtree store process would hash the riak object to create a hash for the update, and hash the Key to map it to one of 1024 * 1024 segments - and then in batches update the store with a key of {$t, Partition, Segment, Key}, and a value of the object hash. + +From this persisted store a Merkle tree is maintained for each Partition. These Merkle trees can then then be exchanged with another vnode's AAE hashtree store if that vnode is also a primary vnode for that same partition. Exchanging the Merkle tree would highlight any segments which had a variance - and then it would be possible to iterate over the store segment by segment to discover which keys actually differed. Those keys could then be re-read, so that read repair would correct any entropy that actually existed. + +The process of maintaining the hashtree is partially deferred to the point of the exchange, but is relatively low cost, and so these exchanges can occur frequently without creating significant background load. Infrequently, but regularly, the hashtree store would be cleared and rebuilt from an object fold to ensure that it reflected the actual persisted state in the store. + +The impact on the tree primary issues of anti-entropy of this mechanism was: + +- Repair was de-coupled from reads, and so unread objects would not have a vastly reduced risk of disappearing following node failure and disk corruption events. + +- Exchanges have a predictable and limited impact on cluster load, relative to object folds, if the variance highlighted by the exchange is small. + +- Secondary index queries would be made consistent following PUT failure after the next AAE exchange, and those exchanges are regular. Consistency is maintained by comparison of the actual object, not the index entries within the backed, and so loss of index data due to backend corruption would still not be detected by AAE. + +Although this represented an improvement in terms of entropy management, there were still some imperfections with the mechanism: + +- The hash of the object was *not* based on a canonicalised version of the object, so could be inconsistent between trees (https://github.com/basho/riak_kv/issues/1189). + +- Converting the object from_binary and sending it to another process has a potentially non-trivial cost for larger objects with significant amounts of metadata (e.g. 2i terms). + +- Hashtrees may become mysteriously inconsistent following rebuilds, if the rebuild followed a cluster change operation (e.g. adding/removing a node) - and there would be storms of read actions prompted that would not lead to repairs. + +- The anti-entropy mechanism is tightly coupled with the partitioning of the cluster, and so cannot be used between clusters of different ring-sizes so that replication cannot support safe ring-size changes (i.e. we cannot change ring size by starting another cluster with a different size and replicating to that cluster). + +- The hashtrees are not externally exposed, and so cannot be used for externally managed replication (e.g. to another database). + +- The rebuilds of the hshtree still require the relatively expensive fold_objects operation, and so parallelisation of rebuilds may need to be controlled to prevent an impact on cluster performance. Measuring the impact is difficult in pre-production load tests due to the scheduled and infrequent nature of AAE rebuilds. + +## Leveled and AAE + Leveled is primarily designed to be a backend for Riak, and Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters. As part of the ongoing community work to build improvements into a new pure open-source release of Riak, some features have been added directly to Leveled to explore some potential enhancements to anti-entropy. These features are concerned with: - Allowing for the database state within in a Leveled store or stores to be compared with an other store or stores which should share a portion of that state; @@ -36,7 +84,7 @@ This requires all of the keys and hashes to be pulled into memory to build the h Anti-entropy in leveled is supported using the [leveled_tictac](https://github.com/martinsumner/leveled/blob/mas-tictac/src/leveled_tictac.erl) module. This module uses a less secure form of merkle trees that don't prevent information from leaking out, or make the tree tamper-proof, but allow for the trees to be built incrementally, and trees built incrementally to be merged. These Merkle trees we're calling Tic-Tac Trees after the [Tic-Tac language](https://en.wikipedia.org/wiki/Tic-tac) to fit in with Bookmaker-based naming conventions of leveled. The Tic-Tac language has been historically used on racecourses to communicate the state of the market between participants; although the more widespread use of mobile communications means that the use of Tic-Tac is petering out, and rather like Basho employees, there are now only three Tic-Tac practitioners left. -The change from secure Merkle trees is simply to (use XOR? or XOR hashes), and not hashing/concatenation, for combining hashes, combined with using trees of fixed sizes, so that tree merging can also be managed through XOR operations. So a segment leaf is calculated from: +The first change from secure Merkle trees is simply to XOR together hashes to combine them, rather than re-hash a concatenation of keys and hashes. Combined with the use of trees of fixed sizes, this allows for tree merging to be managed through XOR operations. So a segment leaf is calculated from: ``hash(K1, H1) XOR hash(K2, H2) XOR ... hash(Kn, Hn)`` From 185d4e9747371aeb2e7238729b29570db4a356cc Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 10 Jul 2017 16:36:01 +0100 Subject: [PATCH 48/58] Notes on Riak Implementation options --- docs/ANTI_ENTROPY.md | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 6bbf866..c208301 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -4,7 +4,7 @@ In the initial releases of Riak, there were three levels of protection against loss of data, where loss is caused by either a backend store not receiving data (because it was unavailable), or losing writes (due to a crash, or corruption of previously written data): -- [Read repair](http://docs.basho.com/riak/kv/2.2.3/learn/concepts/replication/#read-repair), whenever an object was read, if as part of that read it was discovered that a primary node that should have the an update but instead has an older version of an object; then post the completion of the read the finite-state-machine managing the get would update the out-of-date vnode with the latest version. +- [Read repair](http://docs.basho.com/riak/kv/2.2.3/learn/concepts/replication/#read-repair), whenever an object was read, if as part of that read it was discovered that a vnode that should have the an update but instead has an older version of an object; then post the completion of the read the finite-state-machine managing the get would update the out-of-date vnode with the latest version. - [Hinted handoff](http://docs.basho.com/riak/kv/2.2.3/using/reference/handoff/#types-of-handoff), if a fallback node has taken responsibility for writes to a given vnode due to a temporary ring change in the cluster (e.g. due to a node failure), then when the expected primary returns to service the fallback node should be triggered to handoff any data it has (from this or any previous fallback period) to the expected primary vnode. Once handoff was complete the vnode would then self-destruct and remove any durable state. Fallback nodes start vnodes for the same ring partition as the primary vnode. A fallback node is selected because it owns the next vnode in the ring, but it starts a new vnode to replace the primary vnode, it doesn't store data in the vnode backend which caused it to be considered a fallback (fallback is to a node not to a vnode) - so handoff is not required to be selective about the data that is handed off. @@ -24,7 +24,7 @@ From this persisted store a Merkle tree is maintained for each Partition. These The process of maintaining the hashtree is partially deferred to the point of the exchange, but is relatively low cost, and so these exchanges can occur frequently without creating significant background load. Infrequently, but regularly, the hashtree store would be cleared and rebuilt from an object fold to ensure that it reflected the actual persisted state in the store. -The impact on the tree primary issues of anti-entropy of this mechanism was: +The impact on the three primary issues of anti-entropy of this mechanism was: - Repair was de-coupled from reads, and so unread objects would not have a vastly reduced risk of disappearing following node failure and disk corruption events. @@ -44,11 +44,13 @@ Although this represented an improvement in terms of entropy management, there w - The hashtrees are not externally exposed, and so cannot be used for externally managed replication (e.g. to another database). -- The rebuilds of the hshtree still require the relatively expensive fold_objects operation, and so parallelisation of rebuilds may need to be controlled to prevent an impact on cluster performance. Measuring the impact is difficult in pre-production load tests due to the scheduled and infrequent nature of AAE rebuilds. +- The rebuilds of the hashtree still require the relatively expensive fold_objects operation, and so parallelisation of rebuilds may need to be controlled to prevent an impact on cluster performance. Measuring the impact is difficult in pre-production load tests due to the scheduled and infrequent nature of AAE rebuilds. + +- Improvements to hashtrees require significant devleopment and test for transition, due to the potential for hashtree changes to break many things (e.g. Solr integration, MDC), and also the difficulty in coordinating changes between different dependent systems that independently build state over long periods of time. ## Leveled and AAE -Leveled is primarily designed to be a backend for Riak, and Riak has a number of anti-entropy mechanisms for comparing database state within and across clusters. As part of the ongoing community work to build improvements into a new pure open-source release of Riak, some features have been added directly to Leveled to explore some potential enhancements to anti-entropy. These features are concerned with: +Leveled is primarily designed to be a backend for Riak. As part of the ongoing community work to build improvements into a new pure open-source release of Riak, some features have been added directly to Leveled to explore some potential enhancements to anti-entropy. These features are concerned with: - Allowing for the database state within in a Leveled store or stores to be compared with an other store or stores which should share a portion of that state; @@ -78,7 +80,7 @@ A side effect of the concatenation decision is that trees cannot be calculated i ``hash([{K1, H1}, {K2, H2} .. {Kn, Hn}])`` -This requires all of the keys and hashes to be pulled into memory to build the hashtree - unless the tree is being built segment by segment. The Riak hashtree data store is therefore ordered by segment so that it can be incrementally built. The segments which have had key changes are tracked, and at exchange time all "dirty segments" are re-scanned in the store segment by segment, so that the hashtree can be rebuilt. +This requires all of the keys and hashes to be pulled into memory to build the hashtree - unless the tree is being built segment by segment. The Riak hashtree data store is therefore ordered by segment so that it can be incrementally built. The segments which have had key changes are tracked, and at exchange time all "dirty segments" are re-scanned in the store segment by segment, so that the hashtree can be rebuilt. Note though, that this is necessary in the current hashtree implementation even if there was an incrementally buildable Merkle Tree, as there is no read before write into the hashtree to inform the process of what update (if any) to reverse out of the Tree as well as which update to add in. ## Tic-Tac Merkle Trees @@ -124,7 +126,7 @@ The AAE process in production system commonly raises false positives (prompts re ### Proposed Leveled AAE -The first stage in considering an alternative approach to anti-entropy, was to question the necessity of having a dedicated AAE database that needs to reflect all key changes in the actual vnode store. A separate store can have features such as being sorted by segment ID that make that store easier to scan for rebuilds of the tree: hence avoiding the three main costs with scanning over the primary database: +The first stage in considering an alternative approach to anti-entropy, was to question the necessity of having a dedicated AAE database that needs to reflect all key changes in the actual vnode store. This separate store is currently necessary as the hashtree needs a store sorted by segment ID that make that store easier to scan for rebuilds of the tree, hence avoiding the three main costs with scanning over the primary database: - the impact on the page cache as all keys and values have to be read from disk, including not-recently used values; @@ -132,7 +134,9 @@ The first stage in considering an alternative approach to anti-entropy, was to q - the overall I/O load (primarily network-related) of streaming results from the fold. -The third cost can be addressed by the fold output being an incrementally updatable tree of a fixed size; i.e. if the fold builds a Tic-Tac tree and doesn't stream results (like list keys), and guarantees a fixed size output both from a single partition and following merging across multiple partitions. Within Leveled the first two costs are reduced by design due to the separation of Keys and Metadata from the object value, reducing significantly the workload associated with such a scan; especially where values are large. +The third cost can be addressed by the fold output being an incrementally updatable tree of a fixed size; i.e. if the fold builds a Tic-Tac tree and doesn't stream results (like list keys), and guarantees a fixed size output both from a single partition and following merging across multiple partitions. + +Within Leveled the first two costs are reduced by design due to the separation of Keys and Metadata from the object value, reducing significantly the workload associated with such a scan; especially where values are large. The [testing of traditional Riak AAE](https://github.com/martinsumner/leveled/blob/master/docs/VOLUME.md#leveled-aae-rebuild-with-journal-check) already undertaken has shown that scanning the database is not necessarily such a big issue in Leveled. So it does seem potentially feasible to scan the store on a regular basis. The testing of Leveldb with the `riak_kv_sweeper` feature shows that with the improved throttling more regular scanning is also possible here: testing with `riak_kv_sweeper` managed to achieve 10 x the number of sweeps, with only a 9% drop in throughput. @@ -248,3 +252,13 @@ Some further consideration has been given to using a version of this Global Logi - How to discover key IDs from missing dots where the controlling node for the update has recently failed. This likely represent gaps in current understanding, rather than flaws in the approach. The evolution of this research will be tracked with interest. + +## Some Notes on Riak implementation + +Some notes on re-using this alternative anti-entropy mechanism within Riak: + +- There is divergence between Leveled and LevelDB with regards to how async folds are implemented. Within LevelDB requesting an async fold returns a folder function that will take a snapshot when it is called. Within Leveled the option exists to take the snapshot before returning the folder function, so that calling the folder function will work on a snapshot of the store taken when the folder was requested. This difference caused issues with testing with riak_kv_sweeeper, as the scheduling in sweeper meant that folds would be requested, and left on a queue for a long enough to be timed out by the time it was called. The quick fix for riak_kv_sweeper testing was to make the folder snapshot behaviour in Leveled consistent with LevelDB. However, the original behaviour opens up some interesting possibilities for AAE implementation in that a coverage set of vnodes could be snapshotted at a point in time, but not all folds need to be run concurrently to make the result consistent to the point in time - so folds could be directly throttled by the coverage process so that only one fold was being run on each node at once. + +- In Leveled a special fold currently supports the Tic-Tac tree generation for indexes, and one for objects. It may be better to support this through a offering a more open capability to pass different fold functions and accumulators into index folds. This could be re-used for "reporting indexes", where we want to count terms of different types rather than return all those terms via an accumulating list e.g. an index may have a bitmap style part, and the function will apply a wildcard mask to the bitmap and count the number of hits against each possible output. + +- The initial intention is to implement the hashtree query functions based around the coverage_fsm behaviour, but with the option to stipulate externally the offset. So to test for differences between clusters, the user could concurrently query the two clusters for the same offset (or a random offset), whereas to find entropy within a cluster two concurrently run queries could be compared for different offsets. From b5ca2a8541141b3287973a015e2510578a70261d Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 10 Jul 2017 16:47:39 +0100 Subject: [PATCH 49/58] Word wrangling --- docs/ANTI_ENTROPY.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index c208301..075c3ef 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -257,8 +257,8 @@ This likely represent gaps in current understanding, rather than flaws in the ap Some notes on re-using this alternative anti-entropy mechanism within Riak: -- There is divergence between Leveled and LevelDB with regards to how async folds are implemented. Within LevelDB requesting an async fold returns a folder function that will take a snapshot when it is called. Within Leveled the option exists to take the snapshot before returning the folder function, so that calling the folder function will work on a snapshot of the store taken when the folder was requested. This difference caused issues with testing with riak_kv_sweeeper, as the scheduling in sweeper meant that folds would be requested, and left on a queue for a long enough to be timed out by the time it was called. The quick fix for riak_kv_sweeper testing was to make the folder snapshot behaviour in Leveled consistent with LevelDB. However, the original behaviour opens up some interesting possibilities for AAE implementation in that a coverage set of vnodes could be snapshotted at a point in time, but not all folds need to be run concurrently to make the result consistent to the point in time - so folds could be directly throttled by the coverage process so that only one fold was being run on each node at once. +- There is divergence between Leveled and LevelDB with regards to how async folds are implemented. Within LevelDB requesting an async fold returns a folder function that will take a snapshot when it is called. Within Leveled the option exists to take the snapshot before returning the folder function, so that calling the folder function will work on a snapshot of the store taken when the folder was requested. This difference caused issues with testing with riak_kv_sweeeper, as the scheduling in sweeper meant that folds would be requested, and left on a queue for a long enough to be timed out by the time it was called. The quick fix for riak_kv_sweeper testing was to make the folder snapshot behaviour in Leveled consistent with LevelDB. However, the original behaviour opens up some interesting possibilities for AAE implementation in that a coverage set of vnodes could be snapshotted at a point in time, but not all folds need to be run concurrently to make the result consistent to the point in time. This would allow folds could be directly throttled by the coverage process so that only one fold was being run on each node at once, without opening up a time-gap between snapshots that would increase the number of false repairs. - In Leveled a special fold currently supports the Tic-Tac tree generation for indexes, and one for objects. It may be better to support this through a offering a more open capability to pass different fold functions and accumulators into index folds. This could be re-used for "reporting indexes", where we want to count terms of different types rather than return all those terms via an accumulating list e.g. an index may have a bitmap style part, and the function will apply a wildcard mask to the bitmap and count the number of hits against each possible output. -- The initial intention is to implement the hashtree query functions based around the coverage_fsm behaviour, but with the option to stipulate externally the offset. So to test for differences between clusters, the user could concurrently query the two clusters for the same offset (or a random offset), whereas to find entropy within a cluster two concurrently run queries could be compared for different offsets. +- The initial intention is to implement the hashtree query functions based around the coverage_fsm behaviour, but with the option to stipulate externally the offset. So to test for differences between clusters, the user could concurrently query the two clusters for the same offset (or a random offset), whereas to find entropy within a cluster two concurrently run queries could be compared for different offsets. From e4a2a8ecea6ccb8d093759aec5a086224638ac12 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 11 Jul 2017 10:28:34 +0100 Subject: [PATCH 50/58] Stuff on read repair --- docs/ANTI_ENTROPY.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 075c3ef..0607368 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -262,3 +262,5 @@ Some notes on re-using this alternative anti-entropy mechanism within Riak: - In Leveled a special fold currently supports the Tic-Tac tree generation for indexes, and one for objects. It may be better to support this through a offering a more open capability to pass different fold functions and accumulators into index folds. This could be re-used for "reporting indexes", where we want to count terms of different types rather than return all those terms via an accumulating list e.g. an index may have a bitmap style part, and the function will apply a wildcard mask to the bitmap and count the number of hits against each possible output. - The initial intention is to implement the hashtree query functions based around the coverage_fsm behaviour, but with the option to stipulate externally the offset. So to test for differences between clusters, the user could concurrently query the two clusters for the same offset (or a random offset), whereas to find entropy within a cluster two concurrently run queries could be compared for different offsets. + +- A surprising feature of read repair is that it will read repair to fallback nodes, not just primary nodes. This means that in read-intensive workloads, write activity may dramatically increase during node failure (as a large proportion of reads will become write events) - increasing the chance of servers falling domino style. However, in some circumstances the extra duplication can also [increase the chance of data loss](https://github.com/russelldb/russelldb.github.io/blob/master/3.2.kv679-solution.md)! This also increases greatly the volume of unnecessary data to be handed-off when the primary returns. Without active anti-entropy, and in the absence of other safety checks like `notfound_ok` being set to false, or `pr` being set to at least 1 - there will be scenarios where this feature may be helpful. As part of improving active anti-entropy, it may be wise to re-visit the tuning of anti-entropy features that existed prior to AAE, in particular should it be possible to configure read-repair to act on primary nodes only. From 7c86de24911e455a40942733d18a4fa331807b58 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 11 Jul 2017 10:29:50 +0100 Subject: [PATCH 51/58] Clarification on NRT query --- docs/ANTI_ENTROPY.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 0607368..7262110 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -207,6 +207,8 @@ In blacklist mode the Bucket will be $all, and the Key will actually be a {Bucke The index entry is given a TTL of a configurable amount (e.g. 1 hour) - and no index entry may be added if the change is already considered to be too far in the past. The index entry is added to the Ledger in the same transaction as an object value update, and will be re-calculated and re-added out of the Journal under restart conditions where the change has not reached a persisted state in the Ledger prior to the close, for example after a crash. +Querying this anti-entropy index can re-use the ``tictactree_idx`` query feature used for Full Database Anti-Entropy. + The near real-time entropy index currently has four ct tests: - `recent_aae_noaae` (confirming loading a store with real-time aae disabled has no impact); From 80fd2615f652284fce3f490470b7e398bc7a6ac2 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 11 Jul 2017 11:44:01 +0100 Subject: [PATCH 52/58] Implement blacklist/whitelist Change from the all/whitelist ebhavior to the blacklist/whitelist behaviour documented in the write-up --- docs/ANTI_ENTROPY.md | 4 ++-- include/leveled.hrl | 14 +++++++++++--- src/leveled_bookie.erl | 5 +++-- src/leveled_codec.erl | 33 +++++++++++++++++++++++--------- test/end_to_end/tictac_SUITE.erl | 6 +++--- 5 files changed, 43 insertions(+), 19 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 7262110..843cf52 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -2,7 +2,7 @@ ## Background -In the initial releases of Riak, there were three levels of protection against loss of data, where loss is caused by either a backend store not receiving data (because it was unavailable), or losing writes (due to a crash, or corruption of previously written data): +In the early history of Riak, there were three levels of protection against loss of data, where loss is caused by either a backend store not receiving data (because it was unavailable), or losing writes (due to a crash, or corruption of previously written data): - [Read repair](http://docs.basho.com/riak/kv/2.2.3/learn/concepts/replication/#read-repair), whenever an object was read, if as part of that read it was discovered that a vnode that should have the an update but instead has an older version of an object; then post the completion of the read the finite-state-machine managing the get would update the out-of-date vnode with the latest version. @@ -36,7 +36,7 @@ Although this represented an improvement in terms of entropy management, there w - The hash of the object was *not* based on a canonicalised version of the object, so could be inconsistent between trees (https://github.com/basho/riak_kv/issues/1189). -- Converting the object from_binary and sending it to another process has a potentially non-trivial cost for larger objects with significant amounts of metadata (e.g. 2i terms). +- Converting the object from_binary and sending it to another process (to pass from the `riak_kv_vnode` to the `riak_kv_index_hashtree` has a potentially non-trivial cost for larger objects with significant amounts of metadata (e.g. 2i terms). - Hashtrees may become mysteriously inconsistent following rebuilds, if the rebuild followed a cluster change operation (e.g. adding/removing a node) - and there would be storms of read actions prompted that would not lead to repairs. diff --git a/include/leveled.hrl b/include/leveled.hrl index bfb0593..12b2d07 100644 --- a/include/leveled.hrl +++ b/include/leveled.hrl @@ -67,9 +67,17 @@ waste_retention_period :: integer(), reload_strategy = [] :: list()}). --record(recent_aae, {buckets :: list()|all, - % whitelist of buckets to support recent recent AAE - % or all to support all buckets +-record(recent_aae, {filter :: whitelist|blacklist, + % the buckets list should either be a + % - whitelist - specific buckets are included, and + % entries are indexed by bucket name + % - blacklist - specific buckets are excluded, and + % all other entries are indexes using the special + % $all bucket + + buckets :: list(), + % whitelist or blacklist of buckets to support recent + % AAE limit_minutes :: integer(), % how long to retain entries the temporary index for diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index ba346b9..2881a87 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -393,8 +393,9 @@ init([Opts]) -> case get_opt(recent_aae, Opts, ?RECENT_AAE) of false -> false; - {BucketList, LimitMinutes, UnitMinutes} -> - #recent_aae{buckets = BucketList, + {FilterType, BucketList, LimitMinutes, UnitMinutes} -> + #recent_aae{filter = FilterType, + buckets = BucketList, limit_minutes = LimitMinutes, unit_minutes = UnitMinutes} end, diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 1dbccf3..0d49997 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -430,12 +430,18 @@ aae_indexspecs(false, _Bucket, _Key, _SQN, _H, _LastMods) -> aae_indexspecs(_AAE, _Bucket, _Key, _SQN, _H, []) -> []; aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) -> + InList = lists:member(Bucket, AAE#recent_aae.buckets), Bucket0 = - case AAE#recent_aae.buckets of - all -> - {all, Bucket}; - ListB -> - case lists:member(Bucket, ListB) of + case AAE#recent_aae.filter of + blacklist -> + case InList of + true -> + false; + false -> + {all, Bucket} + end; + whitelist -> + case InList of true -> Bucket; false -> @@ -811,7 +817,10 @@ parseolddate_test() -> ?assertMatch(no_index, PD). genaaeidx_test() -> - AAE = #recent_aae{buckets=all, limit_minutes=60, unit_minutes=5}, + AAE = #recent_aae{filter=blacklist, + buckets=[], + limit_minutes=60, + unit_minutes=5}, Bucket = <<"Bucket1">>, Key = <<"Key1">>, SQN = 1, @@ -832,16 +841,22 @@ genaaeidx_test() -> AAESpecs0 = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods0), ?assertMatch(0, length(AAESpecs0)), - AAE0 = AAE#recent_aae{buckets=[<<"Bucket0">>]}, + AAE0 = AAE#recent_aae{filter=whitelist, + buckets=[<<"Bucket0">>]}, AAESpecsB0 = aae_indexspecs(AAE0, Bucket, Key, SQN, H, LastMods1), ?assertMatch(0, length(AAESpecsB0)), - AAESpecsB1 = aae_indexspecs(AAE0, <<"Bucket0">>, Key, SQN, H, LastMods1), + AAESpecsB1 = aae_indexspecs(AAE0, <<"Bucket0">>, Key, SQN, H, LastMods1), ?assertMatch(1, length(AAESpecsB1)), [{{?IDX_TAG, <<"Bucket0">>, {Fld, Term}, <<"Key1">>}, {SQN, {active, TS}, no_lookup, null}}] = AAESpecsB1, ?assertMatch(true, is_integer(TS)), ?assertMatch(17, length(binary_to_list(Term))), - ?assertMatch("$aae.", lists:sublist(binary_to_list(Fld), 5)). + ?assertMatch("$aae.", lists:sublist(binary_to_list(Fld), 5)), + + AAE1 = AAE#recent_aae{filter=blacklist, + buckets=[<<"Bucket0">>]}, + AAESpecsB2 = aae_indexspecs(AAE1, <<"Bucket0">>, Key, SQN, H, LastMods1), + ?assertMatch(0, length(AAESpecsB2)). -endif. \ No newline at end of file diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index e2319da..f1cf457 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -507,7 +507,7 @@ recent_aae_allaae(_Config) -> TreeSize = small, % SegmentCount = 256 * 256, UnitMins = 2, - AAE = {all, 60, UnitMins}, + AAE = {blacklist, [], 60, UnitMins}, % Test requires multiple different databases, so want to mount them all % on individual file paths @@ -658,7 +658,7 @@ recent_aae_bucketaae(_Config) -> TreeSize = small, % SegmentCount = 256 * 256, UnitMins = 2, - AAE = {[<<"Bucket">>], 60, UnitMins}, + AAE = {whitelist, [<<"Bucket">>], 60, UnitMins}, % Test requires multiple different databases, so want to mount them all % on individual file paths @@ -825,7 +825,7 @@ recent_aae_expiry(_Config) -> % SegmentCount = 256 * 256, UnitMins = 1, TotalMins = 2, - AAE = {all, TotalMins, UnitMins}, + AAE = {backlist, [], TotalMins, UnitMins}, % Test requires multiple different databases, so want to mount them all % on individual file paths From 65fd029ca634b369c16d4ff5eb6285a09623319b Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 11 Jul 2017 12:25:06 +0100 Subject: [PATCH 53/58] typo - backlist/blacklist --- test/end_to_end/tictac_SUITE.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index f1cf457..3c65327 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -825,7 +825,7 @@ recent_aae_expiry(_Config) -> % SegmentCount = 256 * 256, UnitMins = 1, TotalMins = 2, - AAE = {backlist, [], TotalMins, UnitMins}, + AAE = {blacklist, [], TotalMins, UnitMins}, % Test requires multiple different databases, so want to mount them all % on individual file paths From 2adf60e9748741b1feac0429593c4a3e67472893 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 11 Jul 2017 19:48:04 +0100 Subject: [PATCH 54/58] Look at super-async folds and rollingcoverage_fsm --- docs/ANTI_ENTROPY.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 843cf52..2c1c87f 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -259,10 +259,19 @@ This likely represent gaps in current understanding, rather than flaws in the ap Some notes on re-using this alternative anti-entropy mechanism within Riak: -- There is divergence between Leveled and LevelDB with regards to how async folds are implemented. Within LevelDB requesting an async fold returns a folder function that will take a snapshot when it is called. Within Leveled the option exists to take the snapshot before returning the folder function, so that calling the folder function will work on a snapshot of the store taken when the folder was requested. This difference caused issues with testing with riak_kv_sweeeper, as the scheduling in sweeper meant that folds would be requested, and left on a queue for a long enough to be timed out by the time it was called. The quick fix for riak_kv_sweeper testing was to make the folder snapshot behaviour in Leveled consistent with LevelDB. However, the original behaviour opens up some interesting possibilities for AAE implementation in that a coverage set of vnodes could be snapshotted at a point in time, but not all folds need to be run concurrently to make the result consistent to the point in time. This would allow folds could be directly throttled by the coverage process so that only one fold was being run on each node at once, without opening up a time-gap between snapshots that would increase the number of false repairs. +* There is divergence between Leveled and LevelDB with regards to how async folds are implemented. Within LevelDB requesting an async fold returns a folder function that will take a snapshot when it is called. Within Leveled the option exists to take the snapshot before returning the folder function, so that calling the folder function will work on a snapshot of the store taken when the folder was requested. This difference caused issues with testing with riak_kv_sweeeper, as the scheduling in sweeper meant that folds would be requested, and left on a queue for a long enough to be timed out by the time it was called. The quick fix for riak_kv_sweeper testing was to make the folder snapshot behaviour in Leveled consistent with LevelDB. However, the original behaviour opens up some interesting possibilities for AAE implementation in that a coverage set of vnodes could be snapshotted at a point in time, but not all folds need to be run concurrently to make the result consistent to the point in time. This would allow folds could be directly throttled by the coverage process so that only one fold was being run on each node at once, without opening up a time-gap between snapshots that would increase the number of false repairs. -- In Leveled a special fold currently supports the Tic-Tac tree generation for indexes, and one for objects. It may be better to support this through a offering a more open capability to pass different fold functions and accumulators into index folds. This could be re-used for "reporting indexes", where we want to count terms of different types rather than return all those terms via an accumulating list e.g. an index may have a bitmap style part, and the function will apply a wildcard mask to the bitmap and count the number of hits against each possible output. + - It may be possible to make the leveldb behaviour async like leveled. The fold function contains the setup of the iterator and doing the fold, and perhaps these could be separated such that the iterator would be setup prior to the fold function being returned: -- The initial intention is to implement the hashtree query functions based around the coverage_fsm behaviour, but with the option to stipulate externally the offset. So to test for differences between clusters, the user could concurrently query the two clusters for the same offset (or a random offset), whereas to find entropy within a cluster two concurrently run queries could be compared for different offsets. + ``` + fold_keys(Ref, Fun, Acc0, Opts) -> + {ok, Itr} = iterator(Ref, Opts, keys_only), + do_fold(Itr, Fun, Acc0, Opts). + ``` + - The potential would then exist for a `riak_core_rollingcoverage_fsm` as a variation on `riak_core_coverage_fsm`. Whereas `riak_core_coverage_fsm` makes a coverage request and the sits in a `waiting_results` state until all vnodes are done, the rollingcoverage version may have async folders returned to it, and the roll over each folder in turn. So all the folds will be run at a snapshot that is close to the same point in time, but only one fold is running at a time hence minimising the impact on the cluster. -- A surprising feature of read repair is that it will read repair to fallback nodes, not just primary nodes. This means that in read-intensive workloads, write activity may dramatically increase during node failure (as a large proportion of reads will become write events) - increasing the chance of servers falling domino style. However, in some circumstances the extra duplication can also [increase the chance of data loss](https://github.com/russelldb/russelldb.github.io/blob/master/3.2.kv679-solution.md)! This also increases greatly the volume of unnecessary data to be handed-off when the primary returns. Without active anti-entropy, and in the absence of other safety checks like `notfound_ok` being set to false, or `pr` being set to at least 1 - there will be scenarios where this feature may be helpful. As part of improving active anti-entropy, it may be wise to re-visit the tuning of anti-entropy features that existed prior to AAE, in particular should it be possible to configure read-repair to act on primary nodes only. +* In Leveled a special fold currently supports the Tic-Tac tree generation for indexes, and one for objects. It may be better to support this through a offering a more open capability to pass different fold functions and accumulators into index folds. This could be re-used for "reporting indexes", where we want to count terms of different types rather than return all those terms via an accumulating list e.g. an index may have a bitmap style part, and the function will apply a wildcard mask to the bitmap and count the number of hits against each possible output. + +* The initial intention is to implement the hashtree query functions based around the coverage_fsm behaviour, but with the option to stipulate externally the offset. So to test for differences between clusters, the user could concurrently query the two clusters for the same offset (or a random offset), whereas to find entropy within a cluster two concurrently run queries could be compared for different offsets. + +* A surprising feature of read repair is that it will read repair to fallback nodes, not just primary nodes. This means that in read-intensive workloads, write activity may dramatically increase during node failure (as a large proportion of reads will become write events) - increasing the chance of servers falling domino style. However, in some circumstances the extra duplication can also [increase the chance of data loss](https://github.com/russelldb/russelldb.github.io/blob/master/3.2.kv679-solution.md)! This also increases greatly the volume of unnecessary data to be handed-off when the primary returns. Without active anti-entropy, and in the absence of other safety checks like `notfound_ok` being set to false, or `pr` being set to at least 1 - there will be scenarios where this feature may be helpful. As part of improving active anti-entropy, it may be wise to re-visit the tuning of anti-entropy features that existed prior to AAE, in particular should it be possible to configure read-repair to act on primary nodes only. From cb5f09496f324d8849ece91bc4b68024d959ea52 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Wed, 12 Jul 2017 14:50:38 +0100 Subject: [PATCH 55/58] Update thoughts on concurrency management After reviewing the code it looks easier to have separate pool of vnode workers to manage concurrency, rather than trying to alter the coverage FSM itself. This will make it easier to adjust for different backend capabilities (i.e. worst case it would fallback to unthrottled). --- docs/ANTI_ENTROPY.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 2c1c87f..c93e4ff 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -259,7 +259,7 @@ This likely represent gaps in current understanding, rather than flaws in the ap Some notes on re-using this alternative anti-entropy mechanism within Riak: -* There is divergence between Leveled and LevelDB with regards to how async folds are implemented. Within LevelDB requesting an async fold returns a folder function that will take a snapshot when it is called. Within Leveled the option exists to take the snapshot before returning the folder function, so that calling the folder function will work on a snapshot of the store taken when the folder was requested. This difference caused issues with testing with riak_kv_sweeeper, as the scheduling in sweeper meant that folds would be requested, and left on a queue for a long enough to be timed out by the time it was called. The quick fix for riak_kv_sweeper testing was to make the folder snapshot behaviour in Leveled consistent with LevelDB. However, the original behaviour opens up some interesting possibilities for AAE implementation in that a coverage set of vnodes could be snapshotted at a point in time, but not all folds need to be run concurrently to make the result consistent to the point in time. This would allow folds could be directly throttled by the coverage process so that only one fold was being run on each node at once, without opening up a time-gap between snapshots that would increase the number of false repairs. +* There is divergence between Leveled and LevelDB with regards to how async folds are implemented. Within LevelDB requesting an async fold returns a folder function that will take a snapshot when it is called. Within Leveled the option exists to take the snapshot before returning the folder function, so that calling the folder function will work on a snapshot of the store taken when the folder was requested. This difference caused issues with testing with riak_kv_sweeeper, as the scheduling in sweeper meant that folds would be requested, and left on a queue for a long enough to be timed out by the time it was called. The quick fix for riak_kv_sweeper testing was to make the folder snapshot behaviour in Leveled consistent with LevelDB. However, the original behaviour opens up some interesting possibilities for AAE implementation in that a coverage set of vnodes could be snapshotted at a point in time, but not all folds need to be run concurrently to make the result consistent to the point in time. This would allow folds could be directly throttled during the coverage process to manage the number of folds running on each node at once, without opening up a time-gap between snapshots that would increase the number of false repairs. - It may be possible to make the leveldb behaviour async like leveled. The fold function contains the setup of the iterator and doing the fold, and perhaps these could be separated such that the iterator would be setup prior to the fold function being returned: @@ -268,7 +268,9 @@ Some notes on re-using this alternative anti-entropy mechanism within Riak: {ok, Itr} = iterator(Ref, Opts, keys_only), do_fold(Itr, Fun, Acc0, Opts). ``` - - The potential would then exist for a `riak_core_rollingcoverage_fsm` as a variation on `riak_core_coverage_fsm`. Whereas `riak_core_coverage_fsm` makes a coverage request and the sits in a `waiting_results` state until all vnodes are done, the rollingcoverage version may have async folders returned to it, and the roll over each folder in turn. So all the folds will be run at a snapshot that is close to the same point in time, but only one fold is running at a time hence minimising the impact on the cluster. + - Likewise with bitcask, it currently is async with the snapshot effectively inside of the async folder function returned (for bitcask it opens a new bitcask store in read-only mode), and this could be done outside. This could be moved outside of the async part but, unlike with leveldb and leveled snapshots this is a relatively expensive operation - so this would block the main bitcask process in an unhealthy way. So finding a simple way of snapshotting prior to the fold and outside of the async process would require more work in Bitcask. + + - riak_core supports vnode_worker_pools (currently only one) and riak_kv sets up a pool for folds. If riak_core were be changed to support more than one pool, a second pool could be setup for snapped folds (i.e. where the response is {snap_async, Work, From, NewModState} as opposed to [async](https://github.com/basho/riak_core/blob/2.1.8/src/riak_core_vnode.erl#L358-#L362), the second vnode_worker_pool would be asked to fulfill this work). The second pool could have a more constrained number of concurrent workers - so these large folds could have concurrency throttled, without a timing impact on the consistency of the results across vnodes. * In Leveled a special fold currently supports the Tic-Tac tree generation for indexes, and one for objects. It may be better to support this through a offering a more open capability to pass different fold functions and accumulators into index folds. This could be re-used for "reporting indexes", where we want to count terms of different types rather than return all those terms via an accumulating list e.g. an index may have a bitmap style part, and the function will apply a wildcard mask to the bitmap and count the number of hits against each possible output. From 2ad1ac0baff2702b65d7088379a43d17ed05ac18 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 13 Jul 2017 17:40:06 +0100 Subject: [PATCH 56/58] node_worker_pool Having a separate vnode_worker_pool wouldn't reoslve he parallelism issue obviously! Need a node_worker_pool instead. --- docs/ANTI_ENTROPY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index c93e4ff..3865645 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -270,7 +270,7 @@ Some notes on re-using this alternative anti-entropy mechanism within Riak: ``` - Likewise with bitcask, it currently is async with the snapshot effectively inside of the async folder function returned (for bitcask it opens a new bitcask store in read-only mode), and this could be done outside. This could be moved outside of the async part but, unlike with leveldb and leveled snapshots this is a relatively expensive operation - so this would block the main bitcask process in an unhealthy way. So finding a simple way of snapshotting prior to the fold and outside of the async process would require more work in Bitcask. - - riak_core supports vnode_worker_pools (currently only one) and riak_kv sets up a pool for folds. If riak_core were be changed to support more than one pool, a second pool could be setup for snapped folds (i.e. where the response is {snap_async, Work, From, NewModState} as opposed to [async](https://github.com/basho/riak_core/blob/2.1.8/src/riak_core_vnode.erl#L358-#L362), the second vnode_worker_pool would be asked to fulfill this work). The second pool could have a more constrained number of concurrent workers - so these large folds could have concurrency throttled, without a timing impact on the consistency of the results across vnodes. + - riak_core supports vnode_worker_pools (currently only one) and riak_kv sets up a pool for folds. The potential may also exist to have a node_worker_pool on each node. It may then be possible to divert snapped async work to this pool (i.e. where the response is {snap_async, Work, From, NewModState} as opposed to [async](https://github.com/basho/riak_core/blob/2.1.8/src/riak_core_vnode.erl#L358-#L362), the node_worker_pool would be asked to fulfill this work). The second pool could have a more constrained number of concurrent workers, perhaps just one. Therefore no more than one vnode on the node would be active doing this sort of work at any one time, and when that work is finished the next vnode in the queue would pick up and commence its fold. * In Leveled a special fold currently supports the Tic-Tac tree generation for indexes, and one for objects. It may be better to support this through a offering a more open capability to pass different fold functions and accumulators into index folds. This could be re-used for "reporting indexes", where we want to count terms of different types rather than return all those terms via an accumulating list e.g. an index may have a bitmap style part, and the function will apply a wildcard mask to the bitmap and count the number of hits against each possible output. From 9c4910fe2640c62017e9ec697c745ab59f0f176a Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 21 Jul 2017 10:21:54 +0100 Subject: [PATCH 57/58] Edits to AAE write-up Further re-read before push --- docs/ANTI_ENTROPY.md | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index 3865645..d40566b 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -4,27 +4,29 @@ In the early history of Riak, there were three levels of protection against loss of data, where loss is caused by either a backend store not receiving data (because it was unavailable), or losing writes (due to a crash, or corruption of previously written data): -- [Read repair](http://docs.basho.com/riak/kv/2.2.3/learn/concepts/replication/#read-repair), whenever an object was read, if as part of that read it was discovered that a vnode that should have the an update but instead has an older version of an object; then post the completion of the read the finite-state-machine managing the get would update the out-of-date vnode with the latest version. +- [Read repair](http://docs.basho.com/riak/kv/2.2.3/learn/concepts/replication/#read-repair), whenever an object was read, the finite-state-machine managing the GET, would wait for a response from all vnodes; after replying to the client the FSM would update any vnode which had revealed an out of date version of the object. -- [Hinted handoff](http://docs.basho.com/riak/kv/2.2.3/using/reference/handoff/#types-of-handoff), if a fallback node has taken responsibility for writes to a given vnode due to a temporary ring change in the cluster (e.g. due to a node failure), then when the expected primary returns to service the fallback node should be triggered to handoff any data it has (from this or any previous fallback period) to the expected primary vnode. Once handoff was complete the vnode would then self-destruct and remove any durable state. Fallback nodes start vnodes for the same ring partition as the primary vnode. A fallback node is selected because it owns the next vnode in the ring, but it starts a new vnode to replace the primary vnode, it doesn't store data in the vnode backend which caused it to be considered a fallback (fallback is to a node not to a vnode) - so handoff is not required to be selective about the data that is handed off. +- [Hinted handoff](http://docs.basho.com/riak/kv/2.2.3/using/reference/handoff/#types-of-handoff), if a fallback node has taken responsibility for writes to a given vnode due to a temporary ring change in the cluster (e.g. due to a node failure), then when the expected primary returns to service the fallback node should be triggered to handoff any data it has (from this or any previous fallback period) to the expected primary vnode. Once handoff was complete the vnode would then self-destruct and remove any durable state. Fallback nodes start vnodes for the same ring partition as the primary vnode. A fallback node is selected because it owns the next vnode in the ring, but it starts a new vnode to replace the primary vnode, it doesn't store data in the vnode backend which caused it to be considered a fallback (fallback is to a node not to a vnode) - so handoff is not normally required to be selective about the data that is handed off. - [Key-listing for multi-data-centre replication](http://docs.basho.com/riak/kv/2.2.3/using/reference/v2-multi-datacenter/architecture/#fullsync-replication), for customers with the proprietary Riak Enterprise software there was a mechanism whereby vnode by vnode there would be a fold over all the objects in the vnode, for a replicated bucket, calculating a hash for the object and sending the keys and hashes to a replicated cluster for comparison with the result of its equivalent object fold. Any variances would then be repaired by streaming those missing updates between the clusters to be re-added across all required vnodes. There were three primary issues with these mechanisms: -- Some objects may be read very infrequently, and such objects may be lost due to a series of failure or disk-corruption events that occurred between reads. +- Some objects may be read very infrequently, and such objects may be lost due to a series of failure or disk-corruption events that occurred between reads and hence without the protection of read repair. -- For large stores per-vnode object folding required for MDC was an expensive operation, and when run in parallel with standard database load could lead to unpredictability in response times. +- For large stores per-vnode object folding required for MDC was an expensive operation, and when run in parallel with standard database load could lead to unpredictable response times. - Some read events do not validate across multiple vnodes, primarily secondary index queries, so an inconsistent index due to a failed write would never be detected by the database. Secondary index queries were not necessarily eventually consistent, but were potentially never consistent. -To address these weaknesses Active Anti-Entropy (AAE) was introduced to Riak, as a configurable option. Configuring Active Anti-Entropy would start a new AAE hashtree store for every primary vnode in the ring. The vnode process would following a successful put [update this hashtree store process](https://github.com/basho/riak_kv/blob/2.1.7/src/riak_kv_vnode.erl#L2139-L2169) by sending it the updated object after converting it from its binary format. This would generally happen in an async way, but periodically the change would block the vnode to confirm that the AAE process was keeping up. The hashtree store process would hash the riak object to create a hash for the update, and hash the Key to map it to one of 1024 * 1024 segments - and then in batches update the store with a key of {$t, Partition, Segment, Key}, and a value of the object hash. +To address these weaknesses Active Anti-Entropy (AAE) was introduced to Riak, as a configurable option. Configuring Active Anti-Entropy would start a new AAE "hashtree" store for every primary vnode in the ring. The vnode process would, following a successful put, [update this hashtree store process](https://github.com/basho/riak_kv/blob/2.1.7/src/riak_kv_vnode.erl#L2139-L2169) by sending it the updated object after converting it from its binary format. This would generally happen in via async message passing, but periodically the change would block the vnode to confirm that the AAE process was keeping up. The hashtree store process would hash the riak object to create a hash for the update, and hash the Key to map it to one of 1024 * 1024 segments - and then in batches update the store with a key of {$t, Partition, Segment, Key}, and a value of the object hash. -From this persisted store a Merkle tree is maintained for each Partition. These Merkle trees can then then be exchanged with another vnode's AAE hashtree store if that vnode is also a primary vnode for that same partition. Exchanging the Merkle tree would highlight any segments which had a variance - and then it would be possible to iterate over the store segment by segment to discover which keys actually differed. Those keys could then be re-read, so that read repair would correct any entropy that actually existed. +From this persisted store a [Merkle tree](https://en.wikipedia.org/wiki/Merkle_tree) is maintained for each Partition. These Merkle trees can then then be exchanged with another vnode's AAE hashtree store if that vnode is also a primary vnode for that same partition. Exchanging the Merkle tree would highlight any segments which had a variance - and then it would be possible to iterate over the store segment by segment to discover which keys actually differed. The objects associated with these keys could then be re-read within the actual vnode stores, so that read repair would correct any entropy that had been indicated by the discrepancy between the hashtree stores. -The process of maintaining the hashtree is partially deferred to the point of the exchange, but is relatively low cost, and so these exchanges can occur frequently without creating significant background load. Infrequently, but regularly, the hashtree store would be cleared and rebuilt from an object fold to ensure that it reflected the actual persisted state in the store. +The process of maintaining the hashtree is partially deferred to the point of the exchange, and this update process is of a low but not-necessarily non-trivial cost. the cost is low enough so that these exchanges can occur with reasonable frequency (i.e. many minutes between exchanges) without creating significant background load. -The impact on the three primary issues of anti-entropy of this mechanism was: +Infrequently, but regularly, the hashtree store would be cleared and rebuilt from an object fold over the vnode store to ensure that it reflected the actual persisted state in the store. This rebuild process depends on some cluster-wide lock acquisition and other throttling techniques, as it has to avoid generating false negative results from exchanges scheduled to occur during the rebuild, avoid unexpected conditions on shutdown during the rebuild, avoid excessive concurrency of rebuild operations within the cluster, and avoid flooding the cluster with read-repair events following a rebuild. Despite precautionary measures within the design, the rebuild process is, when compared to other Riak features, a relatively common trigger for production issues. + +Prior to AAE being available there were three primary issues with anti-entropy in Riak, as listed above. The impact on the three primary issues from introducing AAE was: - Repair was de-coupled from reads, and so unread objects would not have a vastly reduced risk of disappearing following node failure and disk corruption events. @@ -32,7 +34,7 @@ The impact on the three primary issues of anti-entropy of this mechanism was: - Secondary index queries would be made consistent following PUT failure after the next AAE exchange, and those exchanges are regular. Consistency is maintained by comparison of the actual object, not the index entries within the backed, and so loss of index data due to backend corruption would still not be detected by AAE. -Although this represented an improvement in terms of entropy management, there were still some imperfections with the mechanism: +Although this represented an improvement in terms of entropy management, there were still some imperfections with the approach: - The hash of the object was *not* based on a canonicalised version of the object, so could be inconsistent between trees (https://github.com/basho/riak_kv/issues/1189). From 2297c0fab1c5c4369ca11391856d1a1b73861fc7 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 21 Jul 2017 10:27:43 +0100 Subject: [PATCH 58/58] Change snap_async to snap As currently implemented in riak_core branch --- docs/ANTI_ENTROPY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ANTI_ENTROPY.md b/docs/ANTI_ENTROPY.md index d40566b..f92eb19 100644 --- a/docs/ANTI_ENTROPY.md +++ b/docs/ANTI_ENTROPY.md @@ -272,7 +272,7 @@ Some notes on re-using this alternative anti-entropy mechanism within Riak: ``` - Likewise with bitcask, it currently is async with the snapshot effectively inside of the async folder function returned (for bitcask it opens a new bitcask store in read-only mode), and this could be done outside. This could be moved outside of the async part but, unlike with leveldb and leveled snapshots this is a relatively expensive operation - so this would block the main bitcask process in an unhealthy way. So finding a simple way of snapshotting prior to the fold and outside of the async process would require more work in Bitcask. - - riak_core supports vnode_worker_pools (currently only one) and riak_kv sets up a pool for folds. The potential may also exist to have a node_worker_pool on each node. It may then be possible to divert snapped async work to this pool (i.e. where the response is {snap_async, Work, From, NewModState} as opposed to [async](https://github.com/basho/riak_core/blob/2.1.8/src/riak_core_vnode.erl#L358-#L362), the node_worker_pool would be asked to fulfill this work). The second pool could have a more constrained number of concurrent workers, perhaps just one. Therefore no more than one vnode on the node would be active doing this sort of work at any one time, and when that work is finished the next vnode in the queue would pick up and commence its fold. + - riak_core supports vnode_worker_pools (currently only one) and riak_kv sets up a pool for folds. The potential may also exist to have a node_worker_pool on each node. It may then be possible to divert snapped async work to this pool (i.e. where the response is {snap, Work, From, NewModState} as opposed to [async](https://github.com/basho/riak_core/blob/2.1.8/src/riak_core_vnode.erl#L358-#L362), the node_worker_pool would be asked to fulfill this work). The second pool could have a more constrained number of concurrent workers, perhaps just one. Therefore no more than one vnode on the node would be active doing this sort of work at any one time, and when that work is finished the next vnode in the queue would pick up and commence its fold. * In Leveled a special fold currently supports the Tic-Tac tree generation for indexes, and one for objects. It may be better to support this through a offering a more open capability to pass different fold functions and accumulators into index folds. This could be re-used for "reporting indexes", where we want to count terms of different types rather than return all those terms via an accumulating list e.g. an index may have a bitmap style part, and the function will apply a wildcard mask to the bitmap and count the number of hits against each possible output.