From 389694b11bf6aa3269d21fc9e2afc27b9bc6a2a8 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 26 Sep 2017 22:49:40 +0100 Subject: [PATCH] Add exportable option to tictac Idea being that sometimes you may wish to compare a tictac tree between leveled and something that doesn't understand erlang:phash or term_to_binary. So allow the magic_hash to be used instead - and perhaps an extract function that does base64 encoding or something similar. --- src/leveled_bookie.erl | 16 ++-- src/leveled_codec.erl | 3 +- src/leveled_tictac.erl | 122 ++++++++++++++++++++----------- test/end_to_end/tictac_SUITE.erl | 57 ++++++++------- 4 files changed, 117 insertions(+), 81 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index d7b73d5..eb422e1 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -934,14 +934,10 @@ tictactree(State, Tag, Bucket, Query, JournalCheck, TreeSize, Filter) -> fun() -> % The start key and end key will vary depending on whether the % fold is to fold over an index or a key range - {StartKey, EndKey, HashFun} = + {StartKey, EndKey, ExtractFun} = case Tag of ?IDX_TAG -> {IdxField, StartIdx, EndIdx} = Query, - HashIdxValFun = - fun(_Key, IdxValue) -> - erlang:phash2(IdxValue) - end, {leveled_codec:to_ledgerkey(Bucket, null, ?IDX_TAG, @@ -952,23 +948,21 @@ tictactree(State, Tag, Bucket, Query, JournalCheck, TreeSize, Filter) -> ?IDX_TAG, IdxField, EndIdx), - HashIdxValFun}; + fun(K, T) -> {K, T} end}; _ -> {StartObjKey, EndObjKey} = Query, - PassHashFun = fun(_Key, Hash) -> Hash end, {leveled_codec:to_ledgerkey(Bucket, StartObjKey, Tag), leveled_codec:to_ledgerkey(Bucket, EndObjKey, Tag), - PassHashFun} + fun(K, H) -> {K, {is_hash, H}} end} end, - AccFun = accumulate_tree(Filter, JournalCheck, JournalSnapshot, - HashFun), + ExtractFun), Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot, StartKey, EndKey, @@ -1263,7 +1257,7 @@ accumulate_tree(FilterFun, JournalCheck, InkerClone, HashFun) -> fun(B, K, H, Tree) -> case FilterFun(B, K) of accumulate -> - leveled_tictac:add_kv(Tree, K, H, HashFun); + leveled_tictac:add_kv(Tree, K, H, HashFun, false); pass -> Tree end diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 9ed2a2c..6be90d7 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -464,7 +464,8 @@ aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) -> {LMD1, TTL} -> TreeSize = AAE#recent_aae.tree_size, SegID = - leveled_tictac:get_segment(Key, TreeSize), + leveled_tictac:get_segment(erlang:phash2(Key), + TreeSize), IdxFldStr = ?NRT_IDX ++ LMD1 ++ "_bin", IdxTrmStr = string:right(integer_to_list(SegID), 8, $0) ++ diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 39d7db8..d25132c 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -57,14 +57,14 @@ -export([ new_tree/1, new_tree/2, - add_kv/4, + add_kv/5, find_dirtyleaves/2, find_dirtysegments/2, fetch_root/1, fetch_leaves/2, merge_trees/2, get_segment/2, - tictac_hash/2, + tictac_hash/3, export_tree/1, import_tree/1 ]). @@ -161,13 +161,24 @@ import_tree(ExportedTree) -> level2 = Lv2}. -spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree(). +add_kv(TicTacTree, Key, Value, BinExtractFun) -> + add_kv(TicTacTree, Key, Value, BinExtractFun, false). + +-spec add_kv(tictactree(), tuple(), tuple(), fun(), boolean()) -> tictactree(). %% @doc -%% Add a Key and value to a tictactree using the HashFun to calculate the Hash -%% based on that key and value -add_kv(TicTacTree, Key, Value, HashFun) -> - HashV = HashFun(Key, Value), - SegChangeHash = tictac_hash(Key, HashV), - Segment = get_segment(Key, TicTacTree#tictactree.segment_count), +%% Add a Key and value to a tictactree using the BinExtractFun to extract a +%% binary from the Key and value from which to generate the hash. The +%% BinExtractFun will also need to do any canonicalisation necessary to make +%% the hash consistent (such as whitespace removal, or sorting) +%% +%% For exportable trees the hash function will be based on the CJ Bernstein +%% magic hash. For non-exportable trees erlang:phash2 will be used, and so +%% non-binary Keys and Values can be returned from the BinExtractFun in this +%% case. +add_kv(TicTacTree, Key, Value, BinExtractFun, Exportable) -> + {BinK, BinV} = BinExtractFun(Key, Value), + {SegHash, SegChangeHash} = tictac_hash(BinK, BinV, Exportable), + Segment = get_segment(SegHash, TicTacTree#tictactree.segment_count), Level2Pos = Segment band (TicTacTree#tictactree.width - 1), @@ -275,21 +286,33 @@ merge_trees(TreeA, TreeB) -> MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}. --spec get_segment(any(), integer()|small|medium|large|xlarge) -> integer(). +-spec get_segment(integer(), integer()|small|medium|large|xlarge) -> integer(). %% @doc %% Return the segment ID for a Key. Can pass the tree size or the actual %% segment count derived from the size -get_segment(Key, SegmentCount) when is_integer(SegmentCount) -> - erlang:phash2(Key) band (SegmentCount - 1); -get_segment(Key, TreeSize) -> - get_segment(Key, element(3, get_size(TreeSize))). +get_segment(Hash, SegmentCount) when is_integer(SegmentCount) -> + Hash band (SegmentCount - 1); +get_segment(Hash, TreeSize) -> + get_segment(Hash, element(3, get_size(TreeSize))). --spec tictac_hash(tuple(), any()) -> integer(). +-spec tictac_hash(any(), any(), boolean()) -> integer(). %% @doc -%% Hash the key and term -tictac_hash(Key, Term) -> - erlang:phash2({Key, Term}). +%% Hash the key and term, to either something repetable in Erlang, or using +%% the DJ Bernstein hash if it is the tree needs to be compared with one +%% calculated with a non-Erlang store +%% +%% Boolean is Exportable. does the hash need to be repetable by a non-Erlang +%% machine +tictac_hash(BinKey, BinVal, true) + when is_binary(BinKey) and is_binary(BinVal) -> + HashKey = leveled_codec:magic_hash({binary, BinKey}), + HashVal = leveled_codec:magic_hash({binary, BinVal}), + {HashKey, HashKey bxor HashVal}; +tictac_hash(BinKey, {is_hash, HashedVal}, false) -> + {erlang:phash2(BinKey), erlang:phash2(BinKey) bxor HashedVal}; +tictac_hash(BinKey, BinVal, false) -> + {erlang:phash2(BinKey), erlang:phash2(BinKey) bxor erlang:phash2(BinVal)}. %%%============================================================================ %%% Internal functions @@ -363,13 +386,17 @@ simple_bysize_test() -> simple_test_withsize(xlarge). simple_test_withsize(Size) -> - HashFun = fun(_K, V) -> erlang:phash2(V) end, + BinFun = fun(K, V) -> {term_to_binary(K), term_to_binary(V)} end, + K1 = {o, "B1", "K1", null}, + K2 = {o, "B1", "K2", null}, + K3 = {o, "B1", "K3", null}, + Tree0 = new_tree(0, Size), - Tree1 = add_kv(Tree0, {o, "B1", "K1", null}, {caine, 1}, HashFun), - Tree2 = add_kv(Tree1, {o, "B1", "K2", null}, {caine, 2}, HashFun), - Tree3 = add_kv(Tree2, {o, "B1", "K3", null}, {caine, 3}, HashFun), - Tree3A = add_kv(Tree3, {o, "B1", "K3", null}, {caine, 4}, HashFun), + Tree1 = add_kv(Tree0, K1, {caine, 1}, BinFun), + Tree2 = add_kv(Tree1, K2, {caine, 2}, BinFun), + Tree3 = add_kv(Tree2, K3, {caine, 3}, BinFun), + Tree3A = add_kv(Tree3, K3, {caine, 4}, BinFun), ?assertMatch(true, Tree0#tictactree.level1 == Tree0#tictactree.level1), ?assertMatch(false, Tree0#tictactree.level1 == Tree1#tictactree.level1), ?assertMatch(false, Tree1#tictactree.level1 == Tree2#tictactree.level1), @@ -377,23 +404,28 @@ simple_test_withsize(Size) -> ?assertMatch(false, Tree3#tictactree.level1 == Tree3A#tictactree.level1), Tree0X = new_tree(0, Size), - Tree1X = add_kv(Tree0X, {o, "B1", "K3", null}, {caine, 3}, HashFun), - Tree2X = add_kv(Tree1X, {o, "B1", "K1", null}, {caine, 1}, HashFun), - Tree3X = add_kv(Tree2X, {o, "B1", "K2", null}, {caine, 2}, HashFun), - Tree3XA = add_kv(Tree3X, {o, "B1", "K3", null}, {caine, 4}, HashFun), + Tree1X = add_kv(Tree0X, K3, {caine, 3}, BinFun), + Tree2X = add_kv(Tree1X, K1, {caine, 1}, BinFun), + Tree3X = add_kv(Tree2X, K2, {caine, 2}, BinFun), + Tree3XA = add_kv(Tree3X, K3, {caine, 4}, BinFun), ?assertMatch(false, Tree1#tictactree.level1 == Tree1X#tictactree.level1), ?assertMatch(false, Tree2#tictactree.level1 == Tree2X#tictactree.level1), ?assertMatch(true, Tree3#tictactree.level1 == Tree3X#tictactree.level1), ?assertMatch(true, Tree3XA#tictactree.level1 == Tree3XA#tictactree.level1), SC = Tree0#tictactree.segment_count, + + GetSegFun = + fun(TK) -> + get_segment(erlang:phash2(term_to_binary(TK)), SC) + end, DL0 = find_dirtyleaves(Tree1, Tree0), - ?assertMatch(true, lists:member(get_segment({o, "B1", "K1", null}, SC), DL0)), + ?assertMatch(true, lists:member(GetSegFun(K1), DL0)), DL1 = find_dirtyleaves(Tree3, Tree1), - ?assertMatch(true, lists:member(get_segment({o, "B1", "K2", null}, SC), DL1)), - ?assertMatch(true, lists:member(get_segment({o, "B1", "K3", null}, SC), DL1)), - ?assertMatch(false, lists:member(get_segment({o, "B1", "K1", null}, SC), DL1)), + ?assertMatch(true, lists:member(GetSegFun(K2), DL1)), + ?assertMatch(true, lists:member(GetSegFun(K3), DL1)), + ?assertMatch(false, lists:member(GetSegFun(K1), DL1)), % Export and import tree to confirm no difference ExpTree3 = export_tree(Tree3), @@ -416,24 +448,24 @@ merge_bysize_xlarge_test2() -> merge_test_withsize(xlarge). merge_test_withsize(Size) -> - HashFun = fun(_K, V) -> erlang:phash2(V) end, + BinFun = fun(K, V) -> {term_to_binary(K), term_to_binary(V)} end, TreeX0 = new_tree(0, Size), - TreeX1 = add_kv(TreeX0, {o, "B1", "X1", null}, {caine, 1}, HashFun), - TreeX2 = add_kv(TreeX1, {o, "B1", "X2", null}, {caine, 2}, HashFun), - TreeX3 = add_kv(TreeX2, {o, "B1", "X3", null}, {caine, 3}, HashFun), - TreeX4 = add_kv(TreeX3, {o, "B1", "X3", null}, {caine, 4}, HashFun), + TreeX1 = add_kv(TreeX0, {o, "B1", "X1", null}, {caine, 1}, BinFun), + TreeX2 = add_kv(TreeX1, {o, "B1", "X2", null}, {caine, 2}, BinFun), + TreeX3 = add_kv(TreeX2, {o, "B1", "X3", null}, {caine, 3}, BinFun), + TreeX4 = add_kv(TreeX3, {o, "B1", "X3", null}, {caine, 4}, BinFun), TreeY0 = new_tree(0, Size), - TreeY1 = add_kv(TreeY0, {o, "B1", "Y1", null}, {caine, 101}, HashFun), - TreeY2 = add_kv(TreeY1, {o, "B1", "Y2", null}, {caine, 102}, HashFun), - TreeY3 = add_kv(TreeY2, {o, "B1", "Y3", null}, {caine, 103}, HashFun), - TreeY4 = add_kv(TreeY3, {o, "B1", "Y3", null}, {caine, 104}, HashFun), + TreeY1 = add_kv(TreeY0, {o, "B1", "Y1", null}, {caine, 101}, BinFun), + TreeY2 = add_kv(TreeY1, {o, "B1", "Y2", null}, {caine, 102}, BinFun), + TreeY3 = add_kv(TreeY2, {o, "B1", "Y3", null}, {caine, 103}, BinFun), + TreeY4 = add_kv(TreeY3, {o, "B1", "Y3", null}, {caine, 104}, BinFun), - TreeZ1 = add_kv(TreeX4, {o, "B1", "Y1", null}, {caine, 101}, HashFun), - TreeZ2 = add_kv(TreeZ1, {o, "B1", "Y2", null}, {caine, 102}, HashFun), - TreeZ3 = add_kv(TreeZ2, {o, "B1", "Y3", null}, {caine, 103}, HashFun), - TreeZ4 = add_kv(TreeZ3, {o, "B1", "Y3", null}, {caine, 104}, HashFun), + TreeZ1 = add_kv(TreeX4, {o, "B1", "Y1", null}, {caine, 101}, BinFun), + TreeZ2 = add_kv(TreeZ1, {o, "B1", "Y2", null}, {caine, 102}, BinFun), + TreeZ3 = add_kv(TreeZ2, {o, "B1", "Y3", null}, {caine, 103}, BinFun), + TreeZ4 = add_kv(TreeZ3, {o, "B1", "Y3", null}, {caine, 104}, BinFun), TreeM0 = merge_trees(TreeX4, TreeY4), checktree(TreeM0), @@ -443,6 +475,10 @@ merge_test_withsize(Size) -> checktree(TreeM1), ?assertMatch(false, TreeM1#tictactree.level1 == TreeZ4#tictactree.level1). +exportable_test() -> + {Int1, Int2} = tictac_hash(<<"key">>, <<"value">>, true), + ?assertMatch({true, true}, {is_integer(Int1), is_integer(Int2)}). + -endif. diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index a837738..6e7ba2d 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -114,19 +114,16 @@ many_put_compare(_Config) -> % Now run the same query by putting the tree-building responsibility onto % the fold_objects_fun - ApplyHash = - fun(HashFun) -> - fun(_Key, Value) -> - {proxy_object, HeadBin, _Size, _FetchFun} = binary_to_term(Value), - <> = HeadBin, - <> = Rest, - HashFun(lists:sort(binary_to_term(VclockBin))) - end - end, + ExtractClockFun = + fun(Key, Value) -> + {proxy_object, HeadBin, _Size, _FetchFun} = binary_to_term(Value), + <> = HeadBin, + {Key, lists:sort(binary_to_term(VclockBin))} + end, FoldObjectsFun = fun(_Bucket, Key, Value, Acc) -> - leveled_tictac:add_kv(Acc, Key, Value, ApplyHash(fun erlang:phash2/1)) + leveled_tictac:add_kv(Acc, Key, Value, ExtractClockFun, false) end, FoldQ0 = {foldheads_bybucket, @@ -157,22 +154,25 @@ many_put_compare(_Config) -> [timer:now_diff(os:timestamp(), SWB1Obj)]), true = length(leveled_tictac:find_dirtyleaves(TreeA, TreeAObj1)) == 0, - % AAE trees within riak are based on a sha of the vector clock. So to - % compare with an AAE tree we need to compare outputs when we're hashing - % a hash - AltHashFun = - fun(Term) -> - erlang:phash2(crypto:hash(sha, term_to_binary(Term))) + % For an exportable comparison, want hash to be based on something not + % coupled to erlang language - so use exportable query + AltExtractFun = + fun(K, V) -> + {proxy_object, HeadBin, _Size, _FetchFun} = binary_to_term(V), + <> = HeadBin, + {term_to_binary(K), VclockBin} end, AltFoldObjectsFun = fun(_Bucket, Key, Value, Acc) -> - leveled_tictac:add_kv(Acc, Key, Value, ApplyHash(AltHashFun)) + leveled_tictac:add_kv(Acc, Key, Value, AltExtractFun, true) end, AltFoldQ0 = {foldheads_bybucket, - o_rkv, - "Bucket", - {AltFoldObjectsFun, leveled_tictac:new_tree(0, TreeSize)}, - false, true}, + o_rkv, + "Bucket", + {AltFoldObjectsFun, leveled_tictac:new_tree(0, TreeSize)}, + false, + true}, {async, TreeAAltObjFolder0} = leveled_bookie:book_returnfolder(Bookie2, AltFoldQ0), SWB2Obj = os:timestamp(), @@ -187,15 +187,19 @@ many_put_compare(_Config) -> io:format("Build tictac tree via object fold with no "++ "presence check and 200K objects and alt hash in ~w~n", [timer:now_diff(os:timestamp(), SWB3Obj)]), - true = - length(leveled_tictac:find_dirtyleaves(TreeBAltObj, TreeAAltObj)) == 1, + DL_ExportFold = + length(leveled_tictac:find_dirtyleaves(TreeBAltObj, TreeAAltObj)), + io:format("Found dirty leaves with exportable comparison of ~w~n", + [DL_ExportFold]), + true = DL_ExportFold == 1, %% Finding differing keys FoldKeysFun = fun(SegListToFind) -> fun(_B, K, Acc) -> - Seg = leveled_tictac:get_segment(K, SegmentCount), + Seg = + leveled_tictac:get_segment(erlang:phash2(K), SegmentCount), case lists:member(Seg, SegListToFind) of true -> [K|Acc]; @@ -469,7 +473,8 @@ index_compare(_Config) -> FoldKeysIndexQFun = fun(_Bucket, {Term, Key}, Acc) -> - Seg = leveled_tictac:get_segment(Key, SegmentCount), + Seg = + leveled_tictac:get_segment(erlang:phash2(Key), SegmentCount), case lists:member(Seg, DL3_0) of true -> [{Term, Key}|Acc];