From 389694b11bf6aa3269d21fc9e2afc27b9bc6a2a8 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Tue, 26 Sep 2017 22:49:40 +0100
Subject: [PATCH] Add exportable option to tictac

Idea being that sometimes you may wish to compare a tictac tree between leveled and something that doesn't understand erlang:phash or term_to_binary.  So allow the magic_hash to be used instead - and perhaps an extract function that does base64 encoding or something similar.
---
 src/leveled_bookie.erl           |  16 ++--
 src/leveled_codec.erl            |   3 +-
 src/leveled_tictac.erl           | 122 ++++++++++++++++++++-----------
 test/end_to_end/tictac_SUITE.erl |  57 ++++++++-------
 4 files changed, 117 insertions(+), 81 deletions(-)

diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index d7b73d5..eb422e1 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -934,14 +934,10 @@ tictactree(State, Tag, Bucket, Query, JournalCheck, TreeSize, Filter) ->
         fun() ->
             % The start key and end key will vary depending on whether the
             % fold is to fold over an index or a key range
-            {StartKey, EndKey, HashFun} =
+            {StartKey, EndKey, ExtractFun} =
                 case Tag of
                     ?IDX_TAG ->
                         {IdxField, StartIdx, EndIdx} = Query,
-                        HashIdxValFun =
-                            fun(_Key, IdxValue) ->
-                                erlang:phash2(IdxValue)
-                            end,
                         {leveled_codec:to_ledgerkey(Bucket,
                                                     null,
                                                     ?IDX_TAG,
@@ -952,23 +948,21 @@ tictactree(State, Tag, Bucket, Query, JournalCheck, TreeSize, Filter) ->
                                                         ?IDX_TAG,
                                                         IdxField,
                                                         EndIdx),
-                            HashIdxValFun};
+                            fun(K, T) -> {K, T} end};
                     _ ->
                         {StartObjKey, EndObjKey} = Query,
-                        PassHashFun = fun(_Key, Hash) -> Hash end,
                         {leveled_codec:to_ledgerkey(Bucket,
                                                     StartObjKey,
                                                     Tag),
                             leveled_codec:to_ledgerkey(Bucket,
                                                         EndObjKey,
                                                         Tag),
-                            PassHashFun}
+                            fun(K, H) -> {K, {is_hash, H}} end}
                 end,
-
             AccFun = accumulate_tree(Filter,
                                         JournalCheck,
                                         JournalSnapshot,
-                                        HashFun),
+                                        ExtractFun),
             Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot,
                                                     StartKey,
                                                     EndKey,
@@ -1263,7 +1257,7 @@ accumulate_tree(FilterFun, JournalCheck, InkerClone, HashFun) ->
         fun(B, K, H, Tree) ->
             case FilterFun(B, K) of
                 accumulate ->
-                    leveled_tictac:add_kv(Tree, K, H, HashFun);
+                    leveled_tictac:add_kv(Tree, K, H, HashFun, false);
                 pass ->
                     Tree
             end
diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index 9ed2a2c..6be90d7 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -464,7 +464,8 @@ aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) ->
                         {LMD1, TTL} ->
                             TreeSize = AAE#recent_aae.tree_size,
                             SegID =
-                                leveled_tictac:get_segment(Key, TreeSize),
+                                leveled_tictac:get_segment(erlang:phash2(Key), 
+                                                            TreeSize),
                             IdxFldStr = ?NRT_IDX ++ LMD1 ++ "_bin",
                             IdxTrmStr =
                                 string:right(integer_to_list(SegID), 8, $0) ++
diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl
index 39d7db8..d25132c 100644
--- a/src/leveled_tictac.erl
+++ b/src/leveled_tictac.erl
@@ -57,14 +57,14 @@
 -export([
             new_tree/1,
             new_tree/2,
-            add_kv/4,
+            add_kv/5,
             find_dirtyleaves/2,
             find_dirtysegments/2,
             fetch_root/1,
             fetch_leaves/2,
             merge_trees/2,
             get_segment/2,
-            tictac_hash/2,
+            tictac_hash/3,
             export_tree/1,
             import_tree/1
         ]).
@@ -161,13 +161,24 @@ import_tree(ExportedTree) ->
                     level2 = Lv2}.
 
 -spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree().
+add_kv(TicTacTree, Key, Value, BinExtractFun) ->
+    add_kv(TicTacTree, Key, Value, BinExtractFun, false).
+
+-spec add_kv(tictactree(), tuple(), tuple(), fun(), boolean()) -> tictactree().
 %% @doc
-%% Add a Key and value to a tictactree using the HashFun to calculate the Hash
-%% based on that key and value
-add_kv(TicTacTree, Key, Value, HashFun) ->
-    HashV = HashFun(Key, Value),
-    SegChangeHash = tictac_hash(Key, HashV),
-    Segment = get_segment(Key, TicTacTree#tictactree.segment_count),
+%% Add a Key and value to a tictactree using the BinExtractFun to extract a 
+%% binary from the Key and value from which to generate the hash.  The 
+%% BinExtractFun will also need to do any canonicalisation necessary to make
+%% the hash consistent (such as whitespace removal, or sorting)
+%%
+%% For exportable trees the hash function will be based on the CJ Bernstein
+%% magic hash.  For non-exportable trees erlang:phash2 will be used, and so 
+%% non-binary Keys and Values can be returned from the BinExtractFun in this
+%% case.
+add_kv(TicTacTree, Key, Value, BinExtractFun, Exportable) ->
+    {BinK, BinV} = BinExtractFun(Key, Value),
+    {SegHash, SegChangeHash} = tictac_hash(BinK, BinV, Exportable),
+    Segment = get_segment(SegHash, TicTacTree#tictactree.segment_count),
     
     Level2Pos =
         Segment band (TicTacTree#tictactree.width - 1),
@@ -275,21 +286,33 @@ merge_trees(TreeA, TreeB) ->
     
     MergedTree#tictactree{level1 = NewLevel1, level2 = NewLevel2}.
 
--spec get_segment(any(), integer()|small|medium|large|xlarge) -> integer().
+-spec get_segment(integer(), integer()|small|medium|large|xlarge) -> integer().
 %% @doc
 %% Return the segment ID for a Key.  Can pass the tree size or the actual
 %% segment count derived from the size
-get_segment(Key, SegmentCount) when is_integer(SegmentCount) ->
-    erlang:phash2(Key) band (SegmentCount - 1);
-get_segment(Key, TreeSize) ->
-    get_segment(Key, element(3, get_size(TreeSize))).
+get_segment(Hash, SegmentCount) when is_integer(SegmentCount) ->
+    Hash band (SegmentCount - 1);
+get_segment(Hash, TreeSize) ->
+    get_segment(Hash, element(3, get_size(TreeSize))).
 
 
--spec tictac_hash(tuple(), any()) -> integer().
+-spec tictac_hash(any(), any(), boolean()) -> integer().
 %% @doc
-%% Hash the key and term
-tictac_hash(Key, Term) ->
-    erlang:phash2({Key, Term}).
+%% Hash the key and term, to either something repetable in Erlang, or using 
+%% the DJ Bernstein hash if it is the tree needs to be compared with one 
+%% calculated with a non-Erlang store
+%%
+%% Boolean is Exportable.  does the hash need to be repetable by a non-Erlang
+%% machine  
+tictac_hash(BinKey, BinVal, true) 
+                            when is_binary(BinKey) and is_binary(BinVal) ->
+    HashKey = leveled_codec:magic_hash({binary, BinKey}),
+    HashVal = leveled_codec:magic_hash({binary, BinVal}),
+    {HashKey, HashKey bxor HashVal};
+tictac_hash(BinKey, {is_hash, HashedVal}, false) ->
+    {erlang:phash2(BinKey), erlang:phash2(BinKey) bxor HashedVal};
+tictac_hash(BinKey, BinVal, false) ->
+    {erlang:phash2(BinKey), erlang:phash2(BinKey) bxor erlang:phash2(BinVal)}.
 
 %%%============================================================================
 %%% Internal functions
@@ -363,13 +386,17 @@ simple_bysize_test() ->
     simple_test_withsize(xlarge).
 
 simple_test_withsize(Size) ->
-    HashFun = fun(_K, V) -> erlang:phash2(V) end,
+    BinFun = fun(K, V) -> {term_to_binary(K), term_to_binary(V)} end,
     
+    K1 = {o, "B1", "K1", null},
+    K2 = {o, "B1", "K2", null},
+    K3 = {o, "B1", "K3", null},
+
     Tree0 = new_tree(0, Size),
-    Tree1 = add_kv(Tree0, {o, "B1", "K1", null}, {caine, 1}, HashFun),
-    Tree2 = add_kv(Tree1, {o, "B1", "K2", null}, {caine, 2}, HashFun),
-    Tree3 = add_kv(Tree2, {o, "B1", "K3", null}, {caine, 3}, HashFun),
-    Tree3A = add_kv(Tree3, {o, "B1", "K3", null}, {caine, 4}, HashFun),
+    Tree1 = add_kv(Tree0, K1, {caine, 1}, BinFun),
+    Tree2 = add_kv(Tree1, K2, {caine, 2}, BinFun),
+    Tree3 = add_kv(Tree2, K3, {caine, 3}, BinFun),
+    Tree3A = add_kv(Tree3, K3, {caine, 4}, BinFun),
     ?assertMatch(true, Tree0#tictactree.level1 == Tree0#tictactree.level1),
     ?assertMatch(false, Tree0#tictactree.level1 == Tree1#tictactree.level1),
     ?assertMatch(false, Tree1#tictactree.level1 == Tree2#tictactree.level1),
@@ -377,23 +404,28 @@ simple_test_withsize(Size) ->
     ?assertMatch(false, Tree3#tictactree.level1 == Tree3A#tictactree.level1),
     
     Tree0X = new_tree(0, Size),
-    Tree1X = add_kv(Tree0X, {o, "B1", "K3", null}, {caine, 3}, HashFun),
-    Tree2X = add_kv(Tree1X, {o, "B1", "K1", null}, {caine, 1}, HashFun),
-    Tree3X = add_kv(Tree2X, {o, "B1", "K2", null}, {caine, 2}, HashFun),
-    Tree3XA = add_kv(Tree3X, {o, "B1", "K3", null}, {caine, 4}, HashFun),
+    Tree1X = add_kv(Tree0X, K3, {caine, 3}, BinFun),
+    Tree2X = add_kv(Tree1X, K1, {caine, 1}, BinFun),
+    Tree3X = add_kv(Tree2X, K2, {caine, 2}, BinFun),
+    Tree3XA = add_kv(Tree3X, K3, {caine, 4}, BinFun),
     ?assertMatch(false, Tree1#tictactree.level1 == Tree1X#tictactree.level1),
     ?assertMatch(false, Tree2#tictactree.level1 == Tree2X#tictactree.level1),
     ?assertMatch(true, Tree3#tictactree.level1 == Tree3X#tictactree.level1),
     ?assertMatch(true, Tree3XA#tictactree.level1 == Tree3XA#tictactree.level1),
     
     SC = Tree0#tictactree.segment_count,
+
+    GetSegFun = 
+        fun(TK) ->
+            get_segment(erlang:phash2(term_to_binary(TK)), SC)
+        end,
     
     DL0 = find_dirtyleaves(Tree1, Tree0),
-    ?assertMatch(true, lists:member(get_segment({o, "B1", "K1", null}, SC), DL0)),
+    ?assertMatch(true, lists:member(GetSegFun(K1), DL0)),
     DL1 = find_dirtyleaves(Tree3, Tree1),
-    ?assertMatch(true, lists:member(get_segment({o, "B1", "K2", null}, SC), DL1)),
-    ?assertMatch(true, lists:member(get_segment({o, "B1", "K3", null}, SC), DL1)),
-    ?assertMatch(false, lists:member(get_segment({o, "B1", "K1", null}, SC), DL1)),
+    ?assertMatch(true, lists:member(GetSegFun(K2), DL1)),
+    ?assertMatch(true, lists:member(GetSegFun(K3), DL1)),
+    ?assertMatch(false, lists:member(GetSegFun(K1), DL1)),
     
     % Export and import tree to confirm no difference
     ExpTree3 = export_tree(Tree3),
@@ -416,24 +448,24 @@ merge_bysize_xlarge_test2() ->
     merge_test_withsize(xlarge).
 
 merge_test_withsize(Size) ->
-    HashFun = fun(_K, V) -> erlang:phash2(V) end,
+    BinFun = fun(K, V) -> {term_to_binary(K), term_to_binary(V)} end,
     
     TreeX0 = new_tree(0, Size),
-    TreeX1 = add_kv(TreeX0, {o, "B1", "X1", null}, {caine, 1}, HashFun),
-    TreeX2 = add_kv(TreeX1, {o, "B1", "X2", null}, {caine, 2}, HashFun),
-    TreeX3 = add_kv(TreeX2, {o, "B1", "X3", null}, {caine, 3}, HashFun),
-    TreeX4 = add_kv(TreeX3, {o, "B1", "X3", null}, {caine, 4}, HashFun),
+    TreeX1 = add_kv(TreeX0, {o, "B1", "X1", null}, {caine, 1}, BinFun),
+    TreeX2 = add_kv(TreeX1, {o, "B1", "X2", null}, {caine, 2}, BinFun),
+    TreeX3 = add_kv(TreeX2, {o, "B1", "X3", null}, {caine, 3}, BinFun),
+    TreeX4 = add_kv(TreeX3, {o, "B1", "X3", null}, {caine, 4}, BinFun),
     
     TreeY0 = new_tree(0, Size),
-    TreeY1 = add_kv(TreeY0, {o, "B1", "Y1", null}, {caine, 101}, HashFun),
-    TreeY2 = add_kv(TreeY1, {o, "B1", "Y2", null}, {caine, 102}, HashFun),
-    TreeY3 = add_kv(TreeY2, {o, "B1", "Y3", null}, {caine, 103}, HashFun),
-    TreeY4 = add_kv(TreeY3, {o, "B1", "Y3", null}, {caine, 104}, HashFun),
+    TreeY1 = add_kv(TreeY0, {o, "B1", "Y1", null}, {caine, 101}, BinFun),
+    TreeY2 = add_kv(TreeY1, {o, "B1", "Y2", null}, {caine, 102}, BinFun),
+    TreeY3 = add_kv(TreeY2, {o, "B1", "Y3", null}, {caine, 103}, BinFun),
+    TreeY4 = add_kv(TreeY3, {o, "B1", "Y3", null}, {caine, 104}, BinFun),
     
-    TreeZ1 = add_kv(TreeX4, {o, "B1", "Y1", null}, {caine, 101}, HashFun),
-    TreeZ2 = add_kv(TreeZ1, {o, "B1", "Y2", null}, {caine, 102}, HashFun),
-    TreeZ3 = add_kv(TreeZ2, {o, "B1", "Y3", null}, {caine, 103}, HashFun),
-    TreeZ4 = add_kv(TreeZ3, {o, "B1", "Y3", null}, {caine, 104}, HashFun),
+    TreeZ1 = add_kv(TreeX4, {o, "B1", "Y1", null}, {caine, 101}, BinFun),
+    TreeZ2 = add_kv(TreeZ1, {o, "B1", "Y2", null}, {caine, 102}, BinFun),
+    TreeZ3 = add_kv(TreeZ2, {o, "B1", "Y3", null}, {caine, 103}, BinFun),
+    TreeZ4 = add_kv(TreeZ3, {o, "B1", "Y3", null}, {caine, 104}, BinFun),
     
     TreeM0 = merge_trees(TreeX4, TreeY4),
     checktree(TreeM0),
@@ -443,6 +475,10 @@ merge_test_withsize(Size) ->
     checktree(TreeM1),
     ?assertMatch(false, TreeM1#tictactree.level1 == TreeZ4#tictactree.level1).
 
+exportable_test() ->
+    {Int1, Int2} = tictac_hash(<<"key">>, <<"value">>, true),
+    ?assertMatch({true, true}, {is_integer(Int1), is_integer(Int2)}).
+
 -endif.
 
     
diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl
index a837738..6e7ba2d 100644
--- a/test/end_to_end/tictac_SUITE.erl
+++ b/test/end_to_end/tictac_SUITE.erl
@@ -114,19 +114,16 @@ many_put_compare(_Config) ->
     % Now run the same query by putting the tree-building responsibility onto
     % the fold_objects_fun
 
-    ApplyHash =
-      fun(HashFun) ->
-          fun(_Key, Value) ->
-              {proxy_object, HeadBin, _Size, _FetchFun} = binary_to_term(Value),
-              <<?MAGIC:8/integer, ?V1_VERS:8/integer, VclockLen:32/integer,
-                    Rest/binary>> = HeadBin,
-              <<VclockBin:VclockLen/binary, _NotNeeded/binary>> = Rest,
-              HashFun(lists:sort(binary_to_term(VclockBin)))
-          end
-      end,
+    ExtractClockFun =
+        fun(Key, Value) ->
+            {proxy_object, HeadBin, _Size, _FetchFun} = binary_to_term(Value),
+            <<?MAGIC:8/integer, ?V1_VERS:8/integer, VclockLen:32/integer,
+                VclockBin:VclockLen/binary, _Rest/binary>> = HeadBin,
+            {Key, lists:sort(binary_to_term(VclockBin))}
+        end,
     FoldObjectsFun =
       fun(_Bucket, Key, Value, Acc) ->
-          leveled_tictac:add_kv(Acc, Key, Value, ApplyHash(fun erlang:phash2/1))
+          leveled_tictac:add_kv(Acc, Key, Value, ExtractClockFun, false)
       end,
 
     FoldQ0 = {foldheads_bybucket,
@@ -157,22 +154,25 @@ many_put_compare(_Config) ->
                 [timer:now_diff(os:timestamp(), SWB1Obj)]),
     true = length(leveled_tictac:find_dirtyleaves(TreeA, TreeAObj1)) == 0,
 
-    % AAE trees within riak are based on a sha of the vector clock.  So to
-    % compare with an AAE tree we need to compare outputs when we're hashing
-    % a hash
-    AltHashFun =
-        fun(Term) ->
-            erlang:phash2(crypto:hash(sha, term_to_binary(Term)))
+    % For an exportable comparison, want hash to be based on something not 
+    % coupled to erlang language - so use exportable query
+    AltExtractFun =
+        fun(K, V) ->
+            {proxy_object, HeadBin, _Size, _FetchFun} = binary_to_term(V),
+            <<?MAGIC:8/integer, ?V1_VERS:8/integer, VclockLen:32/integer,
+                VclockBin:VclockLen/binary, _Rest/binary>> = HeadBin,
+            {term_to_binary(K), VclockBin}
         end,
     AltFoldObjectsFun =
         fun(_Bucket, Key, Value, Acc) ->
-            leveled_tictac:add_kv(Acc, Key, Value, ApplyHash(AltHashFun))
+            leveled_tictac:add_kv(Acc, Key, Value, AltExtractFun, true)
         end,
     AltFoldQ0 = {foldheads_bybucket,
-              o_rkv,
-              "Bucket",
-              {AltFoldObjectsFun, leveled_tictac:new_tree(0, TreeSize)},
-              false, true},
+                    o_rkv,
+                    "Bucket",
+                    {AltFoldObjectsFun, leveled_tictac:new_tree(0, TreeSize)},
+                    false, 
+                    true},
     {async, TreeAAltObjFolder0} =
         leveled_bookie:book_returnfolder(Bookie2, AltFoldQ0),
     SWB2Obj = os:timestamp(),
@@ -187,15 +187,19 @@ many_put_compare(_Config) ->
     io:format("Build tictac tree via object fold with no "++
                     "presence check and 200K objects  and alt hash in ~w~n",
                 [timer:now_diff(os:timestamp(), SWB3Obj)]),
-    true =
-        length(leveled_tictac:find_dirtyleaves(TreeBAltObj, TreeAAltObj)) == 1,
+    DL_ExportFold = 
+        length(leveled_tictac:find_dirtyleaves(TreeBAltObj, TreeAAltObj)),
+    io:format("Found dirty leaves with exportable comparison of ~w~n",
+                [DL_ExportFold]),
+    true = DL_ExportFold == 1,
 
 
     %% Finding differing keys
     FoldKeysFun =
         fun(SegListToFind) ->
             fun(_B, K, Acc) ->
-                Seg = leveled_tictac:get_segment(K, SegmentCount),
+                Seg = 
+                    leveled_tictac:get_segment(erlang:phash2(K), SegmentCount),
                 case lists:member(Seg, SegListToFind) of
                     true ->
                         [K|Acc];
@@ -469,7 +473,8 @@ index_compare(_Config) ->
 
     FoldKeysIndexQFun =
         fun(_Bucket, {Term, Key}, Acc) ->
-            Seg = leveled_tictac:get_segment(Key, SegmentCount),
+            Seg = 
+                leveled_tictac:get_segment(erlang:phash2(Key), SegmentCount),
             case lists:member(Seg, DL3_0) of
                 true ->
                     [{Term, Key}|Acc];