Attempt to standardise on segment hashes

To allow for the segment has that accelerates queries to be re-used in tictac tree related queries.
This commit is contained in:
Martin Sumner 2017-10-30 13:57:41 +00:00
parent 7763df3cef
commit 6bb7ceef0c
4 changed files with 77 additions and 44 deletions

View file

@ -480,9 +480,9 @@ aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) ->
Acc; Acc;
{LMD1, TTL} -> {LMD1, TTL} ->
TreeSize = AAE#recent_aae.tree_size, TreeSize = AAE#recent_aae.tree_size,
SegID32 = leveled_tictac:keyto_segment32(Key),
SegID = SegID =
leveled_tictac:get_segment(erlang:phash2(Key), leveled_tictac:get_segment(SegID32, TreeSize),
TreeSize),
IdxFldStr = ?NRT_IDX ++ LMD1 ++ "_bin", IdxFldStr = ?NRT_IDX ++ LMD1 ++ "_bin",
IdxTrmStr = IdxTrmStr =
string:right(integer_to_list(SegID), 8, $0) ++ string:right(integer_to_list(SegID), 8, $0) ++

View file

@ -167,6 +167,15 @@ tictactree(SnapFun, {Tag, Bucket, Query}, JournalCheck, TreeSize, Filter) ->
{ok, LedgerSnap, JournalSnap} = SnapFun(), {ok, LedgerSnap, JournalSnap} = SnapFun(),
% The start key and end key will vary depending on whether the % The start key and end key will vary depending on whether the
% fold is to fold over an index or a key range % fold is to fold over an index or a key range
EnsureKeyBinaryFun =
fun(K, T) ->
case is_binary(K) of
true ->
{K, T};
false ->
{term_to_binary(K), T}
end
end,
{StartKey, EndKey, ExtractFun} = {StartKey, EndKey, ExtractFun} =
case Tag of case Tag of
?IDX_TAG -> ?IDX_TAG ->
@ -174,12 +183,15 @@ tictactree(SnapFun, {Tag, Bucket, Query}, JournalCheck, TreeSize, Filter) ->
KeyDefFun = fun leveled_codec:to_ledgerkey/5, KeyDefFun = fun leveled_codec:to_ledgerkey/5,
{KeyDefFun(Bucket, null, ?IDX_TAG, IdxFld, StartIdx), {KeyDefFun(Bucket, null, ?IDX_TAG, IdxFld, StartIdx),
KeyDefFun(Bucket, null, ?IDX_TAG, IdxFld, EndIdx), KeyDefFun(Bucket, null, ?IDX_TAG, IdxFld, EndIdx),
fun(K, T) -> {K, T} end}; EnsureKeyBinaryFun};
_ -> _ ->
{StartOKey, EndOKey} = Query, {StartOKey, EndOKey} = Query,
{leveled_codec:to_ledgerkey(Bucket, StartOKey, Tag), {leveled_codec:to_ledgerkey(Bucket, StartOKey, Tag),
leveled_codec:to_ledgerkey(Bucket, EndOKey, Tag), leveled_codec:to_ledgerkey(Bucket, EndOKey, Tag),
fun(K, H) -> {K, {is_hash, H}} end} fun(K, H) ->
V = {is_hash, H},
EnsureKeyBinaryFun(K, V)
end}
end, end,
AccFun = AccFun =
accumulate_tree(Filter, JournalCheck, JournalSnap, ExtractFun), accumulate_tree(Filter, JournalCheck, JournalSnap, ExtractFun),
@ -363,7 +375,7 @@ accumulate_tree(FilterFun, JournalCheck, InkerClone, HashFun) ->
fun(B, K, H, Tree) -> fun(B, K, H, Tree) ->
case FilterFun(B, K) of case FilterFun(B, K) of
accumulate -> accumulate ->
leveled_tictac:add_kv(Tree, K, H, HashFun, false); leveled_tictac:add_kv(Tree, K, H, HashFun);
pass -> pass ->
Tree Tree
end end

View file

@ -57,17 +57,18 @@
-export([ -export([
new_tree/1, new_tree/1,
new_tree/2, new_tree/2,
add_kv/5, add_kv/4,
find_dirtyleaves/2, find_dirtyleaves/2,
find_dirtysegments/2, find_dirtysegments/2,
fetch_root/1, fetch_root/1,
fetch_leaves/2, fetch_leaves/2,
merge_trees/2, merge_trees/2,
get_segment/2, get_segment/2,
tictac_hash/3, tictac_hash/2,
export_tree/1, export_tree/1,
import_tree/1, import_tree/1,
valid_size/1 valid_size/1,
keyto_segment32/1
]). ]).
@ -169,24 +170,16 @@ import_tree(ExportedTree) ->
level1 = L1Bin, level1 = L1Bin,
level2 = Lv2}. level2 = Lv2}.
-spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree().
add_kv(TicTacTree, Key, Value, BinExtractFun) ->
add_kv(TicTacTree, Key, Value, BinExtractFun, false).
-spec add_kv(tictactree(), tuple(), tuple(), fun(), boolean()) -> tictactree(). -spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree().
%% @doc %% @doc
%% Add a Key and value to a tictactree using the BinExtractFun to extract a %% Add a Key and value to a tictactree using the BinExtractFun to extract a
%% binary from the Key and value from which to generate the hash. The %% binary from the Key and value from which to generate the hash. The
%% BinExtractFun will also need to do any canonicalisation necessary to make %% BinExtractFun will also need to do any canonicalisation necessary to make
%% the hash consistent (such as whitespace removal, or sorting) %% the hash consistent (such as whitespace removal, or sorting)
%% add_kv(TicTacTree, Key, Value, BinExtractFun) ->
%% For exportable trees the hash function will be based on the CJ Bernstein
%% magic hash. For non-exportable trees erlang:phash2 will be used, and so
%% non-binary Keys and Values can be returned from the BinExtractFun in this
%% case.
add_kv(TicTacTree, Key, Value, BinExtractFun, Exportable) ->
{BinK, BinV} = BinExtractFun(Key, Value), {BinK, BinV} = BinExtractFun(Key, Value),
{SegHash, SegChangeHash} = tictac_hash(BinK, BinV, Exportable), {SegHash, SegChangeHash} = tictac_hash(BinK, BinV),
Segment = get_segment(SegHash, TicTacTree#tictactree.segment_count), Segment = get_segment(SegHash, TicTacTree#tictactree.segment_count),
Level2Pos = Level2Pos =
@ -314,28 +307,39 @@ get_segment(Hash, TreeSize) ->
get_segment(Hash, element(3, get_size(TreeSize))). get_segment(Hash, element(3, get_size(TreeSize))).
-spec tictac_hash(any(), any(), boolean()) -> {integer(), integer()}. -spec tictac_hash(binary(), any()) -> {integer(), integer()}.
%% @doc %% @doc
%% Hash the key and term, to either something repetable in Erlang, or using %% Hash the key and term.
%% the DJ Bernstein hash if it is the tree needs to be compared with one %% The term can be of the form {is_hash, 32-bit integer)} to indicate the hash
%% calculated with a non-Erlang store %% has already been taken. If the value is not a pre-extracted hash just use
%% %% erlang:phash2. If an exportable hash of the value is required this should
%% Boolean is Exportable. does the hash need to be repetable by a non-Erlang %% be managed through the add_kv ExtractFun providing a pre-prepared Hash.
%% machine tictac_hash(BinKey, Val) when is_binary(BinKey) ->
tictac_hash(BinKey, BinVal, true) HashKey = keyto_segment32(BinKey),
when is_binary(BinKey) and is_binary(BinVal) -> HashVal =
HashKey = leveled_codec:magic_hash({binary, BinKey}), case Val of
HashVal = leveled_codec:magic_hash({binary, BinVal}), {is_hash, HashedVal} ->
{HashKey, HashKey bxor HashVal}; HashedVal;
tictac_hash(BinKey, {is_hash, HashedVal}, false) -> _ ->
{erlang:phash2(BinKey), erlang:phash2(BinKey) bxor HashedVal}; erlang:phash2(Val)
tictac_hash(BinKey, BinVal, false) -> end,
{erlang:phash2(BinKey), erlang:phash2(BinKey) bxor erlang:phash2(BinVal)}. {HashKey, HashKey bxor HashVal}.
-spec keyto_segment32(any()) -> integer().
%% @doc
%% The first 16 bits of the segment hash used in the tictac tree should be
%% made up of the segment ID part (which is used to accelerate queries)
keyto_segment32(BinKey) when is_binary(BinKey) ->
{SegmentID, ExtraHash} = leveled_codec:segment_hash(BinKey),
(ExtraHash band 65535) bsl 16 + SegmentID;
keyto_segment32(Key) ->
keyto_segment32(term_to_binary(Key)).
%%%============================================================================ %%%============================================================================
%%% Internal functions %%% Internal functions
%%%============================================================================ %%%============================================================================
get_level2(TicTacTree, L1Pos) -> get_level2(TicTacTree, L1Pos) ->
case array:get(L1Pos, TicTacTree#tictactree.level2) of case array:get(L1Pos, TicTacTree#tictactree.level2) of
?EMPTY -> ?EMPTY ->
@ -454,7 +458,7 @@ simple_test_withsize(Size) ->
GetSegFun = GetSegFun =
fun(TK) -> fun(TK) ->
get_segment(erlang:phash2(term_to_binary(TK)), SC) get_segment(keyto_segment32(term_to_binary(TK)), SC)
end, end,
DL0 = find_dirtyleaves(Tree1, Tree0), DL0 = find_dirtyleaves(Tree1, Tree0),
@ -513,7 +517,7 @@ merge_test_withsize(Size) ->
?assertMatch(false, TreeM1#tictactree.level1 == TreeZ4#tictactree.level1). ?assertMatch(false, TreeM1#tictactree.level1 == TreeZ4#tictactree.level1).
exportable_test() -> exportable_test() ->
{Int1, Int2} = tictac_hash(<<"key">>, <<"value">>, true), {Int1, Int2} = tictac_hash(<<"key">>, <<"value">>),
?assertMatch({true, true}, {Int1 >= 0, Int2 >=0}). ?assertMatch({true, true}, {Int1 >= 0, Int2 >=0}).
-endif. -endif.

View file

@ -131,11 +131,18 @@ many_put_compare(_Config) ->
{proxy_object, HeadBin, _Size, _FetchFun} = binary_to_term(Value), {proxy_object, HeadBin, _Size, _FetchFun} = binary_to_term(Value),
<<?MAGIC:8/integer, ?V1_VERS:8/integer, VclockLen:32/integer, <<?MAGIC:8/integer, ?V1_VERS:8/integer, VclockLen:32/integer,
VclockBin:VclockLen/binary, _Rest/binary>> = HeadBin, VclockBin:VclockLen/binary, _Rest/binary>> = HeadBin,
{Key, lists:sort(binary_to_term(VclockBin))} case is_binary(Key) of
true ->
{Key,
lists:sort(binary_to_term(VclockBin))};
false ->
{term_to_binary(Key),
lists:sort(binary_to_term(VclockBin))}
end
end, end,
FoldObjectsFun = FoldObjectsFun =
fun(_Bucket, Key, Value, Acc) -> fun(_Bucket, Key, Value, Acc) ->
leveled_tictac:add_kv(Acc, Key, Value, ExtractClockFun, false) leveled_tictac:add_kv(Acc, Key, Value, ExtractClockFun)
end, end,
FoldQ0 = {foldheads_bybucket, FoldQ0 = {foldheads_bybucket,
@ -179,7 +186,7 @@ many_put_compare(_Config) ->
end, end,
AltFoldObjectsFun = AltFoldObjectsFun =
fun(_Bucket, Key, Value, Acc) -> fun(_Bucket, Key, Value, Acc) ->
leveled_tictac:add_kv(Acc, Key, Value, AltExtractFun, true) leveled_tictac:add_kv(Acc, Key, Value, AltExtractFun)
end, end,
AltFoldQ0 = {foldheads_bybucket, AltFoldQ0 = {foldheads_bybucket,
o_rkv, o_rkv,
@ -213,8 +220,7 @@ many_put_compare(_Config) ->
FoldKeysFun = FoldKeysFun =
fun(SegListToFind) -> fun(SegListToFind) ->
fun(_B, K, Acc) -> fun(_B, K, Acc) ->
Seg = Seg = get_segment(K, SegmentCount),
leveled_tictac:get_segment(erlang:phash2(K), SegmentCount),
case lists:member(Seg, SegListToFind) of case lists:member(Seg, SegListToFind) of
true -> true ->
[K|Acc]; [K|Acc];
@ -488,8 +494,7 @@ index_compare(_Config) ->
FoldKeysIndexQFun = FoldKeysIndexQFun =
fun(_Bucket, {Term, Key}, Acc) -> fun(_Bucket, {Term, Key}, Acc) ->
Seg = Seg = get_segment(Key, SegmentCount),
leveled_tictac:get_segment(erlang:phash2(Key), SegmentCount),
case lists:member(Seg, DL3_0) of case lists:member(Seg, DL3_0) of
true -> true ->
[{Term, Key}|Acc]; [{Term, Key}|Acc];
@ -1144,3 +1149,15 @@ get_tictactree_fun(Bookie, Bucket, TreeSize) ->
[LMD, timer:now_diff(os:timestamp(), SW)]), [LMD, timer:now_diff(os:timestamp(), SW)]),
leveled_tictac:merge_trees(R, Acc) leveled_tictac:merge_trees(R, Acc)
end. end.
get_segment(K, SegmentCount) ->
BinKey =
case is_binary(K) of
true ->
K;
false ->
term_to_binary(K)
end,
{SegmentID, ExtraHash} = leveled_codec:segment_hash(BinKey),
SegHash = (ExtraHash band 65535) bsl 16 + SegmentID,
leveled_tictac:get_segment(SegHash, SegmentCount).