From a128dcdadf57a7cd8a1e65ef5760de56bf0b5657 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 20 Oct 2017 23:04:29 +0100 Subject: [PATCH] Change hash algorithm for penciller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch from magic hash to md5 - to hopefully remove the need for some of the artificial jumps required to get expected fall positive ratios. Also split the hash into two 16-bit integers. We assume that SegmentID (from the perspective of AAE merkle/tictac trees) will always be at least 16 bits. the idea is that hashes should be used in blooms and indexes such that some advantage can be gained from just knowing the segmentID - in particular when folding over all the keys in a bucket. Performance testing has been difficult so far - I think due to “cloud” mysteries. --- src/leveled_bookie.erl | 2 +- src/leveled_codec.erl | 25 ++++++++++++++++----- src/leveled_pclerk.erl | 2 +- src/leveled_penciller.erl | 17 +++++++------- src/leveled_pmem.erl | 21 ++++++++--------- src/leveled_sst.erl | 47 ++++++++++++++++++++------------------- src/leveled_tinybloom.erl | 23 +++++++++---------- 7 files changed, 75 insertions(+), 62 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 7eb4e7a..42d5e0f 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -947,7 +947,7 @@ fetch_head(Key, Penciller, LedgerCache) -> [{Key, Head}] -> Head; [] -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), case leveled_penciller:pcl_fetch(Penciller, Key, Hash) of {Key, Head} -> maybe_longrunning(SW, pcl_head), diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index dc981e8..bd0c60d 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -65,6 +65,7 @@ integer_now/0, riak_extract_metadata/2, magic_hash/1, + segment_hash/1, to_lookup/1]). -define(V1_VERS, 1). @@ -79,6 +80,20 @@ integer()|null, % Hash of vclock - non-exportable integer()}. % Size in bytes of real object + +-spec segment_hash(any()) -> {integer(), integer()}. +%% @doc +%% Return two 16 bit integers - the segment ID and a second integer for spare +%% entropy. The hashed should be used in blooms or indexes such that some +%% speed can be gained if just the segment ID is known - but more can be +%% gained should the extended hash (with the second element) is known +segment_hash(Key) when is_binary(Key) -> + <> = + crypto:hash(md5, Key), + {SegmentID, ExtraHash}; +segment_hash(Key) -> + segment_hash(term_to_binary(Key)). + -spec magic_hash(any()) -> integer(). %% @doc %% Use DJ Bernstein magic hash function. Note, this is more expensive than @@ -87,10 +102,6 @@ %% Hash function contains mysterious constants, some explanation here as to %% what they are - %% http://stackoverflow.com/questions/10696223/reason-for-5381-number-in-djb-hash-function -magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) -> - magic_hash({Bucket, Key}); -magic_hash({?STD_TAG, Bucket, Key, _SubKey}) -> - magic_hash({Bucket, Key}); magic_hash({binary, BinaryKey}) -> H = 5381, hash1(H, BinaryKey) band 16#FFFFFFFF; @@ -516,7 +527,9 @@ parse_date(LMD, UnitMins, LimitMins, Now) -> -spec generate_ledgerkv( tuple(), integer(), any(), integer(), tuple()|infinity) -> - {any(), any(), any(), {integer()|no_lookup, integer()}, list()}. + {any(), any(), any(), + {{integer(), integer()}|no_lookup, integer()}, + list()}. %% @doc %% Function to extract from an object the information necessary to populate %% the Penciller's ledger. @@ -537,7 +550,7 @@ generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> _ -> {active, TS} end, - Hash = magic_hash(PrimaryKey), + Hash = segment_hash(PrimaryKey), {MD, LastMods} = extract_metadata(Obj, Size, Tag), ObjHash = get_objhash(Tag, MD), Value = {SQN, diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 392b13d..c412cf4 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -254,7 +254,7 @@ generate_randomkeys(Count, Acc, BucketLow, BRange) -> K = {o, "Bucket" ++ BNumber, "Key" ++ KNumber}, RandKey = {K, {Count + 1, {active, infinity}, - leveled_codec:magic_hash(K), + leveled_codec:segment_hash(K), null}}, generate_randomkeys(Count - 1, [RandKey|Acc], BucketLow, BRange). diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 4321ee6..c726d08 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -315,21 +315,22 @@ pcl_fetchlevelzero(Pid, Slot) -> %% The Key needs to be hashable (i.e. have a tag which indicates that the key %% can be looked up) - index entries are not hashable for example. %% -%% If the hash is already knonw, call pcl_fetch/3 as magic_hash is a +%% If the hash is already knonw, call pcl_fetch/3 as segment_hash is a %% relatively expensive hash function pcl_fetch(Pid, Key) -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), if Hash /= no_lookup -> gen_server:call(Pid, {fetch, Key, Hash}, infinity) end. --spec pcl_fetch(pid(), tuple(), integer()) -> {tuple(), tuple()}|not_present. +-spec pcl_fetch(pid(), tuple(), {integer(), integer()}) -> + {tuple(), tuple()}|not_present. %% @doc %% Fetch a key, return the first (highest SQN) occurrence of that Key along %% with the value. %% -%% Hash should be result of leveled_codec:magic_hash(Key) +%% Hash should be result of leveled_codec:segment_hash(Key) pcl_fetch(Pid, Key, Hash) -> gen_server:call(Pid, {fetch, Key, Hash}, infinity). @@ -367,7 +368,7 @@ pcl_fetchnextkey(Pid, StartKey, EndKey, AccFun, InitAcc) -> %% If the key is not present, it will be assumed that a higher sequence number %% tombstone once existed, and false will be returned. pcl_checksequencenumber(Pid, Key, SQN) -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), if Hash /= no_lookup -> gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity) @@ -1317,7 +1318,7 @@ generate_randomkeys(Count, SQN, Acc) -> RandKey = {K, {SQN, {active, infinity}, - leveled_codec:magic_hash(K), + leveled_codec:segment_hash(K), null}}, generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). @@ -1347,7 +1348,7 @@ maybe_pause_push(PCL, KL) -> T1 = lists:foldl(fun({K, V}, {AccSL, AccIdx, MinSQN, MaxSQN}) -> UpdSL = [{K, V}|AccSL], SQN = leveled_codec:strip_to_seqonly({K, V}), - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), UpdIdx = leveled_pmem:prepare_for_index(AccIdx, H), {UpdSL, UpdIdx, min(SQN, MinSQN), max(SQN, MaxSQN)} end, @@ -1366,7 +1367,7 @@ maybe_pause_push(PCL, KL) -> %% old test data doesn't have the magic hash add_missing_hash({K, {SQN, ST, MD}}) -> - {K, {SQN, ST, leveled_codec:magic_hash(K), MD}}. + {K, {SQN, ST, leveled_codec:segment_hash(K), MD}}. clean_dir_test() -> diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 0846e3b..40aabfe 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -50,7 +50,8 @@ %%% API %%%============================================================================ --spec prepare_for_index(index_array(), integer()|no_lookup) -> index_array(). +-spec prepare_for_index(index_array(), {integer(), integer()}|no_lookup) + -> index_array(). %% @doc %% Add the hash of a key to the index. This is 'prepared' in the sense that %% this index is not use until it is loaded into the main index. @@ -95,7 +96,7 @@ new_index() -> clear_index(_L0Index) -> new_index(). --spec check_index(integer(), index_array()) -> list(integer()). +-spec check_index({integer(), integer()}, index_array()) -> list(integer()). %% @doc %% return a list of positions in the list of cache arrays that may contain the %% key associated with the hash being checked @@ -158,9 +159,9 @@ to_list(Slots, FetchFun) -> %% checked (with the most recently received cache being checked first) until a %% match is found. check_levelzero(Key, PosList, TreeList) -> - check_levelzero(Key, leveled_codec:magic_hash(Key), PosList, TreeList). + check_levelzero(Key, leveled_codec:segment_hash(Key), PosList, TreeList). --spec check_levelzero(tuple(), integer(), list(integer()), list()) +-spec check_levelzero(tuple(), {integer(), integer()}, list(integer()), list()) -> {boolean(), tuple|not_found}. %% @doc %% Check for the presence of a given Key in the Level Zero cache, with the @@ -204,10 +205,10 @@ find_pos(<<0:1/integer, NxtSlot:7/integer, T/binary>>, Hash, PosList, _SlotID) - find_pos(T, Hash, PosList, NxtSlot). -split_hash(Hash) -> - Slot = Hash band 255, - H0 = (Hash bsr 8) band 8388607, - {Slot, H0}. +split_hash({SegmentID, ExtraHash}) -> + Slot = SegmentID band 255, + H0 = (SegmentID bsr 8) bor (ExtraHash bsl 8), + {Slot, H0 band 8388607}. check_slotlist(Key, _Hash, CheckList, TreeList) -> SlotCheckFun = @@ -358,7 +359,7 @@ with_index_test_() -> with_index_test2() -> IndexPrepareFun = fun({K, _V}, Acc) -> - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), prepare_for_index(Acc, H) end, LoadFun = @@ -382,7 +383,7 @@ with_index_test2() -> CheckFun = fun({K, V}, {L0Idx, L0Cache}) -> - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), PosList = check_index(H, L0Idx), ?assertMatch({true, {K, V}}, check_slotlist(K, H, PosList, L0Cache)), diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 506a6b4..89bf729 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -237,12 +237,12 @@ sst_newlevelzero(RootPath, Filename, Slots, FetchFun, Penciller, MaxSQN) -> -spec sst_get(pid(), tuple()) -> tuple()|not_present. %% @doc %% Return a Key, Value pair matching a Key or not_present if the Key is not in -%% the store. The magic_hash function is used to accelerate the seeking of +%% the store. The segment_hash function is used to accelerate the seeking of %% keys, sst_get/3 should be used directly if this has already been calculated sst_get(Pid, LedgerKey) -> - sst_get(Pid, LedgerKey, leveled_codec:magic_hash(LedgerKey)). + sst_get(Pid, LedgerKey, leveled_codec:segment_hash(LedgerKey)). --spec sst_get(pid(), tuple(), integer()) -> tuple()|not_present. +-spec sst_get(pid(), tuple(), {integer(), integer()}) -> tuple()|not_present. %% @doc %% Return a Key, Value pair matching a Key or not_present if the Key is not in %% the store (with the magic hash precalculated). @@ -554,7 +554,7 @@ fetch(LedgerKey, Hash, State) -> State#state{blockindex_cache = BlockIndexCache}}; <> -> PosList = find_pos(BlockIdx, - double_hash(Hash, LedgerKey), + extra_hash(Hash), [], 0), case PosList of @@ -808,9 +808,9 @@ generate_binary_slot(Lookup, KVL) -> fun({K, V}, {PosBinAcc, NoHashCount, HashAcc}) -> {_SQN, H1} = leveled_codec:strip_to_seqnhashonly({K, V}), - case is_integer(H1) of + PosH1 = extra_hash(H1), + case is_integer(PosH1) of true -> - PosH1 = double_hash(H1, K), case NoHashCount of 0 -> {<<1:1/integer, @@ -1003,7 +1003,7 @@ binaryslot_get(FullBin, Key, Hash) -> <> = BlockLengths, <> = Rest, PosList = find_pos(PosBinIndex, - double_hash(Hash, Key), + extra_hash(Hash), [], 0), {fetch_value(PosList, BlockLengths, Blocks, Key), @@ -1186,9 +1186,10 @@ block_offsetandlength(BlockLengths, BlockID) -> {BlocksPos, B1L + B2L + B3L + B4L, B5L} end. -double_hash(Hash, Key) -> - H2 = erlang:phash2(Key), - (Hash bxor H2) band 32767. +extra_hash({_SegHash, ExtraHash}) when is_integer(ExtraHash) -> + ExtraHash band 32767; +extra_hash(NotHash) -> + NotHash. fetch_value([], _BlockLengths, _Blocks, _Key) -> not_present; @@ -1548,15 +1549,15 @@ indexed_list_test() -> [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]), {TestK1, TestV1} = lists:nth(20, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), {TestK2, TestV2} = lists:nth(40, KVL1), - MH2 = leveled_codec:magic_hash(TestK2), + MH2 = leveled_codec:segment_hash(TestK2), {TestK3, TestV3} = lists:nth(60, KVL1), - MH3 = leveled_codec:magic_hash(TestK3), + MH3 = leveled_codec:segment_hash(TestK3), {TestK4, TestV4} = lists:nth(80, KVL1), - MH4 = leveled_codec:magic_hash(TestK4), + MH4 = leveled_codec:segment_hash(TestK4), {TestK5, TestV5} = lists:nth(100, KVL1), - MH5 = leveled_codec:magic_hash(TestK5), + MH5 = leveled_codec:segment_hash(TestK5), test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), @@ -1573,15 +1574,15 @@ indexed_list_mixedkeys_test() -> {_PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), {TestK1, TestV1} = lists:nth(4, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), {TestK2, TestV2} = lists:nth(8, KVL1), - MH2 = leveled_codec:magic_hash(TestK2), + MH2 = leveled_codec:segment_hash(TestK2), {TestK3, TestV3} = lists:nth(12, KVL1), - MH3 = leveled_codec:magic_hash(TestK3), + MH3 = leveled_codec:segment_hash(TestK3), {TestK4, TestV4} = lists:nth(16, KVL1), - MH4 = leveled_codec:magic_hash(TestK4), + MH4 = leveled_codec:segment_hash(TestK4), {TestK5, TestV5} = lists:nth(20, KVL1), - MH5 = leveled_codec:magic_hash(TestK5), + MH5 = leveled_codec:segment_hash(TestK5), test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), @@ -1598,7 +1599,7 @@ indexed_list_mixedkeys2_test() -> Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2, {_PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), lists:foreach(fun({K, V}) -> - MH = leveled_codec:magic_hash(K), + MH = leveled_codec:segment_hash(K), test_binary_slot(FullBin, K, MH, {K, V}) end, KVL1). @@ -1682,7 +1683,7 @@ indexed_list_mixedkeys_bitflip_test() -> end, {TestK1, _TestV1} = lists:nth(20, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), test_binary_slot(FullBin0, TestK1, MH1, not_present), ToList = binaryslot_tolist(FullBin0), @@ -1920,7 +1921,7 @@ simple_persisted_test() -> In = lists:keymember(K, 1, KVList1), case {K > FirstKey, LastKey > K, In} of {true, true, false} -> - [{K, leveled_codec:magic_hash(K), V}|Acc]; + [{K, leveled_codec:segment_hash(K), V}|Acc]; _ -> Acc end diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 1c7cad8..fc70469 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -48,7 +48,7 @@ create_bloom(HashList) -> %% Check for the presence of a given hash within a bloom check_hash(_Hash, <<>>) -> false; -check_hash(Hash, BloomBin) -> +check_hash({Hash, _ExtraHash}, BloomBin) -> SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, {Slot, H0, H1} = split_hash(Hash, SlotSplit), Mask = get_mask(H0, H1), @@ -66,14 +66,11 @@ check_hash(Hash, BloomBin) -> %%% Internal Functions %%%============================================================================ -split_hash(Hash, SlotSplit) -> - Slot = Hash band SlotSplit, - H0 = (Hash bsr 4) band (?BAND_MASK), - H1 = (Hash bsr 10) band (?BAND_MASK), - H3 = (Hash bsr 16) band (?BAND_MASK), - H4 = (Hash bsr 22) band (?BAND_MASK), - Slot0 = (Hash bsr 28) band SlotSplit, - {Slot bxor Slot0, H0 bxor H3, H1 bxor H4}. +split_hash(SegHash, SlotSplit) -> + Slot = SegHash band SlotSplit, + H0 = (SegHash bsr 4) band (?BAND_MASK), + H1 = (SegHash bsr 10) band (?BAND_MASK), + {Slot, H0, H1}. get_mask(H0, H1) -> case H0 == H1 of @@ -90,7 +87,7 @@ get_mask(H0, H1) -> add_hashlist([], _S, S0, S1) -> IntSize = ?INTEGER_SIZE, <>; -add_hashlist([TopHash|T], SlotSplit, S0, S1) -> +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1) -> {Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1), case Slot of @@ -104,7 +101,7 @@ add_hashlist([], _S, S0, S1, S2, S3) -> IntSize = ?INTEGER_SIZE, <>; -add_hashlist([TopHash|T], SlotSplit, S0, S1, S2, S3) -> +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1, S2, S3) -> {Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1), case Slot of @@ -129,7 +126,7 @@ add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA:IntSize/integer, SB:IntSize/integer, SC:IntSize/integer, SD:IntSize/integer, SE:IntSize/integer, SF:IntSize/integer>>; -add_hashlist([TopHash|T], +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF) -> @@ -254,7 +251,7 @@ get_hashlist(N) -> KVL = lists:sublist(KVL0, N), HashFun = fun({K, _V}) -> - leveled_codec:magic_hash(K) + leveled_codec:segment_hash(K) end, lists:map(HashFun, KVL).