diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 7eb4e7a..42d5e0f 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -947,7 +947,7 @@ fetch_head(Key, Penciller, LedgerCache) -> [{Key, Head}] -> Head; [] -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), case leveled_penciller:pcl_fetch(Penciller, Key, Hash) of {Key, Head} -> maybe_longrunning(SW, pcl_head), diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index dc981e8..bd0c60d 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -65,6 +65,7 @@ integer_now/0, riak_extract_metadata/2, magic_hash/1, + segment_hash/1, to_lookup/1]). -define(V1_VERS, 1). @@ -79,6 +80,20 @@ integer()|null, % Hash of vclock - non-exportable integer()}. % Size in bytes of real object + +-spec segment_hash(any()) -> {integer(), integer()}. +%% @doc +%% Return two 16 bit integers - the segment ID and a second integer for spare +%% entropy. The hashed should be used in blooms or indexes such that some +%% speed can be gained if just the segment ID is known - but more can be +%% gained should the extended hash (with the second element) is known +segment_hash(Key) when is_binary(Key) -> + <> = + crypto:hash(md5, Key), + {SegmentID, ExtraHash}; +segment_hash(Key) -> + segment_hash(term_to_binary(Key)). + -spec magic_hash(any()) -> integer(). %% @doc %% Use DJ Bernstein magic hash function. Note, this is more expensive than @@ -87,10 +102,6 @@ %% Hash function contains mysterious constants, some explanation here as to %% what they are - %% http://stackoverflow.com/questions/10696223/reason-for-5381-number-in-djb-hash-function -magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) -> - magic_hash({Bucket, Key}); -magic_hash({?STD_TAG, Bucket, Key, _SubKey}) -> - magic_hash({Bucket, Key}); magic_hash({binary, BinaryKey}) -> H = 5381, hash1(H, BinaryKey) band 16#FFFFFFFF; @@ -516,7 +527,9 @@ parse_date(LMD, UnitMins, LimitMins, Now) -> -spec generate_ledgerkv( tuple(), integer(), any(), integer(), tuple()|infinity) -> - {any(), any(), any(), {integer()|no_lookup, integer()}, list()}. + {any(), any(), any(), + {{integer(), integer()}|no_lookup, integer()}, + list()}. %% @doc %% Function to extract from an object the information necessary to populate %% the Penciller's ledger. @@ -537,7 +550,7 @@ generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> _ -> {active, TS} end, - Hash = magic_hash(PrimaryKey), + Hash = segment_hash(PrimaryKey), {MD, LastMods} = extract_metadata(Obj, Size, Tag), ObjHash = get_objhash(Tag, MD), Value = {SQN, diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 392b13d..c412cf4 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -254,7 +254,7 @@ generate_randomkeys(Count, Acc, BucketLow, BRange) -> K = {o, "Bucket" ++ BNumber, "Key" ++ KNumber}, RandKey = {K, {Count + 1, {active, infinity}, - leveled_codec:magic_hash(K), + leveled_codec:segment_hash(K), null}}, generate_randomkeys(Count - 1, [RandKey|Acc], BucketLow, BRange). diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 4321ee6..c726d08 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -315,21 +315,22 @@ pcl_fetchlevelzero(Pid, Slot) -> %% The Key needs to be hashable (i.e. have a tag which indicates that the key %% can be looked up) - index entries are not hashable for example. %% -%% If the hash is already knonw, call pcl_fetch/3 as magic_hash is a +%% If the hash is already knonw, call pcl_fetch/3 as segment_hash is a %% relatively expensive hash function pcl_fetch(Pid, Key) -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), if Hash /= no_lookup -> gen_server:call(Pid, {fetch, Key, Hash}, infinity) end. --spec pcl_fetch(pid(), tuple(), integer()) -> {tuple(), tuple()}|not_present. +-spec pcl_fetch(pid(), tuple(), {integer(), integer()}) -> + {tuple(), tuple()}|not_present. %% @doc %% Fetch a key, return the first (highest SQN) occurrence of that Key along %% with the value. %% -%% Hash should be result of leveled_codec:magic_hash(Key) +%% Hash should be result of leveled_codec:segment_hash(Key) pcl_fetch(Pid, Key, Hash) -> gen_server:call(Pid, {fetch, Key, Hash}, infinity). @@ -367,7 +368,7 @@ pcl_fetchnextkey(Pid, StartKey, EndKey, AccFun, InitAcc) -> %% If the key is not present, it will be assumed that a higher sequence number %% tombstone once existed, and false will be returned. pcl_checksequencenumber(Pid, Key, SQN) -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), if Hash /= no_lookup -> gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity) @@ -1317,7 +1318,7 @@ generate_randomkeys(Count, SQN, Acc) -> RandKey = {K, {SQN, {active, infinity}, - leveled_codec:magic_hash(K), + leveled_codec:segment_hash(K), null}}, generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). @@ -1347,7 +1348,7 @@ maybe_pause_push(PCL, KL) -> T1 = lists:foldl(fun({K, V}, {AccSL, AccIdx, MinSQN, MaxSQN}) -> UpdSL = [{K, V}|AccSL], SQN = leveled_codec:strip_to_seqonly({K, V}), - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), UpdIdx = leveled_pmem:prepare_for_index(AccIdx, H), {UpdSL, UpdIdx, min(SQN, MinSQN), max(SQN, MaxSQN)} end, @@ -1366,7 +1367,7 @@ maybe_pause_push(PCL, KL) -> %% old test data doesn't have the magic hash add_missing_hash({K, {SQN, ST, MD}}) -> - {K, {SQN, ST, leveled_codec:magic_hash(K), MD}}. + {K, {SQN, ST, leveled_codec:segment_hash(K), MD}}. clean_dir_test() -> diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 0846e3b..40aabfe 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -50,7 +50,8 @@ %%% API %%%============================================================================ --spec prepare_for_index(index_array(), integer()|no_lookup) -> index_array(). +-spec prepare_for_index(index_array(), {integer(), integer()}|no_lookup) + -> index_array(). %% @doc %% Add the hash of a key to the index. This is 'prepared' in the sense that %% this index is not use until it is loaded into the main index. @@ -95,7 +96,7 @@ new_index() -> clear_index(_L0Index) -> new_index(). --spec check_index(integer(), index_array()) -> list(integer()). +-spec check_index({integer(), integer()}, index_array()) -> list(integer()). %% @doc %% return a list of positions in the list of cache arrays that may contain the %% key associated with the hash being checked @@ -158,9 +159,9 @@ to_list(Slots, FetchFun) -> %% checked (with the most recently received cache being checked first) until a %% match is found. check_levelzero(Key, PosList, TreeList) -> - check_levelzero(Key, leveled_codec:magic_hash(Key), PosList, TreeList). + check_levelzero(Key, leveled_codec:segment_hash(Key), PosList, TreeList). --spec check_levelzero(tuple(), integer(), list(integer()), list()) +-spec check_levelzero(tuple(), {integer(), integer()}, list(integer()), list()) -> {boolean(), tuple|not_found}. %% @doc %% Check for the presence of a given Key in the Level Zero cache, with the @@ -204,10 +205,10 @@ find_pos(<<0:1/integer, NxtSlot:7/integer, T/binary>>, Hash, PosList, _SlotID) - find_pos(T, Hash, PosList, NxtSlot). -split_hash(Hash) -> - Slot = Hash band 255, - H0 = (Hash bsr 8) band 8388607, - {Slot, H0}. +split_hash({SegmentID, ExtraHash}) -> + Slot = SegmentID band 255, + H0 = (SegmentID bsr 8) bor (ExtraHash bsl 8), + {Slot, H0 band 8388607}. check_slotlist(Key, _Hash, CheckList, TreeList) -> SlotCheckFun = @@ -358,7 +359,7 @@ with_index_test_() -> with_index_test2() -> IndexPrepareFun = fun({K, _V}, Acc) -> - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), prepare_for_index(Acc, H) end, LoadFun = @@ -382,7 +383,7 @@ with_index_test2() -> CheckFun = fun({K, V}, {L0Idx, L0Cache}) -> - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), PosList = check_index(H, L0Idx), ?assertMatch({true, {K, V}}, check_slotlist(K, H, PosList, L0Cache)), diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 506a6b4..89bf729 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -237,12 +237,12 @@ sst_newlevelzero(RootPath, Filename, Slots, FetchFun, Penciller, MaxSQN) -> -spec sst_get(pid(), tuple()) -> tuple()|not_present. %% @doc %% Return a Key, Value pair matching a Key or not_present if the Key is not in -%% the store. The magic_hash function is used to accelerate the seeking of +%% the store. The segment_hash function is used to accelerate the seeking of %% keys, sst_get/3 should be used directly if this has already been calculated sst_get(Pid, LedgerKey) -> - sst_get(Pid, LedgerKey, leveled_codec:magic_hash(LedgerKey)). + sst_get(Pid, LedgerKey, leveled_codec:segment_hash(LedgerKey)). --spec sst_get(pid(), tuple(), integer()) -> tuple()|not_present. +-spec sst_get(pid(), tuple(), {integer(), integer()}) -> tuple()|not_present. %% @doc %% Return a Key, Value pair matching a Key or not_present if the Key is not in %% the store (with the magic hash precalculated). @@ -554,7 +554,7 @@ fetch(LedgerKey, Hash, State) -> State#state{blockindex_cache = BlockIndexCache}}; <> -> PosList = find_pos(BlockIdx, - double_hash(Hash, LedgerKey), + extra_hash(Hash), [], 0), case PosList of @@ -808,9 +808,9 @@ generate_binary_slot(Lookup, KVL) -> fun({K, V}, {PosBinAcc, NoHashCount, HashAcc}) -> {_SQN, H1} = leveled_codec:strip_to_seqnhashonly({K, V}), - case is_integer(H1) of + PosH1 = extra_hash(H1), + case is_integer(PosH1) of true -> - PosH1 = double_hash(H1, K), case NoHashCount of 0 -> {<<1:1/integer, @@ -1003,7 +1003,7 @@ binaryslot_get(FullBin, Key, Hash) -> <> = BlockLengths, <> = Rest, PosList = find_pos(PosBinIndex, - double_hash(Hash, Key), + extra_hash(Hash), [], 0), {fetch_value(PosList, BlockLengths, Blocks, Key), @@ -1186,9 +1186,10 @@ block_offsetandlength(BlockLengths, BlockID) -> {BlocksPos, B1L + B2L + B3L + B4L, B5L} end. -double_hash(Hash, Key) -> - H2 = erlang:phash2(Key), - (Hash bxor H2) band 32767. +extra_hash({_SegHash, ExtraHash}) when is_integer(ExtraHash) -> + ExtraHash band 32767; +extra_hash(NotHash) -> + NotHash. fetch_value([], _BlockLengths, _Blocks, _Key) -> not_present; @@ -1548,15 +1549,15 @@ indexed_list_test() -> [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]), {TestK1, TestV1} = lists:nth(20, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), {TestK2, TestV2} = lists:nth(40, KVL1), - MH2 = leveled_codec:magic_hash(TestK2), + MH2 = leveled_codec:segment_hash(TestK2), {TestK3, TestV3} = lists:nth(60, KVL1), - MH3 = leveled_codec:magic_hash(TestK3), + MH3 = leveled_codec:segment_hash(TestK3), {TestK4, TestV4} = lists:nth(80, KVL1), - MH4 = leveled_codec:magic_hash(TestK4), + MH4 = leveled_codec:segment_hash(TestK4), {TestK5, TestV5} = lists:nth(100, KVL1), - MH5 = leveled_codec:magic_hash(TestK5), + MH5 = leveled_codec:segment_hash(TestK5), test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), @@ -1573,15 +1574,15 @@ indexed_list_mixedkeys_test() -> {_PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), {TestK1, TestV1} = lists:nth(4, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), {TestK2, TestV2} = lists:nth(8, KVL1), - MH2 = leveled_codec:magic_hash(TestK2), + MH2 = leveled_codec:segment_hash(TestK2), {TestK3, TestV3} = lists:nth(12, KVL1), - MH3 = leveled_codec:magic_hash(TestK3), + MH3 = leveled_codec:segment_hash(TestK3), {TestK4, TestV4} = lists:nth(16, KVL1), - MH4 = leveled_codec:magic_hash(TestK4), + MH4 = leveled_codec:segment_hash(TestK4), {TestK5, TestV5} = lists:nth(20, KVL1), - MH5 = leveled_codec:magic_hash(TestK5), + MH5 = leveled_codec:segment_hash(TestK5), test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), @@ -1598,7 +1599,7 @@ indexed_list_mixedkeys2_test() -> Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2, {_PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), lists:foreach(fun({K, V}) -> - MH = leveled_codec:magic_hash(K), + MH = leveled_codec:segment_hash(K), test_binary_slot(FullBin, K, MH, {K, V}) end, KVL1). @@ -1682,7 +1683,7 @@ indexed_list_mixedkeys_bitflip_test() -> end, {TestK1, _TestV1} = lists:nth(20, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), test_binary_slot(FullBin0, TestK1, MH1, not_present), ToList = binaryslot_tolist(FullBin0), @@ -1920,7 +1921,7 @@ simple_persisted_test() -> In = lists:keymember(K, 1, KVList1), case {K > FirstKey, LastKey > K, In} of {true, true, false} -> - [{K, leveled_codec:magic_hash(K), V}|Acc]; + [{K, leveled_codec:segment_hash(K), V}|Acc]; _ -> Acc end diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 1c7cad8..fc70469 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -48,7 +48,7 @@ create_bloom(HashList) -> %% Check for the presence of a given hash within a bloom check_hash(_Hash, <<>>) -> false; -check_hash(Hash, BloomBin) -> +check_hash({Hash, _ExtraHash}, BloomBin) -> SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, {Slot, H0, H1} = split_hash(Hash, SlotSplit), Mask = get_mask(H0, H1), @@ -66,14 +66,11 @@ check_hash(Hash, BloomBin) -> %%% Internal Functions %%%============================================================================ -split_hash(Hash, SlotSplit) -> - Slot = Hash band SlotSplit, - H0 = (Hash bsr 4) band (?BAND_MASK), - H1 = (Hash bsr 10) band (?BAND_MASK), - H3 = (Hash bsr 16) band (?BAND_MASK), - H4 = (Hash bsr 22) band (?BAND_MASK), - Slot0 = (Hash bsr 28) band SlotSplit, - {Slot bxor Slot0, H0 bxor H3, H1 bxor H4}. +split_hash(SegHash, SlotSplit) -> + Slot = SegHash band SlotSplit, + H0 = (SegHash bsr 4) band (?BAND_MASK), + H1 = (SegHash bsr 10) band (?BAND_MASK), + {Slot, H0, H1}. get_mask(H0, H1) -> case H0 == H1 of @@ -90,7 +87,7 @@ get_mask(H0, H1) -> add_hashlist([], _S, S0, S1) -> IntSize = ?INTEGER_SIZE, <>; -add_hashlist([TopHash|T], SlotSplit, S0, S1) -> +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1) -> {Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1), case Slot of @@ -104,7 +101,7 @@ add_hashlist([], _S, S0, S1, S2, S3) -> IntSize = ?INTEGER_SIZE, <>; -add_hashlist([TopHash|T], SlotSplit, S0, S1, S2, S3) -> +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1, S2, S3) -> {Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1), case Slot of @@ -129,7 +126,7 @@ add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA:IntSize/integer, SB:IntSize/integer, SC:IntSize/integer, SD:IntSize/integer, SE:IntSize/integer, SF:IntSize/integer>>; -add_hashlist([TopHash|T], +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF) -> @@ -254,7 +251,7 @@ get_hashlist(N) -> KVL = lists:sublist(KVL0, N), HashFun = fun({K, _V}) -> - leveled_codec:magic_hash(K) + leveled_codec:segment_hash(K) end, lists:map(HashFun, KVL).