From 87731a85f5f3b19198f39ace43028b1b78a09aeb Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 19 Oct 2017 13:51:32 +0100 Subject: [PATCH 01/14] Loop test --- src/leveled_tinybloom.erl | 60 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index f3d99ce..bf100b8 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -284,45 +284,46 @@ empty_bloom_test() -> check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})). bloom_test() -> - test_bloom(128), - test_bloom(64), - test_bloom(32), - test_bloom(16), - test_bloom(8). + test_bloom(128, 400), + test_bloom(64, 400), + test_bloom(32, 400), + test_bloom(16, 400), + test_bloom(8, 400). -test_bloom(N) -> - HashList1 = get_hashlist(N), - HashList2 = get_hashlist(N), - HashList3 = get_hashlist(N), - HashList4 = get_hashlist(N), +test_bloom(N, Runs) -> + ListOfHashLists = + lists:map(fun(_X) -> get_hashlist(N) end, lists:seq(1, Runs)), SWa = os:timestamp(), - BloomBin1 = create_bloom(HashList1), - BloomBin2 = create_bloom(HashList2), - BloomBin3 = create_bloom(HashList3), - BloomBin4 = create_bloom(HashList4), + ListOfBlooms = + lists:map(fun(HL) -> create_bloom(HL) end, ListOfHashLists), TSa = timer:now_diff(os:timestamp(), SWa), SWb = os:timestamp(), - check_all_hashes(BloomBin1, HashList1), - check_all_hashes(BloomBin2, HashList2), - check_all_hashes(BloomBin3, HashList3), - check_all_hashes(BloomBin4, HashList4), + lists:foreach(fun(Nth) -> + HL = lists:nth(Nth, ListOfHashLists), + BB = lists:nth(Nth, ListOfBlooms), + check_all_hashes(BB, HL) + end, + lists:seq(1, Runs)), TSb = timer:now_diff(os:timestamp(), SWb), HashPool = get_hashlist(N * 2), - HashListOut1 = lists:sublist(lists:subtract(HashPool, HashList1), N), - HashListOut2 = lists:sublist(lists:subtract(HashPool, HashList2), N), - HashListOut3 = lists:sublist(lists:subtract(HashPool, HashList3), N), - HashListOut4 = lists:sublist(lists:subtract(HashPool, HashList4), N), - + ListOfMisses = + lists:map(fun(HL) -> + lists:sublist(lists:subtract(HashPool, HL), N) + end, + ListOfHashLists), + SWc = os:timestamp(), - C0 = {0, 0}, - C1 = check_neg_hashes(BloomBin1, HashListOut1, C0), - C2 = check_neg_hashes(BloomBin2, HashListOut2, C1), - C3 = check_neg_hashes(BloomBin3, HashListOut3, C2), - C4 = check_neg_hashes(BloomBin4, HashListOut4, C3), - {Pos, Neg} = C4, + {Pos, Neg} = + lists:foldl(fun(Nth, Acc) -> + HL = lists:nth(Nth, ListOfMisses), + BB = lists:nth(Nth, ListOfBlooms), + check_neg_hashes(BB, HL, Acc) + end, + {0, 0}, + lists:seq(1, Runs)), FPR = Pos / (Pos + Neg), TSc = timer:now_diff(os:timestamp(), SWc), @@ -332,5 +333,4 @@ test_bloom(N) -> [N, TSa, TSb, TSc, FPR]). - -endif. From f38d3fde4bfb80a5f8af2dff9f20a1cef4906056 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 19 Oct 2017 13:56:07 +0100 Subject: [PATCH 02/14] Test frequency change --- src/leveled_tinybloom.erl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index bf100b8..67a2d47 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -239,7 +239,7 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> BNumber = string:right(integer_to_list(BucketLow + BRand), 4, $0), KNumber = string:right(integer_to_list(leveled_rand:uniform(10000)), 6, $0), LK = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, "Key" ++ KNumber, o), - Chunk = leveled_rand:rand_bytes(64), + Chunk = leveled_rand:rand_bytes(16), {_B, _K, MV, _H, _LMs} = leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity), generate_randomkeys(Seqn + 1, @@ -284,11 +284,11 @@ empty_bloom_test() -> check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})). bloom_test() -> - test_bloom(128, 400), - test_bloom(64, 400), - test_bloom(32, 400), - test_bloom(16, 400), - test_bloom(8, 400). + test_bloom(128, 2000), + test_bloom(64, 10), + test_bloom(32, 10), + test_bloom(16, 10), + test_bloom(8, 10). test_bloom(N, Runs) -> ListOfHashLists = From 1964f1055b39edaa35aa17d13532580b0d958870 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 19 Oct 2017 21:44:07 +0100 Subject: [PATCH 03/14] Add test timeout --- src/leveled_tinybloom.erl | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 67a2d47..1c7cad8 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -283,12 +283,15 @@ empty_bloom_test() -> ?assertMatch({0, 4}, check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})). -bloom_test() -> +bloom_test_() -> + {timeout, 20, fun bloom_test_ranges/0}. + +bloom_test_ranges() -> test_bloom(128, 2000), - test_bloom(64, 10), - test_bloom(32, 10), - test_bloom(16, 10), - test_bloom(8, 10). + test_bloom(64, 100), + test_bloom(32, 100), + test_bloom(16, 100), + test_bloom(8, 100). test_bloom(N, Runs) -> ListOfHashLists = From a128dcdadf57a7cd8a1e65ef5760de56bf0b5657 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 20 Oct 2017 23:04:29 +0100 Subject: [PATCH 04/14] Change hash algorithm for penciller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch from magic hash to md5 - to hopefully remove the need for some of the artificial jumps required to get expected fall positive ratios. Also split the hash into two 16-bit integers. We assume that SegmentID (from the perspective of AAE merkle/tictac trees) will always be at least 16 bits. the idea is that hashes should be used in blooms and indexes such that some advantage can be gained from just knowing the segmentID - in particular when folding over all the keys in a bucket. Performance testing has been difficult so far - I think due to “cloud” mysteries. --- src/leveled_bookie.erl | 2 +- src/leveled_codec.erl | 25 ++++++++++++++++----- src/leveled_pclerk.erl | 2 +- src/leveled_penciller.erl | 17 +++++++------- src/leveled_pmem.erl | 21 ++++++++--------- src/leveled_sst.erl | 47 ++++++++++++++++++++------------------- src/leveled_tinybloom.erl | 23 +++++++++---------- 7 files changed, 75 insertions(+), 62 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 7eb4e7a..42d5e0f 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -947,7 +947,7 @@ fetch_head(Key, Penciller, LedgerCache) -> [{Key, Head}] -> Head; [] -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), case leveled_penciller:pcl_fetch(Penciller, Key, Hash) of {Key, Head} -> maybe_longrunning(SW, pcl_head), diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index dc981e8..bd0c60d 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -65,6 +65,7 @@ integer_now/0, riak_extract_metadata/2, magic_hash/1, + segment_hash/1, to_lookup/1]). -define(V1_VERS, 1). @@ -79,6 +80,20 @@ integer()|null, % Hash of vclock - non-exportable integer()}. % Size in bytes of real object + +-spec segment_hash(any()) -> {integer(), integer()}. +%% @doc +%% Return two 16 bit integers - the segment ID and a second integer for spare +%% entropy. The hashed should be used in blooms or indexes such that some +%% speed can be gained if just the segment ID is known - but more can be +%% gained should the extended hash (with the second element) is known +segment_hash(Key) when is_binary(Key) -> + <> = + crypto:hash(md5, Key), + {SegmentID, ExtraHash}; +segment_hash(Key) -> + segment_hash(term_to_binary(Key)). + -spec magic_hash(any()) -> integer(). %% @doc %% Use DJ Bernstein magic hash function. Note, this is more expensive than @@ -87,10 +102,6 @@ %% Hash function contains mysterious constants, some explanation here as to %% what they are - %% http://stackoverflow.com/questions/10696223/reason-for-5381-number-in-djb-hash-function -magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) -> - magic_hash({Bucket, Key}); -magic_hash({?STD_TAG, Bucket, Key, _SubKey}) -> - magic_hash({Bucket, Key}); magic_hash({binary, BinaryKey}) -> H = 5381, hash1(H, BinaryKey) band 16#FFFFFFFF; @@ -516,7 +527,9 @@ parse_date(LMD, UnitMins, LimitMins, Now) -> -spec generate_ledgerkv( tuple(), integer(), any(), integer(), tuple()|infinity) -> - {any(), any(), any(), {integer()|no_lookup, integer()}, list()}. + {any(), any(), any(), + {{integer(), integer()}|no_lookup, integer()}, + list()}. %% @doc %% Function to extract from an object the information necessary to populate %% the Penciller's ledger. @@ -537,7 +550,7 @@ generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> _ -> {active, TS} end, - Hash = magic_hash(PrimaryKey), + Hash = segment_hash(PrimaryKey), {MD, LastMods} = extract_metadata(Obj, Size, Tag), ObjHash = get_objhash(Tag, MD), Value = {SQN, diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 392b13d..c412cf4 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -254,7 +254,7 @@ generate_randomkeys(Count, Acc, BucketLow, BRange) -> K = {o, "Bucket" ++ BNumber, "Key" ++ KNumber}, RandKey = {K, {Count + 1, {active, infinity}, - leveled_codec:magic_hash(K), + leveled_codec:segment_hash(K), null}}, generate_randomkeys(Count - 1, [RandKey|Acc], BucketLow, BRange). diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 4321ee6..c726d08 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -315,21 +315,22 @@ pcl_fetchlevelzero(Pid, Slot) -> %% The Key needs to be hashable (i.e. have a tag which indicates that the key %% can be looked up) - index entries are not hashable for example. %% -%% If the hash is already knonw, call pcl_fetch/3 as magic_hash is a +%% If the hash is already knonw, call pcl_fetch/3 as segment_hash is a %% relatively expensive hash function pcl_fetch(Pid, Key) -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), if Hash /= no_lookup -> gen_server:call(Pid, {fetch, Key, Hash}, infinity) end. --spec pcl_fetch(pid(), tuple(), integer()) -> {tuple(), tuple()}|not_present. +-spec pcl_fetch(pid(), tuple(), {integer(), integer()}) -> + {tuple(), tuple()}|not_present. %% @doc %% Fetch a key, return the first (highest SQN) occurrence of that Key along %% with the value. %% -%% Hash should be result of leveled_codec:magic_hash(Key) +%% Hash should be result of leveled_codec:segment_hash(Key) pcl_fetch(Pid, Key, Hash) -> gen_server:call(Pid, {fetch, Key, Hash}, infinity). @@ -367,7 +368,7 @@ pcl_fetchnextkey(Pid, StartKey, EndKey, AccFun, InitAcc) -> %% If the key is not present, it will be assumed that a higher sequence number %% tombstone once existed, and false will be returned. pcl_checksequencenumber(Pid, Key, SQN) -> - Hash = leveled_codec:magic_hash(Key), + Hash = leveled_codec:segment_hash(Key), if Hash /= no_lookup -> gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity) @@ -1317,7 +1318,7 @@ generate_randomkeys(Count, SQN, Acc) -> RandKey = {K, {SQN, {active, infinity}, - leveled_codec:magic_hash(K), + leveled_codec:segment_hash(K), null}}, generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). @@ -1347,7 +1348,7 @@ maybe_pause_push(PCL, KL) -> T1 = lists:foldl(fun({K, V}, {AccSL, AccIdx, MinSQN, MaxSQN}) -> UpdSL = [{K, V}|AccSL], SQN = leveled_codec:strip_to_seqonly({K, V}), - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), UpdIdx = leveled_pmem:prepare_for_index(AccIdx, H), {UpdSL, UpdIdx, min(SQN, MinSQN), max(SQN, MaxSQN)} end, @@ -1366,7 +1367,7 @@ maybe_pause_push(PCL, KL) -> %% old test data doesn't have the magic hash add_missing_hash({K, {SQN, ST, MD}}) -> - {K, {SQN, ST, leveled_codec:magic_hash(K), MD}}. + {K, {SQN, ST, leveled_codec:segment_hash(K), MD}}. clean_dir_test() -> diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 0846e3b..40aabfe 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -50,7 +50,8 @@ %%% API %%%============================================================================ --spec prepare_for_index(index_array(), integer()|no_lookup) -> index_array(). +-spec prepare_for_index(index_array(), {integer(), integer()}|no_lookup) + -> index_array(). %% @doc %% Add the hash of a key to the index. This is 'prepared' in the sense that %% this index is not use until it is loaded into the main index. @@ -95,7 +96,7 @@ new_index() -> clear_index(_L0Index) -> new_index(). --spec check_index(integer(), index_array()) -> list(integer()). +-spec check_index({integer(), integer()}, index_array()) -> list(integer()). %% @doc %% return a list of positions in the list of cache arrays that may contain the %% key associated with the hash being checked @@ -158,9 +159,9 @@ to_list(Slots, FetchFun) -> %% checked (with the most recently received cache being checked first) until a %% match is found. check_levelzero(Key, PosList, TreeList) -> - check_levelzero(Key, leveled_codec:magic_hash(Key), PosList, TreeList). + check_levelzero(Key, leveled_codec:segment_hash(Key), PosList, TreeList). --spec check_levelzero(tuple(), integer(), list(integer()), list()) +-spec check_levelzero(tuple(), {integer(), integer()}, list(integer()), list()) -> {boolean(), tuple|not_found}. %% @doc %% Check for the presence of a given Key in the Level Zero cache, with the @@ -204,10 +205,10 @@ find_pos(<<0:1/integer, NxtSlot:7/integer, T/binary>>, Hash, PosList, _SlotID) - find_pos(T, Hash, PosList, NxtSlot). -split_hash(Hash) -> - Slot = Hash band 255, - H0 = (Hash bsr 8) band 8388607, - {Slot, H0}. +split_hash({SegmentID, ExtraHash}) -> + Slot = SegmentID band 255, + H0 = (SegmentID bsr 8) bor (ExtraHash bsl 8), + {Slot, H0 band 8388607}. check_slotlist(Key, _Hash, CheckList, TreeList) -> SlotCheckFun = @@ -358,7 +359,7 @@ with_index_test_() -> with_index_test2() -> IndexPrepareFun = fun({K, _V}, Acc) -> - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), prepare_for_index(Acc, H) end, LoadFun = @@ -382,7 +383,7 @@ with_index_test2() -> CheckFun = fun({K, V}, {L0Idx, L0Cache}) -> - H = leveled_codec:magic_hash(K), + H = leveled_codec:segment_hash(K), PosList = check_index(H, L0Idx), ?assertMatch({true, {K, V}}, check_slotlist(K, H, PosList, L0Cache)), diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 506a6b4..89bf729 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -237,12 +237,12 @@ sst_newlevelzero(RootPath, Filename, Slots, FetchFun, Penciller, MaxSQN) -> -spec sst_get(pid(), tuple()) -> tuple()|not_present. %% @doc %% Return a Key, Value pair matching a Key or not_present if the Key is not in -%% the store. The magic_hash function is used to accelerate the seeking of +%% the store. The segment_hash function is used to accelerate the seeking of %% keys, sst_get/3 should be used directly if this has already been calculated sst_get(Pid, LedgerKey) -> - sst_get(Pid, LedgerKey, leveled_codec:magic_hash(LedgerKey)). + sst_get(Pid, LedgerKey, leveled_codec:segment_hash(LedgerKey)). --spec sst_get(pid(), tuple(), integer()) -> tuple()|not_present. +-spec sst_get(pid(), tuple(), {integer(), integer()}) -> tuple()|not_present. %% @doc %% Return a Key, Value pair matching a Key or not_present if the Key is not in %% the store (with the magic hash precalculated). @@ -554,7 +554,7 @@ fetch(LedgerKey, Hash, State) -> State#state{blockindex_cache = BlockIndexCache}}; <> -> PosList = find_pos(BlockIdx, - double_hash(Hash, LedgerKey), + extra_hash(Hash), [], 0), case PosList of @@ -808,9 +808,9 @@ generate_binary_slot(Lookup, KVL) -> fun({K, V}, {PosBinAcc, NoHashCount, HashAcc}) -> {_SQN, H1} = leveled_codec:strip_to_seqnhashonly({K, V}), - case is_integer(H1) of + PosH1 = extra_hash(H1), + case is_integer(PosH1) of true -> - PosH1 = double_hash(H1, K), case NoHashCount of 0 -> {<<1:1/integer, @@ -1003,7 +1003,7 @@ binaryslot_get(FullBin, Key, Hash) -> <> = BlockLengths, <> = Rest, PosList = find_pos(PosBinIndex, - double_hash(Hash, Key), + extra_hash(Hash), [], 0), {fetch_value(PosList, BlockLengths, Blocks, Key), @@ -1186,9 +1186,10 @@ block_offsetandlength(BlockLengths, BlockID) -> {BlocksPos, B1L + B2L + B3L + B4L, B5L} end. -double_hash(Hash, Key) -> - H2 = erlang:phash2(Key), - (Hash bxor H2) band 32767. +extra_hash({_SegHash, ExtraHash}) when is_integer(ExtraHash) -> + ExtraHash band 32767; +extra_hash(NotHash) -> + NotHash. fetch_value([], _BlockLengths, _Blocks, _Key) -> not_present; @@ -1548,15 +1549,15 @@ indexed_list_test() -> [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]), {TestK1, TestV1} = lists:nth(20, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), {TestK2, TestV2} = lists:nth(40, KVL1), - MH2 = leveled_codec:magic_hash(TestK2), + MH2 = leveled_codec:segment_hash(TestK2), {TestK3, TestV3} = lists:nth(60, KVL1), - MH3 = leveled_codec:magic_hash(TestK3), + MH3 = leveled_codec:segment_hash(TestK3), {TestK4, TestV4} = lists:nth(80, KVL1), - MH4 = leveled_codec:magic_hash(TestK4), + MH4 = leveled_codec:segment_hash(TestK4), {TestK5, TestV5} = lists:nth(100, KVL1), - MH5 = leveled_codec:magic_hash(TestK5), + MH5 = leveled_codec:segment_hash(TestK5), test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), @@ -1573,15 +1574,15 @@ indexed_list_mixedkeys_test() -> {_PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), {TestK1, TestV1} = lists:nth(4, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), {TestK2, TestV2} = lists:nth(8, KVL1), - MH2 = leveled_codec:magic_hash(TestK2), + MH2 = leveled_codec:segment_hash(TestK2), {TestK3, TestV3} = lists:nth(12, KVL1), - MH3 = leveled_codec:magic_hash(TestK3), + MH3 = leveled_codec:segment_hash(TestK3), {TestK4, TestV4} = lists:nth(16, KVL1), - MH4 = leveled_codec:magic_hash(TestK4), + MH4 = leveled_codec:segment_hash(TestK4), {TestK5, TestV5} = lists:nth(20, KVL1), - MH5 = leveled_codec:magic_hash(TestK5), + MH5 = leveled_codec:segment_hash(TestK5), test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}), test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}), @@ -1598,7 +1599,7 @@ indexed_list_mixedkeys2_test() -> Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2, {_PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), lists:foreach(fun({K, V}) -> - MH = leveled_codec:magic_hash(K), + MH = leveled_codec:segment_hash(K), test_binary_slot(FullBin, K, MH, {K, V}) end, KVL1). @@ -1682,7 +1683,7 @@ indexed_list_mixedkeys_bitflip_test() -> end, {TestK1, _TestV1} = lists:nth(20, KVL1), - MH1 = leveled_codec:magic_hash(TestK1), + MH1 = leveled_codec:segment_hash(TestK1), test_binary_slot(FullBin0, TestK1, MH1, not_present), ToList = binaryslot_tolist(FullBin0), @@ -1920,7 +1921,7 @@ simple_persisted_test() -> In = lists:keymember(K, 1, KVList1), case {K > FirstKey, LastKey > K, In} of {true, true, false} -> - [{K, leveled_codec:magic_hash(K), V}|Acc]; + [{K, leveled_codec:segment_hash(K), V}|Acc]; _ -> Acc end diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 1c7cad8..fc70469 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -48,7 +48,7 @@ create_bloom(HashList) -> %% Check for the presence of a given hash within a bloom check_hash(_Hash, <<>>) -> false; -check_hash(Hash, BloomBin) -> +check_hash({Hash, _ExtraHash}, BloomBin) -> SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, {Slot, H0, H1} = split_hash(Hash, SlotSplit), Mask = get_mask(H0, H1), @@ -66,14 +66,11 @@ check_hash(Hash, BloomBin) -> %%% Internal Functions %%%============================================================================ -split_hash(Hash, SlotSplit) -> - Slot = Hash band SlotSplit, - H0 = (Hash bsr 4) band (?BAND_MASK), - H1 = (Hash bsr 10) band (?BAND_MASK), - H3 = (Hash bsr 16) band (?BAND_MASK), - H4 = (Hash bsr 22) band (?BAND_MASK), - Slot0 = (Hash bsr 28) band SlotSplit, - {Slot bxor Slot0, H0 bxor H3, H1 bxor H4}. +split_hash(SegHash, SlotSplit) -> + Slot = SegHash band SlotSplit, + H0 = (SegHash bsr 4) band (?BAND_MASK), + H1 = (SegHash bsr 10) band (?BAND_MASK), + {Slot, H0, H1}. get_mask(H0, H1) -> case H0 == H1 of @@ -90,7 +87,7 @@ get_mask(H0, H1) -> add_hashlist([], _S, S0, S1) -> IntSize = ?INTEGER_SIZE, <>; -add_hashlist([TopHash|T], SlotSplit, S0, S1) -> +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1) -> {Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1), case Slot of @@ -104,7 +101,7 @@ add_hashlist([], _S, S0, S1, S2, S3) -> IntSize = ?INTEGER_SIZE, <>; -add_hashlist([TopHash|T], SlotSplit, S0, S1, S2, S3) -> +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1, S2, S3) -> {Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1), case Slot of @@ -129,7 +126,7 @@ add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA:IntSize/integer, SB:IntSize/integer, SC:IntSize/integer, SD:IntSize/integer, SE:IntSize/integer, SF:IntSize/integer>>; -add_hashlist([TopHash|T], +add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF) -> @@ -254,7 +251,7 @@ get_hashlist(N) -> KVL = lists:sublist(KVL0, N), HashFun = fun({K, _V}) -> - leveled_codec:magic_hash(K) + leveled_codec:segment_hash(K) end, lists:map(HashFun, KVL). From 36264eb41683dd66215ddcff6eb38c01615b48eb Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 13:19:30 +0100 Subject: [PATCH 05/14] Search range failure Discovered a bug with search ranges in leveled_tree - this was uncovered by an intermittently fialing 19.3 test. Test case added and bug fixed. It was due to a fialure to use end_key passed causing issues with particular manifests and full bucket ranges. --- src/leveled_log.erl | 2 ++ src/leveled_penciller.erl | 2 ++ src/leveled_pmanifest.erl | 43 +++++++++++++++++++++++++++++++++ src/leveled_tree.erl | 24 +++++++++++++++++- test/end_to_end/basic_SUITE.erl | 16 ++++++------ 5 files changed, 78 insertions(+), 9 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index 59892dd..c3508b0 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -144,6 +144,8 @@ ++ "leaving SnapshotCount=~w and MinSQN=~w"}}, {"P0040", {info, "Archiving filename ~s as unused at startup"}}, + {"P0041", + {info, "Penciller manifest switched from SQN ~w to ~w"}}, {"PC001", {info, "Penciller's clerk ~w started with owner ~w"}}, diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index c726d08..8abe56b 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -673,6 +673,8 @@ handle_call(doom, _From, State) -> handle_cast({manifest_change, NewManifest}, State) -> NewManSQN = leveled_pmanifest:get_manifest_sqn(NewManifest), + OldManSQN = leveled_pmanifest:get_manifest_sqn(State#state.manifest), + leveled_log:log("P0041", [OldManSQN, NewManSQN]), ok = leveled_pclerk:clerk_promptdeletions(State#state.clerk, NewManSQN), UpdManifest = leveled_pmanifest:merge_snapshot(State#state.manifest, NewManifest), diff --git a/src/leveled_pmanifest.erl b/src/leveled_pmanifest.erl index 3970c7e..ba6e9b7 100644 --- a/src/leveled_pmanifest.erl +++ b/src/leveled_pmanifest.erl @@ -1128,6 +1128,49 @@ snapshot_timeout_test() -> Man10 = release_snapshot(Man9, ?PHANTOM_PID), ?assertMatch(0, length(Man10#manifest.snapshots)). +potential_issue_test() -> + Manifest = + {manifest,{array,9,0,[], + {[], + [{manifest_entry,{o_rkv,"Bucket","Key10",null}, + {o_rkv,"Bucket","Key12949",null}, + "<0.313.0>","./16_1_0.sst"}, + {manifest_entry,{o_rkv,"Bucket","Key129490",null}, + {o_rkv,"Bucket","Key158981",null}, + "<0.315.0>","./16_1_1.sst"}, + {manifest_entry,{o_rkv,"Bucket","Key158982",null}, + {o_rkv,"Bucket","Key188472",null}, + "<0.316.0>","./16_1_2.sst"}], + {idxt,1, + {{[{{o_rkv,"Bucket1","Key1",null}, + {manifest_entry,{o_rkv,"Bucket","Key9083",null}, + {o_rkv,"Bucket1","Key1",null}, + "<0.320.0>","./16_1_6.sst"}}]}, + {1,{{o_rkv,"Bucket1","Key1",null},1,nil,nil}}}}, + {idxt,0,{{},{0,nil}}}, + {idxt,0,{{},{0,nil}}}, + {idxt,0,{{},{0,nil}}}, + {idxt,0,{{},{0,nil}}}, + {idxt,0,{{},{0,nil}}}, + {idxt,0,{{},{0,nil}}}, + []}}, + 19,[],0, + {dict,0,16,16,8,80,48, + {[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}, + {{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}}}, + 2}, + Range1 = range_lookup(Manifest, + 1, + {o_rkv, "Bucket", null, null}, + {o_rkv, "Bucket", null, null}), + Range2 = range_lookup(Manifest, + 2, + {o_rkv, "Bucket", null, null}, + {o_rkv, "Bucket", null, null}), + io:format("Range in Level 1 ~w~n", [Range1]), + io:format("Range in Level 2 ~w~n", [Range2]), + ?assertMatch(3, length(Range1)), + ?assertMatch(1, length(Range2)). -endif. diff --git a/src/leveled_tree.erl b/src/leveled_tree.erl index da171c9..8079f20 100644 --- a/src/leveled_tree.erl +++ b/src/leveled_tree.erl @@ -214,7 +214,7 @@ search_range(StartRange, EndRange, Tree, StartKeyFun) -> EndRangeFun = fun(ER, _FirstRHSKey, FirstRHSValue) -> StartRHSKey = StartKeyFun(FirstRHSValue), - ER >= StartRHSKey + not leveled_codec:endkey_passed(ER, StartRHSKey) end, case Tree of {tree, _L, T} -> @@ -405,8 +405,12 @@ idxtlookup_range_end(EndRange, {TLI, NK0, SL0}, Iter0, Output, EndRangeFun) -> [{FirstRHSKey, FirstRHSValue}|_Rest] -> case EndRangeFun(EndRange, FirstRHSKey, FirstRHSValue) of true -> + % The start key is not after the end of the range + % and so this should be included in the range Output ++ LHS ++ [{FirstRHSKey, FirstRHSValue}]; false -> + % the start key of the next key is after the end + % of the range and so should not be included Output ++ LHS end end; @@ -804,4 +808,22 @@ empty_test() -> T2 = empty(idxt), ?assertMatch(0, tsize(T2)). +search_range_idx_test() -> + Tree = + {idxt,1, + {{[{{o_rkv,"Bucket1","Key1",null}, + {manifest_entry,{o_rkv,"Bucket","Key9083",null}, + {o_rkv,"Bucket1","Key1",null}, + "<0.320.0>","./16_1_6.sst"}}]}, + {1,{{o_rkv,"Bucket1","Key1",null},1,nil,nil}}}}, + StartKeyFun = + fun(ME) -> + ME#manifest_entry.start_key + end, + R = search_range({o_rkv, "Bucket", null, null}, + {o_rkv, "Bucket", null, null}, + Tree, + StartKeyFun), + ?assertMatch(1, length(R)). + -endif. diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl index 92ca46e..3c2e550 100644 --- a/test/end_to_end/basic_SUITE.erl +++ b/test/end_to_end/basic_SUITE.erl @@ -333,8 +333,8 @@ load_and_count(_Config) -> Bookie1, TestObject, G1), - {_S, Count} = testutil:check_bucket_stats(Bookie1, - "Bucket"), + {_S, Count} = + testutil:check_bucket_stats(Bookie1, "Bucket"), if Acc + 5000 == Count -> ok @@ -351,8 +351,8 @@ load_and_count(_Config) -> Bookie1, TestObject, G2), - {_S, Count} = testutil:check_bucket_stats(Bookie1, - "Bucket"), + {_S, Count} = + testutil:check_bucket_stats(Bookie1, "Bucket"), if Acc + 5000 == Count -> ok @@ -368,8 +368,8 @@ load_and_count(_Config) -> Bookie1, TestObject, G1), - {_S, Count} = testutil:check_bucket_stats(Bookie1, - "Bucket"), + {_S, Count} = + testutil:check_bucket_stats(Bookie1, "Bucket"), if Count == 200000 -> ok @@ -385,8 +385,8 @@ load_and_count(_Config) -> Bookie1, TestObject, G2), - {_S, Count} = testutil:check_bucket_stats(Bookie1, - "Bucket"), + {_S, Count} = + testutil:check_bucket_stats(Bookie1, "Bucket"), if Acc + 5000 == Count -> ok From 26aa573ce149359c8bd4cc9bba4f01b33d75c3c5 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 14:32:04 +0100 Subject: [PATCH 06/14] Switch segment and extra hash More entropy by using the position index with the segment hash - so this would be a better filter to apply. Also could increase the key count now, as extra hash can be larger. As an aside - a leveled_iclerk unit test failure appeared - the range was just wrong. Don't know why this strated happening --- src/leveled_iclerk.erl | 4 ++-- src/leveled_sst.erl | 4 ++-- src/leveled_tinybloom.erl | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl index f19746a..5c6b126 100644 --- a/src/leveled_iclerk.erl +++ b/src/leveled_iclerk.erl @@ -648,8 +648,8 @@ schedule_test_bycount(N) -> ?assertMatch(true, SecondsToCompaction0 < 5700), SecondsToCompaction1 = schedule_compaction([14], N, CurrentTS), % tomorrow! io:format("Seconds to compaction ~w~n", [SecondsToCompaction1]), - ?assertMatch(true, SecondsToCompaction1 > 81000), - ?assertMatch(true, SecondsToCompaction1 < 84300). + ?assertMatch(true, SecondsToCompaction1 >= 81180), + ?assertMatch(true, SecondsToCompaction1 =< 84780). simple_score_test() -> diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 89bf729..3ec82a1 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -1186,8 +1186,8 @@ block_offsetandlength(BlockLengths, BlockID) -> {BlocksPos, B1L + B2L + B3L + B4L, B5L} end. -extra_hash({_SegHash, ExtraHash}) when is_integer(ExtraHash) -> - ExtraHash band 32767; +extra_hash({SegHash, _ExtraHash}) when is_integer(SegHash) -> + SegHash band 32767; extra_hash(NotHash) -> NotHash. diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index fc70469..3c21f3f 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -48,7 +48,7 @@ create_bloom(HashList) -> %% Check for the presence of a given hash within a bloom check_hash(_Hash, <<>>) -> false; -check_hash({Hash, _ExtraHash}, BloomBin) -> +check_hash({_SegHash, Hash}, BloomBin) -> SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, {Slot, H0, H1} = split_hash(Hash, SlotSplit), Mask = get_mask(H0, H1), @@ -66,10 +66,10 @@ check_hash({Hash, _ExtraHash}, BloomBin) -> %%% Internal Functions %%%============================================================================ -split_hash(SegHash, SlotSplit) -> - Slot = SegHash band SlotSplit, - H0 = (SegHash bsr 4) band (?BAND_MASK), - H1 = (SegHash bsr 10) band (?BAND_MASK), +split_hash(Hash, SlotSplit) -> + Slot = Hash band SlotSplit, + H0 = (Hash bsr 4) band (?BAND_MASK), + H1 = (Hash bsr 10) band (?BAND_MASK), {Slot, H0, H1}. get_mask(H0, H1) -> @@ -87,7 +87,7 @@ get_mask(H0, H1) -> add_hashlist([], _S, S0, S1) -> IntSize = ?INTEGER_SIZE, <>; -add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1) -> +add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> {Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1), case Slot of @@ -101,7 +101,7 @@ add_hashlist([], _S, S0, S1, S2, S3) -> IntSize = ?INTEGER_SIZE, <>; -add_hashlist([{TopHash, _ExtraHash}|T], SlotSplit, S0, S1, S2, S3) -> +add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> {Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1), case Slot of @@ -126,7 +126,7 @@ add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA:IntSize/integer, SB:IntSize/integer, SC:IntSize/integer, SD:IntSize/integer, SE:IntSize/integer, SF:IntSize/integer>>; -add_hashlist([{TopHash, _ExtraHash}|T], +add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF) -> From 3fd5260cd94dd076332aa885d17f8f5fe91d2d56 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 15:15:15 +0100 Subject: [PATCH 07/14] Use lower fpr tinyblooms ... but maybe they're slower? --- src/leveled_codec.erl | 4 +- src/leveled_tinybloom.erl | 136 ++++++++++++-------------------------- 2 files changed, 44 insertions(+), 96 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index bd0c60d..0a190ae 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -83,12 +83,12 @@ -spec segment_hash(any()) -> {integer(), integer()}. %% @doc -%% Return two 16 bit integers - the segment ID and a second integer for spare +%% Return two integers - the segment ID and a second integer for spare %% entropy. The hashed should be used in blooms or indexes such that some %% speed can be gained if just the segment ID is known - but more can be %% gained should the extended hash (with the second element) is known segment_hash(Key) when is_binary(Key) -> - <> = + <> = crypto:hash(md5, Key), {SegmentID, ExtraHash}; segment_hash(Key) -> diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 3c21f3f..880e260 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -16,9 +16,12 @@ check_hash/2 ]). --define(BITS_PER_KEY, 8). % Must be 8 or 4 --define(INTEGER_SIZE, ?BITS_PER_KEY * 8). --define(BAND_MASK, ?INTEGER_SIZE - 1). +-define(BLOOM_SIZE_BITS, 128). + % Size of each bloom in bits + % If hash space is now split into 8 different blooms of this size there + % will be 8 bits per key. +-define(BLOOM_SIZE_BYTES, 16). % Bits divided by 8 +-define(BAND_MASK, ?BLOOM_SIZE_BITS - 1). %%%============================================================================ @@ -34,9 +37,8 @@ create_bloom(HashList) -> <<>>; L when L > 32 -> add_hashlist(HashList, - 15, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0); + 7, + 0, 0, 0, 0, 0, 0, 0, 0); L when L > 16 -> add_hashlist(HashList, 3, 0, 0, 0, 0); _ -> @@ -49,11 +51,12 @@ create_bloom(HashList) -> check_hash(_Hash, <<>>) -> false; check_hash({_SegHash, Hash}, BloomBin) -> - SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, - {Slot, H0, H1} = split_hash(Hash, SlotSplit), - Mask = get_mask(H0, H1), - Pos = Slot * ?BITS_PER_KEY, - IntSize = ?INTEGER_SIZE, + SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1, + {Slot, H0, H1, H2, H3} = split_hash(Hash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), + IntSize = ?BLOOM_SIZE_BITS, + Pos = Slot * ?BLOOM_SIZE_BYTES, + io:format("Pos ~w SlotSplit ~w BloomSize ~w~n", [Pos, SlotSplit, byte_size(BloomBin)]), <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, case CheckInt band Mask of Mask -> @@ -69,27 +72,26 @@ check_hash({_SegHash, Hash}, BloomBin) -> split_hash(Hash, SlotSplit) -> Slot = Hash band SlotSplit, H0 = (Hash bsr 4) band (?BAND_MASK), - H1 = (Hash bsr 10) band (?BAND_MASK), - {Slot, H0, H1}. + H1 = (Hash bsr 11) band (?BAND_MASK), + H2 = (Hash bsr 18) band (?BAND_MASK), + H3 = (Hash bsr 25) band (?BAND_MASK), + {Slot, H0, H1, H2, H3}. -get_mask(H0, H1) -> - case H0 == H1 of - true -> - 1 bsl H0; - false -> - (1 bsl H0) + (1 bsl H1) - end. +get_mask(H0, H1, H2, H3) -> + lists:foldl(fun(H, Acc) -> Acc + (1 bsl H) end, + 0, + lists:usort([H0, H1, H2, H3])). %% This looks ugly and clunky, but in tests it was quicker than modifying an %% Erlang term like an array as it is passed around the loop add_hashlist([], _S, S0, S1) -> - IntSize = ?INTEGER_SIZE, + IntSize = ?BLOOM_SIZE_BITS, <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1); @@ -98,12 +100,12 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> end. add_hashlist([], _S, S0, S1, S2, S3) -> - IntSize = ?INTEGER_SIZE, + IntSize = ?BLOOM_SIZE_BITS, <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); @@ -115,104 +117,50 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask) end. -add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF) -> - IntSize = ?INTEGER_SIZE, +add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7) -> + IntSize = ?BLOOM_SIZE_BITS, <>; + S6:IntSize/integer, S7:IntSize/integer>>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + S0, S1, S2, S3, S4, S5, S6, S7) -> + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, SlotSplit, - S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0 bor Mask, S1, S2, S3, S4, S5, S6, S7); 1 -> add_hashlist(T, SlotSplit, - S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1 bor Mask, S2, S3, S4, S5, S6, S7); 2 -> add_hashlist(T, SlotSplit, - S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2 bor Mask, S3, S4, S5, S6, S7); 3 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2, S3 bor Mask, S4, S5, S6, S7); 4 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2, S3, S4 bor Mask, S5, S6, S7); 5 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2, S3, S4, S5 bor Mask, S6, S7); 6 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2, S3, S4, S5, S6 bor Mask, S7); 7 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, S8, S9, - SA, SB, SC, SD, SE, SF); - 8 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8 bor Mask, S9, - SA, SB, SC, SD, SE, SF); - 9 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9 bor Mask, - SA, SB, SC, SD, SE, SF); - 10 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA bor Mask, SB, SC, SD, SE, SF); - 11 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB bor Mask, SC, SD, SE, SF); - 12 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC bor Mask, SD, SE, SF); - 13 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD bor Mask, SE, SF); - 14 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE bor Mask, SF); - 15 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF bor Mask) + S0, S1, S2, S3, S4, S5, S6, S7 bor Mask) end. From 29a2d9fc35bcb5567f81608771f660fd4deae83b Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 15:16:25 +0100 Subject: [PATCH 08/14] Revert "Use lower fpr tinyblooms" This reverts commit 3fd5260cd94dd076332aa885d17f8f5fe91d2d56. --- src/leveled_codec.erl | 4 +- src/leveled_tinybloom.erl | 136 ++++++++++++++++++++++++++------------ 2 files changed, 96 insertions(+), 44 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 0a190ae..bd0c60d 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -83,12 +83,12 @@ -spec segment_hash(any()) -> {integer(), integer()}. %% @doc -%% Return two integers - the segment ID and a second integer for spare +%% Return two 16 bit integers - the segment ID and a second integer for spare %% entropy. The hashed should be used in blooms or indexes such that some %% speed can be gained if just the segment ID is known - but more can be %% gained should the extended hash (with the second element) is known segment_hash(Key) when is_binary(Key) -> - <> = + <> = crypto:hash(md5, Key), {SegmentID, ExtraHash}; segment_hash(Key) -> diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 880e260..3c21f3f 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -16,12 +16,9 @@ check_hash/2 ]). --define(BLOOM_SIZE_BITS, 128). - % Size of each bloom in bits - % If hash space is now split into 8 different blooms of this size there - % will be 8 bits per key. --define(BLOOM_SIZE_BYTES, 16). % Bits divided by 8 --define(BAND_MASK, ?BLOOM_SIZE_BITS - 1). +-define(BITS_PER_KEY, 8). % Must be 8 or 4 +-define(INTEGER_SIZE, ?BITS_PER_KEY * 8). +-define(BAND_MASK, ?INTEGER_SIZE - 1). %%%============================================================================ @@ -37,8 +34,9 @@ create_bloom(HashList) -> <<>>; L when L > 32 -> add_hashlist(HashList, - 7, - 0, 0, 0, 0, 0, 0, 0, 0); + 15, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); L when L > 16 -> add_hashlist(HashList, 3, 0, 0, 0, 0); _ -> @@ -51,12 +49,11 @@ create_bloom(HashList) -> check_hash(_Hash, <<>>) -> false; check_hash({_SegHash, Hash}, BloomBin) -> - SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1, - {Slot, H0, H1, H2, H3} = split_hash(Hash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), - IntSize = ?BLOOM_SIZE_BITS, - Pos = Slot * ?BLOOM_SIZE_BYTES, - io:format("Pos ~w SlotSplit ~w BloomSize ~w~n", [Pos, SlotSplit, byte_size(BloomBin)]), + SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, + {Slot, H0, H1} = split_hash(Hash, SlotSplit), + Mask = get_mask(H0, H1), + Pos = Slot * ?BITS_PER_KEY, + IntSize = ?INTEGER_SIZE, <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, case CheckInt band Mask of Mask -> @@ -72,26 +69,27 @@ check_hash({_SegHash, Hash}, BloomBin) -> split_hash(Hash, SlotSplit) -> Slot = Hash band SlotSplit, H0 = (Hash bsr 4) band (?BAND_MASK), - H1 = (Hash bsr 11) band (?BAND_MASK), - H2 = (Hash bsr 18) band (?BAND_MASK), - H3 = (Hash bsr 25) band (?BAND_MASK), - {Slot, H0, H1, H2, H3}. + H1 = (Hash bsr 10) band (?BAND_MASK), + {Slot, H0, H1}. -get_mask(H0, H1, H2, H3) -> - lists:foldl(fun(H, Acc) -> Acc + (1 bsl H) end, - 0, - lists:usort([H0, H1, H2, H3])). +get_mask(H0, H1) -> + case H0 == H1 of + true -> + 1 bsl H0; + false -> + (1 bsl H0) + (1 bsl H1) + end. %% This looks ugly and clunky, but in tests it was quicker than modifying an %% Erlang term like an array as it is passed around the loop add_hashlist([], _S, S0, S1) -> - IntSize = ?BLOOM_SIZE_BITS, + IntSize = ?INTEGER_SIZE, <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + {Slot, H0, H1} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1); @@ -100,12 +98,12 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> end. add_hashlist([], _S, S0, S1, S2, S3) -> - IntSize = ?BLOOM_SIZE_BITS, + IntSize = ?INTEGER_SIZE, <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + {Slot, H0, H1} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); @@ -117,50 +115,104 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask) end. -add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7) -> - IntSize = ?BLOOM_SIZE_BITS, +add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF) -> + IntSize = ?INTEGER_SIZE, <>; + S6:IntSize/integer, S7:IntSize/integer, + S8:IntSize/integer, S9:IntSize/integer, + SA:IntSize/integer, SB:IntSize/integer, + SC:IntSize/integer, SD:IntSize/integer, + SE:IntSize/integer, SF:IntSize/integer>>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF) -> + {Slot, H0, H1} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1), case Slot of 0 -> add_hashlist(T, SlotSplit, - S0 bor Mask, S1, S2, S3, S4, S5, S6, S7); + S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF); 1 -> add_hashlist(T, SlotSplit, - S0, S1 bor Mask, S2, S3, S4, S5, S6, S7); + S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF); 2 -> add_hashlist(T, SlotSplit, - S0, S1, S2 bor Mask, S3, S4, S5, S6, S7); + S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF); 3 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3 bor Mask, S4, S5, S6, S7); + S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF); 4 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4 bor Mask, S5, S6, S7); + S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF); 5 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5 bor Mask, S6, S7); + S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF); 6 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5, S6 bor Mask, S7); + S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, S8, S9, + SA, SB, SC, SD, SE, SF); 7 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7 bor Mask) + S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, S8, S9, + SA, SB, SC, SD, SE, SF); + 8 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7, S8 bor Mask, S9, + SA, SB, SC, SD, SE, SF); + 9 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9 bor Mask, + SA, SB, SC, SD, SE, SF); + 10 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA bor Mask, SB, SC, SD, SE, SF); + 11 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB bor Mask, SC, SD, SE, SF); + 12 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC bor Mask, SD, SE, SF); + 13 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD bor Mask, SE, SF); + 14 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE bor Mask, SF); + 15 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + SA, SB, SC, SD, SE, SF bor Mask) end. From d5bcccf0ecaff4492657020a68a7b587bd3bbfa8 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 15:20:59 +0100 Subject: [PATCH 09/14] Check fpr with 4 keys Up key count in bloom --- src/leveled_tinybloom.erl | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 3c21f3f..7ea905d 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -50,8 +50,8 @@ check_hash(_Hash, <<>>) -> false; check_hash({_SegHash, Hash}, BloomBin) -> SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, - {Slot, H0, H1} = split_hash(Hash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(Hash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), Pos = Slot * ?BITS_PER_KEY, IntSize = ?INTEGER_SIZE, <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, @@ -70,15 +70,12 @@ split_hash(Hash, SlotSplit) -> Slot = Hash band SlotSplit, H0 = (Hash bsr 4) band (?BAND_MASK), H1 = (Hash bsr 10) band (?BAND_MASK), - {Slot, H0, H1}. + H2 = (Hash bsr 16) band (?BAND_MASK), + H3 = (Hash bsr 24) band (?BAND_MASK), + {Slot, H0, H1, H2, H3}. -get_mask(H0, H1) -> - case H0 == H1 of - true -> - 1 bsl H0; - false -> - (1 bsl H0) + (1 bsl H1) - end. +get_mask(H0, H1, H2, H3) -> + (1 bsl H0) bor (1 bsl H1) bor (1 bsl H2) bor (1 bsl H3). %% This looks ugly and clunky, but in tests it was quicker than modifying an @@ -88,8 +85,8 @@ add_hashlist([], _S, S0, S1) -> IntSize = ?INTEGER_SIZE, <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1); @@ -102,8 +99,8 @@ add_hashlist([], _S, S0, S1, S2, S3) -> <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); @@ -130,8 +127,8 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, From 74c28b52c931c39dfe78aa5b8ca8b0f5a9c49ade Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 15:21:07 +0100 Subject: [PATCH 10/14] Revert "Check fpr with 4 keys" This reverts commit d5bcccf0ecaff4492657020a68a7b587bd3bbfa8. --- src/leveled_tinybloom.erl | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 7ea905d..3c21f3f 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -50,8 +50,8 @@ check_hash(_Hash, <<>>) -> false; check_hash({_SegHash, Hash}, BloomBin) -> SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, - {Slot, H0, H1, H2, H3} = split_hash(Hash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + {Slot, H0, H1} = split_hash(Hash, SlotSplit), + Mask = get_mask(H0, H1), Pos = Slot * ?BITS_PER_KEY, IntSize = ?INTEGER_SIZE, <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, @@ -70,12 +70,15 @@ split_hash(Hash, SlotSplit) -> Slot = Hash band SlotSplit, H0 = (Hash bsr 4) band (?BAND_MASK), H1 = (Hash bsr 10) band (?BAND_MASK), - H2 = (Hash bsr 16) band (?BAND_MASK), - H3 = (Hash bsr 24) band (?BAND_MASK), - {Slot, H0, H1, H2, H3}. + {Slot, H0, H1}. -get_mask(H0, H1, H2, H3) -> - (1 bsl H0) bor (1 bsl H1) bor (1 bsl H2) bor (1 bsl H3). +get_mask(H0, H1) -> + case H0 == H1 of + true -> + 1 bsl H0; + false -> + (1 bsl H0) + (1 bsl H1) + end. %% This looks ugly and clunky, but in tests it was quicker than modifying an @@ -85,8 +88,8 @@ add_hashlist([], _S, S0, S1) -> IntSize = ?INTEGER_SIZE, <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + {Slot, H0, H1} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1); @@ -99,8 +102,8 @@ add_hashlist([], _S, S0, S1, S2, S3) -> <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + {Slot, H0, H1} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); @@ -127,8 +130,8 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + {Slot, H0, H1} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1), case Slot of 0 -> add_hashlist(T, From f08faf6432d52f094795700a175e8f7b85c82e01 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 15:22:12 +0100 Subject: [PATCH 11/14] Revert "Revert "Check fpr with 4 keys"" This reverts commit 74c28b52c931c39dfe78aa5b8ca8b0f5a9c49ade. --- src/leveled_tinybloom.erl | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 3c21f3f..7ea905d 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -50,8 +50,8 @@ check_hash(_Hash, <<>>) -> false; check_hash({_SegHash, Hash}, BloomBin) -> SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, - {Slot, H0, H1} = split_hash(Hash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(Hash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), Pos = Slot * ?BITS_PER_KEY, IntSize = ?INTEGER_SIZE, <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, @@ -70,15 +70,12 @@ split_hash(Hash, SlotSplit) -> Slot = Hash band SlotSplit, H0 = (Hash bsr 4) band (?BAND_MASK), H1 = (Hash bsr 10) band (?BAND_MASK), - {Slot, H0, H1}. + H2 = (Hash bsr 16) band (?BAND_MASK), + H3 = (Hash bsr 24) band (?BAND_MASK), + {Slot, H0, H1, H2, H3}. -get_mask(H0, H1) -> - case H0 == H1 of - true -> - 1 bsl H0; - false -> - (1 bsl H0) + (1 bsl H1) - end. +get_mask(H0, H1, H2, H3) -> + (1 bsl H0) bor (1 bsl H1) bor (1 bsl H2) bor (1 bsl H3). %% This looks ugly and clunky, but in tests it was quicker than modifying an @@ -88,8 +85,8 @@ add_hashlist([], _S, S0, S1) -> IntSize = ?INTEGER_SIZE, <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1); @@ -102,8 +99,8 @@ add_hashlist([], _S, S0, S1, S2, S3) -> <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); @@ -130,8 +127,8 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF) -> - {Slot, H0, H1} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1), + {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), + Mask = get_mask(H0, H1, H2, H3), case Slot of 0 -> add_hashlist(T, From 6af1d3b0031c1ed1bf633b2001077cd88674ebb2 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 15:42:53 +0100 Subject: [PATCH 12/14] Use more keys in bloom Use 4 keys in the bloom (which is closer to optimal size). This should halve the fpr - as we cna now use the large ExtraHash rather than being constrained by the SegmentHash here. --- src/leveled_codec.erl | 2 +- src/leveled_tinybloom.erl | 115 ++++++++++---------------------------- 2 files changed, 31 insertions(+), 86 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index bd0c60d..53bc81a 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -88,7 +88,7 @@ %% speed can be gained if just the segment ID is known - but more can be %% gained should the extended hash (with the second element) is known segment_hash(Key) when is_binary(Key) -> - <> = + <> = crypto:hash(md5, Key), {SegmentID, ExtraHash}; segment_hash(Key) -> diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 7ea905d..5513d31 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -16,8 +16,8 @@ check_hash/2 ]). --define(BITS_PER_KEY, 8). % Must be 8 or 4 --define(INTEGER_SIZE, ?BITS_PER_KEY * 8). +-define(BLOOM_SIZE_BYTES, 16). +-define(INTEGER_SIZE, 128). -define(BAND_MASK, ?INTEGER_SIZE - 1). @@ -34,9 +34,8 @@ create_bloom(HashList) -> <<>>; L when L > 32 -> add_hashlist(HashList, - 15, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0); + 7, + 0, 0, 0, 0, 0, 0, 0, 0); L when L > 16 -> add_hashlist(HashList, 3, 0, 0, 0, 0); _ -> @@ -49,10 +48,10 @@ create_bloom(HashList) -> check_hash(_Hash, <<>>) -> false; check_hash({_SegHash, Hash}, BloomBin) -> - SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, - {Slot, H0, H1, H2, H3} = split_hash(Hash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), - Pos = Slot * ?BITS_PER_KEY, + SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1, + {Slot, Hashes} = split_hash(Hash, SlotSplit), + Mask = get_mask(Hashes), + Pos = Slot * ?BLOOM_SIZE_BYTES, IntSize = ?INTEGER_SIZE, <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, case CheckInt band Mask of @@ -69,12 +68,12 @@ check_hash({_SegHash, Hash}, BloomBin) -> split_hash(Hash, SlotSplit) -> Slot = Hash band SlotSplit, H0 = (Hash bsr 4) band (?BAND_MASK), - H1 = (Hash bsr 10) band (?BAND_MASK), - H2 = (Hash bsr 16) band (?BAND_MASK), - H3 = (Hash bsr 24) band (?BAND_MASK), - {Slot, H0, H1, H2, H3}. + H1 = (Hash bsr 11) band (?BAND_MASK), + H2 = (Hash bsr 18) band (?BAND_MASK), + H3 = (Hash bsr 25) band (?BAND_MASK), + {Slot, [H0, H1, H2, H3]}. -get_mask(H0, H1, H2, H3) -> +get_mask([H0, H1, H2, H3]) -> (1 bsl H0) bor (1 bsl H1) bor (1 bsl H2) bor (1 bsl H3). @@ -85,8 +84,8 @@ add_hashlist([], _S, S0, S1) -> IntSize = ?INTEGER_SIZE, <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + {Slot, Hashes} = split_hash(TopHash, SlotSplit), + Mask = get_mask(Hashes), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1); @@ -99,8 +98,8 @@ add_hashlist([], _S, S0, S1, S2, S3) -> <>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + {Slot, Hashes} = split_hash(TopHash, SlotSplit), + Mask = get_mask(Hashes), case Slot of 0 -> add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); @@ -112,104 +111,50 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask) end. -add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF) -> +add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7) -> IntSize = ?INTEGER_SIZE, <>; + S6:IntSize/integer, S7:IntSize/integer>>; add_hashlist([{_SegHash, TopHash}|T], SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF) -> - {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit), - Mask = get_mask(H0, H1, H2, H3), + S0, S1, S2, S3, S4, S5, S6, S7) -> + {Slot, Hashes} = split_hash(TopHash, SlotSplit), + Mask = get_mask(Hashes), case Slot of 0 -> add_hashlist(T, SlotSplit, - S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0 bor Mask, S1, S2, S3, S4, S5, S6, S7); 1 -> add_hashlist(T, SlotSplit, - S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1 bor Mask, S2, S3, S4, S5, S6, S7); 2 -> add_hashlist(T, SlotSplit, - S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2 bor Mask, S3, S4, S5, S6, S7); 3 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2, S3 bor Mask, S4, S5, S6, S7); 4 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2, S3, S4 bor Mask, S5, S6, S7); 5 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2, S3, S4, S5 bor Mask, S6, S7); 6 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, S8, S9, - SA, SB, SC, SD, SE, SF); + S0, S1, S2, S3, S4, S5, S6 bor Mask, S7); 7 -> add_hashlist(T, SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, S8, S9, - SA, SB, SC, SD, SE, SF); - 8 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8 bor Mask, S9, - SA, SB, SC, SD, SE, SF); - 9 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9 bor Mask, - SA, SB, SC, SD, SE, SF); - 10 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA bor Mask, SB, SC, SD, SE, SF); - 11 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB bor Mask, SC, SD, SE, SF); - 12 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC bor Mask, SD, SE, SF); - 13 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD bor Mask, SE, SF); - 14 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE bor Mask, SF); - 15 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - SA, SB, SC, SD, SE, SF bor Mask) + S0, S1, S2, S3, S4, S5, S6, S7 bor Mask) end. From a22610cee7defe17d71bb971214580f0537a42d1 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 24 Oct 2017 17:58:33 +0100 Subject: [PATCH 13/14] Experiment with alternate slot size Improves fpr. Does this change anything in volume tests? --- src/leveled_sst.erl | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 3ec82a1..90bb9c6 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -64,14 +64,13 @@ -include("include/leveled.hrl"). --define(MAX_SLOTS, 256). --define(LOOK_SLOTSIZE, 128). % This is not configurable --define(LOOK_BLOCKSIZE, {24, 32}). +-define(MAX_SLOTS, 300). +-define(LOOK_SLOTSIZE, 104). % This is not configurable +-define(LOOK_BLOCKSIZE, {20, 24}). -define(NOLOOK_SLOTSIZE, 256). -define(NOLOOK_BLOCKSIZE, {56, 32}). -define(COMPRESSION_LEVEL, 1). -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]). -% -define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]). -define(MERGE_SCANWIDTH, 16). -define(DISCARD_EXT, ".discarded"). -define(DELETE_TIMEOUT, 10000). @@ -1539,7 +1538,7 @@ indexed_list_test() -> io:format(user, "~nIndexed list timing test:~n", []), N = 150, KVL0 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 4)), - KVL1 = lists:sublist(KVL0, 128), + KVL1 = lists:sublist(KVL0, ?LOOK_SLOTSIZE), SW0 = os:timestamp(), @@ -1605,9 +1604,11 @@ indexed_list_mixedkeys2_test() -> KVL1). indexed_list_allindexkeys_test() -> - Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128), + Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), + ?LOOK_SLOTSIZE), {PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), - ?assertMatch(<<_BL:24/binary, 127:8/integer>>, PosBinIndex1), + EmptySlotSize = ?LOOK_SLOTSIZE - 1, + ?assertMatch(<<_BL:24/binary, EmptySlotSize:8/integer>>, PosBinIndex1), % SW = os:timestamp(), BinToList = binaryslot_tolist(FullBin), % io:format(user, @@ -1630,9 +1631,11 @@ indexed_list_allindexkeys_nolookup_test() -> ?assertMatch(Keys, binaryslot_trimmedlist(FullBin, all, all)). indexed_list_allindexkeys_trimmed_test() -> - Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128), + Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), + ?LOOK_SLOTSIZE), {PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), - ?assertMatch(<<_BL:24/binary, 127:8/integer>>, PosBinIndex1), + EmptySlotSize = ?LOOK_SLOTSIZE - 1, + ?assertMatch(<<_BL:24/binary, EmptySlotSize:8/integer>>, PosBinIndex1), ?assertMatch(Keys, binaryslot_trimmedlist(FullBin, {i, "Bucket", @@ -1657,9 +1660,9 @@ indexed_list_allindexkeys_trimmed_test() -> ?assertMatch(11, length(O2)), ?assertMatch(R2, O2), - {SK3, _} = lists:nth(127, Keys), - {EK3, _} = lists:nth(128, Keys), - R3 = lists:sublist(Keys, 127, 2), + {SK3, _} = lists:nth(?LOOK_SLOTSIZE - 1, Keys), + {EK3, _} = lists:nth(?LOOK_SLOTSIZE, Keys), + R3 = lists:sublist(Keys, ?LOOK_SLOTSIZE - 1, 2), O3 = binaryslot_trimmedlist(FullBin, SK3, EK3), ?assertMatch(2, length(O3)), ?assertMatch(R3, O3). From e24eaf655bfa6a4ad85b6981b590b9606adb88e3 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 25 Oct 2017 08:59:34 +0100 Subject: [PATCH 14/14] Revert to previous standard slot size But maintain configurability of slot size to maximum --- src/leveled_sst.erl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 90bb9c6..7b8633b 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -64,11 +64,11 @@ -include("include/leveled.hrl"). --define(MAX_SLOTS, 300). --define(LOOK_SLOTSIZE, 104). % This is not configurable --define(LOOK_BLOCKSIZE, {20, 24}). +-define(MAX_SLOTS, 256). +-define(LOOK_SLOTSIZE, 128). % Maximum of 128 +-define(LOOK_BLOCKSIZE, {24, 32}). % 4x + y = ?LOOK_SLOTSIZE -define(NOLOOK_SLOTSIZE, 256). --define(NOLOOK_BLOCKSIZE, {56, 32}). +-define(NOLOOK_BLOCKSIZE, {56, 32}). % 4x + y = ?NOLOOK_SLOTSIZE -define(COMPRESSION_LEVEL, 1). -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]). -define(MERGE_SCANWIDTH, 16).