Merge pull request #98 from martinsumner/mas-segid-cryptohash

Mas segid cryptohash
This commit is contained in:
Martin Sumner 2017-10-25 10:02:04 +01:00 committed by GitHub
commit 7763df3cef
12 changed files with 231 additions and 201 deletions

View file

@ -947,7 +947,7 @@ fetch_head(Key, Penciller, LedgerCache) ->
[{Key, Head}] ->
Head;
[] ->
Hash = leveled_codec:magic_hash(Key),
Hash = leveled_codec:segment_hash(Key),
case leveled_penciller:pcl_fetch(Penciller, Key, Hash) of
{Key, Head} ->
maybe_longrunning(SW, pcl_head),

View file

@ -65,6 +65,7 @@
integer_now/0,
riak_extract_metadata/2,
magic_hash/1,
segment_hash/1,
to_lookup/1]).
-define(V1_VERS, 1).
@ -79,6 +80,20 @@
integer()|null, % Hash of vclock - non-exportable
integer()}. % Size in bytes of real object
-spec segment_hash(any()) -> {integer(), integer()}.
%% @doc
%% Return two 16 bit integers - the segment ID and a second integer for spare
%% entropy. The hashed should be used in blooms or indexes such that some
%% speed can be gained if just the segment ID is known - but more can be
%% gained should the extended hash (with the second element) is known
segment_hash(Key) when is_binary(Key) ->
<<SegmentID:16/integer, ExtraHash:32/integer, _Rest/binary>> =
crypto:hash(md5, Key),
{SegmentID, ExtraHash};
segment_hash(Key) ->
segment_hash(term_to_binary(Key)).
-spec magic_hash(any()) -> integer().
%% @doc
%% Use DJ Bernstein magic hash function. Note, this is more expensive than
@ -87,10 +102,6 @@
%% Hash function contains mysterious constants, some explanation here as to
%% what they are -
%% http://stackoverflow.com/questions/10696223/reason-for-5381-number-in-djb-hash-function
magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) ->
magic_hash({Bucket, Key});
magic_hash({?STD_TAG, Bucket, Key, _SubKey}) ->
magic_hash({Bucket, Key});
magic_hash({binary, BinaryKey}) ->
H = 5381,
hash1(H, BinaryKey) band 16#FFFFFFFF;
@ -516,7 +527,9 @@ parse_date(LMD, UnitMins, LimitMins, Now) ->
-spec generate_ledgerkv(
tuple(), integer(), any(), integer(), tuple()|infinity) ->
{any(), any(), any(), {integer()|no_lookup, integer()}, list()}.
{any(), any(), any(),
{{integer(), integer()}|no_lookup, integer()},
list()}.
%% @doc
%% Function to extract from an object the information necessary to populate
%% the Penciller's ledger.
@ -537,7 +550,7 @@ generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) ->
_ ->
{active, TS}
end,
Hash = magic_hash(PrimaryKey),
Hash = segment_hash(PrimaryKey),
{MD, LastMods} = extract_metadata(Obj, Size, Tag),
ObjHash = get_objhash(Tag, MD),
Value = {SQN,

View file

@ -648,8 +648,8 @@ schedule_test_bycount(N) ->
?assertMatch(true, SecondsToCompaction0 < 5700),
SecondsToCompaction1 = schedule_compaction([14], N, CurrentTS), % tomorrow!
io:format("Seconds to compaction ~w~n", [SecondsToCompaction1]),
?assertMatch(true, SecondsToCompaction1 > 81000),
?assertMatch(true, SecondsToCompaction1 < 84300).
?assertMatch(true, SecondsToCompaction1 >= 81180),
?assertMatch(true, SecondsToCompaction1 =< 84780).
simple_score_test() ->

View file

@ -144,6 +144,8 @@
++ "leaving SnapshotCount=~w and MinSQN=~w"}},
{"P0040",
{info, "Archiving filename ~s as unused at startup"}},
{"P0041",
{info, "Penciller manifest switched from SQN ~w to ~w"}},
{"PC001",
{info, "Penciller's clerk ~w started with owner ~w"}},

View file

@ -254,7 +254,7 @@ generate_randomkeys(Count, Acc, BucketLow, BRange) ->
K = {o, "Bucket" ++ BNumber, "Key" ++ KNumber},
RandKey = {K, {Count + 1,
{active, infinity},
leveled_codec:magic_hash(K),
leveled_codec:segment_hash(K),
null}},
generate_randomkeys(Count - 1, [RandKey|Acc], BucketLow, BRange).

View file

@ -315,21 +315,22 @@ pcl_fetchlevelzero(Pid, Slot) ->
%% The Key needs to be hashable (i.e. have a tag which indicates that the key
%% can be looked up) - index entries are not hashable for example.
%%
%% If the hash is already knonw, call pcl_fetch/3 as magic_hash is a
%% If the hash is already knonw, call pcl_fetch/3 as segment_hash is a
%% relatively expensive hash function
pcl_fetch(Pid, Key) ->
Hash = leveled_codec:magic_hash(Key),
Hash = leveled_codec:segment_hash(Key),
if
Hash /= no_lookup ->
gen_server:call(Pid, {fetch, Key, Hash}, infinity)
end.
-spec pcl_fetch(pid(), tuple(), integer()) -> {tuple(), tuple()}|not_present.
-spec pcl_fetch(pid(), tuple(), {integer(), integer()}) ->
{tuple(), tuple()}|not_present.
%% @doc
%% Fetch a key, return the first (highest SQN) occurrence of that Key along
%% with the value.
%%
%% Hash should be result of leveled_codec:magic_hash(Key)
%% Hash should be result of leveled_codec:segment_hash(Key)
pcl_fetch(Pid, Key, Hash) ->
gen_server:call(Pid, {fetch, Key, Hash}, infinity).
@ -367,7 +368,7 @@ pcl_fetchnextkey(Pid, StartKey, EndKey, AccFun, InitAcc) ->
%% If the key is not present, it will be assumed that a higher sequence number
%% tombstone once existed, and false will be returned.
pcl_checksequencenumber(Pid, Key, SQN) ->
Hash = leveled_codec:magic_hash(Key),
Hash = leveled_codec:segment_hash(Key),
if
Hash /= no_lookup ->
gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity)
@ -672,6 +673,8 @@ handle_call(doom, _From, State) ->
handle_cast({manifest_change, NewManifest}, State) ->
NewManSQN = leveled_pmanifest:get_manifest_sqn(NewManifest),
OldManSQN = leveled_pmanifest:get_manifest_sqn(State#state.manifest),
leveled_log:log("P0041", [OldManSQN, NewManSQN]),
ok = leveled_pclerk:clerk_promptdeletions(State#state.clerk, NewManSQN),
UpdManifest = leveled_pmanifest:merge_snapshot(State#state.manifest,
NewManifest),
@ -1317,7 +1320,7 @@ generate_randomkeys(Count, SQN, Acc) ->
RandKey = {K,
{SQN,
{active, infinity},
leveled_codec:magic_hash(K),
leveled_codec:segment_hash(K),
null}},
generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]).
@ -1347,7 +1350,7 @@ maybe_pause_push(PCL, KL) ->
T1 = lists:foldl(fun({K, V}, {AccSL, AccIdx, MinSQN, MaxSQN}) ->
UpdSL = [{K, V}|AccSL],
SQN = leveled_codec:strip_to_seqonly({K, V}),
H = leveled_codec:magic_hash(K),
H = leveled_codec:segment_hash(K),
UpdIdx = leveled_pmem:prepare_for_index(AccIdx, H),
{UpdSL, UpdIdx, min(SQN, MinSQN), max(SQN, MaxSQN)}
end,
@ -1366,7 +1369,7 @@ maybe_pause_push(PCL, KL) ->
%% old test data doesn't have the magic hash
add_missing_hash({K, {SQN, ST, MD}}) ->
{K, {SQN, ST, leveled_codec:magic_hash(K), MD}}.
{K, {SQN, ST, leveled_codec:segment_hash(K), MD}}.
clean_dir_test() ->

View file

@ -1128,6 +1128,49 @@ snapshot_timeout_test() ->
Man10 = release_snapshot(Man9, ?PHANTOM_PID),
?assertMatch(0, length(Man10#manifest.snapshots)).
potential_issue_test() ->
Manifest =
{manifest,{array,9,0,[],
{[],
[{manifest_entry,{o_rkv,"Bucket","Key10",null},
{o_rkv,"Bucket","Key12949",null},
"<0.313.0>","./16_1_0.sst"},
{manifest_entry,{o_rkv,"Bucket","Key129490",null},
{o_rkv,"Bucket","Key158981",null},
"<0.315.0>","./16_1_1.sst"},
{manifest_entry,{o_rkv,"Bucket","Key158982",null},
{o_rkv,"Bucket","Key188472",null},
"<0.316.0>","./16_1_2.sst"}],
{idxt,1,
{{[{{o_rkv,"Bucket1","Key1",null},
{manifest_entry,{o_rkv,"Bucket","Key9083",null},
{o_rkv,"Bucket1","Key1",null},
"<0.320.0>","./16_1_6.sst"}}]},
{1,{{o_rkv,"Bucket1","Key1",null},1,nil,nil}}}},
{idxt,0,{{},{0,nil}}},
{idxt,0,{{},{0,nil}}},
{idxt,0,{{},{0,nil}}},
{idxt,0,{{},{0,nil}}},
{idxt,0,{{},{0,nil}}},
{idxt,0,{{},{0,nil}}},
[]}},
19,[],0,
{dict,0,16,16,8,80,48,
{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]},
{{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}}},
2},
Range1 = range_lookup(Manifest,
1,
{o_rkv, "Bucket", null, null},
{o_rkv, "Bucket", null, null}),
Range2 = range_lookup(Manifest,
2,
{o_rkv, "Bucket", null, null},
{o_rkv, "Bucket", null, null}),
io:format("Range in Level 1 ~w~n", [Range1]),
io:format("Range in Level 2 ~w~n", [Range2]),
?assertMatch(3, length(Range1)),
?assertMatch(1, length(Range2)).
-endif.

View file

@ -50,7 +50,8 @@
%%% API
%%%============================================================================
-spec prepare_for_index(index_array(), integer()|no_lookup) -> index_array().
-spec prepare_for_index(index_array(), {integer(), integer()}|no_lookup)
-> index_array().
%% @doc
%% Add the hash of a key to the index. This is 'prepared' in the sense that
%% this index is not use until it is loaded into the main index.
@ -95,7 +96,7 @@ new_index() ->
clear_index(_L0Index) ->
new_index().
-spec check_index(integer(), index_array()) -> list(integer()).
-spec check_index({integer(), integer()}, index_array()) -> list(integer()).
%% @doc
%% return a list of positions in the list of cache arrays that may contain the
%% key associated with the hash being checked
@ -158,9 +159,9 @@ to_list(Slots, FetchFun) ->
%% checked (with the most recently received cache being checked first) until a
%% match is found.
check_levelzero(Key, PosList, TreeList) ->
check_levelzero(Key, leveled_codec:magic_hash(Key), PosList, TreeList).
check_levelzero(Key, leveled_codec:segment_hash(Key), PosList, TreeList).
-spec check_levelzero(tuple(), integer(), list(integer()), list())
-spec check_levelzero(tuple(), {integer(), integer()}, list(integer()), list())
-> {boolean(), tuple|not_found}.
%% @doc
%% Check for the presence of a given Key in the Level Zero cache, with the
@ -204,10 +205,10 @@ find_pos(<<0:1/integer, NxtSlot:7/integer, T/binary>>, Hash, PosList, _SlotID) -
find_pos(T, Hash, PosList, NxtSlot).
split_hash(Hash) ->
Slot = Hash band 255,
H0 = (Hash bsr 8) band 8388607,
{Slot, H0}.
split_hash({SegmentID, ExtraHash}) ->
Slot = SegmentID band 255,
H0 = (SegmentID bsr 8) bor (ExtraHash bsl 8),
{Slot, H0 band 8388607}.
check_slotlist(Key, _Hash, CheckList, TreeList) ->
SlotCheckFun =
@ -358,7 +359,7 @@ with_index_test_() ->
with_index_test2() ->
IndexPrepareFun =
fun({K, _V}, Acc) ->
H = leveled_codec:magic_hash(K),
H = leveled_codec:segment_hash(K),
prepare_for_index(Acc, H)
end,
LoadFun =
@ -382,7 +383,7 @@ with_index_test2() ->
CheckFun =
fun({K, V}, {L0Idx, L0Cache}) ->
H = leveled_codec:magic_hash(K),
H = leveled_codec:segment_hash(K),
PosList = check_index(H, L0Idx),
?assertMatch({true, {K, V}},
check_slotlist(K, H, PosList, L0Cache)),

View file

@ -65,13 +65,12 @@
-include("include/leveled.hrl").
-define(MAX_SLOTS, 256).
-define(LOOK_SLOTSIZE, 128). % This is not configurable
-define(LOOK_BLOCKSIZE, {24, 32}).
-define(LOOK_SLOTSIZE, 128). % Maximum of 128
-define(LOOK_BLOCKSIZE, {24, 32}). % 4x + y = ?LOOK_SLOTSIZE
-define(NOLOOK_SLOTSIZE, 256).
-define(NOLOOK_BLOCKSIZE, {56, 32}).
-define(NOLOOK_BLOCKSIZE, {56, 32}). % 4x + y = ?NOLOOK_SLOTSIZE
-define(COMPRESSION_LEVEL, 1).
-define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]).
% -define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]).
-define(MERGE_SCANWIDTH, 16).
-define(DISCARD_EXT, ".discarded").
-define(DELETE_TIMEOUT, 10000).
@ -237,12 +236,12 @@ sst_newlevelzero(RootPath, Filename, Slots, FetchFun, Penciller, MaxSQN) ->
-spec sst_get(pid(), tuple()) -> tuple()|not_present.
%% @doc
%% Return a Key, Value pair matching a Key or not_present if the Key is not in
%% the store. The magic_hash function is used to accelerate the seeking of
%% the store. The segment_hash function is used to accelerate the seeking of
%% keys, sst_get/3 should be used directly if this has already been calculated
sst_get(Pid, LedgerKey) ->
sst_get(Pid, LedgerKey, leveled_codec:magic_hash(LedgerKey)).
sst_get(Pid, LedgerKey, leveled_codec:segment_hash(LedgerKey)).
-spec sst_get(pid(), tuple(), integer()) -> tuple()|not_present.
-spec sst_get(pid(), tuple(), {integer(), integer()}) -> tuple()|not_present.
%% @doc
%% Return a Key, Value pair matching a Key or not_present if the Key is not in
%% the store (with the magic hash precalculated).
@ -554,7 +553,7 @@ fetch(LedgerKey, Hash, State) ->
State#state{blockindex_cache = BlockIndexCache}};
<<BlockLengths:24/binary, BlockIdx/binary>> ->
PosList = find_pos(BlockIdx,
double_hash(Hash, LedgerKey),
extra_hash(Hash),
[],
0),
case PosList of
@ -808,9 +807,9 @@ generate_binary_slot(Lookup, KVL) ->
fun({K, V}, {PosBinAcc, NoHashCount, HashAcc}) ->
{_SQN, H1} = leveled_codec:strip_to_seqnhashonly({K, V}),
case is_integer(H1) of
PosH1 = extra_hash(H1),
case is_integer(PosH1) of
true ->
PosH1 = double_hash(H1, K),
case NoHashCount of
0 ->
{<<1:1/integer,
@ -1003,7 +1002,7 @@ binaryslot_get(FullBin, Key, Hash) ->
<<B1P:32/integer, _R/binary>> = BlockLengths,
<<PosBinIndex:B1P/binary, Blocks/binary>> = Rest,
PosList = find_pos(PosBinIndex,
double_hash(Hash, Key),
extra_hash(Hash),
[],
0),
{fetch_value(PosList, BlockLengths, Blocks, Key),
@ -1186,9 +1185,10 @@ block_offsetandlength(BlockLengths, BlockID) ->
{BlocksPos, B1L + B2L + B3L + B4L, B5L}
end.
double_hash(Hash, Key) ->
H2 = erlang:phash2(Key),
(Hash bxor H2) band 32767.
extra_hash({SegHash, _ExtraHash}) when is_integer(SegHash) ->
SegHash band 32767;
extra_hash(NotHash) ->
NotHash.
fetch_value([], _BlockLengths, _Blocks, _Key) ->
not_present;
@ -1538,7 +1538,7 @@ indexed_list_test() ->
io:format(user, "~nIndexed list timing test:~n", []),
N = 150,
KVL0 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 4)),
KVL1 = lists:sublist(KVL0, 128),
KVL1 = lists:sublist(KVL0, ?LOOK_SLOTSIZE),
SW0 = os:timestamp(),
@ -1548,15 +1548,15 @@ indexed_list_test() ->
[timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]),
{TestK1, TestV1} = lists:nth(20, KVL1),
MH1 = leveled_codec:magic_hash(TestK1),
MH1 = leveled_codec:segment_hash(TestK1),
{TestK2, TestV2} = lists:nth(40, KVL1),
MH2 = leveled_codec:magic_hash(TestK2),
MH2 = leveled_codec:segment_hash(TestK2),
{TestK3, TestV3} = lists:nth(60, KVL1),
MH3 = leveled_codec:magic_hash(TestK3),
MH3 = leveled_codec:segment_hash(TestK3),
{TestK4, TestV4} = lists:nth(80, KVL1),
MH4 = leveled_codec:magic_hash(TestK4),
MH4 = leveled_codec:segment_hash(TestK4),
{TestK5, TestV5} = lists:nth(100, KVL1),
MH5 = leveled_codec:magic_hash(TestK5),
MH5 = leveled_codec:segment_hash(TestK5),
test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}),
test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}),
@ -1573,15 +1573,15 @@ indexed_list_mixedkeys_test() ->
{_PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys),
{TestK1, TestV1} = lists:nth(4, KVL1),
MH1 = leveled_codec:magic_hash(TestK1),
MH1 = leveled_codec:segment_hash(TestK1),
{TestK2, TestV2} = lists:nth(8, KVL1),
MH2 = leveled_codec:magic_hash(TestK2),
MH2 = leveled_codec:segment_hash(TestK2),
{TestK3, TestV3} = lists:nth(12, KVL1),
MH3 = leveled_codec:magic_hash(TestK3),
MH3 = leveled_codec:segment_hash(TestK3),
{TestK4, TestV4} = lists:nth(16, KVL1),
MH4 = leveled_codec:magic_hash(TestK4),
MH4 = leveled_codec:segment_hash(TestK4),
{TestK5, TestV5} = lists:nth(20, KVL1),
MH5 = leveled_codec:magic_hash(TestK5),
MH5 = leveled_codec:segment_hash(TestK5),
test_binary_slot(FullBin, TestK1, MH1, {TestK1, TestV1}),
test_binary_slot(FullBin, TestK2, MH2, {TestK2, TestV2}),
@ -1598,15 +1598,17 @@ indexed_list_mixedkeys2_test() ->
Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2,
{_PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys),
lists:foreach(fun({K, V}) ->
MH = leveled_codec:magic_hash(K),
MH = leveled_codec:segment_hash(K),
test_binary_slot(FullBin, K, MH, {K, V})
end,
KVL1).
indexed_list_allindexkeys_test() ->
Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128),
Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)),
?LOOK_SLOTSIZE),
{PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys),
?assertMatch(<<_BL:24/binary, 127:8/integer>>, PosBinIndex1),
EmptySlotSize = ?LOOK_SLOTSIZE - 1,
?assertMatch(<<_BL:24/binary, EmptySlotSize:8/integer>>, PosBinIndex1),
% SW = os:timestamp(),
BinToList = binaryslot_tolist(FullBin),
% io:format(user,
@ -1629,9 +1631,11 @@ indexed_list_allindexkeys_nolookup_test() ->
?assertMatch(Keys, binaryslot_trimmedlist(FullBin, all, all)).
indexed_list_allindexkeys_trimmed_test() ->
Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128),
Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)),
?LOOK_SLOTSIZE),
{PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys),
?assertMatch(<<_BL:24/binary, 127:8/integer>>, PosBinIndex1),
EmptySlotSize = ?LOOK_SLOTSIZE - 1,
?assertMatch(<<_BL:24/binary, EmptySlotSize:8/integer>>, PosBinIndex1),
?assertMatch(Keys, binaryslot_trimmedlist(FullBin,
{i,
"Bucket",
@ -1656,9 +1660,9 @@ indexed_list_allindexkeys_trimmed_test() ->
?assertMatch(11, length(O2)),
?assertMatch(R2, O2),
{SK3, _} = lists:nth(127, Keys),
{EK3, _} = lists:nth(128, Keys),
R3 = lists:sublist(Keys, 127, 2),
{SK3, _} = lists:nth(?LOOK_SLOTSIZE - 1, Keys),
{EK3, _} = lists:nth(?LOOK_SLOTSIZE, Keys),
R3 = lists:sublist(Keys, ?LOOK_SLOTSIZE - 1, 2),
O3 = binaryslot_trimmedlist(FullBin, SK3, EK3),
?assertMatch(2, length(O3)),
?assertMatch(R3, O3).
@ -1682,7 +1686,7 @@ indexed_list_mixedkeys_bitflip_test() ->
end,
{TestK1, _TestV1} = lists:nth(20, KVL1),
MH1 = leveled_codec:magic_hash(TestK1),
MH1 = leveled_codec:segment_hash(TestK1),
test_binary_slot(FullBin0, TestK1, MH1, not_present),
ToList = binaryslot_tolist(FullBin0),
@ -1920,7 +1924,7 @@ simple_persisted_test() ->
In = lists:keymember(K, 1, KVList1),
case {K > FirstKey, LastKey > K, In} of
{true, true, false} ->
[{K, leveled_codec:magic_hash(K), V}|Acc];
[{K, leveled_codec:segment_hash(K), V}|Acc];
_ ->
Acc
end

View file

@ -16,8 +16,8 @@
check_hash/2
]).
-define(BITS_PER_KEY, 8). % Must be 8 or 4
-define(INTEGER_SIZE, ?BITS_PER_KEY * 8).
-define(BLOOM_SIZE_BYTES, 16).
-define(INTEGER_SIZE, 128).
-define(BAND_MASK, ?INTEGER_SIZE - 1).
@ -34,9 +34,8 @@ create_bloom(HashList) ->
<<>>;
L when L > 32 ->
add_hashlist(HashList,
15,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0);
7,
0, 0, 0, 0, 0, 0, 0, 0);
L when L > 16 ->
add_hashlist(HashList, 3, 0, 0, 0, 0);
_ ->
@ -48,11 +47,11 @@ create_bloom(HashList) ->
%% Check for the presence of a given hash within a bloom
check_hash(_Hash, <<>>) ->
false;
check_hash(Hash, BloomBin) ->
SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1,
{Slot, H0, H1} = split_hash(Hash, SlotSplit),
Mask = get_mask(H0, H1),
Pos = Slot * ?BITS_PER_KEY,
check_hash({_SegHash, Hash}, BloomBin) ->
SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1,
{Slot, Hashes} = split_hash(Hash, SlotSplit),
Mask = get_mask(Hashes),
Pos = Slot * ?BLOOM_SIZE_BYTES,
IntSize = ?INTEGER_SIZE,
<<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin,
case CheckInt band Mask of
@ -69,19 +68,13 @@ check_hash(Hash, BloomBin) ->
split_hash(Hash, SlotSplit) ->
Slot = Hash band SlotSplit,
H0 = (Hash bsr 4) band (?BAND_MASK),
H1 = (Hash bsr 10) band (?BAND_MASK),
H3 = (Hash bsr 16) band (?BAND_MASK),
H4 = (Hash bsr 22) band (?BAND_MASK),
Slot0 = (Hash bsr 28) band SlotSplit,
{Slot bxor Slot0, H0 bxor H3, H1 bxor H4}.
H1 = (Hash bsr 11) band (?BAND_MASK),
H2 = (Hash bsr 18) band (?BAND_MASK),
H3 = (Hash bsr 25) band (?BAND_MASK),
{Slot, [H0, H1, H2, H3]}.
get_mask(H0, H1) ->
case H0 == H1 of
true ->
1 bsl H0;
false ->
(1 bsl H0) + (1 bsl H1)
end.
get_mask([H0, H1, H2, H3]) ->
(1 bsl H0) bor (1 bsl H1) bor (1 bsl H2) bor (1 bsl H3).
%% This looks ugly and clunky, but in tests it was quicker than modifying an
@ -90,9 +83,9 @@ get_mask(H0, H1) ->
add_hashlist([], _S, S0, S1) ->
IntSize = ?INTEGER_SIZE,
<<S0:IntSize/integer, S1:IntSize/integer>>;
add_hashlist([TopHash|T], SlotSplit, S0, S1) ->
{Slot, H0, H1} = split_hash(TopHash, SlotSplit),
Mask = get_mask(H0, H1),
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) ->
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
Mask = get_mask(Hashes),
case Slot of
0 ->
add_hashlist(T, SlotSplit, S0 bor Mask, S1);
@ -104,9 +97,9 @@ add_hashlist([], _S, S0, S1, S2, S3) ->
IntSize = ?INTEGER_SIZE,
<<S0:IntSize/integer, S1:IntSize/integer,
S2:IntSize/integer, S3:IntSize/integer>>;
add_hashlist([TopHash|T], SlotSplit, S0, S1, S2, S3) ->
{Slot, H0, H1} = split_hash(TopHash, SlotSplit),
Mask = get_mask(H0, H1),
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) ->
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
Mask = get_mask(Hashes),
case Slot of
0 ->
add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3);
@ -118,104 +111,50 @@ add_hashlist([TopHash|T], SlotSplit, S0, S1, S2, S3) ->
add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask)
end.
add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF) ->
add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7) ->
IntSize = ?INTEGER_SIZE,
<<S0:IntSize/integer, S1:IntSize/integer,
S2:IntSize/integer, S3:IntSize/integer,
S4:IntSize/integer, S5:IntSize/integer,
S6:IntSize/integer, S7:IntSize/integer,
S8:IntSize/integer, S9:IntSize/integer,
SA:IntSize/integer, SB:IntSize/integer,
SC:IntSize/integer, SD:IntSize/integer,
SE:IntSize/integer, SF:IntSize/integer>>;
add_hashlist([TopHash|T],
S6:IntSize/integer, S7:IntSize/integer>>;
add_hashlist([{_SegHash, TopHash}|T],
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF) ->
{Slot, H0, H1} = split_hash(TopHash, SlotSplit),
Mask = get_mask(H0, H1),
S0, S1, S2, S3, S4, S5, S6, S7) ->
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
Mask = get_mask(Hashes),
case Slot of
0 ->
add_hashlist(T,
SlotSplit,
S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
S0 bor Mask, S1, S2, S3, S4, S5, S6, S7);
1 ->
add_hashlist(T,
SlotSplit,
S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
S0, S1 bor Mask, S2, S3, S4, S5, S6, S7);
2 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
S0, S1, S2 bor Mask, S3, S4, S5, S6, S7);
3 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
S0, S1, S2, S3 bor Mask, S4, S5, S6, S7);
4 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
S0, S1, S2, S3, S4 bor Mask, S5, S6, S7);
5 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
S0, S1, S2, S3, S4, S5 bor Mask, S6, S7);
6 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
S0, S1, S2, S3, S4, S5, S6 bor Mask, S7);
7 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, S8, S9,
SA, SB, SC, SD, SE, SF);
8 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8 bor Mask, S9,
SA, SB, SC, SD, SE, SF);
9 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9 bor Mask,
SA, SB, SC, SD, SE, SF);
10 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA bor Mask, SB, SC, SD, SE, SF);
11 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB bor Mask, SC, SD, SE, SF);
12 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC bor Mask, SD, SE, SF);
13 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD bor Mask, SE, SF);
14 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE bor Mask, SF);
15 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF bor Mask)
S0, S1, S2, S3, S4, S5, S6, S7 bor Mask)
end.
@ -239,7 +178,7 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) ->
BNumber = string:right(integer_to_list(BucketLow + BRand), 4, $0),
KNumber = string:right(integer_to_list(leveled_rand:uniform(10000)), 6, $0),
LK = leveled_codec:to_ledgerkey("Bucket" ++ BNumber, "Key" ++ KNumber, o),
Chunk = leveled_rand:rand_bytes(64),
Chunk = leveled_rand:rand_bytes(16),
{_B, _K, MV, _H, _LMs} =
leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity),
generate_randomkeys(Seqn + 1,
@ -254,7 +193,7 @@ get_hashlist(N) ->
KVL = lists:sublist(KVL0, N),
HashFun =
fun({K, _V}) ->
leveled_codec:magic_hash(K)
leveled_codec:segment_hash(K)
end,
lists:map(HashFun, KVL).
@ -283,46 +222,50 @@ empty_bloom_test() ->
?assertMatch({0, 4},
check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})).
bloom_test() ->
test_bloom(128),
test_bloom(64),
test_bloom(32),
test_bloom(16),
test_bloom(8).
bloom_test_() ->
{timeout, 20, fun bloom_test_ranges/0}.
test_bloom(N) ->
HashList1 = get_hashlist(N),
HashList2 = get_hashlist(N),
HashList3 = get_hashlist(N),
HashList4 = get_hashlist(N),
bloom_test_ranges() ->
test_bloom(128, 2000),
test_bloom(64, 100),
test_bloom(32, 100),
test_bloom(16, 100),
test_bloom(8, 100).
test_bloom(N, Runs) ->
ListOfHashLists =
lists:map(fun(_X) -> get_hashlist(N) end, lists:seq(1, Runs)),
SWa = os:timestamp(),
BloomBin1 = create_bloom(HashList1),
BloomBin2 = create_bloom(HashList2),
BloomBin3 = create_bloom(HashList3),
BloomBin4 = create_bloom(HashList4),
ListOfBlooms =
lists:map(fun(HL) -> create_bloom(HL) end, ListOfHashLists),
TSa = timer:now_diff(os:timestamp(), SWa),
SWb = os:timestamp(),
check_all_hashes(BloomBin1, HashList1),
check_all_hashes(BloomBin2, HashList2),
check_all_hashes(BloomBin3, HashList3),
check_all_hashes(BloomBin4, HashList4),
lists:foreach(fun(Nth) ->
HL = lists:nth(Nth, ListOfHashLists),
BB = lists:nth(Nth, ListOfBlooms),
check_all_hashes(BB, HL)
end,
lists:seq(1, Runs)),
TSb = timer:now_diff(os:timestamp(), SWb),
HashPool = get_hashlist(N * 2),
HashListOut1 = lists:sublist(lists:subtract(HashPool, HashList1), N),
HashListOut2 = lists:sublist(lists:subtract(HashPool, HashList2), N),
HashListOut3 = lists:sublist(lists:subtract(HashPool, HashList3), N),
HashListOut4 = lists:sublist(lists:subtract(HashPool, HashList4), N),
ListOfMisses =
lists:map(fun(HL) ->
lists:sublist(lists:subtract(HashPool, HL), N)
end,
ListOfHashLists),
SWc = os:timestamp(),
C0 = {0, 0},
C1 = check_neg_hashes(BloomBin1, HashListOut1, C0),
C2 = check_neg_hashes(BloomBin2, HashListOut2, C1),
C3 = check_neg_hashes(BloomBin3, HashListOut3, C2),
C4 = check_neg_hashes(BloomBin4, HashListOut4, C3),
{Pos, Neg} = C4,
{Pos, Neg} =
lists:foldl(fun(Nth, Acc) ->
HL = lists:nth(Nth, ListOfMisses),
BB = lists:nth(Nth, ListOfBlooms),
check_neg_hashes(BB, HL, Acc)
end,
{0, 0},
lists:seq(1, Runs)),
FPR = Pos / (Pos + Neg),
TSc = timer:now_diff(os:timestamp(), SWc),
@ -332,5 +275,4 @@ test_bloom(N) ->
[N, TSa, TSb, TSc, FPR]).
-endif.

View file

@ -214,7 +214,7 @@ search_range(StartRange, EndRange, Tree, StartKeyFun) ->
EndRangeFun =
fun(ER, _FirstRHSKey, FirstRHSValue) ->
StartRHSKey = StartKeyFun(FirstRHSValue),
ER >= StartRHSKey
not leveled_codec:endkey_passed(ER, StartRHSKey)
end,
case Tree of
{tree, _L, T} ->
@ -405,8 +405,12 @@ idxtlookup_range_end(EndRange, {TLI, NK0, SL0}, Iter0, Output, EndRangeFun) ->
[{FirstRHSKey, FirstRHSValue}|_Rest] ->
case EndRangeFun(EndRange, FirstRHSKey, FirstRHSValue) of
true ->
% The start key is not after the end of the range
% and so this should be included in the range
Output ++ LHS ++ [{FirstRHSKey, FirstRHSValue}];
false ->
% the start key of the next key is after the end
% of the range and so should not be included
Output ++ LHS
end
end;
@ -804,4 +808,22 @@ empty_test() ->
T2 = empty(idxt),
?assertMatch(0, tsize(T2)).
search_range_idx_test() ->
Tree =
{idxt,1,
{{[{{o_rkv,"Bucket1","Key1",null},
{manifest_entry,{o_rkv,"Bucket","Key9083",null},
{o_rkv,"Bucket1","Key1",null},
"<0.320.0>","./16_1_6.sst"}}]},
{1,{{o_rkv,"Bucket1","Key1",null},1,nil,nil}}}},
StartKeyFun =
fun(ME) ->
ME#manifest_entry.start_key
end,
R = search_range({o_rkv, "Bucket", null, null},
{o_rkv, "Bucket", null, null},
Tree,
StartKeyFun),
?assertMatch(1, length(R)).
-endif.

View file

@ -333,8 +333,8 @@ load_and_count(_Config) ->
Bookie1,
TestObject,
G1),
{_S, Count} = testutil:check_bucket_stats(Bookie1,
"Bucket"),
{_S, Count} =
testutil:check_bucket_stats(Bookie1, "Bucket"),
if
Acc + 5000 == Count ->
ok
@ -351,8 +351,8 @@ load_and_count(_Config) ->
Bookie1,
TestObject,
G2),
{_S, Count} = testutil:check_bucket_stats(Bookie1,
"Bucket"),
{_S, Count} =
testutil:check_bucket_stats(Bookie1, "Bucket"),
if
Acc + 5000 == Count ->
ok
@ -368,8 +368,8 @@ load_and_count(_Config) ->
Bookie1,
TestObject,
G1),
{_S, Count} = testutil:check_bucket_stats(Bookie1,
"Bucket"),
{_S, Count} =
testutil:check_bucket_stats(Bookie1, "Bucket"),
if
Count == 200000 ->
ok
@ -385,8 +385,8 @@ load_and_count(_Config) ->
Bookie1,
TestObject,
G2),
{_S, Count} = testutil:check_bucket_stats(Bookie1,
"Bucket"),
{_S, Count} =
testutil:check_bucket_stats(Bookie1, "Bucket"),
if
Acc + 5000 == Count ->
ok