From b571be9e438ba70ab2349f055acffcb0813eca83 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 15 Nov 2018 00:00:18 +0000 Subject: [PATCH 1/3] Check that seglist-filtered keys are actually in range --- src/leveled_sst.erl | 100 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 97 insertions(+), 3 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 1a64c94..fe7fe68 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -124,6 +124,8 @@ -export([tune_seglist/1, extract_hash/1, member_check/2]). +-export([in_range/3]). + -record(slot_index_value, {slot_id :: integer(), start_position :: integer(), length :: integer()}). @@ -1674,7 +1676,7 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, % the hash value this will fial unexpectedly. BinMapFun = fun(Pointer, Acc) -> - {SP, _L, ID, _SK, _EK} = pointer_mapfun(Pointer), + {SP, _L, ID, SK, EK} = pointer_mapfun(Pointer), CachedHeader = array:get(ID - 1, BlockIndexCache), case extract_header(CachedHeader, IdxModDate) of none -> @@ -1708,7 +1710,7 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, % Need to find just the right keys PositionList = find_pos(BlockIdx, SegList, [], 0), - Acc ++ + KVL = check_blocks(PositionList, {Handle, SP}, BlockLengths, @@ -1716,7 +1718,10 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, false, PressMethod, IdxModDate, - []) + []), + FilterFun = + fun(KV) -> in_range(KV, SK, EK) end, + Acc ++ lists:filter(FilterFun, KVL) % Note check_blocks shouldreturn [] if % PositionList is empty end @@ -1726,6 +1731,16 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, lists:foldl(BinMapFun, [], SlotList). +-spec in_range(leveled_codec:ledger_kv(), + range_endpoint(), range_endpoint()) -> boolean(). +in_range({_LK, _LV}, all, all) -> + true; +in_range({LK, _LV}, all, EK) -> + not leveled_codec:endkey_passed(EK, LK); +in_range({LK, LV}, SK, EK) -> + (LK >= SK) and in_range({LK, LV}, all, EK). + + read_slotlist(SlotList, Handle) -> LengthList = lists:map(fun pointer_mapfun/1, SlotList), {MultiSlotBin, StartPos} = read_length_list(Handle, LengthList), @@ -2862,6 +2877,85 @@ simple_persisted_range_tester(SSTNewFun) -> TL5 = lists:map(fun(EK) -> {SK5, EK} end, [EK2, EK3, EK4, EK5]), lists:foreach(TestFun, TL2 ++ TL3 ++ TL4 ++ TL5). + +simple_persisted_rangesegfilter_test() -> + simple_persisted_rangesegfilter_tester(fun testsst_new/6), + simple_persisted_rangesegfilter_tester(fun sst_new/6). + +simple_persisted_rangesegfilter_tester(SSTNewFun) -> + {RP, Filename} = {"../test/", "range_segfilter_test"}, + KVList0 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 16, 1, 20), + KVList1 = lists:ukeysort(1, KVList0), + [{FirstKey, _FV}|_Rest] = KVList1, + {LastKey, _LV} = lists:last(KVList1), + {ok, Pid, {FirstKey, LastKey}, _Bloom} = + SSTNewFun(RP, Filename, 1, KVList1, length(KVList1), native), + + SK1 = element(1, lists:nth(124, KVList1)), + SK2 = element(1, lists:nth(126, KVList1)), + SK3 = element(1, lists:nth(128, KVList1)), + SK4 = element(1, lists:nth(130, KVList1)), + SK5 = element(1, lists:nth(132, KVList1)), + + EK1 = element(1, lists:nth(252, KVList1)), + EK2 = element(1, lists:nth(254, KVList1)), + EK3 = element(1, lists:nth(256, KVList1)), + EK4 = element(1, lists:nth(258, KVList1)), + EK5 = element(1, lists:nth(260, KVList1)), + + GetSegFun = + fun(LK) -> + extract_hash( + leveled_codec:strip_to_segmentonly( + lists:keyfind(LK, 1, KVList1))) + end, + SegList = + lists:map(GetSegFun, + [SK1, SK2, SK3, SK4, SK5, EK1, EK2, EK3, EK4, EK5]), + + TestFun = + fun(StartKey, EndKey, OutList) -> + RangeKVL = + sst_getfilteredrange(Pid, StartKey, EndKey, 4, SegList, 0), + RangeKL = lists:map(fun({LK0, _LV0}) -> LK0 end, RangeKVL), + ?assertMatch(true, lists:member(StartKey, RangeKL)), + ?assertMatch(true, lists:member(EndKey, RangeKL)), + CheckOutFun = + fun(OutKey) -> + ?assertMatch(false, lists:member(OutKey, RangeKL)) + end, + lists:foreach(CheckOutFun, OutList) + end, + + lists:foldl(fun(SK0, Acc) -> + TestFun(SK0, EK1, [EK2, EK3, EK4, EK5] ++ Acc), + [SK0|Acc] + end, + [], + [SK1, SK2, SK3, SK4, SK5]), + lists:foldl(fun(SK0, Acc) -> + TestFun(SK0, EK2, [EK3, EK4, EK5] ++ Acc), + [SK0|Acc] + end, + [], + [SK1, SK2, SK3, SK4, SK5]), + lists:foldl(fun(SK0, Acc) -> + TestFun(SK0, EK3, [EK4, EK5] ++ Acc), + [SK0|Acc] + end, + [], + [SK1, SK2, SK3, SK4, SK5]), + lists:foldl(fun(SK0, Acc) -> + TestFun(SK0, EK4, [EK5] ++ Acc), + [SK0|Acc] + end, + [], + [SK1, SK2, SK3, SK4, SK5]), + + ok = sst_clear(Pid). + + + additional_range_test() -> % Test fetching ranges that fall into odd situations with regards to the % summayr index From a12931b430c110d3077685478f3b85a124dfb908 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 15 Nov 2018 01:06:37 +0000 Subject: [PATCH 2/3] Add comments --- src/leveled_sst.erl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index fe7fe68..7836c51 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -1719,11 +1719,14 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, PressMethod, IdxModDate, []), + % There is no range passed through to the + % binaryslot_reader, so this needs to + % filtered FilterFun = fun(KV) -> in_range(KV, SK, EK) end, Acc ++ lists:filter(FilterFun, KVL) - % Note check_blocks shouldreturn [] if - % PositionList is empty + % Note check_blocks shouldreturn [] if + % PositionList is empty end end end @@ -1733,6 +1736,8 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, -spec in_range(leveled_codec:ledger_kv(), range_endpoint(), range_endpoint()) -> boolean(). +%% @doc +%% Is the ledger key in the range. in_range({_LK, _LV}, all, all) -> true; in_range({LK, _LV}, all, EK) -> From fab11bc2d20571d05c02a88b293d9ffec816adb3 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 15 Nov 2018 08:54:06 +0000 Subject: [PATCH 3/3] Update comments to assist with clarity --- src/leveled_sst.erl | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 7836c51..e2f226b 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -1710,6 +1710,8 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, % Need to find just the right keys PositionList = find_pos(BlockIdx, SegList, [], 0), + % Note check_blocks should return [] if + % PositionList is empty (which it may be) KVL = check_blocks(PositionList, {Handle, SP}, @@ -1720,13 +1722,11 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, IdxModDate, []), % There is no range passed through to the - % binaryslot_reader, so this needs to - % filtered + % binaryslot_reader, so these results need + % to be filtered FilterFun = fun(KV) -> in_range(KV, SK, EK) end, Acc ++ lists:filter(FilterFun, KVL) - % Note check_blocks shouldreturn [] if - % PositionList is empty end end end @@ -1760,8 +1760,25 @@ read_slotlist(SlotList, Handle) -> list({integer(), binary()})}. %% @doc %% Read the binary slots converting them to {K, V} pairs if they were not -%% already {K, V} pairs +%% already {K, V} pairs. If they are already {K, V} pairs it is assumed +%% that they have already been range checked before extraction. +%% +%% Keys which are still to be extracted from the slot, are accompanied at +%% this function by the range against which the keys need to be checked. +%% This range is passed with the slot to binaryslot_trimmedlist which should +%% open the slot block by block, filtering individual keys where the endpoints +%% of the block are outside of the range, and leaving blocks already proven to +%% be outside of the range unopened. binaryslot_reader(SlotBinsToFetch, PressMethod, IdxModDate, SegList) -> + % Two accumulators are added. + % One to collect the list of keys and values found in the binary slots + % (subject to range filtering if the slot is still deserialised at this + % stage. + % The second accumulator extracts the header information from the slot, so + % that the cache can be built for that slot. This is used by the handling + % of get_kvreader calls. This means that slots which are only used in + % range queries can still populate their block_index caches (on the FSM + % loop state), and those caches can be used for future queries. binaryslot_reader(SlotBinsToFetch, PressMethod, IdxModDate, SegList, [], []). @@ -1769,6 +1786,11 @@ binaryslot_reader([], _PressMethod, _IdxModDate, _SegList, Acc, BIAcc) -> {Acc, BIAcc}; binaryslot_reader([{SlotBin, ID, SK, EK}|Tail], PressMethod, IdxModDate, SegList, Acc, BIAcc) -> + % The start key and end key here, may not the start key and end key the + % application passed into the query. If the slot is known to lie entirely + % inside the range, on either of both sides, the SK and EK may be + % substituted for the 'all' key work to indicate there is no need for + % entries in this slot to be trimmed from either or both sides. {TrimmedL, BICache} = binaryslot_trimmedlist(SlotBin, SK, EK, @@ -1783,6 +1805,8 @@ binaryslot_reader([{SlotBin, ID, SK, EK}|Tail], [{ID, BICache}|BIAcc]); binaryslot_reader([{K, V}|Tail], PressMethod, IdxModDate, SegList, Acc, BIAcc) -> + % These entries must already have been filtered for membership inside any + % range used in the query. binaryslot_reader(Tail, PressMethod, IdxModDate, SegList, Acc ++ [{K, V}], BIAcc).