diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 39384d9..09debba 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -82,7 +82,8 @@ book_objectfold/5, book_objectfold/6, book_headfold/6, - book_headfold/7 + book_headfold/7, + book_headfold/9 ]). -export([empty_ledgercache/0, @@ -104,7 +105,6 @@ -define(JOURNAL_SIZE_JITTER, 20). -define(ABSOLUTEMAX_JOURNALSIZE, 4000000000). -define(LONG_RUNNING, 80000). --define(RECENT_AAE, false). -define(COMPRESSION_METHOD, lz4). -define(COMPRESSION_POINT, on_receipt). -define(TIMING_SAMPLESIZE, 100). @@ -112,13 +112,13 @@ -define(DUMMY, dummy). % Dummy key used for mput operations -define(MAX_KEYCHECK_FREQUENCY, 100). -define(MIN_KEYCHECK_FREQUENCY, 1). +-define(OPEN_LASTMOD_RANGE, {0, infinity}). -define(OPTION_DEFAULTS, [{root_path, undefined}, {snapshot_bookie, undefined}, {cache_size, ?CACHE_SIZE}, {max_journalsize, 1000000000}, {sync_strategy, none}, - {recent_aae, ?RECENT_AAE}, {head_only, false}, {waste_retention_period, undefined}, {max_run_length, undefined}, @@ -140,7 +140,6 @@ -record(state, {inker :: pid() | undefined, penciller :: pid() | undefined, cache_size :: integer() | undefined, - recent_aae :: recent_aae(), ledger_cache = #ledger_cache{}, is_snapshot :: boolean() | undefined, slow_offer = false :: boolean(), @@ -186,10 +185,7 @@ -type fold_timings() :: no_timing|#fold_timings{}. -type head_timings() :: no_timing|#head_timings{}. -type timing_types() :: head|get|put|fold. --type recent_aae() :: false|#recent_aae{}|undefined. --type key() :: binary()|string()|{binary(), binary()}. - % Keys SHOULD be binary() - % string() support is a legacy of old tests + -type open_options() :: %% For full description of options see ../docs/STARTUP_OPTIONS.md [{root_path, string()|undefined} | @@ -220,12 +216,6 @@ % riak_sync is used for backwards compatability with OTP16 - and % will manually call sync() after each write (rather than use the % O_SYNC option on startup - {recent_aae, false|{atom(), list(), integer(), integer()}} | - % DEPRECATED - % Before working on kv_index_tictactree looked at the possibility - % of maintaining AAE just for recent changes. Given the efficiency - % of the kv_index_tictactree approach this is unecessary. - % Should be set to false {head_only, false|with_lookup|no_lookup} | % When set to true, there are three fundamental changes as to how % leveled will work: @@ -310,7 +300,6 @@ % Defaults to ?COMPRESSION_POINT ]. --export_type([key/0]). %%%============================================================================ @@ -371,7 +360,7 @@ book_plainstart(Opts) -> gen_server:start(?MODULE, [set_defaults(Opts)], []). --spec book_tempput(pid(), key(), key(), any(), +-spec book_tempput(pid(), leveled_codec:key(), leveled_codec:key(), any(), leveled_codec:index_specs(), leveled_codec:tag(), integer()) -> ok|pause. @@ -438,7 +427,7 @@ book_put(Pid, Bucket, Key, Object, IndexSpecs) -> book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag) -> book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag, infinity). --spec book_put(pid(), key(), key(), any(), +-spec book_put(pid(), leveled_codec:key(), leveled_codec:key(), any(), leveled_codec:index_specs(), leveled_codec:tag(), infinity|integer()) -> ok|pause. @@ -448,7 +437,7 @@ book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag, TTL) -> infinity). --spec book_mput(pid(), list(tuple())) -> ok|pause. +-spec book_mput(pid(), list(leveled_codec:object_spec())) -> ok|pause. %% @doc %% %% When the store is being run in head_only mode, batches of object specs may @@ -461,7 +450,8 @@ book_put(Pid, Bucket, Key, Object, IndexSpecs, Tag, TTL) -> book_mput(Pid, ObjectSpecs) -> book_mput(Pid, ObjectSpecs, infinity). --spec book_mput(pid(), list(tuple()), infinity|integer()) -> ok|pause. +-spec book_mput(pid(), list(leveled_codec:object_spec()), infinity|integer()) + -> ok|pause. %% @doc %% %% When the store is being run in head_only mode, batches of object specs may @@ -474,8 +464,9 @@ book_mput(Pid, ObjectSpecs) -> book_mput(Pid, ObjectSpecs, TTL) -> gen_server:call(Pid, {mput, ObjectSpecs, TTL}, infinity). --spec book_delete(pid(), key(), key(), leveled_codec:index_specs()) - -> ok|pause. +-spec book_delete(pid(), + leveled_codec:key(), leveled_codec:key(), + leveled_codec:index_specs()) -> ok|pause. %% @doc %% @@ -486,11 +477,15 @@ book_delete(Pid, Bucket, Key, IndexSpecs) -> book_put(Pid, Bucket, Key, delete, IndexSpecs, ?STD_TAG). --spec book_get(pid(), key(), key(), leveled_codec:tag()) +-spec book_get(pid(), + leveled_codec:key(), leveled_codec:key(), leveled_codec:tag()) -> {ok, any()}|not_found. --spec book_head(pid(), key(), key(), leveled_codec:tag()) +-spec book_head(pid(), + leveled_codec:key(), leveled_codec:key(), leveled_codec:tag()) + -> {ok, any()}|not_found. +-spec book_headonly(pid(), + leveled_codec:key(), leveled_codec:key(), leveled_codec:key()) -> {ok, any()}|not_found. --spec book_headonly(pid(), key(), key(), key()) -> {ok, any()}|not_found. %% @doc - GET and HEAD requests %% @@ -869,8 +864,9 @@ book_objectfold(Pid, Tag, Bucket, Limiter, FoldAccT, SnapPreFold) -> SegmentList :: false | list(integer()), Runner :: fun(() -> Acc). book_headfold(Pid, Tag, FoldAccT, JournalCheck, SnapPreFold, SegmentList) -> - RunnerType = {foldheads_allkeys, Tag, FoldAccT, JournalCheck, SnapPreFold, SegmentList}, - book_returnfolder(Pid, RunnerType). + book_headfold(Pid, Tag, all, + FoldAccT, JournalCheck, SnapPreFold, + SegmentList, false, false). %% @doc as book_headfold/6, but with the addition of a `Limiter' that %% restricts the set of objects folded over. `Limiter' can either be a @@ -902,11 +898,61 @@ book_headfold(Pid, Tag, FoldAccT, JournalCheck, SnapPreFold, SegmentList) -> SnapPreFold :: boolean(), SegmentList :: false | list(integer()), Runner :: fun(() -> Acc). -book_headfold(Pid, Tag, {bucket_list, BucketList}, FoldAccT, JournalCheck, SnapPreFold, SegmentList) -> - RunnerType = {foldheads_bybucket, Tag, BucketList, bucket_list, FoldAccT, JournalCheck, SnapPreFold, SegmentList}, +book_headfold(Pid, Tag, Limiter, FoldAccT, JournalCheck, SnapPreFold, SegmentList) -> + book_headfold(Pid, Tag, Limiter, + FoldAccT, JournalCheck, SnapPreFold, + SegmentList, false, false). + +%% @doc as book_headfold/7, but with the addition of a Last Modified Date +%% Range and Max Object Count. For version 2 objects this will filter out +%% all objects with a highest Last Modified Date that is outside of the range. +%% All version 1 objects will be included in the result set regardless of Last +%% Modified Date. +%% The Max Object Count will stop the fold once the count has been reached on +%% this store only. The Max Object Count if provided will mean that the runner +%% will return {RemainingCount, Acc} not just Acc +-spec book_headfold(pid(), Tag, Limiter, FoldAccT, JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount) -> + {async, Runner} when + Tag :: leveled_codec:tag(), + Limiter :: BucketList | BucketKeyRange | all, + BucketList :: {bucket_list, list(Bucket)}, + BucketKeyRange :: {range, Bucket, KeyRange}, + KeyRange :: {StartKey, EndKey} | all, + StartKey :: Key, + EndKey :: Key, + FoldAccT :: {FoldFun, Acc}, + FoldFun :: fun((Bucket, Key, Value, Acc) -> Acc), + Acc :: term(), + Bucket :: term(), + Key :: term(), + Value :: term(), + JournalCheck :: boolean(), + SnapPreFold :: boolean(), + SegmentList :: false | list(integer()), + LastModRange :: false | leveled_codec:lastmod_range(), + MaxObjectCount :: false | pos_integer(), + Runner :: fun(() -> ResultingAcc), + ResultingAcc :: Acc | {non_neg_integer(), Acc}. +book_headfold(Pid, Tag, {bucket_list, BucketList}, FoldAccT, JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount) -> + RunnerType = + {foldheads_bybucket, Tag, BucketList, bucket_list, FoldAccT, + JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount}, book_returnfolder(Pid, RunnerType); -book_headfold(Pid, Tag, {range, Bucket, KeyRange}, FoldAccT, JournalCheck, SnapPreFold, SegmentList) -> - RunnerType = {foldheads_bybucket, Tag, Bucket, KeyRange, FoldAccT, JournalCheck, SnapPreFold, SegmentList}, +book_headfold(Pid, Tag, {range, Bucket, KeyRange}, FoldAccT, JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount) -> + RunnerType = + {foldheads_bybucket, Tag, Bucket, KeyRange, FoldAccT, + JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount}, + book_returnfolder(Pid, RunnerType); +book_headfold(Pid, Tag, all, FoldAccT, JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount) -> + RunnerType = {foldheads_allkeys, Tag, FoldAccT, + JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount}, book_returnfolder(Pid, RunnerType). -spec book_snapshot(pid(), @@ -1008,16 +1054,6 @@ init([Opts]) -> ConfiguredCacheSize div (100 div ?CACHE_SIZE_JITTER), CacheSize = ConfiguredCacheSize + erlang:phash2(self()) rem CacheJitter, - RecentAAE = - case proplists:get_value(recent_aae, Opts) of - false -> - false; - {FilterType, BucketList, LimitMinutes, UnitMinutes} -> - #recent_aae{filter = FilterType, - buckets = BucketList, - limit_minutes = LimitMinutes, - unit_minutes = UnitMinutes} - end, {HeadOnly, HeadLookup} = case proplists:get_value(head_only, Opts) of @@ -1030,7 +1066,6 @@ init([Opts]) -> end, State0 = #state{cache_size=CacheSize, - recent_aae=RecentAAE, is_snapshot=false, head_only=HeadOnly, head_lookup = HeadLookup}, @@ -1137,7 +1172,7 @@ handle_call({get, Bucket, Key, Tag}, _From, State) not_found; Head -> {Seqn, Status, _MH, _MD} = - leveled_codec:striphead_to_details(Head), + leveled_codec:striphead_to_v1details(Head), case Status of tomb -> not_found; @@ -1186,7 +1221,7 @@ handle_call({head, Bucket, Key, Tag}, _From, State) not_present -> {not_found, State#state.ink_checking}; Head -> - case leveled_codec:striphead_to_details(Head) of + case leveled_codec:striphead_to_v1details(Head) of {_SeqN, tomb, _MH, _MD} -> {not_found, State#state.ink_checking}; {SeqN, {active, TS}, _MH, MD} -> @@ -1574,12 +1609,14 @@ get_runner(State, {keylist, Tag, Bucket, KeyRange, FoldAccT, TermRegex}) -> get_runner(State, {foldheads_allkeys, Tag, FoldFun, - JournalCheck, SnapPreFold, SegmentList}) -> + JournalCheck, SnapPreFold, SegmentList, + LastModRange, MaxObjectCount}) -> SnapType = snaptype_by_presence(JournalCheck), SnapFun = return_snapfun(State, SnapType, no_lookup, true, SnapPreFold), leveled_runner:foldheads_allkeys(SnapFun, Tag, FoldFun, - JournalCheck, SegmentList); + JournalCheck, SegmentList, + LastModRange, MaxObjectCount); get_runner(State, {foldobjects_allkeys, Tag, FoldFun, SnapPreFold}) -> get_runner(State, @@ -1597,7 +1634,8 @@ get_runner(State, Tag, BucketList, bucket_list, FoldFun, - JournalCheck, SnapPreFold, SegmentList}) -> + JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount}) -> KeyRangeFun = fun(Bucket) -> {StartKey, EndKey, _} = return_ledger_keyrange(Tag, Bucket, all), @@ -1609,13 +1647,16 @@ get_runner(State, Tag, lists:map(KeyRangeFun, BucketList), FoldFun, - JournalCheck, SegmentList); + JournalCheck, + SegmentList, + LastModRange, MaxObjectCount); get_runner(State, {foldheads_bybucket, Tag, Bucket, KeyRange, FoldFun, - JournalCheck, SnapPreFold, SegmentList}) -> + JournalCheck, SnapPreFold, + SegmentList, LastModRange, MaxObjectCount}) -> {StartKey, EndKey, SnapQ} = return_ledger_keyrange(Tag, Bucket, KeyRange), SnapType = snaptype_by_presence(JournalCheck), SnapFun = return_snapfun(State, SnapType, SnapQ, true, SnapPreFold), @@ -1623,7 +1664,9 @@ get_runner(State, Tag, [{StartKey, EndKey}], FoldFun, - JournalCheck, SegmentList); + JournalCheck, + SegmentList, + LastModRange, MaxObjectCount); get_runner(State, {foldobjects_bybucket, Tag, Bucket, KeyRange, @@ -1926,14 +1969,12 @@ preparefor_ledgercache(?INKT_KEYD, {no_lookup, SQN, KeyChanges}; preparefor_ledgercache(_InkTag, LedgerKey, SQN, Obj, Size, {IdxSpecs, TTL}, - State) -> - {Bucket, Key, MetaValue, {KeyH, ObjH}, LastMods} = + _State) -> + {Bucket, Key, MetaValue, {KeyH, _ObjH}, _LastMods} = leveled_codec:generate_ledgerkv(LedgerKey, SQN, Obj, Size, TTL), KeyChanges = [{LedgerKey, MetaValue}] ++ - leveled_codec:idx_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL) ++ - leveled_codec:aae_indexspecs(State#state.recent_aae, - Bucket, Key, SQN, ObjH, LastMods), + leveled_codec:idx_indexspecs(IdxSpecs, Bucket, Key, SQN, TTL), {KeyH, SQN, KeyChanges}. @@ -2449,7 +2490,7 @@ foldobjects_vs_hashtree_testto() -> {foldheads_allkeys, ?STD_TAG, FoldHeadsFun, - true, true, false}), + true, true, false, false, false}), KeyHashList3 = HTFolder3(), ?assertMatch(KeyHashList1, lists:usort(KeyHashList3)), @@ -2468,7 +2509,7 @@ foldobjects_vs_hashtree_testto() -> {foldheads_allkeys, ?STD_TAG, FoldHeadsFun2, - false, false, false}), + false, false, false, false, false}), KeyHashList4 = HTFolder4(), ?assertMatch(KeyHashList1, lists:usort(KeyHashList4)), @@ -2544,7 +2585,8 @@ folder_cache_test(CacheSize) -> "BucketA", all, FoldHeadsFun, - true, true, false}), + true, true, + false, false, false}), KeyHashList2A = HTFolder2A(), {async, HTFolder2B} = book_returnfolder(Bookie1, @@ -2553,7 +2595,8 @@ folder_cache_test(CacheSize) -> "BucketB", all, FoldHeadsFun, - true, false, false}), + true, false, + false, false, false}), KeyHashList2B = HTFolder2B(), ?assertMatch(true, @@ -2568,7 +2611,8 @@ folder_cache_test(CacheSize) -> "BucketB", {"Key", <<"$all">>}, FoldHeadsFun, - true, false, false}), + true, false, + false, false, false}), KeyHashList2C = HTFolder2C(), {async, HTFolder2D} = book_returnfolder(Bookie1, @@ -2577,7 +2621,8 @@ folder_cache_test(CacheSize) -> "BucketB", {"Key", "Keyzzzzz"}, FoldHeadsFun, - true, true, false}), + true, true, + false, false, false}), KeyHashList2D = HTFolder2D(), ?assertMatch(true, lists:usort(KeyHashList2B) == lists:usort(KeyHashList2C)), @@ -2597,7 +2642,8 @@ folder_cache_test(CacheSize) -> "BucketB", {"Key", SplitIntEnd}, FoldHeadsFun, - true, false, false}), + true, false, + false, false, false}), KeyHashList2E = HTFolder2E(), {async, HTFolder2F} = book_returnfolder(Bookie1, @@ -2606,7 +2652,8 @@ folder_cache_test(CacheSize) -> "BucketB", {SplitIntStart, "Key|"}, FoldHeadsFun, - true, false, false}), + true, false, + false, false, false}), KeyHashList2F = HTFolder2F(), ?assertMatch(true, length(KeyHashList2E) > 0), diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 92c677b..71bc83d 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -37,8 +37,8 @@ strip_to_seqonly/1, strip_to_statusonly/1, strip_to_keyseqonly/1, - strip_to_seqnhashonly/1, - striphead_to_details/1, + strip_to_indexdetails/1, + striphead_to_v1details/1, is_active/3, endkey_passed/2, key_dominates/2, @@ -63,7 +63,6 @@ get_keyandobjhash/2, idx_indexspecs/5, obj_objectspecs/3, - aae_indexspecs/6, riak_extract_metadata/2, segment_hash/1, to_lookup/1, @@ -74,9 +73,7 @@ -define(MAGIC, 53). % riak_kv -> riak_object -define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). -define(NRT_IDX, "$aae."). --define(ALL_BUCKETS, <<"$all">>). --type recent_aae() :: #recent_aae{}. -type riak_metadata() :: {binary()|delete, % Sibling Metadata binary()|null, % Vclock Metadata integer()|null, % Hash of vclock - non-exportable @@ -84,14 +81,35 @@ -type tag() :: ?STD_TAG|?RIAK_TAG|?IDX_TAG|?HEAD_TAG. +-type key() :: + binary()|string()|{binary(), binary()}. + % Keys SHOULD be binary() + % string() support is a legacy of old tests +-type sqn() :: + % SQN of the object in the Journal + pos_integer(). -type segment_hash() :: + % hash of the key to an aae segment - to be used in ledger filters {integer(), integer()}|no_lookup. +-type metadata() :: + tuple()|null. % null for empty metadata +-type last_moddate() :: + % modified date as determined by the object (not this store) + % if the object has siblings in the store will be the maximum of those + % dates + integer()|undefined. +-type lastmod_range() :: {integer(), pos_integer()|infinity}. + -type ledger_status() :: tomb|{active, non_neg_integer()|infinity}. -type ledger_key() :: {tag(), any(), any(), any()}|all. --type ledger_value() :: - {integer(), ledger_status(), segment_hash(), tuple()|null}. +-type ledger_value() :: + ledger_value_v1()|ledger_value_v2(). +-type ledger_value_v1() :: + {sqn(), ledger_status(), segment_hash(), metadata()}. +-type ledger_value_v2() :: + {sqn(), ledger_status(), segment_hash(), metadata(), last_moddate()}. -type ledger_kv() :: {ledger_key(), ledger_value()}. -type compaction_strategy() :: @@ -100,18 +118,29 @@ ?INKT_STND|?INKT_TOMB|?INKT_MPUT|?INKT_KEYD. -type journal_key() :: {integer(), journal_key_tag(), ledger_key()}. +-type object_spec_v0() :: + {add|remove, key(), key(), key()|null, any()}. +-type object_spec_v1() :: + {add|remove, v1, key(), key(), key()|null, + list(erlang:timestamp())|undefined, any()}. +-type object_spec() :: + object_spec_v0()|object_spec_v1(). -type compression_method() :: lz4|native. -type index_specs() :: list({add|remove, any(), any()}). -type journal_keychanges() :: {index_specs(), infinity|integer()}. % {KeyChanges, TTL} +-type maybe_lookup() :: + lookup|no_lookup. -type segment_list() :: list(integer())|false. -export_type([tag/0, + key/0, + object_spec/0, segment_hash/0, ledger_status/0, ledger_key/0, @@ -123,7 +152,10 @@ compression_method/0, journal_keychanges/0, index_specs/0, - segment_list/0]). + segment_list/0, + maybe_lookup/0, + last_moddate/0, + lastmod_range/0]). %%%============================================================================ @@ -152,7 +184,7 @@ segment_hash(Key) -> segment_hash(term_to_binary(Key)). --spec to_lookup(ledger_key()) -> lookup|no_lookup. +-spec to_lookup(ledger_key()) -> maybe_lookup(). %% @doc %% Should it be possible to lookup a key in the merge tree. This is not true %% For keys that should only be read through range queries. Direct lookup @@ -170,36 +202,41 @@ to_lookup(Key) -> %% Some helper functions to get a sub_components of the key/value -spec strip_to_statusonly(ledger_kv()) -> ledger_status(). -strip_to_statusonly({_, {_, St, _, _}}) -> St. +strip_to_statusonly({_, V}) -> element(2, V). -spec strip_to_seqonly(ledger_kv()) -> non_neg_integer(). -strip_to_seqonly({_, {SeqN, _, _, _}}) -> SeqN. +strip_to_seqonly({_, V}) -> element(1, V). -spec strip_to_keyseqonly(ledger_kv()) -> {ledger_key(), integer()}. -strip_to_keyseqonly({LK, {SeqN, _, _, _}}) -> {LK, SeqN}. +strip_to_keyseqonly({LK, V}) -> {LK, element(1, V)}. --spec strip_to_seqnhashonly(ledger_kv()) -> {integer(), segment_hash()}. -strip_to_seqnhashonly({_, {SeqN, _, MH, _}}) -> {SeqN, MH}. +-spec strip_to_indexdetails(ledger_kv()) -> + {integer(), segment_hash(), last_moddate()}. +strip_to_indexdetails({_, V}) when tuple_size(V) == 4 -> + % A v1 value + {element(1, V), element(3, V), undefined}; +strip_to_indexdetails({_, V}) when tuple_size(V) > 4 -> + % A v2 value should have a fith element - Last Modified Date + {element(1, V), element(3, V), element(5, V)}. --spec striphead_to_details(ledger_value()) -> ledger_value(). -striphead_to_details({SeqN, St, MH, MD}) -> {SeqN, St, MH, MD}. +-spec striphead_to_v1details(ledger_value()) -> ledger_value(). +striphead_to_v1details(V) -> + {element(1, V), element(2, V), element(3, V), element(4, V)}. -spec key_dominates(ledger_kv(), ledger_kv()) -> left_hand_first|right_hand_first|left_hand_dominant|right_hand_dominant. %% @doc %% When comparing two keys in the ledger need to find if one key comes before %% the other, or if the match, which key is "better" and should be the winner -key_dominates(LeftKey, RightKey) -> - case {LeftKey, RightKey} of - {{LK, _LVAL}, {RK, _RVAL}} when LK < RK -> - left_hand_first; - {{LK, _LVAL}, {RK, _RVAL}} when RK < LK -> - right_hand_first; - {{LK, {LSN, _LST, _LMH, _LMD}}, {RK, {RSN, _RST, _RMH, _RMD}}} - when LK == RK, LSN >= RSN -> +key_dominates({LK, _LVAL}, {RK, _RVAL}) when LK < RK -> + left_hand_first; +key_dominates({LK, _LVAL}, {RK, _RVAL}) when RK < LK -> + right_hand_first; +key_dominates(LObj, RObj) -> + case strip_to_seqonly(LObj) >= strip_to_seqonly(RObj) of + true -> left_hand_dominant; - {{LK, {LSN, _LST, _LMH, _LMD}}, {RK, {RSN, _RST, _RMH, _RMD}}} - when LK == RK, LSN < RSN -> + false -> right_hand_dominant end. @@ -249,8 +286,6 @@ from_ledgerkey(_ExpectedTag, _OtherKey) -> -spec from_ledgerkey(tuple()) -> tuple(). %% @doc %% Return identifying information from the LedgerKey -from_ledgerkey({?IDX_TAG, ?ALL_BUCKETS, {_IdxFld, IdxVal}, {Bucket, Key}}) -> - {Bucket, Key, IdxVal}; from_ledgerkey({?IDX_TAG, Bucket, {_IdxFld, IdxVal}, Key}) -> {Bucket, Key, IdxVal}; from_ledgerkey({?HEAD_TAG, Bucket, Key, SubKey}) -> @@ -528,9 +563,7 @@ hash(Obj) -> %% @doc %% Convert object specs to KV entries ready for the ledger obj_objectspecs(ObjectSpecs, SQN, TTL) -> - lists:map(fun({IdxOp, Bucket, Key, SubKey, Value}) -> - gen_headspec(Bucket, Key, IdxOp, SubKey, Value, SQN, TTL) - end, + lists:map(fun(ObjectSpec) -> gen_headspec(ObjectSpec, SQN, TTL) end, ObjectSpecs). -spec idx_indexspecs(index_specs(), @@ -548,27 +581,25 @@ idx_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL) -> gen_indexspec(Bucket, Key, IdxOp, IdxField, IdxTerm, SQN, TTL) -> Status = set_status(IdxOp, TTL), - case Bucket of - {all, RealBucket} -> - {to_ledgerkey(?ALL_BUCKETS, - {RealBucket, Key}, - ?IDX_TAG, - IdxField, - IdxTerm), - {SQN, Status, no_lookup, null}}; - _ -> - {to_ledgerkey(Bucket, - Key, - ?IDX_TAG, - IdxField, - IdxTerm), - {SQN, Status, no_lookup, null}} - end. + {to_ledgerkey(Bucket, Key, ?IDX_TAG, IdxField, IdxTerm), + {SQN, Status, no_lookup, null}}. -gen_headspec(Bucket, Key, IdxOp, SubKey, Value, SQN, TTL) -> +-spec gen_headspec(object_spec(), integer(), integer()|infinity) -> ledger_kv(). +%% @doc +%% Take an object_spec as passed in a book_mput, and convert it into to a +%% valid ledger key and value. Supports different shaped tuples for different +%% versions of the object_spec +gen_headspec({IdxOp, v1, Bucket, Key, SubKey, LMD, Value}, SQN, TTL) -> + % v1 object spec Status = set_status(IdxOp, TTL), K = to_ledgerkey(Bucket, {Key, SubKey}, ?HEAD_TAG), - {K, {SQN, Status, segment_hash(K), Value}}. + {K, {SQN, Status, segment_hash(K), Value, get_last_lastmodification(LMD)}}; +gen_headspec({IdxOp, Bucket, Key, SubKey, Value}, SQN, TTL) -> + % v0 object spec + Status = set_status(IdxOp, TTL), + K = to_ledgerkey(Bucket, {Key, SubKey}, ?HEAD_TAG), + {K, {SQN, Status, segment_hash(K), Value, undefined}}. + set_status(add, TTL) -> @@ -577,103 +608,6 @@ set_status(remove, _TTL) -> %% TODO: timestamps for delayed reaping tomb. --spec aae_indexspecs(false|recent_aae(), - any(), any(), - integer(), integer(), - list()) - -> list(). -%% @doc -%% Generate an additional index term representing the change, if the last -%% modified date for the change is within the definition of recency. -%% -%% The object may have multiple last modified dates (siblings), and in this -%% case index entries for all dates within the range are added. -%% -%% The index should entry auto-expire in the future (when it is no longer -%% relevant to assessing recent changes) -aae_indexspecs(false, _Bucket, _Key, _SQN, _H, _LastMods) -> - []; -aae_indexspecs(_AAE, _Bucket, _Key, _SQN, _H, []) -> - []; -aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods) -> - InList = lists:member(Bucket, AAE#recent_aae.buckets), - Bucket0 = - case AAE#recent_aae.filter of - blacklist -> - case InList of - true -> - false; - false -> - {all, Bucket} - end; - whitelist -> - case InList of - true -> - Bucket; - false -> - false - end - end, - case Bucket0 of - false -> - []; - Bucket0 -> - GenIdxFun = - fun(LMD0, Acc) -> - Dates = parse_date(LMD0, - AAE#recent_aae.unit_minutes, - AAE#recent_aae.limit_minutes, - leveled_util:integer_now()), - case Dates of - no_index -> - Acc; - {LMD1, TTL} -> - TreeSize = AAE#recent_aae.tree_size, - SegID32 = leveled_tictac:keyto_segment32(Key), - SegID = - leveled_tictac:get_segment(SegID32, TreeSize), - IdxFldStr = ?NRT_IDX ++ LMD1 ++ "_bin", - IdxTrmStr = - string:right(integer_to_list(SegID), 8, $0) ++ - "." ++ - string:right(integer_to_list(H), 8, $0), - {IdxK, IdxV} = - gen_indexspec(Bucket0, Key, - add, - list_to_binary(IdxFldStr), - list_to_binary(IdxTrmStr), - SQN, TTL), - [{IdxK, IdxV}|Acc] - end - end, - lists:foldl(GenIdxFun, [], LastMods) - end. - --spec parse_date(tuple(), integer(), integer(), integer()) -> - no_index|{list(), integer()}. -%% @doc -%% Parse the last modified date and the AAE date configuration to return a -%% binary to be used as the last modified date part of the index, and an -%% integer to be used as the TTL of the index entry. -%% Return no_index if the change is not recent. -parse_date(LMD, UnitMins, LimitMins, Now) -> - LMDsecs = leveled_util:integer_time(LMD), - Recent = (LMDsecs + LimitMins * 60) > Now, - case Recent of - false -> - no_index; - true -> - {{Y, M, D}, {Hour, Minute, _Second}} = - calendar:now_to_datetime(LMD), - RoundMins = - UnitMins * (Minute div UnitMins), - StrTime = - lists:flatten(io_lib:format(?LMD_FORMAT, - [Y, M, D, Hour, RoundMins])), - TTL = min(Now, LMDsecs) + (LimitMins + UnitMins) * 60, - {StrTime, TTL} - end. - -spec generate_ledgerkv( tuple(), integer(), any(), integer(), tuple()|infinity) -> {any(), any(), any(), @@ -705,9 +639,23 @@ generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> Value = {SQN, Status, Hash, - MD}, + MD, + get_last_lastmodification(LastMods)}, {Bucket, Key, Value, {Hash, ObjHash}, LastMods}. +-spec get_last_lastmodification(list(erlang:timestamp())|undefined) + -> pos_integer()|undefined. +%% @doc +%% Get the highest of the last modifications measured in seconds. This will be +%% stored as 4 bytes (unsigned) so will last for another 80 + years +get_last_lastmodification(undefined) -> + undefined; +get_last_lastmodification([]) -> + undefined; +get_last_lastmodification(LastMods) -> + {Mega, Sec, _Micro} = lists:max(LastMods), + Mega * 1000000 + Sec. + extract_metadata(Obj, Size, ?RIAK_TAG) -> riak_extract_metadata(Obj, Size); @@ -716,7 +664,7 @@ extract_metadata(Obj, Size, ?STD_TAG) -> get_size(PK, Value) -> {Tag, _Bucket, _Key, _} = PK, - {_, _, _, MD} = Value, + MD = element(4, Value), case Tag of ?RIAK_TAG -> {_RMD, _VC, _Hash, Size} = MD, @@ -733,7 +681,7 @@ get_size(PK, Value) -> %% the sorted vclock) get_keyandobjhash(LK, Value) -> {Tag, Bucket, Key, _} = LK, - {_, _, _, MD} = Value, + MD = element(4, Value), case Tag of ?IDX_TAG -> from_ledgerkey(LK); % returns {Bucket, Key, IdxValue} @@ -927,95 +875,6 @@ hashperf_test() -> io:format(user, "1000 object hashes in ~w microseconds~n", [timer:now_diff(os:timestamp(), SW)]). -parsedate_test() -> - {MeS, S, MiS} = os:timestamp(), - timer:sleep(100), - Now = leveled_util:integer_now(), - UnitMins = 5, - LimitMins = 60, - PD = parse_date({MeS, S, MiS}, UnitMins, LimitMins, Now), - io:format("Parsed Date ~w~n", [PD]), - ?assertMatch(true, is_tuple(PD)), - check_pd(PD, UnitMins), - CheckFun = - fun(Offset) -> - ModDate = {MeS, S + Offset * 60, MiS}, - check_pd(parse_date(ModDate, UnitMins, LimitMins, Now), UnitMins) - end, - lists:foreach(CheckFun, lists:seq(1, 60)). - -check_pd(PD, UnitMins) -> - {LMDstr, _TTL} = PD, - Minutes = list_to_integer(lists:nthtail(10, LMDstr)), - ?assertMatch(0, Minutes rem UnitMins). - -parseolddate_test() -> - LMD = os:timestamp(), - timer:sleep(100), - Now = leveled_util:integer_now() + 60 * 60, - UnitMins = 5, - LimitMins = 60, - PD = parse_date(LMD, UnitMins, LimitMins, Now), - io:format("Parsed Date ~w~n", [PD]), - ?assertMatch(no_index, PD). - -genaaeidx_test() -> - AAE = #recent_aae{filter=blacklist, - buckets=[], - limit_minutes=60, - unit_minutes=5}, - Bucket = <<"Bucket1">>, - Key = <<"Key1">>, - SQN = 1, - H = erlang:phash2(null), - LastMods = [os:timestamp(), os:timestamp()], - - AAESpecs = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods), - ?assertMatch(2, length(AAESpecs)), - - LastMods1 = [os:timestamp()], - AAESpecs1 = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods1), - ?assertMatch(1, length(AAESpecs1)), - IdxB = element(2, element(1, lists:nth(1, AAESpecs1))), - io:format(user, "AAE IDXSpecs1 ~w~n", [AAESpecs1]), - ?assertMatch(<<"$all">>, IdxB), - - LastMods0 = [], - AAESpecs0 = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods0), - ?assertMatch(0, length(AAESpecs0)), - - AAE0 = AAE#recent_aae{filter=whitelist, - buckets=[<<"Bucket0">>]}, - AAESpecsB0 = aae_indexspecs(AAE0, Bucket, Key, SQN, H, LastMods1), - ?assertMatch(0, length(AAESpecsB0)), - - AAESpecsB1 = aae_indexspecs(AAE0, <<"Bucket0">>, Key, SQN, H, LastMods1), - ?assertMatch(1, length(AAESpecsB1)), - [{{?IDX_TAG, <<"Bucket0">>, {Fld, Term}, <<"Key1">>}, - {SQN, {active, TS}, no_lookup, null}}] = AAESpecsB1, - ?assertMatch(true, is_integer(TS)), - ?assertMatch(17, length(binary_to_list(Term))), - ?assertMatch("$aae.", lists:sublist(binary_to_list(Fld), 5)), - - AAE1 = AAE#recent_aae{filter=blacklist, - buckets=[<<"Bucket0">>]}, - AAESpecsB2 = aae_indexspecs(AAE1, <<"Bucket0">>, Key, SQN, H, LastMods1), - ?assertMatch(0, length(AAESpecsB2)). - -delayedupdate_aaeidx_test() -> - AAE = #recent_aae{filter=blacklist, - buckets=[], - limit_minutes=60, - unit_minutes=5}, - Bucket = <<"Bucket1">>, - Key = <<"Key1">>, - SQN = 1, - H = erlang:phash2(null), - {Mega, Sec, MSec} = os:timestamp(), - LastMods = [{Mega -1, Sec, MSec}], - AAESpecs = aae_indexspecs(AAE, Bucket, Key, SQN, H, LastMods), - ?assertMatch(0, length(AAESpecs)). - head_segment_compare_test() -> % Reminder to align native and parallel(leveled_ko) key stores for % kv_index_tictactree @@ -1025,4 +884,13 @@ head_segment_compare_test() -> ?assertMatch(H1, H2), ?assertMatch(H1, H3). +headspec_v0v1_test() -> + % A v0 object spec generates the same outcome as a v1 object spec with the + % last modified date undefined + V1 = {add, v1, <<"B">>, <<"K">>, <<"SK">>, undefined, <<"V">>}, + V0 = {add, <<"B">>, <<"K">>, <<"SK">>, <<"V">>}, + TTL = infinity, + ?assertMatch(true, gen_headspec(V0, 1, TTL) == gen_headspec(V1, 1, TTL)). + + -endif. diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 4692bfc..63c9fdc 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -177,7 +177,7 @@ pcl_fetch/4, pcl_fetchkeys/5, pcl_fetchkeys/6, - pcl_fetchkeysbysegment/6, + pcl_fetchkeysbysegment/8, pcl_fetchnextkey/5, pcl_checksequencenumber/3, pcl_workforclerk/1, @@ -237,6 +237,7 @@ -define(SNAPSHOT_TIMEOUT_SHORT, 600). -define(TIMING_SAMPLECOUNTDOWN, 10000). -define(TIMING_SAMPLESIZE, 100). +-define(OPEN_LASTMOD_RANGE, {0, infinity}). -record(state, {manifest, % a manifest record from the leveled_manifest module persisted_sqn = 0 :: integer(), % The highest SQN persisted @@ -248,7 +249,7 @@ levelzero_pending = false :: boolean(), levelzero_constructor :: pid() | undefined, - levelzero_cache = [] :: list(), % a list of trees + levelzero_cache = [] :: levelzero_cache(), levelzero_size = 0 :: integer(), levelzero_maxcachesize :: integer() | undefined, levelzero_cointoss = false :: boolean(), @@ -293,6 +294,12 @@ integer()}. -type pcl_state() :: #state{}. -type pcl_timings() :: no_timing|#pcl_timings{}. +-type levelzero_cacheentry() :: {pos_integer(), levled_tree:leveled_tree()}. +-type levelzero_cache() :: list(levelzero_cacheentry()). +-type iterator_entry() + :: {pos_integer(), + list(leveled_codec:ledger_kv()|leveled_sst:expandable_pointer())}. +-type iterator() :: list(iterator_entry()). %%%============================================================================ %%% API @@ -405,7 +412,7 @@ pcl_fetchkeys(Pid, StartKey, EndKey, AccFun, InitAcc, By) -> {fetch_keys, StartKey, EndKey, AccFun, InitAcc, - false, -1, + false, false, -1, By}, infinity). @@ -414,7 +421,9 @@ pcl_fetchkeys(Pid, StartKey, EndKey, AccFun, InitAcc, By) -> leveled_codec:ledger_key(), leveled_codec:ledger_key(), fun(), any(), - leveled_codec:segment_list()) -> any(). + leveled_codec:segment_list(), + false | leveled_codec:lastmod_range(), + boolean()) -> any(). %% @doc %% Run a range query between StartKey and EndKey (inclusive). This will cover %% all keys in the range - so must only be run against snapshots of the @@ -427,12 +436,21 @@ pcl_fetchkeys(Pid, StartKey, EndKey, AccFun, InitAcc, By) -> %% Note that segment must be false unless the object Tag supports additional %% indexing by segment. This cannot be used on ?IDX_TAG and other tags that %% use the no_lookup hash -pcl_fetchkeysbysegment(Pid, StartKey, EndKey, AccFun, InitAcc, SegmentList) -> +pcl_fetchkeysbysegment(Pid, StartKey, EndKey, AccFun, InitAcc, + SegmentList, LastModRange, LimitByCount) -> + {MaxKeys, InitAcc0} = + case LimitByCount of + true -> + % The passed in accumulator should have the Max Key Count + % as the first element of a tuple with the actual accumulator + InitAcc; + false -> + {-1, InitAcc} + end, gen_server:call(Pid, {fetch_keys, - StartKey, EndKey, - AccFun, InitAcc, - SegmentList, -1, + StartKey, EndKey, AccFun, InitAcc0, + SegmentList, LastModRange, MaxKeys, as_pcl}, infinity). @@ -449,7 +467,7 @@ pcl_fetchnextkey(Pid, StartKey, EndKey, AccFun, InitAcc) -> {fetch_keys, StartKey, EndKey, AccFun, InitAcc, - false, 1, + false, false, 1, as_pcl}, infinity). @@ -684,10 +702,17 @@ handle_call({check_sqn, Key, Hash, SQN}, _From, State) -> handle_call({fetch_keys, StartKey, EndKey, AccFun, InitAcc, - SegmentList, MaxKeys, By}, + SegmentList, LastModRange, MaxKeys, By}, _From, State=#state{snapshot_fully_loaded=Ready}) when Ready == true -> + LastModRange0 = + case LastModRange of + false -> + ?OPEN_LASTMOD_RANGE; + R -> + R + end, SW = os:timestamp(), L0AsList = case State#state.levelzero_astree of @@ -720,7 +745,7 @@ handle_call({fetch_keys, keyfolder({L0AsList, SSTiter}, {StartKey, EndKey}, {AccFun, InitAcc}, - {SegmentList, MaxKeys}) + {SegmentList, LastModRange0, MaxKeys}) end, case By of as_pcl -> @@ -1375,27 +1400,40 @@ keyfolder(IMMiter, SSTiter, StartKey, EndKey, {AccFun, Acc}) -> keyfolder({IMMiter, SSTiter}, {StartKey, EndKey}, {AccFun, Acc}, - {false, -1}). + {false, {0, infinity}, -1}). -keyfolder(_Iterators, _KeyRange, {_AccFun, Acc}, {_SegmentList, MaxKeys}) - when MaxKeys == 0 -> - Acc; -keyfolder({[], SSTiter}, KeyRange, {AccFun, Acc}, {SegmentList, MaxKeys}) -> +keyfolder(_Iterators, _KeyRange, {_AccFun, Acc}, + {_SegmentList, _LastModRange, MaxKeys}) when MaxKeys == 0 -> + {0, Acc}; +keyfolder({[], SSTiter}, KeyRange, {AccFun, Acc}, + {SegmentList, LastModRange, MaxKeys}) -> {StartKey, EndKey} = KeyRange, - case find_nextkey(SSTiter, StartKey, EndKey, SegmentList) of + case find_nextkey(SSTiter, StartKey, EndKey, + SegmentList, element(1, LastModRange)) of no_more_keys -> - Acc; + case MaxKeys > 0 of + true -> + % This query had a max count, so we must respond with the + % remainder on the count + {MaxKeys, Acc}; + false -> + % This query started with a MaxKeys set to -1. Query is + % not interested in having MaxKeys in Response + Acc + end; {NxSSTiter, {SSTKey, SSTVal}} -> - Acc1 = AccFun(SSTKey, SSTVal, Acc), + {Acc1, MK1} = + maybe_accumulate(SSTKey, SSTVal, Acc, AccFun, + MaxKeys, LastModRange), keyfolder({[], NxSSTiter}, KeyRange, {AccFun, Acc1}, - {SegmentList, MaxKeys - 1}) + {SegmentList, LastModRange, MK1}) end; keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, KeyRange, {AccFun, Acc}, - {SegmentList, MaxKeys}) -> + {SegmentList, LastModRange, MaxKeys}) -> {StartKey, EndKey} = KeyRange, case {IMMKey < StartKey, leveled_codec:endkey_passed(EndKey, IMMKey)} of {false, true} -> @@ -1405,18 +1443,21 @@ keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, keyfolder({[], SSTiterator}, KeyRange, {AccFun, Acc}, - {SegmentList, MaxKeys}); + {SegmentList, LastModRange, MaxKeys}); {false, false} -> - case find_nextkey(SSTiterator, StartKey, EndKey, SegmentList) of + case find_nextkey(SSTiterator, StartKey, EndKey, + SegmentList, element(1, LastModRange)) of no_more_keys -> % No more keys in range in the persisted store, so use the % in-memory KV as the next - Acc1 = AccFun(IMMKey, IMMVal, Acc), + {Acc1, MK1} = + maybe_accumulate(IMMKey, IMMVal, Acc, AccFun, + MaxKeys, LastModRange), keyfolder({NxIMMiterator, []}, KeyRange, {AccFun, Acc1}, - {SegmentList, MaxKeys - 1}); + {SegmentList, LastModRange, MK1}); {NxSSTiterator, {SSTKey, SSTVal}} -> % There is a next key, so need to know which is the % next key between the two (and handle two keys @@ -1426,7 +1467,9 @@ keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, {SSTKey, SSTVal}) of left_hand_first -> - Acc1 = AccFun(IMMKey, IMMVal, Acc), + {Acc1, MK1} = + maybe_accumulate(IMMKey, IMMVal, Acc, AccFun, + MaxKeys, LastModRange), % Stow the previous best result away at Level -1 % so that there is no need to iterate to it again NewEntry = {-1, [{SSTKey, SSTVal}]}, @@ -1437,16 +1480,20 @@ keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, NewEntry)}, KeyRange, {AccFun, Acc1}, - {SegmentList, MaxKeys - 1}); + {SegmentList, LastModRange, MK1}); right_hand_first -> - Acc1 = AccFun(SSTKey, SSTVal, Acc), + {Acc1, MK1} = + maybe_accumulate(SSTKey, SSTVal, Acc, AccFun, + MaxKeys, LastModRange), keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], NxSSTiterator}, KeyRange, {AccFun, Acc1}, - {SegmentList, MaxKeys - 1}); + {SegmentList, LastModRange, MK1}); left_hand_dominant -> - Acc1 = AccFun(IMMKey, IMMVal, Acc), + {Acc1, MK1} = + maybe_accumulate(IMMKey, IMMVal, Acc, AccFun, + MaxKeys, LastModRange), % We can add to the accumulator here. As the SST % key was the most dominant across all SST levels, % so there is no need to hold off until the IMMKey @@ -1455,30 +1502,55 @@ keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, NxSSTiterator}, KeyRange, {AccFun, Acc1}, - {SegmentList, MaxKeys - 1}) + {SegmentList, LastModRange, MK1}) end end end. +-spec maybe_accumulate(leveled_codec:ledger_key(), + leveled_codec:ledger_value(), + any(), fun(), integer(), + {non_neg_integer(), non_neg_integer()|infinity}) -> + any(). +%% @doc +%% Make an accumulation decision based one the date range +maybe_accumulate(LK, LV, Acc, AccFun, MaxKeys, {LowLastMod, HighLastMod}) -> + {_SQN, _SH, LMD} = leveled_codec:strip_to_indexdetails({LK, LV}), + RunAcc = + (LMD == undefined) or ((LMD >= LowLastMod) and (LMD =< HighLastMod)), + case RunAcc of + true -> + {AccFun(LK, LV, Acc), MaxKeys - 1}; + false -> + {Acc, MaxKeys} + end. + + +-spec find_nextkey(iterator(), + leveled_codec:ledger_key(), leveled_codec:ledger_key()) -> + no_more_keys|{iterator(), leveled_codec:ledger_kv()}. +%% @doc %% Looks to find the best choice for the next key across the levels (other %% than in-memory table) %% In finding the best choice, the next key in a given level may be a next %% block or next file pointer which will need to be expanded find_nextkey(QueryArray, StartKey, EndKey) -> - find_nextkey(QueryArray, StartKey, EndKey, false). + find_nextkey(QueryArray, StartKey, EndKey, false, 0). -find_nextkey(QueryArray, StartKey, EndKey, SegmentList) -> +find_nextkey(QueryArray, StartKey, EndKey, SegmentList, LowLastMod) -> find_nextkey(QueryArray, -1, {null, null}, StartKey, EndKey, - SegmentList, ?ITERATOR_SCANWIDTH). + SegmentList, + LowLastMod, + ?ITERATOR_SCANWIDTH). find_nextkey(_QueryArray, LCnt, {null, null}, _StartKey, _EndKey, - _SegList, _Width) when LCnt > ?MAX_LEVELS -> + _SegList, _LowLastMod, _Width) when LCnt > ?MAX_LEVELS -> % The array has been scanned wihtout finding a best key - must be % exhausted - respond to indicate no more keys to be found by the % iterator @@ -1486,7 +1558,7 @@ find_nextkey(_QueryArray, LCnt, find_nextkey(QueryArray, LCnt, {BKL, BestKV}, _StartKey, _EndKey, - _SegList, _Width) when LCnt > ?MAX_LEVELS -> + _SegList, _LowLastMod, _Width) when LCnt > ?MAX_LEVELS -> % All levels have been scanned, so need to remove the best result from % the array, and return that array along with the best key/sqn/status % combination @@ -1495,7 +1567,7 @@ find_nextkey(QueryArray, LCnt, find_nextkey(QueryArray, LCnt, {BestKeyLevel, BestKV}, StartKey, EndKey, - SegList, Width) -> + SegList, LowLastMod, Width) -> % Get the next key at this level {NextKey, RestOfKeys} = case lists:keyfind(LCnt, 1, QueryArray) of @@ -1514,15 +1586,16 @@ find_nextkey(QueryArray, LCnt, LCnt + 1, {BKL, BKV}, StartKey, EndKey, - SegList, Width); + SegList, LowLastMod, Width); {{next, Owner, _SK}, BKL, BKV} -> % The first key at this level is pointer to a file - need to query % the file to expand this level out before proceeding Pointer = {next, Owner, StartKey, EndKey}, - UpdList = leveled_sst:expand_list_by_pointer(Pointer, - RestOfKeys, - Width, - SegList), + UpdList = leveled_sst:sst_expandpointer(Pointer, + RestOfKeys, + Width, + SegList, + LowLastMod), NewEntry = {LCnt, UpdList}, % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level @@ -1530,15 +1603,16 @@ find_nextkey(QueryArray, LCnt, LCnt, {BKL, BKV}, StartKey, EndKey, - SegList, Width); + SegList, LowLastMod, Width); {{pointer, SSTPid, Slot, PSK, PEK}, BKL, BKV} -> % The first key at this level is pointer within a file - need to % query the file to expand this level out before proceeding Pointer = {pointer, SSTPid, Slot, PSK, PEK}, - UpdList = leveled_sst:expand_list_by_pointer(Pointer, - RestOfKeys, - Width, - SegList), + UpdList = leveled_sst:sst_expandpointer(Pointer, + RestOfKeys, + Width, + SegList, + LowLastMod), NewEntry = {LCnt, UpdList}, % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level @@ -1546,7 +1620,7 @@ find_nextkey(QueryArray, LCnt, LCnt, {BKL, BKV}, StartKey, EndKey, - SegList, Width); + SegList, LowLastMod, Width); {{Key, Val}, null, null} -> % No best key set - so can assume that this key is the best key, % and check the lower levels @@ -1554,7 +1628,7 @@ find_nextkey(QueryArray, LCnt, LCnt + 1, {LCnt, {Key, Val}}, StartKey, EndKey, - SegList, Width); + SegList, LowLastMod, Width); {{Key, Val}, _BKL, {BestKey, _BestVal}} when Key < BestKey -> % There is a real key and a best key to compare, and the real key % at this level is before the best key, and so is now the new best @@ -1564,7 +1638,7 @@ find_nextkey(QueryArray, LCnt, LCnt + 1, {LCnt, {Key, Val}}, StartKey, EndKey, - SegList, Width); + SegList, LowLastMod, Width); {{Key, Val}, BKL, {BestKey, BestVal}} when Key == BestKey -> SQN = leveled_codec:strip_to_seqonly({Key, Val}), BestSQN = leveled_codec:strip_to_seqonly({BestKey, BestVal}), @@ -1579,7 +1653,7 @@ find_nextkey(QueryArray, LCnt, LCnt + 1, {BKL, {BestKey, BestVal}}, StartKey, EndKey, - SegList, Width); + SegList, LowLastMod, Width); SQN > BestSQN -> % There is a real key at the front of this level and it has % a higher SQN than the best key, so we should use this as @@ -1595,7 +1669,7 @@ find_nextkey(QueryArray, LCnt, LCnt + 1, {LCnt, {Key, Val}}, StartKey, EndKey, - SegList, Width) + SegList, LowLastMod, Width) end; {_, BKL, BKV} -> % This is not the best key @@ -1603,7 +1677,7 @@ find_nextkey(QueryArray, LCnt, LCnt + 1, {BKL, BKV}, StartKey, EndKey, - SegList, Width) + SegList, LowLastMod, Width) end. @@ -1964,84 +2038,98 @@ simple_server_test() -> simple_findnextkey_test() -> QueryArray = [ - {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, - {{o, "Bucket1", "Key5"}, {4, {active, infinity}, null}}]}, - {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}]}, - {5, [{{o, "Bucket1", "Key2"}, {2, {active, infinity}, null}}]} + {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, + {{o, "Bucket1", "Key5", null}, {4, {active, infinity}, {0, 0}, null}}]}, + {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, + {5, [{{o, "Bucket1", "Key2", null}, {2, {active, infinity}, {0, 0}, null}}]} ], {Array2, KV1} = find_nextkey(QueryArray, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, KV1), + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key1", null}, + {5, {active, infinity}, {0, 0}, null}}, + KV1), {Array3, KV2} = find_nextkey(Array2, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key2"}, {2, {active, infinity}, null}}, KV2), + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key2", null}, + {2, {active, infinity}, {0, 0}, null}}, + KV2), {Array4, KV3} = find_nextkey(Array3, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}, KV3), + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key3", null}, + {3, {active, infinity}, {0, 0}, null}}, + KV3), {Array5, KV4} = find_nextkey(Array4, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key5"}, {4, {active, infinity}, null}}, KV4), + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key5", null}, + {4, {active, infinity}, {0, 0}, null}}, + KV4), ER = find_nextkey(Array5, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), ?assertMatch(no_more_keys, ER). sqnoverlap_findnextkey_test() -> QueryArray = [ - {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key5"}, {4, {active, infinity}, 0, null}}]}, - {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}]}, - {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, 0, null}}]} + {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, + {{o, "Bucket1", "Key5", null}, {4, {active, infinity}, {0, 0}, null}}]}, + {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, + {5, [{{o, "Bucket1", "Key5", null}, {2, {active, infinity}, {0, 0}, null}}]} ], {Array2, KV1} = find_nextkey(QueryArray, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key1", null}, + {5, {active, infinity}, {0, 0}, null}}, KV1), {Array3, KV2} = find_nextkey(Array2, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}, + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key3", null}, + {3, {active, infinity}, {0, 0}, null}}, KV2), {Array4, KV3} = find_nextkey(Array3, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key5"}, {4, {active, infinity}, 0, null}}, + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key5", null}, + {4, {active, infinity}, {0, 0}, null}}, KV3), ER = find_nextkey(Array4, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), ?assertMatch(no_more_keys, ER). sqnoverlap_otherway_findnextkey_test() -> QueryArray = [ - {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, - {{o, "Bucket1", "Key5"}, {1, {active, infinity}, 0, null}}]}, - {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}]}, - {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, 0, null}}]} + {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, + {{o, "Bucket1", "Key5", null}, {1, {active, infinity}, {0, 0}, null}}]}, + {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, + {5, [{{o, "Bucket1", "Key5", null}, {2, {active, infinity}, {0, 0}, null}}]} ], {Array2, KV1} = find_nextkey(QueryArray, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key1", null}, + {5, {active, infinity}, {0, 0}, null}}, KV1), {Array3, KV2} = find_nextkey(Array2, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}, + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key3", null}, + {3, {active, infinity}, {0, 0}, null}}, KV2), {Array4, KV3} = find_nextkey(Array3, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key5"}, {2, {active, infinity}, 0, null}}, + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), + ?assertMatch({{o, "Bucket1", "Key5", null}, + {2, {active, infinity}, {0, 0}, null}}, KV3), ER = find_nextkey(Array4, - {o, "Bucket1", "Key0"}, - {o, "Bucket1", "Key5"}), + {o, "Bucket1", "Key0", null}, + {o, "Bucket1", "Key5", null}), ?assertMatch(no_more_keys, ER). foldwithimm_simple_test() -> diff --git a/src/leveled_runner.erl b/src/leveled_runner.erl index 7b88c21..f034890 100644 --- a/src/leveled_runner.erl +++ b/src/leveled_runner.erl @@ -30,9 +30,9 @@ bucketkey_query/6, hashlist_query/3, tictactree/5, - foldheads_allkeys/5, + foldheads_allkeys/7, foldobjects_allkeys/4, - foldheads_bybucket/6, + foldheads_bybucket/8, foldobjects_bybucket/4, foldobjects_byindex/3 ]). @@ -49,6 +49,7 @@ :: {fun(), any()}. -type term_regex() :: re:mp()|undefined. + %%%============================================================================ %%% External functions %%%============================================================================ @@ -269,12 +270,14 @@ tictactree(SnapFun, {Tag, Bucket, Query}, JournalCheck, TreeSize, Filter) -> {async, Runner}. -spec foldheads_allkeys(fun(), leveled_codec:tag(), - fun(), boolean(), false|list(integer())) - -> {async, fun()}. + fun(), boolean(), false|list(integer()), + false|leveled_codec:lastmod_range(), + false|pos_integer()) -> {async, fun()}. %% @doc %% Fold over all heads in the store for a given tag - applying the passed %% function to each proxy object -foldheads_allkeys(SnapFun, Tag, FoldFun, JournalCheck, SegmentList) -> +foldheads_allkeys(SnapFun, Tag, FoldFun, JournalCheck, + SegmentList, LastModRange, MaxObjectCount) -> StartKey = leveled_codec:to_ledgerkey(null, null, Tag), EndKey = leveled_codec:to_ledgerkey(null, null, Tag), foldobjects(SnapFun, @@ -282,7 +285,9 @@ foldheads_allkeys(SnapFun, Tag, FoldFun, JournalCheck, SegmentList) -> [{StartKey, EndKey}], FoldFun, {true, JournalCheck}, - SegmentList). + SegmentList, + LastModRange, + MaxObjectCount). -spec foldobjects_allkeys(fun(), leveled_codec:tag(), fun(), key_order|sqn_order) -> {async, fun()}. @@ -399,7 +404,10 @@ foldobjects_bybucket(SnapFun, Tag, KeyRanges, FoldFun) -> atom(), list({any(), any()}), fun(), - boolean(), false|list(integer())) + boolean(), + false|list(integer()), + false|leveled_codec:lastmod_range(), + false|pos_integer()) -> {async, fun()}. %% @doc %% Fold over all object metadata within a given key range in a bucket @@ -407,13 +415,16 @@ foldheads_bybucket(SnapFun, Tag, KeyRanges, FoldFun, - JournalCheck, SegmentList) -> + JournalCheck, + SegmentList, LastModRange, MaxObjectCount) -> foldobjects(SnapFun, Tag, KeyRanges, FoldFun, {true, JournalCheck}, - SegmentList). + SegmentList, + LastModRange, + MaxObjectCount). -spec foldobjects_byindex(fun(), tuple(), fun()) -> {async, fun()}. %% @doc @@ -454,10 +465,10 @@ get_nextbucket(NextBucket, NextKey, Tag, LedgerSnapshot, BKList, {C, L}) -> ExtractFun, null), case R of - null -> + {1, null} -> leveled_log:log("B0008",[]), BKList; - {{B, K}, V} -> + {0, {{B, K}, V}} -> case leveled_codec:is_active({Tag, B, K, null}, V, Now) of true -> leveled_log:log("B0009",[B]), @@ -484,6 +495,16 @@ get_nextbucket(NextBucket, NextKey, Tag, LedgerSnapshot, BKList, {C, L}) -> -spec foldobjects(fun(), atom(), list(), fun(), false|{true, boolean()}, false|list(integer())) -> {async, fun()}. +foldobjects(SnapFun, Tag, KeyRanges, FoldObjFun, DeferredFetch, SegmentList) -> + foldobjects(SnapFun, Tag, KeyRanges, + FoldObjFun, DeferredFetch, SegmentList, false, false). + +-spec foldobjects(fun(), atom(), list(), fun(), + false|{true, boolean()}, + false|list(integer()), + false|leveled_codec:lastmod_range(), + false|pos_integer()) -> + {async, fun()}. %% @doc %% The object folder should be passed DeferredFetch. %% DeferredFetch can either be false (which will return to the fold function @@ -491,7 +512,8 @@ get_nextbucket(NextBucket, NextKey, Tag, LedgerSnapshot, BKList, {C, L}) -> %% will be created that if understood by the fold function will allow the fold %% function to work on the head of the object, and defer fetching the body in %% case such a fetch is unecessary. -foldobjects(SnapFun, Tag, KeyRanges, FoldObjFun, DeferredFetch, SegmentList) -> +foldobjects(SnapFun, Tag, KeyRanges, FoldObjFun, DeferredFetch, + SegmentList, LastModRange, MaxObjectCount) -> {FoldFun, InitAcc} = case is_tuple(FoldObjFun) of true -> @@ -502,16 +524,24 @@ foldobjects(SnapFun, Tag, KeyRanges, FoldObjFun, DeferredFetch, SegmentList) -> % no initial accumulator passed, and so should be just a list {FoldObjFun, []} end, + {LimitByCount, InitAcc0} = + case MaxObjectCount of + false -> + {false, InitAcc}; + MOC when is_integer(MOC) -> + {true, {MOC, InitAcc}} + end, Folder = fun() -> {ok, LedgerSnapshot, JournalSnapshot} = SnapFun(), - AccFun = accumulate_objects(FoldFun, - JournalSnapshot, - Tag, - DeferredFetch), - + AccFun = + accumulate_objects(FoldFun, + JournalSnapshot, + Tag, + DeferredFetch), + ListFoldFun = fun({StartKey, EndKey}, FoldAcc) -> leveled_penciller:pcl_fetchkeysbysegment(LedgerSnapshot, @@ -519,9 +549,11 @@ foldobjects(SnapFun, Tag, KeyRanges, FoldObjFun, DeferredFetch, SegmentList) -> EndKey, AccFun, FoldAcc, - SegmentList) + SegmentList, + LastModRange, + LimitByCount) end, - Acc = lists:foldl(ListFoldFun, InitAcc, KeyRanges), + Acc = lists:foldl(ListFoldFun, InitAcc0, KeyRanges), ok = leveled_penciller:pcl_close(LedgerSnapshot), case DeferredFetch of {true, false} -> @@ -612,7 +644,7 @@ accumulate_objects(FoldObjectsFun, InkerClone, Tag, DeferredFetch) -> case leveled_codec:is_active(LK, V, Now) of true -> {SQN, _St, _MH, MD} = - leveled_codec:striphead_to_details(V), + leveled_codec:striphead_to_v1details(V), {B, K} = case leveled_codec:from_ledgerkey(LK) of {B0, K0} -> diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 64941d1..b68b130 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -88,8 +88,10 @@ -define(TIMING_SAMPLESIZE, 100). -define(CACHE_SIZE, 32). -define(BLOCK_LENGTHS_LENGTH, 20). +-define(LMD_LENGTH, 4). -define(FLIPPER32, 4294967295). -define(COMPRESS_AT_LEVEL, 1). +-define(INDEX_MODDATE, true). -include_lib("eunit/include/eunit.hrl"). @@ -111,10 +113,7 @@ sst_open/2, sst_get/2, sst_get/3, - sst_getkvrange/4, - sst_getfilteredrange/5, - sst_getslots/2, - sst_getfilteredslots/3, + sst_expandpointer/5, sst_getmaxsequencenumber/1, sst_setfordelete/2, sst_clear/1, @@ -122,7 +121,6 @@ sst_deleteconfirmed/1, sst_close/1]). --export([expand_list_by_pointer/4]). -record(slot_index_value, {slot_id :: integer(), @@ -135,16 +133,28 @@ size :: integer(), max_sqn :: integer()}). --type press_methods() +-type press_method() :: lz4|native|none. -type range_endpoint() :: all|leveled_codec:ledger_key(). -type slot_pointer() :: {pointer, pid(), integer(), range_endpoint(), range_endpoint()}. --type sst_pointer() +-type sst_pointer() + % Used in sst_new :: {next, leveled_pmanifest:manifest_entry(), - leveled_codec:ledger_key()|all}. + range_endpoint()}. +-type sst_closed_pointer() + % used in expand_list_by_pointer + % (close point is added by maybe_expand_pointer + :: {next, + leveled_pmanifest:manifest_entry(), + range_endpoint(), + range_endpoint()}. +-type expandable_pointer() + :: slot_pointer()|sst_closed_pointer(). +-type expanded_pointer() + :: leveled_codec:ledger_kv()|expandable_pointer(). -type binaryslot_element() :: {tuple(), tuple()}|{binary(), integer(), tuple(), tuple()}. @@ -163,7 +173,8 @@ filename, yield_blockquery = false :: boolean(), blockindex_cache, - compression_method = native :: press_methods(), + compression_method = native :: press_method(), + index_moddate = ?INDEX_MODDATE :: boolean(), timings = no_timing :: sst_timings(), timings_countdown = 0 :: integer(), fetch_cache = array:new([{size, ?CACHE_SIZE}])}). @@ -192,6 +203,8 @@ -type sst_timings() :: no_timing|#sst_timings{}. -type build_timings() :: no_timing|#build_timings{}. +-export_type([expandable_pointer/0]). + %%%============================================================================ %%% API %%%============================================================================ @@ -219,7 +232,7 @@ sst_open(RootPath, Filename) -> -spec sst_new(string(), string(), integer(), list(leveled_codec:ledger_kv()), - integer(), press_methods()) + integer(), press_method()) -> {ok, pid(), {leveled_codec:ledger_key(), leveled_codec:ledger_key()}, binary()}. @@ -228,9 +241,15 @@ sst_open(RootPath, Filename) -> %% pairs. This should not be used for basement levels or unexpanded Key/Value %% lists as merge_lists will not be called. sst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod) -> + sst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod, + ?INDEX_MODDATE). + +sst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod, + IndexModDate) -> {ok, Pid} = gen_fsm:start_link(?MODULE, [], []), PressMethod0 = compress_level(Level, PressMethod), - {[], [], SlotList, FK} = merge_lists(KVList, PressMethod0), + {[], [], SlotList, FK} = + merge_lists(KVList, PressMethod0, IndexModDate), case gen_fsm:sync_send_event(Pid, {sst_new, RootPath, @@ -238,7 +257,8 @@ sst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod) -> Level, {SlotList, FK}, MaxSQN, - PressMethod0}, + PressMethod0, + IndexModDate}, infinity) of {ok, {SK, EK}, Bloom} -> {ok, Pid, {SK, EK}, Bloom} @@ -248,7 +268,7 @@ sst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod) -> list(leveled_codec:ledger_kv()|sst_pointer()), list(leveled_codec:ledger_kv()|sst_pointer()), boolean(), integer(), - integer(), press_methods()) + integer(), press_method()) -> empty|{ok, pid(), {{list(leveled_codec:ledger_kv()), list(leveled_codec:ledger_kv())}, @@ -269,9 +289,17 @@ sst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod) -> sst_new(RootPath, Filename, KVL1, KVL2, IsBasement, Level, MaxSQN, PressMethod) -> + sst_new(RootPath, Filename, + KVL1, KVL2, IsBasement, Level, + MaxSQN, PressMethod, ?INDEX_MODDATE). + +sst_new(RootPath, Filename, + KVL1, KVL2, IsBasement, Level, + MaxSQN, PressMethod, IndexModDate) -> PressMethod0 = compress_level(Level, PressMethod), {Rem1, Rem2, SlotList, FK} = - merge_lists(KVL1, KVL2, {IsBasement, Level}, PressMethod0), + merge_lists(KVL1, KVL2, {IsBasement, Level}, + PressMethod0, IndexModDate), case SlotList of [] -> empty; @@ -284,7 +312,8 @@ sst_new(RootPath, Filename, Level, {SlotList, FK}, MaxSQN, - PressMethod0}, + PressMethod0, + IndexModDate}, infinity) of {ok, {SK, EK}, Bloom} -> {ok, Pid, {{Rem1, Rem2}, SK, EK}, Bloom} @@ -293,7 +322,7 @@ sst_new(RootPath, Filename, -spec sst_newlevelzero(string(), string(), integer(), fun(), pid()|undefined, integer(), - press_methods()) -> + press_method()) -> {ok, pid(), noreply}. %% @doc %% Start a new file at level zero. At this level the file size is not fixed - @@ -312,7 +341,8 @@ sst_newlevelzero(RootPath, Filename, FetchFun, Penciller, MaxSQN, - PressMethod0}), + PressMethod0, + ?INDEX_MODDATE}), {ok, Pid, noreply}. -spec sst_get(pid(), leveled_codec:ledger_key()) @@ -332,90 +362,29 @@ sst_get(Pid, LedgerKey) -> sst_get(Pid, LedgerKey, Hash) -> gen_fsm:sync_send_event(Pid, {get_kv, LedgerKey, Hash}, infinity). - --spec sst_getkvrange(pid(), - range_endpoint(), - range_endpoint(), - integer()) - -> list(leveled_codec:ledger_kv()|slot_pointer()). -%% @doc -%% Get a range of {Key, Value} pairs as a list between StartKey and EndKey -%% (inclusive). The ScanWidth is the maximum size of the range, a pointer -%% will be placed on the tail of the resulting list if results expand beyond -%% the Scan Width -sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> - sst_getfilteredrange(Pid, StartKey, EndKey, ScanWidth, false). - - --spec sst_getfilteredrange(pid(), - range_endpoint(), - range_endpoint(), - integer(), - leveled_codec:segment_list()) - -> list(leveled_codec:ledger_kv()|slot_pointer()). -%% @doc -%% Get a range of {Key, Value} pairs as a list between StartKey and EndKey -%% (inclusive). The ScanWidth is the maximum size of the range, a pointer -%% will be placed on the tail of the resulting list if results expand beyond -%% the Scan Width -%% -%% To make the range open-ended (either to start, end or both) the all atom -%% can be used in place of the Key tuple. -%% -%% A segment list can also be passed, which inidcates a subset of segment -%% hashes of interest in the query. -%% -%% TODO: Optimise this so that passing a list of segments that tune to the -%% same hash is faster - perhaps provide an exportable function in -%% leveled_tictac -sst_getfilteredrange(Pid, StartKey, EndKey, ScanWidth, SegList) -> - SegList0 = tune_seglist(SegList), - case gen_fsm:sync_send_event(Pid, - {get_kvrange, - StartKey, EndKey, - ScanWidth, SegList0}, - infinity) of - {yield, SlotsToFetchBinList, SlotsToPoint, PressMethod} -> - {L, _BIC} = - binaryslot_reader(SlotsToFetchBinList, PressMethod, SegList0), - L ++ SlotsToPoint; - Reply -> - Reply - end. - --spec sst_getslots(pid(), list(slot_pointer())) - -> list(leveled_codec:ledger_kv()). -%% @doc -%% Get a list of slots by their ID. The slot will be converted from the binary -%% to term form outside of the FSM loop, this is to stop the copying of the -%% converted term to the calling process. -sst_getslots(Pid, SlotList) -> - sst_getfilteredslots(Pid, SlotList, false). - --spec sst_getfilteredslots(pid(), - list(slot_pointer()), - leveled_codec:segment_list()) - -> list(leveled_codec:ledger_kv()). -%% @doc -%% Get a list of slots by their ID. The slot will be converted from the binary -%% to term form outside of the FSM loop -%% -%% A list of 16-bit integer Segment IDs can be passed to filter the keys -%% returned (not precisely - with false results returned in addition). Use -%% false as a SegList to not filter -sst_getfilteredslots(Pid, SlotList, SegList) -> - SegL0 = tune_seglist(SegList), - {SlotBins, PressMethod} = - gen_fsm:sync_send_event(Pid, {get_slots, SlotList, SegL0}, infinity), - {L, _BIC} = binaryslot_reader(SlotBins, PressMethod, SegL0), - L. - -spec sst_getmaxsequencenumber(pid()) -> integer(). %% @doc %% Get the maximume sequence number for this SST file sst_getmaxsequencenumber(Pid) -> gen_fsm:sync_send_event(Pid, get_maxsequencenumber, infinity). +-spec sst_expandpointer(expandable_pointer(), + list(expandable_pointer()), + pos_integer(), + leveled_codec:segment_list(), + non_neg_integer()) + -> list(expanded_pointer()). +%% @doc +%% Expand out a list of pointer to return a list of Keys and Values with a +%% tail of pointers (once the ScanWidth has been satisfied). +%% Folding over keys in a store uses this function, although this function +%% does not directly call the gen_server - it does so by sst_getfilteredslots +%% or sst_getfilteredrange depending on the nature of the pointer. +sst_expandpointer(Pointer, MorePointers, ScanWidth, SegmentList, LowLastMod) -> + expand_list_by_pointer(Pointer, MorePointers, ScanWidth, + SegmentList, LowLastMod). + + -spec sst_setfordelete(pid(), pid()|false) -> ok. %% @doc %% If the SST is no longer in use in the active ledger it can be set for @@ -483,17 +452,15 @@ starting({sst_open, RootPath, Filename}, _From, State) -> starting({sst_new, RootPath, Filename, Level, {SlotList, FirstKey}, MaxSQN, - PressMethod}, _From, State) -> + PressMethod, IdxModDate}, _From, State) -> SW = os:timestamp(), - {Length, - SlotIndex, - BlockIndex, - SlotsBin, - Bloom} = build_all_slots(SlotList, PressMethod), + {Length, SlotIndex, BlockIndex, SlotsBin, Bloom} = + build_all_slots(SlotList), SummaryBin = build_table_summary(SlotIndex, Level, FirstKey, Length, MaxSQN, Bloom), ActualFilename = - write_file(RootPath, Filename, SummaryBin, SlotsBin, PressMethod), + write_file(RootPath, Filename, SummaryBin, SlotsBin, + PressMethod, IdxModDate), YBQ = Level =< 2, {UpdState, Bloom} = read_file(ActualFilename, @@ -509,21 +476,19 @@ starting({sst_new, starting({sst_newlevelzero, RootPath, Filename, Slots, FetchFun, Penciller, MaxSQN, - PressMethod}, State) -> + PressMethod, IdxModDate}, State) -> SW0 = os:timestamp(), KVList = leveled_pmem:to_list(Slots, FetchFun), Time0 = timer:now_diff(os:timestamp(), SW0), SW1 = os:timestamp(), - {[], [], SlotList, FirstKey} = merge_lists(KVList, PressMethod), + {[], [], SlotList, FirstKey} = + merge_lists(KVList, PressMethod, IdxModDate), Time1 = timer:now_diff(os:timestamp(), SW1), SW2 = os:timestamp(), - {SlotCount, - SlotIndex, - BlockIndex, - SlotsBin, - Bloom} = build_all_slots(SlotList, PressMethod), + {SlotCount, SlotIndex, BlockIndex, SlotsBin,Bloom} = + build_all_slots(SlotList), Time2 = timer:now_diff(os:timestamp(), SW2), SW3 = os:timestamp(), @@ -533,7 +498,8 @@ starting({sst_newlevelzero, RootPath, Filename, SW4 = os:timestamp(), ActualFilename = - write_file(RootPath, Filename, SummaryBin, SlotsBin, PressMethod), + write_file(RootPath, Filename, SummaryBin, SlotsBin, + PressMethod, IdxModDate), {UpdState, Bloom} = read_file(ActualFilename, State#state{root_path=RootPath, yield_blockquery=true}), @@ -572,14 +538,17 @@ reader({get_kv, LedgerKey, Hash}, _From, State) -> {reply, Result, reader, UpdState#state{timings = UpdTimings0, timings_countdown = CountDown}}; -reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList}, _From, State) -> +reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod}, + _From, State) -> {SlotsToFetchBinList, SlotsToPoint} = fetch_range(StartKey, EndKey, ScanWidth, SegList, + LowLastMod, State), PressMethod = State#state.compression_method, + IdxModDate = State#state.index_moddate, case State#state.yield_blockquery of true -> @@ -587,12 +556,14 @@ reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList}, _From, State) -> {yield, SlotsToFetchBinList, SlotsToPoint, - PressMethod}, + PressMethod, + IdxModDate}, reader, State}; false -> {L, BIC} = - binaryslot_reader(SlotsToFetchBinList, PressMethod, SegList), + binaryslot_reader(SlotsToFetchBinList, + PressMethod, IdxModDate, SegList), FoldFun = fun(CacheEntry, Cache) -> case CacheEntry of @@ -608,13 +579,16 @@ reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList}, _From, State) -> reader, State#state{blockindex_cache = BlockIdxC0}} end; -reader({get_slots, SlotList, SegList}, _From, State) -> +reader({get_slots, SlotList, SegList, LowLastMod}, _From, State) -> + PressMethod = State#state.compression_method, + IdxModDate = State#state.index_moddate, SlotBins = read_slots(State#state.handle, SlotList, - {SegList, State#state.blockindex_cache}, - State#state.compression_method), - {reply, {SlotBins, State#state.compression_method}, reader, State}; + {SegList, LowLastMod, State#state.blockindex_cache}, + State#state.compression_method, + State#state.index_moddate), + {reply, {SlotBins, PressMethod, IdxModDate}, reader, State}; reader(get_maxsequencenumber, _From, State) -> Summary = State#state.summary, {reply, Summary#summary.max_sqn, reader, State}; @@ -645,28 +619,33 @@ reader(close, _From, State) -> delete_pending({get_kv, LedgerKey, Hash}, _From, State) -> {Result, UpdState, _Ts} = fetch(LedgerKey, Hash, State, no_timing), {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT}; -delete_pending({get_kvrange, StartKey, EndKey, ScanWidth, SegList}, - _From, State) -> +delete_pending({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod}, + _From, State) -> {SlotsToFetchBinList, SlotsToPoint} = fetch_range(StartKey, EndKey, ScanWidth, SegList, + LowLastMod, State), % Always yield as about to clear and de-reference PressMethod = State#state.compression_method, + IdxModDate = State#state.index_moddate, {reply, - {yield, SlotsToFetchBinList, SlotsToPoint, PressMethod}, + {yield, SlotsToFetchBinList, SlotsToPoint, PressMethod, IdxModDate}, delete_pending, State, ?DELETE_TIMEOUT}; -delete_pending({get_slots, SlotList, SegList}, _From, State) -> +delete_pending({get_slots, SlotList, SegList, LowLastMod}, _From, State) -> + PressMethod = State#state.compression_method, + IdxModDate = State#state.index_moddate, SlotBins = read_slots(State#state.handle, SlotList, - {SegList, State#state.blockindex_cache}, - State#state.compression_method), + {SegList, LowLastMod, State#state.blockindex_cache}, + PressMethod, + IdxModDate), {reply, - {SlotBins, State#state.compression_method}, + {SlotBins, PressMethod, IdxModDate}, delete_pending, State, ?DELETE_TIMEOUT}; @@ -709,6 +688,157 @@ code_change(_OldVsn, StateName, State, _Extra) -> {ok, StateName, State}. +%%%============================================================================ +%%% External Functions +%%%============================================================================ + +-spec expand_list_by_pointer(expandable_pointer(), + list(expandable_pointer()), + pos_integer()) + -> list(expanded_pointer()). +%% @doc +%% Expand a list of pointers, maybe ending up with a list of keys and values +%% with a tail of pointers +%% By defauls will not have a segment filter, or a low last_modified_date, but +%% they can be used. Range checking a last modified date must still be made on +%% the output - at this stage the low last_modified_date has been used to bulk +%% skip those slots not containing any information over the low last modified +%% date +expand_list_by_pointer(Pointer, Tail, Width) -> + expand_list_by_pointer(Pointer, Tail, Width, false). + +%% TODO until leveled_penciller updated +expand_list_by_pointer(Pointer, Tail, Width, SegList) -> + expand_list_by_pointer(Pointer, Tail, Width, SegList, 0). + +-spec expand_list_by_pointer(expandable_pointer(), + list(expandable_pointer()), + pos_integer(), + leveled_codec:segment_list(), + non_neg_integer()) + -> list(expanded_pointer()). +%% @doc +%% With filters (as described in expand_list_by_pointer/3 +expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey}, + Tail, Width, SegList, LowLastMod) -> + FoldFun = + fun(X, {Pointers, Remainder}) -> + case length(Pointers) of + L when L < Width -> + case X of + {pointer, SSTPid, S, SK, EK} -> + {Pointers ++ [{pointer, S, SK, EK}], Remainder}; + _ -> + {Pointers, Remainder ++ [X]} + end; + _ -> + {Pointers, Remainder ++ [X]} + end + end, + InitAcc = {[{pointer, Slot, StartKey, EndKey}], []}, + {AccPointers, AccTail} = lists:foldl(FoldFun, InitAcc, Tail), + ExpPointers = sst_getfilteredslots(SSTPid, + AccPointers, + SegList, + LowLastMod), + lists:append(ExpPointers, AccTail); +expand_list_by_pointer({next, ManEntry, StartKey, EndKey}, + Tail, Width, SegList, LowLastMod) -> + SSTPid = ManEntry#manifest_entry.owner, + leveled_log:log("SST10", [SSTPid, is_process_alive(SSTPid)]), + ExpPointer = sst_getfilteredrange(SSTPid, + StartKey, + EndKey, + Width, + SegList, + LowLastMod), + ExpPointer ++ Tail. + + +-spec sst_getkvrange(pid(), + range_endpoint(), + range_endpoint(), + integer()) + -> list(leveled_codec:ledger_kv()|slot_pointer()). +%% @doc +%% Get a range of {Key, Value} pairs as a list between StartKey and EndKey +%% (inclusive). The ScanWidth is the maximum size of the range, a pointer +%% will be placed on the tail of the resulting list if results expand beyond +%% the Scan Width +sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> + sst_getfilteredrange(Pid, StartKey, EndKey, ScanWidth, false, 0). + + +-spec sst_getfilteredrange(pid(), + range_endpoint(), + range_endpoint(), + integer(), + leveled_codec:segment_list(), + non_neg_integer()) + -> list(leveled_codec:ledger_kv()|slot_pointer()). +%% @doc +%% Get a range of {Key, Value} pairs as a list between StartKey and EndKey +%% (inclusive). The ScanWidth is the maximum size of the range, a pointer +%% will be placed on the tail of the resulting list if results expand beyond +%% the Scan Width +%% +%% To make the range open-ended (either to start, end or both) the all atom +%% can be used in place of the Key tuple. +%% +%% A segment list can also be passed, which inidcates a subset of segment +%% hashes of interest in the query. +%% +%% TODO: Optimise this so that passing a list of segments that tune to the +%% same hash is faster - perhaps provide an exportable function in +%% leveled_tictac +sst_getfilteredrange(Pid, StartKey, EndKey, ScanWidth, SegList, LowLastMod) -> + SegList0 = tune_seglist(SegList), + case gen_fsm:sync_send_event(Pid, + {get_kvrange, + StartKey, EndKey, + ScanWidth, SegList0, LowLastMod}, + infinity) of + {yield, SlotsToFetchBinList, SlotsToPoint, PressMethod, IdxModDate} -> + {L, _BIC} = + binaryslot_reader(SlotsToFetchBinList, + PressMethod, IdxModDate, SegList0), + L ++ SlotsToPoint; + Reply -> + Reply + end. + +-spec sst_getslots(pid(), list(slot_pointer())) + -> list(leveled_codec:ledger_kv()). +%% @doc +%% Get a list of slots by their ID. The slot will be converted from the binary +%% to term form outside of the FSM loop, this is to stop the copying of the +%% converted term to the calling process. +sst_getslots(Pid, SlotList) -> + sst_getfilteredslots(Pid, SlotList, false, 0). + +-spec sst_getfilteredslots(pid(), + list(slot_pointer()), + leveled_codec:segment_list(), + non_neg_integer()) + -> list(leveled_codec:ledger_kv()). +%% @doc +%% Get a list of slots by their ID. The slot will be converted from the binary +%% to term form outside of the FSM loop +%% +%% A list of 16-bit integer Segment IDs can be passed to filter the keys +%% returned (not precisely - with false results returned in addition). Use +%% false as a SegList to not filter. +%% An integer can be provided which gives a floor for the LastModified Date +%% of the object, if the object is to be covered by the query +sst_getfilteredslots(Pid, SlotList, SegList, LowLastMod) -> + SegL0 = tune_seglist(SegList), + {SlotBins, PressMethod, IdxModDate} = + gen_fsm:sync_send_event(Pid, + {get_slots, SlotList, SegL0, LowLastMod}, + infinity), + {L, _BIC} = binaryslot_reader(SlotBins, PressMethod, IdxModDate, SegL0), + L. + %%%============================================================================ %%% Internal Functions %%%============================================================================ @@ -726,6 +856,7 @@ fetch(LedgerKey, Hash, State, Timings0) -> Summary = State#state.summary, PressMethod = State#state.compression_method, + IdxModDate = State#state.index_moddate, Slot = lookup_slot(LedgerKey, Summary#summary.index), {SW1, Timings1} = update_timings(SW0, Timings0, index_query, true), @@ -734,13 +865,12 @@ fetch(LedgerKey, Hash, State, Timings0) -> CachedBlockIdx = array:get(SlotID - 1, State#state.blockindex_cache), {SW2, Timings2} = update_timings(SW1, Timings1, lookup_cache, true), - BL = ?BLOCK_LENGTHS_LENGTH, - case CachedBlockIdx of + case extract_header(CachedBlockIdx, IdxModDate) of none -> SlotBin = read_slot(State#state.handle, Slot), {Result, Header} = - binaryslot_get(SlotBin, LedgerKey, Hash, PressMethod), + binaryslot_get(SlotBin, LedgerKey, Hash, PressMethod, IdxModDate), BlockIndexCache = array:set(SlotID - 1, Header, State#state.blockindex_cache), {_SW3, Timings3} = @@ -748,7 +878,7 @@ fetch(LedgerKey, Hash, State, Timings0) -> {Result, State#state{blockindex_cache = BlockIndexCache}, Timings3}; - <> -> + {BlockLengths, _LMD, PosBin} -> PosList = find_pos(PosBin, extra_hash(Hash), [], 0), case PosList of [] -> @@ -777,6 +907,7 @@ fetch(LedgerKey, Hash, State, Timings0) -> byte_size(PosBin), LedgerKey, PressMethod, + IdxModDate, not_present), FetchCache0 = array:set(CacheHash, Result, FetchCache), @@ -793,8 +924,9 @@ fetch(LedgerKey, Hash, State, Timings0) -> end. --spec fetch_range(tuple(), tuple(), integer(), leveled_codec:segment_list(), - sst_state()) -> {list(), list()}. +-spec fetch_range(tuple(), tuple(), integer(), + leveled_codec:segment_list(), non_neg_integer(), + sst_state()) -> {list(), list()}. %% @doc %% Fetch the contents of the SST file for a given key range. This will %% pre-fetch some results, and append pointers for additional results. @@ -802,7 +934,7 @@ fetch(LedgerKey, Hash, State, Timings0) -> %% A filter can be provided based on the Segment ID (usable for hashable %% objects not no_lookup entries) to accelerate the query if the 5-arity %% version is used -fetch_range(StartKey, EndKey, ScanWidth, SegList, State) -> +fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State) -> Summary = State#state.summary, Handle = State#state.handle, {Slots, RTrim} = lookup_slots(StartKey, EndKey, Summary#summary.index), @@ -856,11 +988,12 @@ fetch_range(StartKey, EndKey, ScanWidth, SegList, State) -> SlotsToFetchBinList = read_slots(Handle, SlotsToFetch, - {SegList, State#state.blockindex_cache}, - State#state.compression_method), + {SegList, LowLastMod, State#state.blockindex_cache}, + State#state.compression_method, + State#state.index_moddate), {SlotsToFetchBinList, SlotsToPoint}. --spec compress_level(integer(), press_methods()) -> press_methods(). +-spec compress_level(integer(), press_method()) -> press_method(). %% @doc %% disable compression at higher levels for improved performance compress_level(Level, _PressMethod) when Level < ?COMPRESS_AT_LEVEL -> @@ -868,11 +1001,12 @@ compress_level(Level, _PressMethod) when Level < ?COMPRESS_AT_LEVEL -> compress_level(_Level, PressMethod) -> PressMethod. -write_file(RootPath, Filename, SummaryBin, SlotsBin, PressMethod) -> +write_file(RootPath, Filename, SummaryBin, SlotsBin, + PressMethod, IdxModDate) -> SummaryLength = byte_size(SummaryBin), SlotsLength = byte_size(SlotsBin), {PendingName, FinalName} = generate_filenames(Filename), - FileVersion = gen_fileversion(PressMethod), + FileVersion = gen_fileversion(PressMethod, IdxModDate), ok = file:write_file(filename:join(RootPath, PendingName), < filename = Filename}, Bloom}. -gen_fileversion(PressMethod) -> +gen_fileversion(PressMethod, IdxModDate) -> % Native or none can be treated the same once written, as reader % does not need to know as compression info will be in header of the % block @@ -921,17 +1055,31 @@ gen_fileversion(PressMethod) -> native -> 0; none -> 0 end, - Bit1. + Bit2 = + case IdxModDate of + true -> + 2; + false -> + 0 + end, + Bit1+ Bit2. imp_fileversion(VersionInt, State) -> - UpdState = + UpdState0 = case VersionInt band 1 of 0 -> State#state{compression_method = native}; 1 -> State#state{compression_method = lz4} end, - UpdState. + UpdState1 = + case VersionInt band 2 of + 0 -> + UpdState0#state{index_moddate = false}; + 2 -> + UpdState0#state{index_moddate = true} + end, + UpdState1. open_reader(Filename) -> {ok, Handle} = file:open(Filename, [binary, raw, read]), @@ -964,28 +1112,25 @@ read_table_summary(BinWithCheck) -> end. -build_all_slots(SlotList, PressMethod) -> +build_all_slots(SlotList) -> SlotCount = length(SlotList), - BuildResponse = build_all_slots(SlotList, - 9, - 1, - [], - array:new([{size, SlotCount}, - {default, none}]), - <<>>, - [], - PressMethod), - {SlotIndex, BlockIndex, SlotsBin, HashLists} = BuildResponse, + {SlotIndex, BlockIndex, SlotsBin, HashLists} = + build_all_slots(SlotList, + 9, + 1, + [], + array:new([{size, SlotCount}, + {default, none}]), + <<>>, + []), Bloom = leveled_ebloom:create_bloom(HashLists), {SlotCount, SlotIndex, BlockIndex, SlotsBin, Bloom}. build_all_slots([], _Pos, _SlotID, - SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists, - _PressMethod) -> + SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> {SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists}; build_all_slots([SlotD|Rest], Pos, SlotID, - SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists, - PressMethod) -> + SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> {BlockIdx, SlotBin, HashList, LastKey} = SlotD, Length = byte_size(SlotBin), SlotIndexV = #slot_index_value{slot_id = SlotID, @@ -997,8 +1142,7 @@ build_all_slots([SlotD|Rest], Pos, SlotID, [{LastKey, SlotIndexV}|SlotIdxAcc], array:set(SlotID - 1, BlockIdx, BlockIdxAcc), <>, - lists:append(HashLists, HashList), - PressMethod). + lists:append(HashLists, HashList)). generate_filenames(RootFilename) -> @@ -1016,7 +1160,7 @@ generate_filenames(RootFilename) -> end. --spec serialise_block(any(), press_methods()) -> binary(). +-spec serialise_block(any(), press_method()) -> binary(). %% @doc %% Convert term to binary %% Function split out to make it easier to experiment with different @@ -1036,7 +1180,7 @@ serialise_block(Term, none) -> <>. --spec deserialise_block(binary(), press_methods()) -> any(). +-spec deserialise_block(binary(), press_method()) -> any(). %% @doc %% Convert binary to term %% Function split out to make it easier to experiment with different @@ -1131,48 +1275,82 @@ lookup_slots(StartKey, EndKey, Tree) -> %% based on a 17-bit hash (so 0.0039 fpr). -generate_binary_slot(Lookup, KVL, PressMethod, BuildTimings0) -> +-spec accumulate_positions(leveled_codec:ledger_kv(), + {binary(), + non_neg_integer(), + list(non_neg_integer()), + leveled_codec:last_moddate()}) -> + {binary(), + non_neg_integer(), + list(non_neg_integer()), + leveled_codec:last_moddate()}. +%% @doc +%% Fold function use to accumulate the position information needed to +%% populate the summary of the slot +accumulate_positions({K, V}, {PosBinAcc, NoHashCount, HashAcc, LMDAcc}) -> + {_SQN, H1, LMD} = leveled_codec:strip_to_indexdetails({K, V}), + LMDAcc0 = take_max_lastmoddate(LMD, LMDAcc), + PosH1 = extra_hash(H1), + case is_integer(PosH1) of + true -> + case NoHashCount of + 0 -> + {<<1:1/integer, PosH1:15/integer,PosBinAcc/binary>>, + 0, + [H1|HashAcc], + LMDAcc0}; + N -> + % The No Hash Count is an integer between 0 and 127 + % and so at read time should count NHC + 1 + NHC = N - 1, + {<<1:1/integer, + PosH1:15/integer, + 0:1/integer, + NHC:7/integer, + PosBinAcc/binary>>, + 0, + HashAcc, + LMDAcc0} + end; + false -> + {PosBinAcc, NoHashCount + 1, HashAcc, LMDAcc0} + end. + + +-spec take_max_lastmoddate(leveled_codec:last_moddate(), + leveled_codec:last_moddate()) -> + leveled_codec:last_moddate(). +%% @doc +%% Get the last modified date. If no Last Modified Date on any object, can't +%% add the accelerator and should check each object in turn +take_max_lastmoddate(undefined, _LMDAcc) -> + ?FLIPPER32; +take_max_lastmoddate(LMD, LMDAcc) -> + max(LMD, LMDAcc). + +-spec generate_binary_slot(leveled_codec:lookup(), + list(leveled_codec:ledger_kv()), + press_method(), + boolean(), + build_timings()) -> + {{binary(), + binary(), + list(integer()), + leveled_codec:ledger_key()}, + build_timings()}. +%% @doc +%% Generate the serialised slot to be used when storing this sublist of keys +%% and values +generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> SW0 = os:timestamp(), - - HashFoldFun = - fun({K, V}, {PosBinAcc, NoHashCount, HashAcc}) -> - - {_SQN, H1} = leveled_codec:strip_to_seqnhashonly({K, V}), - PosH1 = extra_hash(H1), - case is_integer(PosH1) of - true -> - case NoHashCount of - 0 -> - {<<1:1/integer, - PosH1:15/integer, - PosBinAcc/binary>>, - 0, - [H1|HashAcc]}; - N -> - % The No Hash Count is an integer between 0 and 127 - % and so at read time should count NHC + 1 - NHC = N - 1, - {<<1:1/integer, - PosH1:15/integer, - 0:1/integer, - NHC:7/integer, - PosBinAcc/binary>>, - 0, - HashAcc} - end; - false -> - {PosBinAcc, NoHashCount + 1, HashAcc} - end - - end, - {HashL, PosBinIndex} = + {HashL, PosBinIndex, LMD} = case Lookup of lookup -> - {PosBinIndex0, - NHC, - HashL0} = lists:foldr(HashFoldFun, {<<>>, 0, []}, KVL), + InitAcc = {<<>>, 0, [], 0}, + {PosBinIndex0, NHC, HashL0, LMD0} = + lists:foldr(fun accumulate_positions/2, InitAcc, KVL), PosBinIndex1 = case NHC of 0 -> @@ -1181,9 +1359,9 @@ generate_binary_slot(Lookup, KVL, PressMethod, BuildTimings0) -> N = NHC - 1, <<0:1/integer, N:7/integer, PosBinIndex0/binary>> end, - {HashL0, PosBinIndex1}; + {HashL0, PosBinIndex1, LMD0}; no_lookup -> - {[], <<0:1/integer, 127:7/integer>>} + {[], <<0:1/integer, 127:7/integer>>, 0} end, BuildTimings1 = update_buildtimings(SW0, BuildTimings0, slot_hashlist), @@ -1244,19 +1422,37 @@ generate_binary_slot(Lookup, KVL, PressMethod, BuildTimings0) -> BuildTimings2 = update_buildtimings(SW1, BuildTimings1, slot_serialise), SW2 = os:timestamp(), - B1P = byte_size(PosBinIndex) + ?BLOCK_LENGTHS_LENGTH, + B1P = + case IndexModDate of + true -> + byte_size(PosBinIndex) + ?BLOCK_LENGTHS_LENGTH + ?LMD_LENGTH; + false -> + byte_size(PosBinIndex) + ?BLOCK_LENGTHS_LENGTH + end, CheckB1P = hmac(B1P), B1L = byte_size(B1), B2L = byte_size(B2), B3L = byte_size(B3), B4L = byte_size(B4), B5L = byte_size(B5), - Header = <>, + Header = + case IndexModDate of + true -> + <>; + false -> + <> + end, CheckH = hmac(Header), SlotBin = < {{Header, SlotBin, HashL, LastKey}, BuildTimings3}. - - -spec check_blocks(list(integer()), binary()|{file:io_device(), integer()}, binary(), integer(), leveled_codec:ledger_key()|false, - press_methods(), + press_method(), + boolean(), list()|not_present) -> list()|not_present. %% @doc %% Acc should start as not_present if LedgerKey is a key, and a list if %% LedgerKey is false check_blocks([], _BlockPointer, _BlockLengths, _PosBinLength, - _LedgerKeyToCheck, _PressMethod, not_present) -> + _LedgerKeyToCheck, _PressMethod, _IdxModDate, not_present) -> not_present; check_blocks([], _BlockPointer, _BlockLengths, _PosBinLength, - _LedgerKeyToCheck, _PressMethod, Acc) -> + _LedgerKeyToCheck, _PressMethod, _IdxModDate, Acc) -> lists:reverse(Acc); check_blocks([Pos|Rest], BlockPointer, BlockLengths, PosBinLength, - LedgerKeyToCheck, PressMethod, Acc) -> + LedgerKeyToCheck, PressMethod, IdxModDate, Acc) -> {BlockNumber, BlockPos} = revert_position(Pos), BlockBin = read_block(BlockPointer, BlockLengths, PosBinLength, - BlockNumber), + BlockNumber, + additional_offset(IdxModDate)), BlockL = deserialise_block(BlockBin, PressMethod), {K, V} = lists:nth(BlockPos, BlockL), case K of @@ -1306,30 +1502,38 @@ check_blocks([Pos|Rest], BlockPointer, BlockLengths, PosBinLength, false -> check_blocks(Rest, BlockPointer, BlockLengths, PosBinLength, - LedgerKeyToCheck, PressMethod, + LedgerKeyToCheck, PressMethod, IdxModDate, [{K, V}|Acc]); _ -> check_blocks(Rest, BlockPointer, BlockLengths, PosBinLength, - LedgerKeyToCheck, PressMethod, Acc) + LedgerKeyToCheck, PressMethod, IdxModDate, + Acc) end end. +-spec additional_offset(boolean()) -> pos_integer(). +%% @doc +%% 4-byte CRC, 4-byte pos, 4-byte CRC, 5x4 byte lengths, 4 byte LMD +%% LMD may not be present +additional_offset(true) -> + ?BLOCK_LENGTHS_LENGTH + 4 + 4 + 4 + ?LMD_LENGTH; +additional_offset(false) -> + ?BLOCK_LENGTHS_LENGTH + 4 + 4 + 4. -read_block({Handle, StartPos}, BlockLengths, PosBinLength, BlockID) -> + +read_block({Handle, StartPos}, BlockLengths, PosBinLength, BlockID, AO) -> {Offset, Length} = block_offsetandlength(BlockLengths, BlockID), {ok, BlockBin} = file:pread(Handle, StartPos + Offset + PosBinLength - + 32, - % 4-byte CRC, 4-byte pos, - % 4-byte CRC, 5x4 byte lengths + + AO, Length), BlockBin; -read_block(SlotBin, BlockLengths, PosBinLength, BlockID) -> +read_block(SlotBin, BlockLengths, PosBinLength, BlockID, AO) -> {Offset, Length} = block_offsetandlength(BlockLengths, BlockID), - StartPos = Offset + PosBinLength + 32, + StartPos = Offset + PosBinLength + AO, <<_Pre:StartPos/binary, BlockBin:Length/binary, _Rest/binary>> = SlotBin, BlockBin. @@ -1368,12 +1572,12 @@ binarysplit_mapfun(MultiSlotBin, StartPos) -> -spec read_slots(file:io_device(), list(), - {false|list(), any()}, press_methods()) - -> list(binaryslot_element()). + {false|list(), non_neg_integer(), binary()}, + press_method(), boolean()) -> list(binaryslot_element()). %% @doc %% The reading of sots will return a list of either 2-tuples containing %% {K, V} pairs - or 3-tuples containing {Binary, SK, EK}. The 3 tuples -%% can be exploded into lists of {K, V} pairs using the binaryslot_reader/3 +%% can be exploded into lists of {K, V} pairs using the binaryslot_reader/4 %% function %% %% Reading slots is generally unfiltered, but in the sepcial case when @@ -1384,12 +1588,13 @@ binarysplit_mapfun(MultiSlotBin, StartPos) -> %% any key comparison between levels should allow for a non-matching key to %% be considered as superior to a matching key - as otherwise a matching key %% may be intermittently removed from the result set -read_slots(Handle, SlotList, {false, _BlockIndexCache}, _PressMethod) -> - % No list of segments passed - LengthList = lists:map(fun pointer_mapfun/1, SlotList), - {MultiSlotBin, StartPos} = read_length_list(Handle, LengthList), - lists:map(binarysplit_mapfun(MultiSlotBin, StartPos), LengthList); -read_slots(Handle, SlotList, {SegList, BlockIndexCache}, PressMethod) -> +read_slots(Handle, SlotList, {false, 0, _BlockIndexCache}, + _PressMethod, _IdxModDate) -> + % No list of segments passed or useful Low LastModified Date + % Just read slots in SlotList + read_slotlist(SlotList, Handle); +read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, + PressMethod, IdxModDate) -> % List of segments passed so only {K, V} pairs matching those segments % should be returned. This required the {K, V} pair to have been added % with the appropriate hash - if the pair were added with no_lookup as @@ -1397,67 +1602,97 @@ read_slots(Handle, SlotList, {SegList, BlockIndexCache}, PressMethod) -> BinMapFun = fun(Pointer, Acc) -> {SP, _L, ID, _SK, _EK} = pointer_mapfun(Pointer), - BL = ?BLOCK_LENGTHS_LENGTH, - case array:get(ID - 1, BlockIndexCache) of + CachedHeader = array:get(ID - 1, BlockIndexCache), + case extract_header(CachedHeader, IdxModDate) of none -> % If there is an attempt to use the seg list query and the % index block cache isn't cached for any part this may be % slower as each slot will be read in turn - LengthDetails = pointer_mapfun(Pointer), - {MultiSlotBin, StartPos} = - read_length_list(Handle, [LengthDetails]), - MapFun = binarysplit_mapfun(MultiSlotBin, StartPos), - Acc ++ [MapFun(LengthDetails)]; - <> -> + Acc ++ read_slotlist([Pointer], Handle); + {BlockLengths, LMD, BlockIdx} -> % If there is a BlockIndex cached then we can use it to % check to see if any of the expected segments are % present without lifting the slot off disk. Also the % fact that we know position can be used to filter out % other keys - case find_pos(BlockIdx, SegList, [], 0) of - [] -> + % + % Note that LMD will be 0 if the indexing of last mod + % date was not enable at creation time. So in this + % case the filter should always map + case LowLastMod > LMD of + true -> + % The highest LMD on the slot was before the + % LowLastMod date passed in the query - therefore + % there are no interesting modifications in this + % slot - it is all too old Acc; - PositionList -> - Acc ++ - check_blocks(PositionList, - {Handle, SP}, - BlockLengths, - byte_size(BlockIdx), - false, PressMethod, - []) + false -> + case SegList of + false -> + % Need all the slot now + Acc ++ read_slotlist([Pointer], Handle); + _SL -> + % Need to find just the right keys + PositionList = + find_pos(BlockIdx, SegList, [], 0), + Acc ++ + check_blocks(PositionList, + {Handle, SP}, + BlockLengths, + byte_size(BlockIdx), + false, + PressMethod, + IdxModDate, + []) + % Note check_blocks shouldreturn [] if + % PositionList is empty + end end end end, lists:foldl(BinMapFun, [], SlotList). +read_slotlist(SlotList, Handle) -> + LengthList = lists:map(fun pointer_mapfun/1, SlotList), + {MultiSlotBin, StartPos} = read_length_list(Handle, LengthList), + lists:map(binarysplit_mapfun(MultiSlotBin, StartPos), LengthList). + + -spec binaryslot_reader(list(binaryslot_element()), - native|lz4, + press_method(), + boolean(), leveled_codec:segment_list()) -> {list({tuple(), tuple()}), list({integer(), binary()})}. %% @doc %% Read the binary slots converting them to {K, V} pairs if they were not %% already {K, V} pairs -binaryslot_reader(SlotBinsToFetch, PressMethod, SegList) -> - binaryslot_reader(SlotBinsToFetch, PressMethod, SegList, [], []). +binaryslot_reader(SlotBinsToFetch, PressMethod, IdxModDate, SegList) -> + binaryslot_reader(SlotBinsToFetch, + PressMethod, IdxModDate, SegList, [], []). -binaryslot_reader([], _PressMethod, _SegList, Acc, BIAcc) -> +binaryslot_reader([], _PressMethod, _IdxModDate, _SegList, Acc, BIAcc) -> {Acc, BIAcc}; binaryslot_reader([{SlotBin, ID, SK, EK}|Tail], - PressMethod, SegList, Acc, BIAcc) -> + PressMethod, IdxModDate, SegList, Acc, BIAcc) -> {TrimmedL, BICache} = binaryslot_trimmedlist(SlotBin, SK, EK, PressMethod, + IdxModDate, SegList), binaryslot_reader(Tail, PressMethod, + IdxModDate, SegList, Acc ++ TrimmedL, [{ID, BICache}|BIAcc]); -binaryslot_reader([{K, V}|Tail], PressMethod, SegList, Acc, BIAcc) -> - binaryslot_reader(Tail, PressMethod, SegList, Acc ++ [{K, V}], BIAcc). +binaryslot_reader([{K, V}|Tail], + PressMethod, IdxModDate, SegList, Acc, BIAcc) -> + binaryslot_reader(Tail, + PressMethod, IdxModDate, SegList, + Acc ++ [{K, V}], BIAcc). read_length_list(Handle, LengthList) -> @@ -1468,12 +1703,27 @@ read_length_list(Handle, LengthList) -> {MultiSlotBin, StartPos}. +-spec extract_header(binary()|none, boolean()) -> + {binary(), integer(), binary()}|none. +%% @doc +%% Helper for extracting the binaries from the header ignoring the missing LMD +%% if LMD is not indexed +extract_header(none, _IdxModDate) -> + none; % used when the block cache has returned none +extract_header(Header, true) -> + BL = ?BLOCK_LENGTHS_LENGTH, + <> = Header, + {BlockLengths, LMD, PosBinIndex}; +extract_header(Header, false) -> + BL = ?BLOCK_LENGTHS_LENGTH, + <> = Header, + {BlockLengths, 0, PosBinIndex}. -binaryslot_get(FullBin, Key, Hash, PressMethod) -> +binaryslot_get(FullBin, Key, Hash, PressMethod, IdxModDate) -> case crc_check_slot(FullBin) of {Header, Blocks} -> - BL = ?BLOCK_LENGTHS_LENGTH, - <> = Header, + {BlockLengths, _LMD, PosBinIndex} = + extract_header(Header, IdxModDate), PosList = find_pos(PosBinIndex, extra_hash(Hash), [], @@ -1485,7 +1735,7 @@ binaryslot_get(FullBin, Key, Hash, PressMethod) -> none} end. -binaryslot_tolist(FullBin, PressMethod) -> +binaryslot_tolist(FullBin, PressMethod, IdxModDate) -> BlockFetchFun = fun(Length, {Acc, Bin}) -> case Length of @@ -1500,12 +1750,13 @@ binaryslot_tolist(FullBin, PressMethod) -> {Out, _Rem} = case crc_check_slot(FullBin) of {Header, Blocks} -> + {BlockLengths, _LMD, _PosBinIndex} = + extract_header(Header, IdxModDate), <> = Header, + B5L:32/integer>> = BlockLengths, lists:foldl(BlockFetchFun, {[], Blocks}, [B1L, B2L, B3L, B4L, B5L]); @@ -1515,9 +1766,11 @@ binaryslot_tolist(FullBin, PressMethod) -> Out. -binaryslot_trimmedlist(FullBin, all, all, PressMethod, false) -> - {binaryslot_tolist(FullBin, PressMethod), none}; -binaryslot_trimmedlist(FullBin, StartKey, EndKey, PressMethod, SegList) -> +binaryslot_trimmedlist(FullBin, all, all, + PressMethod, IdxModDate, false) -> + {binaryslot_tolist(FullBin, PressMethod, IdxModDate), none}; +binaryslot_trimmedlist(FullBin, StartKey, EndKey, + PressMethod, IdxModDate, SegList) -> LTrimFun = fun({K, _V}) -> K < StartKey end, RTrimFun = fun({K, _V}) -> not leveled_codec:endkey_passed(EndKey, K) end, BlockCheckFun = @@ -1566,12 +1819,13 @@ binaryslot_trimmedlist(FullBin, StartKey, EndKey, PressMethod, SegList) -> % and Block3 (as Block 1 will also be checked), but finessing this last % scenario is hard to do in concise code {{Header, Blocks}, false} -> + {BlockLengths, _LMD, _PosBinIndex} = + extract_header(Header, IdxModDate), <> = Header, + B5L:32/integer>> = BlockLengths, <> = Blocks, @@ -1608,15 +1862,15 @@ binaryslot_trimmedlist(FullBin, StartKey, EndKey, PressMethod, SegList) -> {Acc, _Continue} = lists:foldl(BlockCheckFun, {[], true}, BlocksToCheck), {Acc, none}; {{Header, _Blocks}, SegList} -> - BL = ?BLOCK_LENGTHS_LENGTH, - <> = Header, + {BlockLengths, _LMD, BlockIdx} = extract_header(Header, IdxModDate), PosList = find_pos(BlockIdx, SegList, [], 0), KVL = check_blocks(PosList, FullBin, BlockLengths, byte_size(BlockIdx), false, - PressMethod, + PressMethod, + IdxModDate, []), {KVL, Header}; {crc_wonky, _} -> @@ -1768,7 +2022,7 @@ find_pos(<<0:1/integer, NHC:7/integer, T/binary>>, Hash, PosList, Count) -> %% large numbers of index keys are present - as well as improving compression %% ratios in the Ledger. %% -%% The outcome of merge_lists/1 and merge_lists/3 should be an list of slots. +%% The outcome of merge_lists/3 and merge_lists/5 should be an list of slots. %% Each slot should be ordered by Key and be of the form {Flag, KVList}, where %% Flag can either be lookup or no-lookup. The list of slots should also be %% ordered by Key (i.e. the first key in the slot) @@ -1784,63 +2038,65 @@ find_pos(<<0:1/integer, NHC:7/integer, T/binary>>, Hash, PosList, Count) -> %% there are matching keys then the highest sequence number must be chosen and %% any lower sequence numbers should be compacted out of existence --spec merge_lists(list(), atom()) +-spec merge_lists(list(), press_method(), boolean()) -> {list(), list(), list(tuple()), tuple()|null}. %% @doc %% %% Merge from asingle list (i.e. at Level 0) -merge_lists(KVList1, PressMethod) -> +merge_lists(KVList1, PressMethod, IdxModDate) -> SlotCount = length(KVList1) div ?LOOK_SLOTSIZE, {[], [], - split_lists(KVList1, [], SlotCount, PressMethod), + split_lists(KVList1, [], SlotCount, PressMethod, IdxModDate), element(1, lists:nth(1, KVList1))}. -split_lists([], SlotLists, 0, _PressMethod) -> +split_lists([], SlotLists, 0, _PressMethod, _IdxModDate) -> lists:reverse(SlotLists); -split_lists(LastPuff, SlotLists, 0, PressMethod) -> +split_lists(LastPuff, SlotLists, 0, PressMethod, IdxModDate) -> {SlotD, _} = - generate_binary_slot(lookup, LastPuff, PressMethod, no_timing), + generate_binary_slot(lookup, LastPuff, PressMethod, IdxModDate, no_timing), lists:reverse([SlotD|SlotLists]); -split_lists(KVList1, SlotLists, N, PressMethod) -> +split_lists(KVList1, SlotLists, N, PressMethod, IdxModDate) -> {Slot, KVListRem} = lists:split(?LOOK_SLOTSIZE, KVList1), {SlotD, _} = - generate_binary_slot(lookup, Slot, PressMethod, no_timing), - split_lists(KVListRem, [SlotD|SlotLists], N - 1, PressMethod). + generate_binary_slot(lookup, Slot, PressMethod, IdxModDate, no_timing), + split_lists(KVListRem, [SlotD|SlotLists], N - 1, PressMethod, IdxModDate). --spec merge_lists(list(), list(), tuple(), atom()) -> +-spec merge_lists(list(), list(), tuple(), press_method(), boolean()) -> {list(), list(), list(tuple()), tuple()|null}. %% @doc %% Merge lists when merging across more thna one file. KVLists that are %% provided may include pointers to fetch more Keys/Values from the source %% file -merge_lists(KVList1, KVList2, LevelInfo, PressMethod) -> +merge_lists(KVList1, KVList2, LevelInfo, PressMethod, IndexModDate) -> merge_lists(KVList1, KVList2, LevelInfo, [], null, 0, - PressMethod, + PressMethod, + IndexModDate, #build_timings{}). -merge_lists(KVList1, KVList2, LI, SlotList, FirstKey, ?MAX_SLOTS, - _PressMethod, T0) -> +merge_lists(KVL1, KVL2, LI, SlotList, FirstKey, ?MAX_SLOTS, + _PressMethod, _IdxModDate, T0) -> % This SST file is full, move to complete file, and return the % remainder log_buildtimings(T0, LI), - {KVList1, KVList2, lists:reverse(SlotList), FirstKey}; -merge_lists([], [], LI, SlotList, FirstKey, _SlotCount, _PressMethod, T0) -> + {KVL1, KVL2, lists:reverse(SlotList), FirstKey}; +merge_lists([], [], LI, SlotList, FirstKey, _SlotCount, + _PressMethod, _IdxModDate, T0) -> % the source files are empty, complete the file log_buildtimings(T0, LI), {[], [], lists:reverse(SlotList), FirstKey}; -merge_lists(KVList1, KVList2, LI, SlotList, FirstKey, SlotCount, - PressMethod, T0) -> +merge_lists(KVL1, KVL2, LI, SlotList, FirstKey, SlotCount, + PressMethod, IdxModDate, T0) -> % Form a slot by merging the two lists until the next 128 K/V pairs have % been determined SW = os:timestamp(), {KVRem1, KVRem2, Slot, FK0} = - form_slot(KVList1, KVList2, LI, no_lookup, 0, [], FirstKey), + form_slot(KVL1, KVL2, LI, no_lookup, 0, [], FirstKey), T1 = update_buildtimings(SW, T0, fold_toslot), case Slot of {_, []} -> @@ -1852,12 +2108,13 @@ merge_lists(KVList1, KVList2, LI, SlotList, FirstKey, SlotCount, FK0, SlotCount, PressMethod, + IdxModDate, T1); {Lookup, KVL} -> % Convert the list of KVs for the slot into a binary, and related % metadata {SlotD, T2} = - generate_binary_slot(Lookup, KVL, PressMethod, T1), + generate_binary_slot(Lookup, KVL, PressMethod, IdxModDate, T1), merge_lists(KVRem1, KVRem2, LI, @@ -1865,6 +2122,7 @@ merge_lists(KVList1, KVList2, LI, SlotList, FirstKey, SlotCount, FK0, SlotCount + 1, PressMethod, + IdxModDate, T2) end. @@ -1989,41 +2247,6 @@ maybe_expand_pointer(List) -> List. -expand_list_by_pointer(Pointer, Tail, Width) -> - expand_list_by_pointer(Pointer, Tail, Width, false). - -expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey}, - Tail, Width, SegList) -> - FoldFun = - fun(X, {Pointers, Remainder}) -> - case length(Pointers) of - L when L < Width -> - case X of - {pointer, SSTPid, S, SK, EK} -> - {Pointers ++ [{pointer, S, SK, EK}], Remainder}; - _ -> - {Pointers, Remainder ++ [X]} - end; - _ -> - {Pointers, Remainder ++ [X]} - end - end, - InitAcc = {[{pointer, Slot, StartKey, EndKey}], []}, - {AccPointers, AccTail} = lists:foldl(FoldFun, InitAcc, Tail), - ExpPointers = - leveled_sst:sst_getfilteredslots(SSTPid, AccPointers, SegList), - lists:append(ExpPointers, AccTail); -expand_list_by_pointer({next, ManEntry, StartKey, EndKey}, - Tail, Width, SegList) -> - SSTPid = ManEntry#manifest_entry.owner, - leveled_log:log("SST10", [SSTPid, is_process_alive(SSTPid)]), - ExpPointer = - leveled_sst:sst_getfilteredrange(SSTPid, - StartKey, EndKey, - Width, SegList), - ExpPointer ++ Tail. - - %%%============================================================================ %%% Timing Functions @@ -2164,6 +2387,14 @@ update_timings(SW, Timings, Stage, Continue) -> -ifdef(TEST). +testsst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod) -> + sst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod, false). + +testsst_new(RootPath, Filename, + KVL1, KVL2, IsBasement, Level, MaxSQN, PressMethod) -> + sst_new(RootPath, Filename, KVL1, KVL2, IsBasement, Level, MaxSQN, + PressMethod, false). + generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> generate_randomkeys(Seqn, Count, @@ -2228,7 +2459,8 @@ merge_tombstonelist_test() -> R = merge_lists([SkippingKV1, SkippingKV3, SkippingKV5], [SkippingKV2, SkippingKV4], {true, 9999999}, - native), + native, + ?INDEX_MODDATE), ?assertMatch({[], [], [], null}, R). indexed_list_test() -> @@ -2240,7 +2472,7 @@ indexed_list_test() -> SW0 = os:timestamp(), {{_PosBinIndex1, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, KVL1, native, no_timing), + generate_binary_slot(lookup, KVL1, native, ?INDEX_MODDATE, no_timing), io:format(user, "Indexed list created slot in ~w microseconds of size ~w~n", [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]), @@ -2269,7 +2501,7 @@ indexed_list_mixedkeys_test() -> Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), {{_PosBinIndex1, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, no_timing), + generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), {TestK1, TestV1} = lists:nth(4, KVL1), MH1 = leveled_codec:segment_hash(TestK1), @@ -2296,7 +2528,7 @@ indexed_list_mixedkeys2_test() -> % this isn't actually ordered correctly Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2, {{_Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, no_timing), + generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), lists:foreach(fun({K, V}) -> MH = leveled_codec:segment_hash(K), test_binary_slot(FullBin, K, MH, {K, V}) @@ -2306,43 +2538,61 @@ indexed_list_mixedkeys2_test() -> indexed_list_allindexkeys_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), ?LOOK_SLOTSIZE), - {{Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, no_timing), + {{HeaderT, FullBinT, _HL, _LK}, no_timing} = + generate_binary_slot(lookup, Keys, native, true, no_timing), + {{HeaderF, FullBinF, _HL, _LK}, no_timing} = + generate_binary_slot(lookup, Keys, native, false, no_timing), EmptySlotSize = ?LOOK_SLOTSIZE - 1, - ?assertMatch(<<_BL:20/binary, EmptySlotSize:8/integer>>, Header), + LMD = ?FLIPPER32, + ?assertMatch(<<_BL:20/binary, LMD:32/integer, EmptySlotSize:8/integer>>, + HeaderT), + ?assertMatch(<<_BL:20/binary, EmptySlotSize:8/integer>>, + HeaderF), % SW = os:timestamp(), - BinToList = binaryslot_tolist(FullBin, native), + BinToListT = binaryslot_tolist(FullBinT, native, true), + BinToListF = binaryslot_tolist(FullBinF, native, false), % io:format(user, % "Indexed list flattened in ~w microseconds ~n", % [timer:now_diff(os:timestamp(), SW)]), - ?assertMatch(Keys, BinToList), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBin, + ?assertMatch(Keys, BinToListT), + ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBinT, all, all, - native, false)). + native, + true, + false)), + ?assertMatch(Keys, BinToListF), + ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBinF, + all, all, + native, + false, + false)). indexed_list_allindexkeys_nolookup_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(1000)), ?NOLOOK_SLOTSIZE), {{Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(no_lookup, Keys, native, no_timing), - ?assertMatch(<<_BL:20/binary, 127:8/integer>>, Header), + generate_binary_slot(no_lookup, Keys, native, ?INDEX_MODDATE,no_timing), + ?assertMatch(<<_BL:20/binary, _LMD:32/integer, 127:8/integer>>, Header), % SW = os:timestamp(), - BinToList = binaryslot_tolist(FullBin, native), + BinToList = binaryslot_tolist(FullBin, native, ?INDEX_MODDATE), % io:format(user, % "Indexed list flattened in ~w microseconds ~n", % [timer:now_diff(os:timestamp(), SW)]), ?assertMatch(Keys, BinToList), ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBin, all, all, - native, false)). + native, + ?INDEX_MODDATE, + false)). indexed_list_allindexkeys_trimmed_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), ?LOOK_SLOTSIZE), {{Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, no_timing), + generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE,no_timing), EmptySlotSize = ?LOOK_SLOTSIZE - 1, - ?assertMatch(<<_BL:20/binary, EmptySlotSize:8/integer>>, Header), + ?assertMatch(<<_BL:20/binary, _LMD:32/integer, EmptySlotSize:8/integer>>, + Header), ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBin, {i, "Bucket", @@ -2353,26 +2603,30 @@ indexed_list_allindexkeys_trimmed_test() -> {"t1_int", 99999}, null}, native, + ?INDEX_MODDATE, false)), {SK1, _} = lists:nth(10, Keys), {EK1, _} = lists:nth(100, Keys), R1 = lists:sublist(Keys, 10, 91), - {O1, none} = binaryslot_trimmedlist(FullBin, SK1, EK1, native, false), + {O1, none} = binaryslot_trimmedlist(FullBin, SK1, EK1, + native, ?INDEX_MODDATE, false), ?assertMatch(91, length(O1)), ?assertMatch(R1, O1), {SK2, _} = lists:nth(10, Keys), {EK2, _} = lists:nth(20, Keys), R2 = lists:sublist(Keys, 10, 11), - {O2, none} = binaryslot_trimmedlist(FullBin, SK2, EK2, native, false), + {O2, none} = binaryslot_trimmedlist(FullBin, SK2, EK2, + native, ?INDEX_MODDATE, false), ?assertMatch(11, length(O2)), ?assertMatch(R2, O2), {SK3, _} = lists:nth(?LOOK_SLOTSIZE - 1, Keys), {EK3, _} = lists:nth(?LOOK_SLOTSIZE, Keys), R3 = lists:sublist(Keys, ?LOOK_SLOTSIZE - 1, 2), - {O3, none} = binaryslot_trimmedlist(FullBin, SK3, EK3, native, false), + {O3, none} = binaryslot_trimmedlist(FullBin, SK3, EK3, + native, ?INDEX_MODDATE, false), ?assertMatch(2, length(O3)), ?assertMatch(R3, O3). @@ -2382,7 +2636,7 @@ indexed_list_mixedkeys_bitflip_test() -> KVL1 = lists:sublist(KVL0, 33), Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), {{Header, SlotBin, _HL, LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, no_timing), + generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), ?assertMatch(LK, element(1, lists:last(Keys))), @@ -2390,7 +2644,8 @@ indexed_list_mixedkeys_bitflip_test() -> _B2L:32/integer, _B3L:32/integer, _B4L:32/integer, - _B5L:32/integer, + _B5L:32/integer, + _LMD:32/integer, PosBin/binary>> = Header, TestKey1 = element(1, lists:nth(1, KVL1)), @@ -2400,7 +2655,7 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin, TestKey1, MH1, lists:nth(1, KVL1)), test_binary_slot(SlotBin, TestKey2, MH2, lists:nth(33, KVL1)), - ToList = binaryslot_tolist(SlotBin, native), + ToList = binaryslot_tolist(SlotBin, native, ?INDEX_MODDATE), ?assertMatch(Keys, ToList), [Pos1] = find_pos(PosBin, extra_hash(MH1), [], 0), @@ -2419,8 +2674,8 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin1, TestKey1, MH1, not_present), test_binary_slot(SlotBin2, TestKey2, MH2, not_present), - ToList1 = binaryslot_tolist(SlotBin1, native), - ToList2 = binaryslot_tolist(SlotBin2, native), + ToList1 = binaryslot_tolist(SlotBin1, native, ?INDEX_MODDATE), + ToList2 = binaryslot_tolist(SlotBin2, native, ?INDEX_MODDATE), ?assertMatch(true, is_list(ToList1)), ?assertMatch(true, is_list(ToList2)), @@ -2433,7 +2688,8 @@ indexed_list_mixedkeys_bitflip_test() -> {SK1, _} = lists:nth(10, Keys), {EK1, _} = lists:nth(20, Keys), - {O1, none} = binaryslot_trimmedlist(SlotBin3, SK1, EK1, native, false), + {O1, none} = binaryslot_trimmedlist(SlotBin3, SK1, EK1, + native, ?INDEX_MODDATE, false), ?assertMatch([], O1), SlotBin4 = flip_byte(SlotBin, 0, 20), @@ -2441,12 +2697,14 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin4, TestKey1, MH1, not_present), test_binary_slot(SlotBin5, TestKey1, MH1, not_present), - ToList4 = binaryslot_tolist(SlotBin4, native), - ToList5 = binaryslot_tolist(SlotBin5, native), + ToList4 = binaryslot_tolist(SlotBin4, native, ?INDEX_MODDATE), + ToList5 = binaryslot_tolist(SlotBin5, native, ?INDEX_MODDATE), ?assertMatch([], ToList4), ?assertMatch([], ToList5), - {O4, none} = binaryslot_trimmedlist(SlotBin4, SK1, EK1, native, false), - {O5, none} = binaryslot_trimmedlist(SlotBin4, SK1, EK1, native, false), + {O4, none} = binaryslot_trimmedlist(SlotBin4, SK1, EK1, + native, ?INDEX_MODDATE, false), + {O5, none} = binaryslot_trimmedlist(SlotBin4, SK1, EK1, + native, ?INDEX_MODDATE, false), ?assertMatch([], O4), ?assertMatch([], O5). @@ -2464,23 +2722,28 @@ flip_byte(Binary, Offset, Length) -> test_binary_slot(FullBin, Key, Hash, ExpectedValue) -> % SW = os:timestamp(), - {ReturnedValue, _Header} = binaryslot_get(FullBin, Key, Hash, native), + {ReturnedValue, _Header} = + binaryslot_get(FullBin, Key, Hash, native, ?INDEX_MODDATE), ?assertMatch(ExpectedValue, ReturnedValue). % io:format(user, "Fetch success in ~w microseconds ~n", % [timer:now_diff(os:timestamp(), SW)]). - merge_test() -> + merge_tester(fun testsst_new/6, fun testsst_new/8), + merge_tester(fun sst_new/6, fun sst_new/8). + + +merge_tester(NewFunS, NewFunM) -> N = 3000, KVL1 = lists:ukeysort(1, generate_randomkeys(N + 1, N, 1, 20)), KVL2 = lists:ukeysort(1, generate_randomkeys(1, N, 1, 20)), KVL3 = lists:ukeymerge(1, KVL1, KVL2), SW0 = os:timestamp(), {ok, P1, {FK1, LK1}, _Bloom1} = - sst_new("../test/", "level1_src", 1, KVL1, 6000, native), + NewFunS("../test/", "level1_src", 1, KVL1, 6000, native), {ok, P2, {FK2, LK2}, _Bloom2} = - sst_new("../test/", "level2_src", 2, KVL2, 3000, native), + NewFunS("../test/", "level2_src", 2, KVL2, 3000, native), ExpFK1 = element(1, lists:nth(1, KVL1)), ExpLK1 = element(1, lists:last(KVL1)), ExpFK2 = element(1, lists:nth(1, KVL2)), @@ -2492,7 +2755,7 @@ merge_test() -> ML1 = [{next, #manifest_entry{owner = P1}, FK1}], ML2 = [{next, #manifest_entry{owner = P2}, FK2}], NewR = - sst_new("../test/", "level2_merge", ML1, ML2, false, 2, N * 2, native), + NewFunM("../test/", "level2_merge", ML1, ML2, false, 2, N * 2, native), {ok, P3, {{Rem1, Rem2}, FK3, LK3}, _Bloom3} = NewR, ?assertMatch([], Rem1), ?assertMatch([], Rem2), @@ -2521,13 +2784,17 @@ merge_test() -> simple_persisted_range_test() -> + simple_persisted_range_tester(fun testsst_new/6), + simple_persisted_range_tester(fun sst_new/6). + +simple_persisted_range_tester(SSTNewFun) -> {RP, Filename} = {"../test/", "simple_test"}, KVList0 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 16, 1, 20), KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), {ok, Pid, {FirstKey, LastKey}, _Bloom} = - sst_new(RP, Filename, 1, KVList1, length(KVList1), native), + SSTNewFun(RP, Filename, 1, KVList1, length(KVList1), native), {o, B, K, null} = LastKey, SK1 = {o, B, K, 0}, @@ -2628,6 +2895,11 @@ additional_range_test() -> simple_persisted_slotsize_test() -> + simple_persisted_slotsize_tester(fun testsst_new/6), + simple_persisted_slotsize_tester(fun sst_new/6). + + +simple_persisted_slotsize_tester(SSTNewFun) -> {RP, Filename} = {"../test/", "simple_slotsize_test"}, KVList0 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 2, 1, 20), KVList1 = lists:sublist(lists:ukeysort(1, KVList0), @@ -2635,7 +2907,7 @@ simple_persisted_slotsize_test() -> [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), {ok, Pid, {FirstKey, LastKey}, _Bloom} = - sst_new(RP, Filename, 1, KVList1, length(KVList1), native), + SSTNewFun(RP, Filename, 1, KVList1, length(KVList1), native), lists:foreach(fun({K, V}) -> ?assertMatch({K, V}, sst_get(Pid, K)) end, @@ -2643,14 +2915,21 @@ simple_persisted_slotsize_test() -> ok = sst_close(Pid), ok = file:delete(filename:join(RP, Filename ++ ".sst")). -simple_persisted_test() -> +simple_persisted_test_() -> + {timeout, 60, fun simple_persisted_test_bothformats/0}. + +simple_persisted_test_bothformats() -> + simple_persisted_tester(fun testsst_new/6), + simple_persisted_tester(fun sst_new/6). + +simple_persisted_tester(SSTNewFun) -> {RP, Filename} = {"../test/", "simple_test"}, KVList0 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 32, 1, 20), KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), {ok, Pid, {FirstKey, LastKey}, _Bloom} = - sst_new(RP, Filename, 1, KVList1, length(KVList1), native), + SSTNewFun(RP, Filename, 1, KVList1, length(KVList1), native), SW0 = os:timestamp(), lists:foreach(fun({K, V}) -> ?assertMatch({K, V}, sst_get(Pid, K)) @@ -2841,7 +3120,7 @@ hashmatching_bytreesize_test() -> end, KVL = lists:map(GenKeyFun, lists:seq(1, 128)), {{PosBinIndex1, _FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, KVL, native, no_timing), + generate_binary_slot(lookup, KVL, native, ?INDEX_MODDATE, no_timing), check_segment_match(PosBinIndex1, KVL, small), check_segment_match(PosBinIndex1, KVL, medium). @@ -2872,5 +3151,12 @@ timings_test() -> ?assertMatch(true, T4#sst_timings.slot_fetch_time > T3#sst_timings.slot_fetch_time). +take_max_lastmoddate_test() -> + % TODO: Remove this test + % Temporarily added to make dialyzer happy (until we've made use of last + % modified dates + ?assertMatch(1, take_max_lastmoddate(0, 1)). + + -endif. diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl index 76d77da..495ef7b 100644 --- a/src/leveled_tictac.erl +++ b/src/leveled_tictac.erl @@ -189,7 +189,7 @@ import_tree(ExportedTree) -> level2 = Lv2}. --spec add_kv(tictactree(), tuple(), tuple(), fun()) -> tictactree(). +-spec add_kv(tictactree(), term(), term(), fun()) -> tictactree(). %% @doc %% Add a Key and value to a tictactree using the BinExtractFun to extract a %% binary from the Key and value from which to generate the hash. The @@ -198,7 +198,7 @@ import_tree(ExportedTree) -> add_kv(TicTacTree, Key, Value, BinExtractFun) -> add_kv(TicTacTree, Key, Value, BinExtractFun, false). --spec add_kv(tictactree(), tuple(), tuple(), fun(), boolean()) +-spec add_kv(tictactree(), term(), term(), fun(), boolean()) -> tictactree()|{tictactree(), integer()}. %% @doc %% add_kv with ability to return segment ID of Key added @@ -523,8 +523,15 @@ get_size(Size) -> ?XLARGE end. -segmentcompare(SrcBin, SinkBin) when byte_size(SrcBin)==byte_size(SinkBin) -> - segmentcompare(SrcBin, SinkBin, [], 0). + +segmentcompare(SrcBin, SinkBin) when byte_size(SrcBin) == byte_size(SinkBin) -> + segmentcompare(SrcBin, SinkBin, [], 0); +segmentcompare(<<>>, SinkBin) -> + Size = bit_size(SinkBin), + segmentcompare(<<0:Size/integer>>, SinkBin); +segmentcompare(SrcBin, <<>>) -> + Size = bit_size(SrcBin), + segmentcompare(SrcBin, <<0:Size/integer>>). segmentcompare(<<>>, <<>>, Acc, _Counter) -> Acc; @@ -836,6 +843,19 @@ matchbysegment_check(SegList, MatchList, SmallSize, LargeSize) -> OL = lists:filter(PredFun, MatchList), {timer:now_diff(os:timestamp(), SW)/1000, OL}. +find_dirtysegments_withanemptytree_test() -> + T1 = new_tree(t1), + T2 = new_tree(t2), + ?assertMatch([], find_dirtysegments(fetch_root(T1), fetch_root(T2))), + + {T3, DS1} = + add_kv(T2, <<"TestKey">>, <<"V1">>, fun(B, K) -> {B, K} end, true), + ExpectedAnswer = [DS1 div 256], + ?assertMatch(ExpectedAnswer, find_dirtysegments(<<>>, fetch_root(T3))), + ?assertMatch(ExpectedAnswer, find_dirtysegments(fetch_root(T3), <<>>)). + + + -endif. diff --git a/src/leveled_tree.erl b/src/leveled_tree.erl index 38df85d..f033bb4 100644 --- a/src/leveled_tree.erl +++ b/src/leveled_tree.erl @@ -35,6 +35,9 @@ integer(), % length any()}. +-export_type([leveled_tree/0]). + + %%%============================================================================ %%% API %%%============================================================================ diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index 8ec9312..5d9aa28 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -529,30 +529,31 @@ multibucket_fold(_Config) -> end, FoldAccT = {FF, []}, - {async, R1} = leveled_bookie:book_headfold(Bookie1, - ?RIAK_TAG, - {bucket_list, - [{<<"Type1">>, <<"Bucket1">>}, - {<<"Type2">>, <<"Bucket4">>}]}, - FoldAccT, - false, - true, - false), + {async, R1} = + leveled_bookie:book_headfold(Bookie1, + ?RIAK_TAG, + {bucket_list, + [{<<"Type1">>, <<"Bucket1">>}, + {<<"Type2">>, <<"Bucket4">>}]}, + FoldAccT, + false, + true, + false), O1 = length(R1()), io:format("Result R1 of length ~w~n", [O1]), - Q2 = {foldheads_bybucket, - ?RIAK_TAG, - [<<"Bucket2">>, <<"Bucket3">>], bucket_list, - {fun(_B, _K, _PO, Acc) -> - Acc +1 - end, - 0}, - false, - true, - false}, - {async, R2} = leveled_bookie:book_returnfolder(Bookie1, Q2), + {async, R2} = + leveled_bookie:book_headfold(Bookie1, + ?RIAK_TAG, + {bucket_list, + [<<"Bucket2">>, + <<"Bucket3">>]}, + {fun(_B, _K, _PO, Acc) -> + Acc +1 + end, + 0}, + false, true, false), O2 = R2(), io:format("Result R2 of ~w~n", [O2]), diff --git a/test/end_to_end/recovery_SUITE.erl b/test/end_to_end/recovery_SUITE.erl index 0f2f81a..eb748cc 100644 --- a/test/end_to_end/recovery_SUITE.erl +++ b/test/end_to_end/recovery_SUITE.erl @@ -227,7 +227,8 @@ aae_missingjournal(_Config) -> {foldheads_allkeys, ?RIAK_TAG, FoldHeadsFun, - true, true, false}), + true, true, false, + false, false}), HeadL2 = length(AllHeadF2()), io:format("Fold head returned ~w objects~n", [HeadL2]), true = HeadL2 < HeadL1, diff --git a/test/end_to_end/riak_SUITE.erl b/test/end_to_end/riak_SUITE.erl index 0482bd5..cf3aa01 100644 --- a/test/end_to_end/riak_SUITE.erl +++ b/test/end_to_end/riak_SUITE.erl @@ -3,6 +3,7 @@ -include("include/leveled.hrl"). -export([all/0]). -export([ + fetchclocks_modifiedbetween/1, crossbucket_aae/1, handoff/1, dollar_bucket_index/1, @@ -10,6 +11,7 @@ ]). all() -> [ + fetchclocks_modifiedbetween, crossbucket_aae, handoff, dollar_bucket_index, @@ -18,6 +20,315 @@ all() -> [ -define(MAGIC, 53). % riak_kv -> riak_object + +fetchclocks_modifiedbetween(_Config) -> + RootPathA = testutil:reset_filestructure("fetchClockA"), + RootPathB = testutil:reset_filestructure("fetchClockB"), + StartOpts1A = [{root_path, RootPathA}, + {max_journalsize, 500000000}, + {max_pencillercachesize, 8000}, + {sync_strategy, testutil:sync_strategy()}], + StartOpts1B = [{root_path, RootPathB}, + {max_journalsize, 500000000}, + {max_pencillercachesize, 12000}, + {sync_strategy, testutil:sync_strategy()}], + {ok, Bookie1A} = leveled_bookie:book_start(StartOpts1A), + {ok, Bookie1B} = leveled_bookie:book_start(StartOpts1B), + + ObjL1StartTS = testutil:convert_to_seconds(os:timestamp()), + ObjList1 = + testutil:generate_objects(20000, + {fixed_binary, 1}, [], + leveled_rand:rand_bytes(512), + fun() -> [] end, + <<"B0">>), + timer:sleep(1000), + ObjL1EndTS = testutil:convert_to_seconds(os:timestamp()), + timer:sleep(1000), + + _ObjL2StartTS = testutil:convert_to_seconds(os:timestamp()), + ObjList2 = + testutil:generate_objects(15000, + {fixed_binary, 20001}, [], + leveled_rand:rand_bytes(512), + fun() -> [] end, + <<"B0">>), + timer:sleep(1000), + _ObjList2EndTS = testutil:convert_to_seconds(os:timestamp()), + timer:sleep(1000), + + ObjL3StartTS = testutil:convert_to_seconds(os:timestamp()), + ObjList3 = + testutil:generate_objects(35000, + {fixed_binary, 35001}, [], + leveled_rand:rand_bytes(512), + fun() -> [] end, + <<"B0">>), + timer:sleep(1000), + ObjL3EndTS = testutil:convert_to_seconds(os:timestamp()), + timer:sleep(1000), + + ObjL4StartTS = testutil:convert_to_seconds(os:timestamp()), + ObjList4 = + testutil:generate_objects(30000, + {fixed_binary, 70001}, [], + leveled_rand:rand_bytes(512), + fun() -> [] end, + <<"B0">>), + timer:sleep(1000), + _ObjL4EndTS = testutil:convert_to_seconds(os:timestamp()), + timer:sleep(1000), + + ObjL5StartTS = testutil:convert_to_seconds(os:timestamp()), + ObjList5 = + testutil:generate_objects(8000, + {fixed_binary, 1}, [], + leveled_rand:rand_bytes(512), + fun() -> [] end, + <<"B1">>), + timer:sleep(1000), + _ObjL5EndTS = testutil:convert_to_seconds(os:timestamp()), + timer:sleep(1000), + + _ObjL6StartTS = testutil:convert_to_seconds(os:timestamp()), + ObjList6 = + testutil:generate_objects(7000, + {fixed_binary, 1}, [], + leveled_rand:rand_bytes(512), + fun() -> [] end, + <<"B2">>), + timer:sleep(1000), + ObjL6EndTS = testutil:convert_to_seconds(os:timestamp()), + timer:sleep(1000), + + testutil:riakload(Bookie1A, ObjList5), + testutil:riakload(Bookie1A, ObjList1), + testutil:riakload(Bookie1A, ObjList2), + testutil:riakload(Bookie1A, ObjList3), + testutil:riakload(Bookie1A, ObjList4), + testutil:riakload(Bookie1A, ObjList6), + + testutil:riakload(Bookie1B, ObjList4), + testutil:riakload(Bookie1B, ObjList5), + testutil:riakload(Bookie1B, ObjList1), + testutil:riakload(Bookie1B, ObjList6), + testutil:riakload(Bookie1B, ObjList3), + + RevertFixedBinKey = + fun(FBK) -> + <<$K, $e, $y, KeyNumber:64/integer>> = FBK, + KeyNumber + end, + StoreFoldFun = + fun(_B, K, _V, {_LK, AccC}) -> + {RevertFixedBinKey(K), AccC + 1} + end, + + KeyRangeFun = + fun(StartNumber, EndNumber) -> + {range, + <<"B0">>, + {testutil:fixed_bin_key(StartNumber), + testutil:fixed_bin_key(EndNumber)}} + end, + + % Count with max object count + FoldRangesFun = + fun(FoldTarget, ModRange, EndNumber, MaxCount) -> + fun(_I, {LKN, KC}) -> + {async, Runner} = + leveled_bookie:book_headfold(FoldTarget, + ?RIAK_TAG, + KeyRangeFun(LKN + 1, + EndNumber), + {StoreFoldFun, {LKN, KC}}, + false, + true, + false, + ModRange, + MaxCount), + {_, {LKN0, KC0}} = Runner(), + {LKN0, KC0} + end + end, + + R1A = lists:foldl(FoldRangesFun(Bookie1A, false, 50000, 13000), + {0, 0}, lists:seq(1, 4)), + io:format("R1A ~w~n", [R1A]), + true = {50000, 50000} == R1A, + + R1B = lists:foldl(FoldRangesFun(Bookie1B, false, 50000, 13000), + {0, 0}, lists:seq(1, 3)), + io:format("R1B ~w~n", [R1B]), + true = {50000, 35000} == R1B, + + R2A = lists:foldl(FoldRangesFun(Bookie1A, + {ObjL3StartTS, ObjL3EndTS}, + 60000, + 13000), + {10000, 0}, lists:seq(1, 2)), + io:format("R2A ~w~n", [R2A]), + true = {60000, 25000} == R2A, + R2A_SR = lists:foldl(FoldRangesFun(Bookie1A, + {ObjL3StartTS, ObjL3EndTS}, + 60000, + 13000), + {10000, 0}, lists:seq(1, 1)), % Only single rotation + io:format("R2A_SingleRotation ~w~n", [R2A_SR]), + true = {48000, 13000} == R2A_SR, % Hit at max results + R2B = lists:foldl(FoldRangesFun(Bookie1B, + {ObjL3StartTS, ObjL3EndTS}, + 60000, + 13000), + {10000, 0}, lists:seq(1, 2)), + io:format("R2B ~w~n", [R1B]), + true = {60000, 25000} == R2B, + + CrudeStoreFoldFun = + fun(LowLMD, HighLMD) -> + fun(_B, K, V, {LK, AccC}) -> + % Value is proxy_object? Can we get the metadata and + % read the last modified date? The do a non-accelerated + % fold to chekc that it is slower + {proxy_object, MDBin, _Size, _Fetcher} = binary_to_term(V), + LMDTS = testutil:get_lastmodified(MDBin), + LMD = testutil:convert_to_seconds(LMDTS), + case (LMD >= LowLMD) and (LMD =< HighLMD) of + true -> + {RevertFixedBinKey(K), AccC + 1}; + false -> + {LK, AccC} + end + end + end, + + io:format("Comparing queries for Obj1 TS range ~w ~w~n", + [ObjL1StartTS, ObjL1EndTS]), + + PlusFilterStart = os:timestamp(), + R3A_PlusFilter = lists:foldl(FoldRangesFun(Bookie1A, + {ObjL1StartTS, ObjL1EndTS}, + 100000, + 100000), + {0, 0}, lists:seq(1, 1)), + PlusFilterTime = timer:now_diff(os:timestamp(), PlusFilterStart)/1000, + io:format("R3A_PlusFilter ~w~n", [R3A_PlusFilter]), + true = {20000, 20000} == R3A_PlusFilter, + + NoFilterStart = os:timestamp(), + {async, R3A_NoFilterRunner} = + leveled_bookie:book_headfold(Bookie1A, + ?RIAK_TAG, + KeyRangeFun(1, 100000), + {CrudeStoreFoldFun(ObjL1StartTS, + ObjL1EndTS), + {0, 0}}, + false, + true, + false), + R3A_NoFilter = R3A_NoFilterRunner(), + NoFilterTime = timer:now_diff(os:timestamp(), NoFilterStart)/1000, + io:format("R3A_NoFilter ~w~n", [R3A_NoFilter]), + true = {20000, 20000} == R3A_NoFilter, + io:format("Filtered query ~w ms and unfiltered query ~w ms~n", + [PlusFilterTime, NoFilterTime]), + true = NoFilterTime > PlusFilterTime, + + SimpleCountFun = + fun(_B, _K, _V, AccC) -> AccC + 1 end, + + {async, R4A_MultiBucketRunner} = + leveled_bookie:book_headfold(Bookie1A, + ?RIAK_TAG, + {bucket_list, [<<"B0">>, <<"B2">>]}, + {SimpleCountFun, 0}, + false, + true, + false, + {ObjL4StartTS, ObjL6EndTS}, + % Range includes ObjjL5 LMDs, + % but these ar enot in bucket list + false), + R4A_MultiBucket = R4A_MultiBucketRunner(), + io:format("R4A_MultiBucket ~w ~n", [R4A_MultiBucket]), + true = R4A_MultiBucket == 37000, + + {async, R5A_MultiBucketRunner} = + leveled_bookie:book_headfold(Bookie1A, + ?RIAK_TAG, + {bucket_list, [<<"B2">>, <<"B0">>]}, + % Reverse the buckets in the bucket + % list + {SimpleCountFun, 0}, + false, + true, + false, + {ObjL4StartTS, ObjL6EndTS}, + false), + R5A_MultiBucket = R5A_MultiBucketRunner(), + io:format("R5A_MultiBucket ~w ~n", [R5A_MultiBucket]), + true = R5A_MultiBucket == 37000, + + + {async, R5B_MultiBucketRunner} = + leveled_bookie:book_headfold(Bookie1B, + % Same query - other bookie + ?RIAK_TAG, + {bucket_list, [<<"B2">>, <<"B0">>]}, + {SimpleCountFun, 0}, + false, + true, + false, + {ObjL4StartTS, ObjL6EndTS}, + false), + R5B_MultiBucket = R5B_MultiBucketRunner(), + io:format("R5B_MultiBucket ~w ~n", [R5B_MultiBucket]), + true = R5A_MultiBucket == 37000, + + testutil:update_some_objects(Bookie1A, ObjList1, 1000), + R6A_PlusFilter = lists:foldl(FoldRangesFun(Bookie1A, + {ObjL1StartTS, ObjL1EndTS}, + 100000, + 100000), + {0, 0}, lists:seq(1, 1)), + io:format("R6A_PlusFilter ~w~n", [R6A_PlusFilter]), + true = 19000 == element(2, R6A_PlusFilter), + + % Hit limit of max count before trying next bucket, with and without a + % timestamp filter + {async, R7A_MultiBucketRunner} = + leveled_bookie:book_headfold(Bookie1A, + ?RIAK_TAG, + {bucket_list, [<<"B1">>, <<"B2">>]}, + {SimpleCountFun, 0}, + false, + true, + false, + {ObjL5StartTS, ObjL6EndTS}, + 5000), + R7A_MultiBucket = R7A_MultiBucketRunner(), + io:format("R7A_MultiBucket ~w ~n", [R7A_MultiBucket]), + true = R7A_MultiBucket == {0, 5000}, + + {async, R8A_MultiBucketRunner} = + leveled_bookie:book_headfold(Bookie1A, + ?RIAK_TAG, + {bucket_list, [<<"B1">>, <<"B2">>]}, + {SimpleCountFun, 0}, + false, + true, + false, + false, + 5000), + R8A_MultiBucket = R8A_MultiBucketRunner(), + io:format("R8A_MultiBucket ~w ~n", [R8A_MultiBucket]), + true = R8A_MultiBucket == {0, 5000}, + + ok = leveled_bookie:book_destroy(Bookie1A), + ok = leveled_bookie:book_destroy(Bookie1B). + + + crossbucket_aae(_Config) -> % Test requires multiple different databases, so want to mount them all % on individual file paths @@ -141,7 +452,7 @@ test_segfilter_query(Bookie, CLs) -> Acc end end, 0}, - false, true, SegL} + false, true, SegL, false, false} end, {async, SL1Folder} = @@ -174,7 +485,7 @@ test_singledelta_stores(BookA, BookB, TreeSize, DeltaKey) -> ?RIAK_TAG, {fun head_tictac_foldfun/4, {0, leveled_tictac:new_tree(test, TreeSize)}}, - false, true, false}, + false, true, false, false, false}, % tictac query by bucket (should be same result as all stores) TicTacByBucketFolder = {foldheads_bybucket, @@ -182,7 +493,7 @@ test_singledelta_stores(BookA, BookB, TreeSize, DeltaKey) -> all, {fun head_tictac_foldfun/4, {0, leveled_tictac:new_tree(test, TreeSize)}}, - false, false, false}, + false, false, false, false, false}, DLs = check_tictacfold(BookA, BookB, TicTacFolder, @@ -197,7 +508,7 @@ test_singledelta_stores(BookA, BookB, TreeSize, DeltaKey) -> {foldheads_allkeys, ?RIAK_TAG, {get_segment_folder(DLs, TreeSize), []}, - false, true, false}, + false, true, false, false, false}, SW_SL0 = os:timestamp(), {async, BookASegFolder} = @@ -221,7 +532,7 @@ test_singledelta_stores(BookA, BookB, TreeSize, DeltaKey) -> {foldheads_allkeys, ?RIAK_TAG, {get_segment_folder(DLs, TreeSize), []}, - false, true, SegFilterList}, + false, true, SegFilterList, false, false}, SW_SL1 = os:timestamp(), {async, BookASegFolder1} = @@ -240,7 +551,7 @@ test_singledelta_stores(BookA, BookB, TreeSize, DeltaKey) -> {foldheads_allkeys, ?RIAK_TAG, {get_segment_folder(DLs, TreeSize), []}, - true, true, SegFilterList}, + true, true, SegFilterList, false, false}, SW_SL1CP = os:timestamp(), {async, BookASegFolder1CP} = @@ -264,7 +575,7 @@ test_singledelta_stores(BookA, BookB, TreeSize, DeltaKey) -> {foldheads_allkeys, ?RIAK_TAG, {get_segment_folder(DLs, TreeSize), []}, - false, true, SegFilterListF}, + false, true, SegFilterListF, false, false}, SW_SL1F = os:timestamp(), {async, BookASegFolder1F} = @@ -468,7 +779,7 @@ handoff(_Config) -> ?RIAK_TAG, {fun head_tictac_foldfun/4, {0, leveled_tictac:new_tree(test, TreeSize)}}, - false, true, false}, + false, true, false, false, false}, check_tictacfold(Bookie1, Bookie2, TicTacFolder, none, TreeSize), check_tictacfold(Bookie2, Bookie3, TicTacFolder, none, TreeSize), check_tictacfold(Bookie3, Bookie4, TicTacFolder, none, TreeSize), diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index 176c065..37ffea9 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -28,6 +28,7 @@ get_key/1, get_value/1, get_vclock/1, + get_lastmodified/1, get_compressiblevalue/0, get_compressiblevalue_andinteger/0, get_randomindexes_generator/1, @@ -51,8 +52,9 @@ sync_strategy/0, riak_object/4, get_value_from_objectlistitem/1, - numbered_key/1, - fixed_bin_key/1]). + numbered_key/1, + fixed_bin_key/1, + convert_to_seconds/1]). -define(RETURN_TERMS, {true, undefined}). -define(SLOWOFFER_DELAY, 5). @@ -491,7 +493,10 @@ update_some_objects(Bookie, ObjList, SampleSize) -> VC = Obj#r_object.vclock, VC0 = update_vclock(VC), [C] = Obj#r_object.contents, - C0 = C#r_content{value = leveled_rand:rand_bytes(512)}, + MD = C#r_content.metadata, + MD0 = dict:store(?MD_LASTMOD, os:timestamp(), MD), + C0 = C#r_content{value = leveled_rand:rand_bytes(512), + metadata = MD0}, UpdObj = Obj#r_object{vclock = VC0, contents = [C0]}, {R, UpdObj, Spec} end, @@ -551,6 +556,24 @@ get_value(ObjectBin) -> error end. +get_lastmodified(ObjectBin) -> + <<_Magic:8/integer, _Vers:8/integer, VclockLen:32/integer, + Rest1/binary>> = ObjectBin, + <<_VclockBin:VclockLen/binary, SibCount:32/integer, SibsBin/binary>> = Rest1, + case SibCount of + 1 -> + <> = SibsBin, + <<_ContentBin:SibLength/binary, + MetaLength:32/integer, + MetaBin:MetaLength/binary, + _Rest3/binary>> = Rest2, + <> = MetaBin, + {MegaSec, Sec, MicroSec} + end. + get_vclock(ObjectBin) -> <<_Magic:8/integer, _Vers:8/integer, VclockLen:32/integer, Rest1/binary>> = ObjectBin, @@ -771,3 +794,5 @@ find_journals(RootPath) -> FNsA_J), CDBFiles. +convert_to_seconds({MegaSec, Seconds, _MicroSec}) -> + MegaSec * 1000000 + Seconds. \ No newline at end of file diff --git a/test/end_to_end/tictac_SUITE.erl b/test/end_to_end/tictac_SUITE.erl index 2c0435d..20c748c 100644 --- a/test/end_to_end/tictac_SUITE.erl +++ b/test/end_to_end/tictac_SUITE.erl @@ -5,20 +5,12 @@ -export([ many_put_compare/1, index_compare/1, - recent_aae_noaae/1, - recent_aae_allaae/1, - recent_aae_bucketaae/1, - recent_aae_expiry/1, basic_headonly/1 ]). all() -> [ many_put_compare, index_compare, - recent_aae_noaae, - recent_aae_allaae, - recent_aae_bucketaae, - recent_aae_expiry, basic_headonly ]. @@ -164,14 +156,15 @@ many_put_compare(_Config) -> [timer:now_diff(os:timestamp(), SWB0Obj)]), true = length(leveled_tictac:find_dirtyleaves(TreeA, TreeAObj0)) == 0, - FoldQ1 = {foldheads_bybucket, - o_rkv, - "Bucket", - all, - {FoldObjectsFun, leveled_tictac:new_tree(0, TreeSize)}, - true, true, false}, + InitAccTree = leveled_tictac:new_tree(0, TreeSize), + {async, TreeAObjFolder1} = - leveled_bookie:book_returnfolder(Bookie2, FoldQ1), + leveled_bookie:book_headfold(Bookie2, + ?RIAK_TAG, + {range, "Bucket", all}, + {FoldObjectsFun, + InitAccTree}, + true, true, false), SWB1Obj = os:timestamp(), TreeAObj1 = TreeAObjFolder1(), io:format("Build tictac tree via object fold with "++ @@ -192,21 +185,26 @@ many_put_compare(_Config) -> fun(_Bucket, Key, Value, Acc) -> leveled_tictac:add_kv(Acc, Key, Value, AltExtractFun) end, - AltFoldQ0 = {foldheads_bybucket, - o_rkv, - "Bucket", - all, - {AltFoldObjectsFun, leveled_tictac:new_tree(0, TreeSize)}, - false, true, false}, + {async, TreeAAltObjFolder0} = - leveled_bookie:book_returnfolder(Bookie2, AltFoldQ0), + leveled_bookie:book_headfold(Bookie2, + ?RIAK_TAG, + {range, "Bucket", all}, + {AltFoldObjectsFun, + InitAccTree}, + false, true, false), SWB2Obj = os:timestamp(), TreeAAltObj = TreeAAltObjFolder0(), io:format("Build tictac tree via object fold with no "++ "presence check and 200K objects and alt hash in ~w~n", [timer:now_diff(os:timestamp(), SWB2Obj)]), {async, TreeBAltObjFolder0} = - leveled_bookie:book_returnfolder(Bookie3, AltFoldQ0), + leveled_bookie:book_headfold(Bookie3, + ?RIAK_TAG, + {range, "Bucket", all}, + {AltFoldObjectsFun, + InitAccTree}, + false, true, false), SWB3Obj = os:timestamp(), TreeBAltObj = TreeBAltObjFolder0(), io:format("Build tictac tree via object fold with no "++ @@ -542,478 +540,6 @@ index_compare(_Config) -> ok = leveled_bookie:book_close(Book2D). -recent_aae_noaae(_Config) -> - % Starts databases with recent_aae tables, and attempt to query to fetch - % recent aae trees returns empty trees as no index entries are found. - - TreeSize = small, - % SegmentCount = 256 * 256, - UnitMins = 2, - - % Test requires multiple different databases, so want to mount them all - % on individual file paths - RootPathA = testutil:reset_filestructure("testA"), - RootPathB = testutil:reset_filestructure("testB"), - RootPathC = testutil:reset_filestructure("testC"), - RootPathD = testutil:reset_filestructure("testD"), - StartOptsA = aae_startopts(RootPathA, false), - StartOptsB = aae_startopts(RootPathB, false), - StartOptsC = aae_startopts(RootPathC, false), - StartOptsD = aae_startopts(RootPathD, false), - - % Book1A to get all objects - {ok, Book1A} = leveled_bookie:book_start(StartOptsA), - % Book1B/C/D will have objects partitioned across it - {ok, Book1B} = leveled_bookie:book_start(StartOptsB), - {ok, Book1C} = leveled_bookie:book_start(StartOptsC), - {ok, Book1D} = leveled_bookie:book_start(StartOptsD), - - {B1, K1, V1, S1, MD} = {"Bucket", - "Key1.1.4567.4321", - "Value1", - [], - [{"MDK1", "MDV1"}]}, - {TestObject, TestSpec} = testutil:generate_testobject(B1, K1, V1, S1, MD), - - SW_StartLoad = os:timestamp(), - - ok = testutil:book_riakput(Book1A, TestObject, TestSpec), - ok = testutil:book_riakput(Book1B, TestObject, TestSpec), - testutil:check_forobject(Book1A, TestObject), - testutil:check_forobject(Book1B, TestObject), - - {TicTacTreeJoined, TicTacTreeFull, EmptyTree, _LMDIndexes} = - load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, - SW_StartLoad, TreeSize, UnitMins, - false), - % Go compare! Also confirm we're not comparing empty trees - DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, - TicTacTreeJoined), - - DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), - true = DL1_0 == [], - true = length(DL1_1) == 0, - - ok = leveled_bookie:book_close(Book1A), - ok = leveled_bookie:book_close(Book1B), - ok = leveled_bookie:book_close(Book1C), - ok = leveled_bookie:book_close(Book1D). - - -recent_aae_allaae(_Config) -> - % Leveled is started in blacklisted mode with no buckets blacklisted. - % - % A number of changes are then loaded into a store, and also partitioned - % across a separate set of three stores. A merge tree is returned from - % both the single store and the partitioned store, and proven to compare - % the same. - % - % A single change is then made, but into one half of the system only. The - % aae index is then re-queried and it is verified that a signle segment - % difference is found. - % - % The segment Id found is then used in a query to find the Keys that make - % up that segment, and the delta discovered should be just that one key - % which was known to have been changed - - TreeSize = small, - % SegmentCount = 256 * 256, - UnitMins = 2, - AAE = {blacklist, [], 60, UnitMins}, - - % Test requires multiple different databases, so want to mount them all - % on individual file paths - RootPathA = testutil:reset_filestructure("testA"), - RootPathB = testutil:reset_filestructure("testB"), - RootPathC = testutil:reset_filestructure("testC"), - RootPathD = testutil:reset_filestructure("testD"), - StartOptsA = aae_startopts(RootPathA, AAE), - StartOptsB = aae_startopts(RootPathB, AAE), - StartOptsC = aae_startopts(RootPathC, AAE), - StartOptsD = aae_startopts(RootPathD, AAE), - - % Book1A to get all objects - {ok, Book1A} = leveled_bookie:book_start(StartOptsA), - % Book1B/C/D will have objects partitioned across it - {ok, Book1B} = leveled_bookie:book_start(StartOptsB), - {ok, Book1C} = leveled_bookie:book_start(StartOptsC), - {ok, Book1D} = leveled_bookie:book_start(StartOptsD), - - {B1, K1, V1, S1, MD} = {"Bucket", - "Key1.1.4567.4321", - "Value1", - [], - [{"MDK1", "MDV1"}]}, - {TestObject, TestSpec} = testutil:generate_testobject(B1, K1, V1, S1, MD), - - SW_StartLoad = os:timestamp(), - - ok = testutil:book_riakput(Book1A, TestObject, TestSpec), - ok = testutil:book_riakput(Book1B, TestObject, TestSpec), - testutil:check_forobject(Book1A, TestObject), - testutil:check_forobject(Book1B, TestObject), - - {TicTacTreeJoined, TicTacTreeFull, EmptyTree, LMDIndexes} = - load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, - SW_StartLoad, TreeSize, UnitMins, - false), - % Go compare! Also confirm we're not comparing empty trees - DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, - TicTacTreeJoined), - - DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), - true = DL1_0 == [], - true = length(DL1_1) > 100, - - ok = leveled_bookie:book_close(Book1A), - ok = leveled_bookie:book_close(Book1B), - ok = leveled_bookie:book_close(Book1C), - ok = leveled_bookie:book_close(Book1D), - - % Book2A to get all objects - {ok, Book2A} = leveled_bookie:book_start(StartOptsA), - % Book2B/C/D will have objects partitioned across it - {ok, Book2B} = leveled_bookie:book_start(StartOptsB), - {ok, Book2C} = leveled_bookie:book_start(StartOptsC), - {ok, Book2D} = leveled_bookie:book_start(StartOptsD), - - {TicTacTreeJoined, TicTacTreeFull, EmptyTree, LMDIndexes} = - load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, - SW_StartLoad, TreeSize, UnitMins, - LMDIndexes), - % Go compare! Also confirm we're not comparing empty trees - DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, - TicTacTreeJoined), - - DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), - true = DL1_0 == [], - true = length(DL1_1) > 100, - - V2 = "Value2", - {TestObject2, TestSpec2} = - testutil:generate_testobject(B1, K1, V2, S1, MD), - - New_startTS = os:timestamp(), - - ok = testutil:book_riakput(Book2B, TestObject2, TestSpec2), - testutil:check_forobject(Book2B, TestObject2), - testutil:check_forobject(Book2A, TestObject), - - New_endTS = os:timestamp(), - NewLMDIndexes = determine_lmd_indexes(New_startTS, New_endTS, UnitMins), - {TicTacTreeJoined2, TicTacTreeFull2, _EmptyTree, NewLMDIndexes} = - load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, - New_startTS, TreeSize, UnitMins, - NewLMDIndexes), - DL2_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull2, - TicTacTreeJoined2), - - % DL2_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), - true = length(DL2_0) == 1, - - [DirtySeg] = DL2_0, - TermPrefix = string:right(integer_to_list(DirtySeg), 8, $0), - - LMDSegFolder = - fun(LMD, {Acc, Bookie}) -> - IdxLMD = list_to_binary("$aae." ++ LMD ++ "_bin"), - IdxQ1 = - {index_query, - <<"$all">>, - {fun testutil:foldkeysfun_returnbucket/3, []}, - {IdxLMD, - list_to_binary(TermPrefix ++ "."), - list_to_binary(TermPrefix ++ "|")}, - {true, undefined}}, - {async, IdxFolder} = - leveled_bookie:book_returnfolder(Bookie, IdxQ1), - {Acc ++ IdxFolder(), Bookie} - end, - {KeysTerms2A, _} = lists:foldl(LMDSegFolder, - {[], Book2A}, - lists:usort(LMDIndexes ++ NewLMDIndexes)), - true = length(KeysTerms2A) >= 1, - - {KeysTerms2B, _} = lists:foldl(LMDSegFolder, - {[], Book2B}, - lists:usort(LMDIndexes ++ NewLMDIndexes)), - {KeysTerms2C, _} = lists:foldl(LMDSegFolder, - {[], Book2C}, - lists:usort(LMDIndexes ++ NewLMDIndexes)), - {KeysTerms2D, _} = lists:foldl(LMDSegFolder, - {[], Book2D}, - lists:usort(LMDIndexes ++ NewLMDIndexes)), - - KeysTerms2Joined = KeysTerms2B ++ KeysTerms2C ++ KeysTerms2D, - DeltaX = lists:subtract(KeysTerms2A, KeysTerms2Joined), - DeltaY = lists:subtract(KeysTerms2Joined, KeysTerms2A), - - io:format("DeltaX ~w~n", [DeltaX]), - io:format("DeltaY ~w~n", [DeltaY]), - - true = length(DeltaX) == 0, % This hasn't seen any extra changes - true = length(DeltaY) == 1, % This has seen an extra change - [{_, {B1, K1}}] = DeltaY, - - ok = leveled_bookie:book_close(Book2A), - ok = leveled_bookie:book_close(Book2B), - ok = leveled_bookie:book_close(Book2C), - ok = leveled_bookie:book_close(Book2D). - - - -recent_aae_bucketaae(_Config) -> - % Configure AAE to work only on a single whitelisted bucket - % Confirm that we can spot a delta in this bucket, but not - % in another bucket - - TreeSize = small, - % SegmentCount = 256 * 256, - UnitMins = 2, - AAE = {whitelist, [<<"Bucket">>], 60, UnitMins}, - - % Test requires multiple different databases, so want to mount them all - % on individual file paths - RootPathA = testutil:reset_filestructure("testA"), - RootPathB = testutil:reset_filestructure("testB"), - RootPathC = testutil:reset_filestructure("testC"), - RootPathD = testutil:reset_filestructure("testD"), - StartOptsA = aae_startopts(RootPathA, AAE), - StartOptsB = aae_startopts(RootPathB, AAE), - StartOptsC = aae_startopts(RootPathC, AAE), - StartOptsD = aae_startopts(RootPathD, AAE), - - % Book1A to get all objects - {ok, Book1A} = leveled_bookie:book_start(StartOptsA), - % Book1B/C/D will have objects partitioned across it - {ok, Book1B} = leveled_bookie:book_start(StartOptsB), - {ok, Book1C} = leveled_bookie:book_start(StartOptsC), - {ok, Book1D} = leveled_bookie:book_start(StartOptsD), - - {B1, K1, V1, S1, MD} = {<<"Bucket">>, - "Key1.1.4567.4321", - "Value1", - [], - [{"MDK1", "MDV1"}]}, - {TestObject, TestSpec} = testutil:generate_testobject(B1, K1, V1, S1, MD), - - SW_StartLoad = os:timestamp(), - - ok = testutil:book_riakput(Book1A, TestObject, TestSpec), - ok = testutil:book_riakput(Book1B, TestObject, TestSpec), - testutil:check_forobject(Book1A, TestObject), - testutil:check_forobject(Book1B, TestObject), - - {TicTacTreeJoined, TicTacTreeFull, EmptyTree, LMDIndexes} = - load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, - SW_StartLoad, TreeSize, UnitMins, - false, <<"Bucket">>), - % Go compare! Also confirm we're not comparing empty trees - DL1_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, - TicTacTreeJoined), - - DL1_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), - true = DL1_0 == [], - true = length(DL1_1) > 100, - - ok = leveled_bookie:book_close(Book1A), - ok = leveled_bookie:book_close(Book1B), - ok = leveled_bookie:book_close(Book1C), - ok = leveled_bookie:book_close(Book1D), - - % Book2A to get all objects - {ok, Book2A} = leveled_bookie:book_start(StartOptsA), - % Book2B/C/D will have objects partitioned across it - {ok, Book2B} = leveled_bookie:book_start(StartOptsB), - {ok, Book2C} = leveled_bookie:book_start(StartOptsC), - {ok, Book2D} = leveled_bookie:book_start(StartOptsD), - - % Change the value for a key in another bucket - % If we get trees for this period, no difference should be found - - V2 = "Value2", - {TestObject2, TestSpec2} = - testutil:generate_testobject(<<"NotBucket">>, K1, V2, S1, MD), - - New_startTS2 = os:timestamp(), - - ok = testutil:book_riakput(Book2B, TestObject2, TestSpec2), - testutil:check_forobject(Book2B, TestObject2), - testutil:check_forobject(Book2A, TestObject), - - New_endTS2 = os:timestamp(), - NewLMDIndexes2 = determine_lmd_indexes(New_startTS2, New_endTS2, UnitMins), - {TicTacTreeJoined2, TicTacTreeFull2, _EmptyTree, NewLMDIndexes2} = - load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, - New_startTS2, TreeSize, UnitMins, - NewLMDIndexes2, <<"Bucket">>), - DL2_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull2, - TicTacTreeJoined2), - true = length(DL2_0) == 0, - - % Now create an object that is a change to an existing key in the - % monitored bucket. A differrence should be found - - {TestObject3, TestSpec3} = - testutil:generate_testobject(B1, K1, V2, S1, MD), - - New_startTS3 = os:timestamp(), - - ok = testutil:book_riakput(Book2B, TestObject3, TestSpec3), - testutil:check_forobject(Book2B, TestObject3), - testutil:check_forobject(Book2A, TestObject), - - New_endTS3 = os:timestamp(), - NewLMDIndexes3 = determine_lmd_indexes(New_startTS3, New_endTS3, UnitMins), - {TicTacTreeJoined3, TicTacTreeFull3, _EmptyTree, NewLMDIndexes3} = - load_and_check_recentaae(Book2A, Book2B, Book2C, Book2D, - New_startTS3, TreeSize, UnitMins, - NewLMDIndexes3, <<"Bucket">>), - DL3_0 = leveled_tictac:find_dirtyleaves(TicTacTreeFull3, - TicTacTreeJoined3), - - % DL2_1 = leveled_tictac:find_dirtyleaves(TicTacTreeFull, EmptyTree), - true = length(DL3_0) == 1, - - % Find the dirty segment, and use that to find the dirty key - % - % Note that unlike when monitoring $all, fold_keys can be used as there - % is no need to return the Bucket (as hte bucket is known) - - [DirtySeg] = DL3_0, - TermPrefix = string:right(integer_to_list(DirtySeg), 8, $0), - - LMDSegFolder = - fun(LMD, {Acc, Bookie}) -> - IdxLMD = list_to_binary("$aae." ++ LMD ++ "_bin"), - IdxQ1 = - {index_query, - <<"Bucket">>, - {fun testutil:foldkeysfun/3, []}, - {IdxLMD, - list_to_binary(TermPrefix ++ "."), - list_to_binary(TermPrefix ++ "|")}, - {true, undefined}}, - {async, IdxFolder} = - leveled_bookie:book_returnfolder(Bookie, IdxQ1), - {Acc ++ IdxFolder(), Bookie} - end, - {KeysTerms2A, _} = lists:foldl(LMDSegFolder, - {[], Book2A}, - lists:usort(LMDIndexes ++ NewLMDIndexes3)), - true = length(KeysTerms2A) >= 1, - - {KeysTerms2B, _} = lists:foldl(LMDSegFolder, - {[], Book2B}, - lists:usort(LMDIndexes ++ NewLMDIndexes3)), - {KeysTerms2C, _} = lists:foldl(LMDSegFolder, - {[], Book2C}, - lists:usort(LMDIndexes ++ NewLMDIndexes3)), - {KeysTerms2D, _} = lists:foldl(LMDSegFolder, - {[], Book2D}, - lists:usort(LMDIndexes ++ NewLMDIndexes3)), - - KeysTerms2Joined = KeysTerms2B ++ KeysTerms2C ++ KeysTerms2D, - DeltaX = lists:subtract(KeysTerms2A, KeysTerms2Joined), - DeltaY = lists:subtract(KeysTerms2Joined, KeysTerms2A), - - io:format("DeltaX ~w~n", [DeltaX]), - io:format("DeltaY ~w~n", [DeltaY]), - - true = length(DeltaX) == 0, % This hasn't seen any extra changes - true = length(DeltaY) == 1, % This has seen an extra change - [{_, K1}] = DeltaY, - - ok = leveled_bookie:book_close(Book2A), - ok = leveled_bookie:book_close(Book2B), - ok = leveled_bookie:book_close(Book2C), - ok = leveled_bookie:book_close(Book2D). - - -recent_aae_expiry(_Config) -> - % Proof that the index entries are indeed expired - - TreeSize = small, - % SegmentCount = 256 * 256, - UnitMins = 1, - TotalMins = 2, - AAE = {blacklist, [], TotalMins, UnitMins}, - - % Test requires multiple different databases, so want to mount them all - % on individual file paths - RootPathA = testutil:reset_filestructure("testA"), - StartOptsA = aae_startopts(RootPathA, AAE), - - % Book1A to get all objects - {ok, Book1A} = leveled_bookie:book_start(StartOptsA), - - GenMapFun = - fun(_X) -> - V = testutil:get_compressiblevalue(), - Indexes = testutil:get_randomindexes_generator(8), - testutil:generate_objects(5000, - binary_uuid, - [], - V, - Indexes) - end, - - ObjLists = lists:map(GenMapFun, lists:seq(1, 3)), - - SW0 = os:timestamp(), - % Load all nine lists into Book1A - lists:foreach(fun(ObjL) -> testutil:riakload(Book1A, ObjL) end, - ObjLists), - SW1 = os:timestamp(), - % sleep for two minutes, so all index entries will have expired - GetTicTacTreeFun = - fun(Bookie) -> - get_tictactree_fun(Bookie, <<"$all">>, TreeSize) - end, - EmptyTree = leveled_tictac:new_tree(empty, TreeSize), - LMDIndexes = determine_lmd_indexes(SW0, SW1, UnitMins), - - % Should get a non-empty answer to the query - TicTacTree1_Full = - lists:foldl(GetTicTacTreeFun(Book1A), EmptyTree, LMDIndexes), - DL3_0 = leveled_tictac:find_dirtyleaves(TicTacTree1_Full, EmptyTree), - io:format("Dirty leaves found before expiry ~w~n", [length(DL3_0)]), - - true = length(DL3_0) > 0, - - SecondsSinceLMD = timer:now_diff(os:timestamp(), SW0) div 1000000, - SecondsToExpiry = (TotalMins + UnitMins) * 60, - - io:format("SecondsToExpiry ~w SecondsSinceLMD ~w~n", - [SecondsToExpiry, SecondsSinceLMD]), - io:format("LMDIndexes ~w~n", [LMDIndexes]), - - case SecondsToExpiry > SecondsSinceLMD of - true -> - timer:sleep((1 + SecondsToExpiry - SecondsSinceLMD) * 1000); - false -> - timer:sleep(1000) - end, - - % Should now get an empty answer - all entries have expired - TicTacTree2_Full = - lists:foldl(GetTicTacTreeFun(Book1A), EmptyTree, LMDIndexes), - DL4_0 = leveled_tictac:find_dirtyleaves(TicTacTree2_Full, EmptyTree), - io:format("Dirty leaves found after expiry ~w~n", [length(DL4_0)]), - - timer:sleep(10000), - - TicTacTree3_Full = - lists:foldl(GetTicTacTreeFun(Book1A), EmptyTree, LMDIndexes), - DL5_0 = leveled_tictac:find_dirtyleaves(TicTacTree3_Full, EmptyTree), - io:format("Dirty leaves found after expiry plus 10s ~w~n", [length(DL5_0)]), - - - ok = leveled_bookie:book_close(Book1A), - - true = length(DL4_0) == 0. - - basic_headonly(_Config) -> ObjectCount = 200000, RemoveCount = 100, @@ -1069,7 +595,8 @@ basic_headonly_test(ObjectCount, RemoveCount, HeadOnly) -> InitAcc = {0, 0}, RunnerDefinition = - {foldheads_allkeys, h, {FoldFun, InitAcc}, false, false, false}, + {foldheads_allkeys, h, {FoldFun, InitAcc}, + false, false, false, false, false}, {async, Runner1} = leveled_bookie:book_returnfolder(Bookie1, RunnerDefinition), @@ -1196,145 +723,6 @@ load_objectspecs(ObjectSpecL, SliceSize, Bookie) -> end. - -load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, - SW_StartLoad, TreeSize, UnitMins, - LMDIndexes_Loaded) -> - load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, - SW_StartLoad, TreeSize, UnitMins, - LMDIndexes_Loaded, <<"$all">>). - -load_and_check_recentaae(Book1A, Book1B, Book1C, Book1D, - SW_StartLoad, TreeSize, UnitMins, - LMDIndexes_Loaded, Bucket) -> - LMDIndexes = - case LMDIndexes_Loaded of - false -> - % Generate nine lists of objects - % BucketBin = list_to_binary("Bucket"), - GenMapFun = - fun(_X) -> - V = testutil:get_compressiblevalue(), - Indexes = testutil:get_randomindexes_generator(8), - testutil:generate_objects(5000, - binary_uuid, - [], - V, - Indexes) - end, - - ObjLists = lists:map(GenMapFun, lists:seq(1, 9)), - - % Load all nine lists into Book1A - lists:foreach(fun(ObjL) -> testutil:riakload(Book1A, ObjL) end, - ObjLists), - - % Split nine lists across Book1B to Book1D, three object lists - % in each - lists:foreach(fun(ObjL) -> testutil:riakload(Book1B, ObjL) end, - lists:sublist(ObjLists, 1, 3)), - lists:foreach(fun(ObjL) -> testutil:riakload(Book1C, ObjL) end, - lists:sublist(ObjLists, 4, 3)), - lists:foreach(fun(ObjL) -> testutil:riakload(Book1D, ObjL) end, - lists:sublist(ObjLists, 7, 3)), - - SW_EndLoad = os:timestamp(), - determine_lmd_indexes(SW_StartLoad, SW_EndLoad, UnitMins); - _ -> - LMDIndexes_Loaded - end, - - EmptyTree = leveled_tictac:new_tree(empty, TreeSize), - - GetTicTacTreeFun = - fun(Bookie) -> - get_tictactree_fun(Bookie, Bucket, TreeSize) - end, - - % Get a TicTac tree representing one of the indexes in Bucket A - TicTacTree1_Full = - lists:foldl(GetTicTacTreeFun(Book1A), EmptyTree, LMDIndexes), - - TicTacTree1_P1 = - lists:foldl(GetTicTacTreeFun(Book1B), EmptyTree, LMDIndexes), - TicTacTree1_P2 = - lists:foldl(GetTicTacTreeFun(Book1C), EmptyTree, LMDIndexes), - TicTacTree1_P3 = - lists:foldl(GetTicTacTreeFun(Book1D), EmptyTree, LMDIndexes), - - % Merge the tree across the partitions - TicTacTree1_Joined = lists:foldl(fun leveled_tictac:merge_trees/2, - TicTacTree1_P1, - [TicTacTree1_P2, TicTacTree1_P3]), - - {TicTacTree1_Full, TicTacTree1_Joined, EmptyTree, LMDIndexes}. - - -aae_startopts(RootPath, AAE) -> - LS = 2000, - JS = 50000000, - SS = testutil:sync_strategy(), - [{root_path, RootPath}, - {sync_strategy, SS}, - {cache_size, LS}, - {max_journalsize, JS}, - {recent_aae, AAE}]. - - -determine_lmd_indexes(StartTS, EndTS, UnitMins) -> - StartDT = calendar:now_to_datetime(StartTS), - EndDT = calendar:now_to_datetime(EndTS), - StartTimeStr = get_strtime(StartDT, UnitMins), - EndTimeStr = get_strtime(EndDT, UnitMins), - - AddTimeFun = - fun(X, Acc) -> - case lists:member(EndTimeStr, Acc) of - true -> - Acc; - false -> - NextTime = - UnitMins * 60 * X + - calendar:datetime_to_gregorian_seconds(StartDT), - NextDT = - calendar:gregorian_seconds_to_datetime(NextTime), - Acc ++ [get_strtime(NextDT, UnitMins)] - end - end, - - lists:foldl(AddTimeFun, [StartTimeStr], lists:seq(1, 10)). - - -get_strtime(DateTime, UnitMins) -> - {{Y, M, D}, {Hour, Minute, _Second}} = DateTime, - RoundMins = - UnitMins * (Minute div UnitMins), - StrTime = - lists:flatten(io_lib:format(?LMD_FORMAT, - [Y, M, D, Hour, RoundMins])), - StrTime. - - -get_tictactree_fun(Bookie, Bucket, TreeSize) -> - fun(LMD, Acc) -> - SW = os:timestamp(), - ST = <<"0">>, - ET = <<"A">>, - Q = {tictactree_idx, - {Bucket, - list_to_binary("$aae." ++ LMD ++ "_bin"), - ST, - ET}, - TreeSize, - fun(_B, _K) -> accumulate end}, - {async, Folder} = leveled_bookie:book_returnfolder(Bookie, Q), - R = Folder(), - io:format("TicTac Tree for index ~s took " ++ - "~w microseconds~n", - [LMD, timer:now_diff(os:timestamp(), SW)]), - leveled_tictac:merge_trees(R, Acc) - end. - get_segment(K, SegmentCount) -> BinKey = case is_binary(K) of