leveled/src/leveled_pmem.erl
Martin Sumner aaeac7ba36
Mas d34 i453 eqwalizer (#454)
* Add eqwalizer and clear for codec & sst

The eqwalizer errors highlighted the need in several places for type clarification.

Within tests there are some issue where a type is assumed, and so ignore has been used to handle this rather than write more complex code to be explicit about the assumption.

The handling of arrays isn't great by eqwalizer - to be specific about the content of array causes issues when initialising an array.  Perhaps a type (map maybe) where one can be more explicit about types might be a better option (even if there is a minimal performance impact).

The use of a ?TOMB_COUNT defined option complicated the code much more with eqwalizer.  So for now, there is no developer option to disable ?TOMB_COUNT.

Test fixes required where strings have been used for buckets/keys not binaries.

The leveled_sst statem needs a different state record for starting when compared to other modes.  The state record has been divided up to reflect this, to make type management easier.  The impact on performance needs to be tested.

* Update ct tests to support binary keys/buckets only

* Eqwalizer for leveled_cdb and leveled_tictac

As array is used in leveled_tictac - there is the same issue as with leveled_sst

* Remove redundant indirection of leveled_rand

A legacy of pre-20 OTP

* Morde modules eqwalized

ebloom/log/util/monitor

* Eqwalize further modules

elp eqwalize leveled_codec; elp eqwalize leveled_sst; elp eqwalize leveled_cdb; elp eqwalize leveled_tictac; elp eqwalize leveled_log; elp eqwalize leveled_monitor; elp eqwalize leveled_head; elp eqwalize leveled_ebloom; elp eqwalize leveled_iclerk

All concurrently OK

* Refactor unit tests to use binary() no string() in key

Previously string() was allowed just to avoid having to change all these tests.  Go through the pain now, as part of eqwalizing.

* Add fixes for penciller, inker

Add a new ?IS_DEF macro to replace =/= undefined.

Now more explicit about primary, object and query keys

* Further fixes

Need to clarify functions used by runner - where keys , query keys and object keys are used

* Further eqwalisation

* Eqwalize leveled_pmanifest

Also make implementation independent of choice of dict - i.e. one can save a manifest using dict for blooms/pending_deletions and then open a manifest with code that uses a different type.  Allow for slow dict to be replaced with map.

Would not be backwards compatible though, without further thought - i.e. if you upgrade then downgrade.

Redundant code created by leveled_sst refactoring removed.

* Fix backwards compatibility issues

* Manifest Entry to belong to leveled_pmanifest

There are two manifests - leveled_pmanifest and leveled_imanifest.  Both have manifest_entry() type objects, but these types are different.  To avoid confusion don't include the pmanifest manifest_entry() within the global include file - be specific that it belongs to the leveled_pmanifest module

* Ignore elp file - large binary

* Update src/leveled_pmem.erl

Remove unnecessary empty list from type definition

Co-authored-by: Thomas Arts <thomas.arts@quviq.com>

---------

Co-authored-by: Thomas Arts <thomas.arts@quviq.com>
2024-11-13 13:37:13 +00:00

467 lines
16 KiB
Erlang

%% -------- PENCILLER MEMORY ---------
%%
%% Module that provides functions for maintaining the L0 memory of the
%% Penciller.
%%
%% It is desirable that the L0Mem can efficiently handle the push of new trees
%% whilst maintaining the capability to quickly snapshot the memory for clones
%% of the Penciller.
%%
%% ETS tables are not used due to complications with managing their mutability,
%% as the database is snapshotted.
%%
%% An attempt was made to merge all trees into a single tree on push (in a
%% spawned process), but this proved to have an expensive impact as the tree
%% got larger.
%%
%% This approach is to keep a list of trees which have been received in the
%% order which they were received. There is then a fixed-size array of hashes
%% used to either point lookups at the right tree in the list, or inform the
%% requestor it is not present avoiding any lookups.
%%
%% The trade-off taken with the approach is that the size of the L0Cache is
%% uncertain. The Size count is incremented based on the inbound size and so
%% does not necessarily reflect the size once the lists are merged (reflecting
%% rotating objects)
-module(leveled_pmem).
-include("leveled.hrl").
-export([
prepare_for_index/2,
add_to_cache/5,
to_list/2,
check_levelzero/3,
check_levelzero/4,
merge_trees/4,
add_to_index/3,
new_index/0,
check_index/2,
cache_full/1
]).
% Test functions to ignore for equalizer - due to array issues
-eqwalizer({nowarn_function, index_performance_test/0}).
-define(MAX_CACHE_LINES, 31). % Must be less than 128
-type index_array() :: list(array:array(binary()))|none.
-export_type([index_array/0]).
%%%============================================================================
%%% API
%%%============================================================================
-spec cache_full(list()) -> boolean().
%% @doc
%% If there are already 31 entries in the cache then the cache is full
cache_full(L0Cache) ->
length(L0Cache) == ?MAX_CACHE_LINES.
-spec prepare_for_index(
array:array(binary()), leveled_codec:segment_hash()) -> array:array().
%% @doc
%% Add the hash of a key to the index. This is 'prepared' in the sense that
%% this index is not use until it is loaded into the main index.
%%
%% prepare_for_index is called from the Bookie when been added to the ledger
%% cache, but the index is not used until that ledger cache is in the
%% penciller L0 memory
prepare_for_index(IndexArray, no_lookup) ->
IndexArray;
prepare_for_index(IndexArray, Hash) ->
{Slot, H0} = split_hash(Hash),
Bin = array:get(Slot, IndexArray),
array:set(Slot, <<Bin/binary, H0:24/integer>>, IndexArray).
-spec add_to_index(
array:array(binary()), index_array(), integer()) -> index_array().
%% @doc
%% Expand the penciller's current index array with the details from a new
%% ledger cache tree sent from the Bookie. The tree will have a cache slot
%% which is the index of this ledger_cache in the list of the ledger_caches
add_to_index(
LM1Array, L0Index, CacheSlot)
when CacheSlot < 128, L0Index =/= none ->
[LM1Array|L0Index].
-spec new_index() -> array:array(binary()).
%% @doc
%% Create a new index array
new_index() ->
% eqwalizer:ignore - array does contain binary()
array:new([{size, 256}, {default, <<>>}]).
-spec check_index(leveled_codec:segment_hash(), index_array())
-> list(non_neg_integer()).
%% @doc
%% return a list of positions in the list of cache arrays that may contain the
%% key associated with the hash being checked
check_index(Hash, L0Index) when L0Index =/= none ->
{Slot, H0} = split_hash(Hash),
{_L, Positions} =
lists:foldl(
fun(A, {SlotC, PosList}) ->
B = array:get(Slot, A),
case find_pos(B, H0) of
true -> {SlotC + 1, [SlotC|PosList]};
false -> {SlotC + 1, PosList}
end
end,
{1, []},
L0Index),
lists:reverse(Positions).
-spec add_to_cache(
integer(),
{tuple(), integer(), integer()},
integer(),
list(),
boolean()) -> {integer(), integer(), list()}|empty_push.
%% @doc
%% The penciller's cache is a list of leveled_trees, this adds a new tree to
%% that cache, providing an update to the approximate size of the cache and
%% the Ledger's SQN.
%% Updates to cache must set Writable to true if the update could generate a
%% Level 0 file - as this must guard against empty entries (which may lead to
%% an attempt to write an empty L0 file)
add_to_cache(L0Size, {LM1, MinSQN, MaxSQN}, LedgerSQN, TreeList, Writeable) ->
case {Writeable, leveled_tree:tsize(LM1)} of
{true, 0} ->
empty_push;
{_, LM1Size} ->
if
MinSQN >= LedgerSQN ->
{MaxSQN,
L0Size + LM1Size,
[LM1|TreeList]}
end
end.
-spec to_list(
integer(), fun((pos_integer()) -> leveled_tree:leveled_tree())) -> list().
%% @doc
%% The cache is a list of leveled_trees of length Slots. This will fetch
%% each tree in turn by slot ID and then produce a merged/sorted output of
%% Keys and Values (to load into a SST file).
%%
%% Each slot is requested in turn to avoid halting the penciller whilst it
%% does a large object copy of the whole cache.
to_list(Slots, FetchFun) ->
SW = os:timestamp(),
SlotList = lists:seq(1, Slots),
FullList = lists:foldl(fun(Slot, Acc) ->
Tree = FetchFun(Slot),
L = leveled_tree:to_list(Tree),
lists:ukeymerge(1, Acc, L)
end,
[],
SlotList),
leveled_log:log_timer(pm002, [length(FullList)], SW),
FullList.
-spec check_levelzero(tuple(), list(integer()), list())
-> {boolean(), tuple|not_found}.
%% @doc
%% Check for the presence of a given Key in the Level Zero cache, with the
%% index array having been checked first for a list of potential positions
%% in the list of ledger caches - and then each potential ledger_cache being
%% checked (with the most recently received cache being checked first) until a
%% match is found.
check_levelzero(Key, PosList, TreeList) ->
check_levelzero(Key, leveled_codec:segment_hash(Key), PosList, TreeList).
-spec check_levelzero(tuple(), {integer(), integer()}, list(integer()), list())
-> {boolean(), tuple|not_found}.
%% @doc
%% Check for the presence of a given Key in the Level Zero cache, with the
%% index array having been checked first for a list of potential positions
%% in the list of ledger caches - and then each potential ledger_cache being
%% checked (with the most recently received cache being checked first) until a
%% match is found.
check_levelzero(_Key, _Hash, _PosList, []) ->
{false, not_found};
check_levelzero(_Key, _Hash, [], _TreeList) ->
{false, not_found};
check_levelzero(Key, Hash, PosList, TreeList) ->
check_slotlist(Key, Hash, PosList, TreeList).
-spec merge_trees(tuple(), tuple(), list(tuple()), tuple()) -> list().
%% @doc
%% Return a list of keys and values across the level zero cache (and the
%% currently unmerged bookie's ledger cache) that are between StartKey
%% and EndKey (inclusive).
merge_trees(StartKey, EndKey, TreeList, LevelMinus1) ->
lists:foldl(
fun(Tree, Acc) ->
R = leveled_tree:match_range(StartKey, EndKey, Tree),
lists:ukeymerge(1, Acc, R) end,
[],
[LevelMinus1|TreeList]).
%%%============================================================================
%%% Internal Functions
%%%============================================================================
find_pos(<<>>, _Hash) ->
false;
find_pos(<<Hash:24/integer, _T/binary>>, Hash) ->
true;
find_pos(<<_Miss:24/integer, T/binary>>, Hash) ->
find_pos(T, Hash).
split_hash({SegmentID, ExtraHash}) ->
Slot = SegmentID band 255,
H0 = (SegmentID bsr 8) bor (ExtraHash bsl 8),
{Slot, H0 band 16#FFFFFF}.
check_slotlist(Key, _Hash, CheckList, TreeList) ->
SlotCheckFun =
fun(SlotToCheck, {Found, KV}) ->
case Found of
true ->
{Found, KV};
false ->
CheckTree = lists:nth(SlotToCheck, TreeList),
case leveled_tree:match(Key, CheckTree) of
none ->
{Found, KV};
{value, Value} ->
{true, {Key, Value}}
end
end
end,
lists:foldl(SlotCheckFun, {false, not_found}, CheckList).
%%%============================================================================
%%% Test
%%%============================================================================
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
generate_randomkeys_aslist(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
lists:ukeysort(
1,
generate_randomkeys(Seqn, Count, [], BucketRangeLow, BucketRangeHigh)
).
generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
KVL =
generate_randomkeys(Seqn, Count, [], BucketRangeLow, BucketRangeHigh),
leveled_tree:from_orderedlist(lists:ukeysort(1, KVL), ?CACHE_TYPE).
generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) ->
Acc;
generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) ->
BNumber =
lists:flatten(
io_lib:format("~4..0B",
[BucketLow + rand:uniform(BRange)])),
KNumber =
lists:flatten(io_lib:format("~4..0B", [rand:uniform(1000)])),
{K, V} =
{
{o,
list_to_binary("Bucket" ++ BNumber),
list_to_binary("Key" ++ KNumber),
null},
{Seqn, {active, infinity}, null}
},
generate_randomkeys(Seqn + 1, Count - 1, [{K, V}|Acc], BucketLow, BRange).
compare_method_test() ->
R =
lists:foldl(
fun(_X, {LedgerSQN, L0Size, L0TreeList}) ->
LM1 = generate_randomkeys(LedgerSQN + 1, 2000, 1, 500),
add_to_cache(
L0Size,
{LM1, LedgerSQN + 1, LedgerSQN + 2000},
LedgerSQN,
L0TreeList,
true)
end,
{0, 0, []},
lists:seq(1, 16)),
{SQN, Size, TreeList} = R,
?assertMatch(32000, SQN),
?assertMatch(true, Size =< 32000),
TestList = leveled_tree:to_list(generate_randomkeys(1, 2000, 1, 800)),
FindKeyFun =
fun(Key) ->
fun(Tree, {Found, KV}) ->
case Found of
true ->
{true, KV};
false ->
L0 = leveled_tree:match(Key, Tree),
case L0 of
none ->
{false, not_found};
{value, Value} ->
{true, {Key, Value}}
end
end
end
end,
S0 =
lists:foldl(
fun({Key, _V}, Acc) ->
R0 =
lists:foldl(
FindKeyFun(Key), {false, not_found}, TreeList),
[R0|Acc]
end,
[],
TestList)
,
PosList = lists:seq(1, length(TreeList)),
S1 =
lists:foldl(
fun({Key, _V}, Acc) ->
R0 = check_levelzero(Key, PosList, TreeList),
[R0|Acc]
end,
[],
TestList
),
?assertMatch(S0, S1),
StartKey = {o, <<"Bucket0100">>, null, null},
EndKey = {o, <<"Bucket0200">>, null, null},
SWa = os:timestamp(),
FetchFun = fun(Slot) -> lists:nth(Slot, TreeList) end,
DumpList = to_list(length(TreeList), FetchFun),
Q0 =
lists:foldl(
fun({K, V}, Acc) ->
P = leveled_codec:endkey_passed(EndKey, K),
case {K, P} of
{K, false} when K >= StartKey ->
[{K, V}|Acc];
_ ->
Acc
end
end,
[],
DumpList
),
Tree = leveled_tree:from_orderedlist(lists:ukeysort(1, Q0), ?CACHE_TYPE),
Sz0 = leveled_tree:tsize(Tree),
io:format(
"Crude method took ~w microseconds resulting in tree of size ~w~n",
[timer:now_diff(os:timestamp(), SWa), Sz0]
),
SWb = os:timestamp(),
Q1 = merge_trees(StartKey, EndKey, TreeList, leveled_tree:empty(?CACHE_TYPE)),
Sz1 = length(Q1),
io:format(
"Merge method took ~w microseconds resulting in tree of size ~w~n",
[timer:now_diff(os:timestamp(), SWb), Sz1]),
?assertMatch(Sz0, Sz1).
with_index_test_() ->
% Otherwise this test may timeout when run with coverage enabled
{timeout, 60, fun with_index_test2/0}.
with_index_test2() ->
IndexPrepareFun =
fun({K, _V}, Acc) ->
H = leveled_codec:segment_hash(K),
prepare_for_index(Acc, H)
end,
LoadFun =
fun(_X, {{LedgerSQN, L0Size, L0TreeList}, L0Idx, SrcList}) ->
LM1 = generate_randomkeys_aslist(LedgerSQN + 1, 2000, 1, 500),
LM1Array = lists:foldl(IndexPrepareFun, new_index(), LM1),
LM1SL = leveled_tree:from_orderedlist(lists:ukeysort(1, LM1), ?CACHE_TYPE),
UpdL0Index = add_to_index(LM1Array, L0Idx, length(L0TreeList) + 1),
R = add_to_cache(
L0Size,
{LM1SL, LedgerSQN + 1, LedgerSQN + 2000},
LedgerSQN,
L0TreeList,
true),
{R, UpdL0Index, lists:ukeymerge(1, LM1, SrcList)}
end,
R0 = lists:foldl(LoadFun, {{0, 0, []}, [], []}, lists:seq(1, 16)),
{{SQN, Size, TreeList}, L0Index, SrcKVL} = R0,
?assertMatch(32000, SQN),
?assertMatch(true, Size =< 32000),
CheckFun =
fun({K, V}, {L0Idx, L0Cache}) ->
H = leveled_codec:segment_hash(K),
PosList = check_index(H, L0Idx),
?assertMatch({true, {K, V}},
check_slotlist(K, H, PosList, L0Cache)),
{L0Idx, L0Cache}
end,
_R1 = lists:foldl(CheckFun, {L0Index, TreeList}, SrcKVL).
index_performance_test() ->
LM1 = generate_randomkeys_aslist(1, 2000, 1, 500),
LM2 = generate_randomkeys_aslist(2001, 2000, 1, 500),
HL1 = lists:map(fun({K, _V}) -> leveled_codec:segment_hash(K) end, LM1),
HL2 = lists:map(fun({K, _V}) -> leveled_codec:segment_hash(K) end, LM2),
SWP = os:timestamp(),
A1 =
lists:foldl(
fun(H, A) -> prepare_for_index(A, H) end,
new_index(),
HL1),
io:format(
user,
"~nPrepare single index takes ~w microsec~n",
[timer:now_diff(os:timestamp(), SWP)]),
SWL = os:timestamp(),
PMI1 =
lists:foldl(
fun(I, Idx) -> add_to_index(A1, Idx, I) end, [], lists:seq(1, 8)),
io:format(
user,
"Appending to array takes ~w microsec~n",
[timer:now_diff(os:timestamp(), SWL)]),
SWC1 = os:timestamp(),
R0 = lists:seq(1, 8),
lists:foreach(fun(H) -> ?assertMatch(R0, check_index(H, PMI1)) end, HL1),
io:format(
user,
"Checking 2000 matches in array at each level takes ~w microsec~n",
[timer:now_diff(os:timestamp(), SWC1)]),
SWC2 = os:timestamp(),
FPT =
lists:foldl(
fun(H, FPC) -> FPC + length(check_index(H, PMI1)) end,
0,
HL2),
io:format(
user,
"Checking 2000 misses in array at each level takes ~w microsec " ++
"with ~w false positives~n",
[timer:now_diff(os:timestamp(), SWC2), FPT]).
-endif.