SFT file continued

Writing of a slot
This commit is contained in:
martinsumner 2016-06-10 19:09:55 +01:00
parent a95d77607e
commit cc16f90c9c

View file

@ -14,16 +14,18 @@
%% %%
%% All keys are not equal in sft files, keys are only expected in a specific %% All keys are not equal in sft files, keys are only expected in a specific
%% series of formats %% series of formats
%% - {o, Bucket, Key, State} - Object Keys %% - {o, Bucket, Key} - Object Keys
%% - {i, Bucket, IndexName, IndexTerm, Key, State} - Postings %% - {i, Bucket, IndexName, IndexTerm, Key} - Postings
%% The {Bucket, Key} part of all types of keys are hashed for segment filters. %% The {Bucket, Key} part of all types of keys are hashed for segment filters.
%% For Postings the {Bucket, IndexName, IndexTerm} is also hashed. This %% For Postings the {Bucket, IndexName, IndexTerm} is also hashed. This
%% causes a false positive on lookup of a segment, but allows for the presence %% causes a false positive on lookup of a segment, but allows for the presence
%% of specific index terms to be checked %% of specific index terms to be checked
%% %%
%% The objects stored are a tuple of {Key, State, Value}, where %% The objects stored are a tuple of {Key, SequenceNumber, State, Value}, where
%% Key - as above %% Key - as above
%% State - {SequenceNumber, active|tomb, ExpiryTimestamp | infinity} %% SequenceNumber - monotonically increasing counter of addition to the nursery
%% log
%% State - {active|tomb, ExpiryTimestamp | infinity}
%% Value - null (all postings) | [Object Metadata] (all object keys) %% Value - null (all postings) | [Object Metadata] (all object keys)
%% Keys should be unique in files. If more than two keys are candidate for %% Keys should be unique in files. If more than two keys are candidate for
%% the same file the highest sequence number should be chosen. If the file %% the same file the highest sequence number should be chosen. If the file
@ -149,7 +151,8 @@
serialise_segment_filter/1, serialise_segment_filter/1,
check_for_segments/3, check_for_segments/3,
speedtest_check_forsegment/4, speedtest_check_forsegment/4,
generate_randomsegfilter/1]). generate_randomsegfilter/1,
create_slot/3]).
-include_lib("eunit/include/eunit.hrl"). -include_lib("eunit/include/eunit.hrl").
@ -163,6 +166,7 @@
-define(MAX_SEG_HASH, 1048576). -define(MAX_SEG_HASH, 1048576).
-define(DIVISOR_BITS, 13). -define(DIVISOR_BITS, 13).
-define(DIVISOR, 8092). -define(DIVISOR, 8092).
-define(COMPRESSION_LEVEL, 1).
-record(state, {version = ?CURRENT_VERSION :: tuple(), -record(state, {version = ?CURRENT_VERSION :: tuple(),
@ -216,9 +220,12 @@ create_header(initial) ->
%% Do we need to check here that KeyList1 and KeyList2 are not just a [pointer] %% Do we need to check here that KeyList1 and KeyList2 are not just a [pointer]
%% Otherwise the pointer will never be expanded %% Otherwise the pointer will never be expanded
%%
%% Also this should return a partial block if the KeyLists have been exhausted
%% but the block is full
create_block(KeyList1, KeyList2, Level) -> create_block(KeyList1, KeyList2, Level) ->
create_block(KeyList1, KeyList2, [], {0, 0}, [], Level). create_block(KeyList1, KeyList2, [], {infinity, 0}, [], Level).
create_block(KeyList1, KeyList2, create_block(KeyList1, KeyList2,
BlockKeyList, {LSN, HSN}, SegmentList, _) BlockKeyList, {LSN, HSN}, SegmentList, _)
@ -231,7 +238,6 @@ create_block(KeyList1, KeyList2,
BlockKeyList, {LSN, HSN}, SegmentList, Level) -> BlockKeyList, {LSN, HSN}, SegmentList, Level) ->
case key_dominates(KeyList1, KeyList2, Level) of case key_dominates(KeyList1, KeyList2, Level) of
{{next_key, TopKey}, Rem1, Rem2} -> {{next_key, TopKey}, Rem1, Rem2} ->
io:format("TopKey is ~w~n", [TopKey]),
{UpdLSN, UpdHSN} = update_sequencenumbers(TopKey, LSN, HSN), {UpdLSN, UpdHSN} = update_sequencenumbers(TopKey, LSN, HSN),
NewBlockKeyList = lists:append(BlockKeyList, NewBlockKeyList = lists:append(BlockKeyList,
[TopKey]), [TopKey]),
@ -241,7 +247,6 @@ create_block(KeyList1, KeyList2,
NewBlockKeyList, {UpdLSN, UpdHSN}, NewBlockKeyList, {UpdLSN, UpdHSN},
NewSegmentList, Level); NewSegmentList, Level);
{skipped_key, Rem1, Rem2} -> {skipped_key, Rem1, Rem2} ->
io:format("Key is skipped~n"),
create_block(Rem1, Rem2, create_block(Rem1, Rem2,
BlockKeyList, {LSN, HSN}, BlockKeyList, {LSN, HSN},
SegmentList, Level) SegmentList, Level)
@ -261,24 +266,61 @@ create_block(KeyList1, KeyList2,
%% - Remainder of any KeyLists used to make the slot %% - Remainder of any KeyLists used to make the slot
%% create_slot(KeyList1, KeyList2, Level) create_slot(KeyList1, KeyList2, Level) ->
%% create_slot(KeyList1, KeyList2, Level, ?BLOCK_COUNT, null, <<>>, <<>>, []). create_slot(KeyList1, KeyList2, Level, ?BLOCK_COUNT, [], <<>>, [],
{null, infinity, 0, null, full}).
%% create_slot(KL1, KL2, Level, 0, LowKey, SegFilter, SerialisedSlot, %% Keep adding blocks to the slot until either the block count is reached or
%% LengthList, {LSN, HSN}, LastKey) -> %% there is a partial block
%% {{LowKey, SegFilter, SerialisedSlot, LengthList},
%% {{LSN, HSN}, LastKey, full, KL1, KL2}};
%% create_slot(KL1, KL2, Level, BlockCount, LowKey, SegFilter, SerialisedSlot,
%% LengthList, {LSN, HSN}, LastKey) ->
%% BlockDetails = create_block(KeyList1, KeyList2, Level),
%% {BlockKeyList, Status, {LSN, HSN}, SegmentList, KL1, KL2} = BlockDetails,
%% SerialisedBlock = serialise_block(BlockKeyList),
%% <<SerialisedSlot/binary, SerilaisedBlock/binary>>,
%% case Status of
%% full ->
create_slot(KL1, KL2, _, 0, SegLists, SerialisedSlot, LengthList,
{LowKey, LSN, HSN, LastKey, Status}) ->
{{LowKey, generate_segment_filter(SegLists), SerialisedSlot, LengthList},
{{LSN, HSN}, LastKey, Status},
KL1, KL2};
create_slot(KL1, KL2, _, _, SegLists, SerialisedSlot, LengthList,
{LowKey, LSN, HSN, LastKey, partial}) ->
{{LowKey, generate_segment_filter(SegLists), SerialisedSlot, LengthList},
{{LSN, HSN}, LastKey, partial},
KL1, KL2};
create_slot(KL1, KL2, Level, BlockCount, SegLists, SerialisedSlot, LengthList,
{LowKey, LSN, HSN, LastKey, _Status}) ->
{BlockKeyList, Status,
{LSNb, HSNb},
SegmentList, KL1b, KL2b} = create_block(KL1, KL2, Level),
case LowKey of
null ->
[NewLowKeyV|_] = BlockKeyList,
TrackingMetadata = {strip_to_keyonly(NewLowKeyV),
min(LSN, LSNb), max(HSN, HSNb),
strip_to_keyonly(last(BlockKeyList,
{last, LastKey})),
Status};
_ ->
TrackingMetadata = {LowKey,
min(LSN, LSNb), max(HSN, HSNb),
strip_to_keyonly(last(BlockKeyList,
{last, LastKey})),
Status}
end,
SerialisedBlock = serialise_block(BlockKeyList),
BlockLength = bit_size(SerialisedBlock),
SerialisedSlot2 = <<SerialisedSlot/binary, SerialisedBlock/binary>>,
create_slot(KL1b, KL2b, Level, BlockCount - 1, SegLists ++ [SegmentList],
SerialisedSlot2, LengthList ++ [BlockLength], TrackingMetadata).
last([], {last, LastKey}) -> {keyonly, LastKey};
last([E|Es], PrevLast) -> last(E, Es, PrevLast).
last(_, [E|Es], PrevLast) -> last(E, Es, PrevLast);
last(E, [], _) -> E.
strip_to_keyonly({keyonly, K}) -> K;
strip_to_keyonly({K, _, _, _}) -> K.
serialise_block(BlockKeyList) ->
term_to_binary(BlockKeyList, [{compressed, ?COMPRESSION_LEVEL}]).
%% Compare the keys at the head of the list, and either skip that "best" key or %% Compare the keys at the head of the list, and either skip that "best" key or
@ -390,6 +432,12 @@ update_sequencenumbers({_, _, _, _}, LSN, HSN) ->
%% This is more space efficient than the equivalent bloom filter and avoids %% This is more space efficient than the equivalent bloom filter and avoids
%% the calculation of many hash functions. %% the calculation of many hash functions.
generate_segment_filter([SegL1, []]) ->
generate_segment_filter({SegL1, [], [], []});
generate_segment_filter([SegL1, SegL2, []]) ->
generate_segment_filter({SegL1, SegL2, [], []});
generate_segment_filter([SegL1, SegL2, SegL3, SegL4]) ->
generate_segment_filter({SegL1, SegL2, SegL3, SegL4});
generate_segment_filter(SegLists) -> generate_segment_filter(SegLists) ->
generate_segment_filter(merge_seglists(SegLists), generate_segment_filter(merge_seglists(SegLists),
[], [],
@ -441,8 +489,8 @@ merge_seglists({SegList1, SegList2, SegList3, SegList4}) ->
Stage4 = lists:foldl(fun(X, Acc) -> [{X, 3}|Acc] end, Stage3, SegList4), Stage4 = lists:foldl(fun(X, Acc) -> [{X, 3}|Acc] end, Stage3, SegList4),
lists:sort(Stage4). lists:sort(Stage4).
hash_for_segmentid(Key) -> hash_for_segmentid(KV) ->
erlang:phash2(Key). erlang:phash2(strip_to_keyonly(KV), ?MAX_SEG_HASH).
%% Check for a given list of segments in the filter, returning in normal %% Check for a given list of segments in the filter, returning in normal
@ -616,6 +664,35 @@ generate_randomsegfilter(BlockSize) ->
Block4})). Block4})).
generate_randomkeys(Count) ->
generate_randomkeys(Count, []).
generate_randomkeys(0, Acc) ->
Acc;
generate_randomkeys(Count, Acc) ->
RandKey = {{o,
lists:concat(["Bucket", random:uniform(1024)]),
lists:concat(["Key", random:uniform(1024)])},
random:uniform(1024*1024),
{active, infinity}, null},
generate_randomkeys(Count - 1, [RandKey|Acc]).
generate_sequentialkeys(Count, Start) ->
generate_sequentialkeys(Count + Start, Start, []).
generate_sequentialkeys(Target, Incr, Acc) when Incr =:= Target ->
Acc;
generate_sequentialkeys(Target, Incr, Acc) ->
KeyStr = string:right(integer_to_list(Incr), 8, $0),
NextKey = {{o,
"BucketSeq",
lists:concat(["Key", KeyStr])},
5,
{active, infinity}, null},
generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]).
simple_create_block_test() -> simple_create_block_test() ->
KeyList1 = [{{o, "Bucket1", "Key1"}, 1, {active, infinity}, null}, KeyList1 = [{{o, "Bucket1", "Key1"}, 1, {active, infinity}, null},
{{o, "Bucket1", "Key3"}, 2, {active, infinity}, null}], {{o, "Bucket1", "Key3"}, 2, {active, infinity}, null}],
@ -629,7 +706,6 @@ simple_create_block_test() ->
[H2|T2] = T1, [H2|T2] = T1,
?assertMatch(H2, {{o, "Bucket1", "Key2"}, 3, {active, infinity}, null}), ?assertMatch(H2, {{o, "Bucket1", "Key2"}, 3, {active, infinity}, null}),
?assertMatch(T2, [{{o, "Bucket1", "Key3"}, 2, {active, infinity}, null}]), ?assertMatch(T2, [{{o, "Bucket1", "Key3"}, 2, {active, infinity}, null}]),
io:format("SN is ~w~n", [SN]),
?assertMatch(SN, {1,3}). ?assertMatch(SN, {1,3}).
dominate_create_block_test() -> dominate_create_block_test() ->
@ -645,7 +721,7 @@ dominate_create_block_test() ->
?assertMatch(K2, {{o, "Bucket1", "Key2"}, 3, {tomb, infinity}, null}), ?assertMatch(K2, {{o, "Bucket1", "Key2"}, 3, {tomb, infinity}, null}),
?assertMatch(SN, {1,3}). ?assertMatch(SN, {1,3}).
alternating_create_block_test() -> sample_keylist() ->
KeyList1 = [{{o, "Bucket1", "Key1"}, 1, {active, infinity}, null}, KeyList1 = [{{o, "Bucket1", "Key1"}, 1, {active, infinity}, null},
{{o, "Bucket1", "Key3"}, 1, {active, infinity}, null}, {{o, "Bucket1", "Key3"}, 1, {active, infinity}, null},
{{o, "Bucket1", "Key5"}, 1, {active, infinity}, null}, {{o, "Bucket1", "Key5"}, 1, {active, infinity}, null},
@ -675,14 +751,17 @@ alternating_create_block_test() ->
{{o, "Bucket2", "Key6"}, 1, {active, infinity}, null}, {{o, "Bucket2", "Key6"}, 1, {active, infinity}, null},
{{o, "Bucket2", "Key8"}, 1, {active, infinity}, null}, {{o, "Bucket2", "Key8"}, 1, {active, infinity}, null},
{{o, "Bucket3", "Key2"}, 1, {active, infinity}, null}, {{o, "Bucket3", "Key2"}, 1, {active, infinity}, null},
{{o, "Bucket3", "Key4"}, 1, {active, infinity}, null}, {{o, "Bucket3", "Key4"}, 3, {active, infinity}, null},
{{o, "Bucket3", "Key6"}, 1, {active, infinity}, null}, {{o, "Bucket3", "Key6"}, 2, {active, infinity}, null},
{{o, "Bucket3", "Key8"}, 1, {active, infinity}, null}], {{o, "Bucket3", "Key8"}, 1, {active, infinity}, null}],
{KeyList1, KeyList2}.
alternating_create_block_test() ->
{KeyList1, KeyList2} = sample_keylist(),
{MergedKeyList, ListStatus, _, _, _, _} = create_block(KeyList1, {MergedKeyList, ListStatus, _, _, _, _} = create_block(KeyList1,
KeyList2, KeyList2,
1), 1),
BlockSize = length(MergedKeyList), BlockSize = length(MergedKeyList),
io:format("Block size is ~w~n", [BlockSize]),
?assertMatch(BlockSize, 32), ?assertMatch(BlockSize, 32),
?assertMatch(ListStatus, full), ?assertMatch(ListStatus, full),
K1 = lists:nth(1, MergedKeyList), K1 = lists:nth(1, MergedKeyList),
@ -738,4 +817,72 @@ merge_seglists_test() ->
?assertMatch(R8, {maybe_present, [0]}), ?assertMatch(R8, {maybe_present, [0]}),
R9 = check_for_segments(SegBin, [1024*1024 - 1], false), R9 = check_for_segments(SegBin, [1024*1024 - 1], false),
?assertMatch(R9, not_present). ?assertMatch(R9, not_present).
createslot_stage1_test() ->
{KeyList1, KeyList2} = sample_keylist(),
Out = create_slot(KeyList1, KeyList2, 1),
{{LowKey, SegFilter, _SerialisedSlot, _LengthList},
{{LSN, HSN}, LastKey, Status},
KL1, KL2} = Out,
?assertMatch(LowKey, {o, "Bucket1", "Key1"}),
?assertMatch(LastKey, {o, "Bucket4", "Key1"}),
?assertMatch(Status, partial),
?assertMatch(KL1, []),
?assertMatch(KL2, []),
R0 = check_for_segments(serialise_segment_filter(SegFilter),
[hash_for_segmentid({keyonly, {o, "Bucket1", "Key1"}})],
true),
?assertMatch(R0, {maybe_present, [0]}),
R1 = check_for_segments(serialise_segment_filter(SegFilter),
[hash_for_segmentid({keyonly, {o, "Bucket1", "Key99"}})],
true),
?assertMatch(R1, not_present),
?assertMatch(LSN, 1),
?assertMatch(HSN, 3).
createslot_stage2_test() ->
Out = create_slot(lists:sort(generate_randomkeys(100)),
lists:sort(generate_randomkeys(100)),
1),
{{_LowKey, _SegFilter, SerialisedSlot, LengthList},
{{_LSN, _HSN}, _LastKey, Status},
_KL1, _KL2} = Out,
?assertMatch(Status, full),
Sum1 = lists:foldl(fun(X, Sum) -> Sum + X end, 0, LengthList),
Sum2 = bit_size(SerialisedSlot),
?assertMatch(Sum1, Sum2).
createslot_stage3_test() ->
Out = create_slot(lists:sort(generate_sequentialkeys(100, 1)),
lists:sort(generate_sequentialkeys(100, 101)),
1),
{{LowKey, SegFilter, SerialisedSlot, LengthList},
{{_LSN, _HSN}, LastKey, Status},
KL1, KL2} = Out,
?assertMatch(Status, full),
Sum1 = lists:foldl(fun(X, Sum) -> Sum + X end, 0, LengthList),
Sum2 = bit_size(SerialisedSlot),
?assertMatch(Sum1, Sum2),
?assertMatch(LowKey, {o, "BucketSeq", "Key00000001"}),
?assertMatch(LastKey, {o, "BucketSeq", "Key00000128"}),
?assertMatch(KL1, []),
Rem = length(KL2),
?assertMatch(Rem, 72),
R0 = check_for_segments(serialise_segment_filter(SegFilter),
[hash_for_segmentid({keyonly, {o, "BucketSeq", "Key00000100"}})],
true),
?assertMatch(R0, {maybe_present, [3]}),
R1 = check_for_segments(serialise_segment_filter(SegFilter),
[hash_for_segmentid({keyonly, {o, "Bucket1", "Key99"}})],
true),
?assertMatch(R1, not_present),
R2 = check_for_segments(serialise_segment_filter(SegFilter),
[hash_for_segmentid({keyonly, {o, "BucketSeq", "Key00000040"}})],
true),
?assertMatch(R2, {maybe_present, [1]}),
R3 = check_for_segments(serialise_segment_filter(SegFilter),
[hash_for_segmentid({keyonly, {o, "BucketSeq", "Key00000004"}})],
true),
?assertMatch(R3, {maybe_present, [0]}).