Mas i361 d31 (#365)

* Resolve hash 0 of confusion (#362)

* Resolve hash 0 of confusion

Ensure that a hash of 0 is not confused with an empty index entry (where both hash and position are 0).  For a genuine entry position must always be non-zero.

* Use little endian format directly

... instead of extracting in big_endian format then flipping (or flipping before writing)

* OTP 24 dialyzer fix in test
This commit is contained in:
Martin Sumner 2021-10-27 14:04:08 +01:00 committed by GitHub
parent 43ec9a4eab
commit 27cf6bef55
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1004,8 +1004,8 @@ set_writeops(SyncStrategy) ->
open_active_file(FileName) when is_list(FileName) -> open_active_file(FileName) when is_list(FileName) ->
{ok, Handle} = file:open(FileName, ?WRITE_OPS), {ok, Handle} = file:open(FileName, ?WRITE_OPS),
{ok, Position} = file:position(Handle, {bof, 256 * ?DWORD_SIZE}), {ok, Position} = file:position(Handle, {bof, 256 * ?DWORD_SIZE}),
{LastPosition, {HashTree, LastKey}} = startup_scan_over_file(Handle, {LastPosition, {HashTree, LastKey}} =
Position), startup_scan_over_file(Handle, Position),
case file:position(Handle, eof) of case file:position(Handle, eof) of
{ok, LastPosition} -> {ok, LastPosition} ->
ok = file:close(Handle); ok = file:close(Handle);
@ -1239,8 +1239,7 @@ load_index(Handle) ->
LoadIndexFun = LoadIndexFun =
fun(X) -> fun(X) ->
file:position(Handle, {bof, ?DWORD_SIZE * X}), file:position(Handle, {bof, ?DWORD_SIZE * X}),
{HashTablePos, Count} = read_next_2_integers(Handle), read_next_2_integers(Handle)
{HashTablePos, Count}
end, end,
list_to_tuple(lists:map(LoadIndexFun, Index)). list_to_tuple(lists:map(LoadIndexFun, Index)).
@ -1279,8 +1278,8 @@ scan_index_returnpositions(Handle, Position, Count, PosList0) ->
{ok, _} = file:position(Handle, Position), {ok, _} = file:position(Handle, Position),
AddPosFun = AddPosFun =
fun({Hash, HPosition}, PosList) -> fun({Hash, HPosition}, PosList) ->
case Hash of case {Hash, HPosition} of
0 -> {0, 0} ->
PosList; PosList;
_ -> _ ->
[HPosition|PosList] [HPosition|PosList]
@ -1572,13 +1571,11 @@ extract_valueandsize(ValueAsBin) ->
{ValueAsBin, byte_size(ValueAsBin)}. {ValueAsBin, byte_size(ValueAsBin)}.
%% Used for reading lengths %% Used for reading lengths with CDB
%% Note that the endian_flip is required to make the file format compatible
%% with CDB
read_next_2_integers(Handle) -> read_next_2_integers(Handle) ->
case file:read(Handle, ?DWORD_SIZE) of case file:read(Handle, ?DWORD_SIZE) of
{ok, <<Int1:32,Int2:32>>} -> {ok, <<Int1:32/little-integer, Int2:32/little-integer>>} ->
{endian_flip(Int1), endian_flip(Int2)}; {Int1, Int2};
ReadError -> ReadError ->
ReadError ReadError
end. end.
@ -1589,10 +1586,9 @@ read_next_n_integerpairs(Handle, NumberOfPairs) ->
read_integerpairs(<<>>, Pairs) -> read_integerpairs(<<>>, Pairs) ->
Pairs; Pairs;
read_integerpairs(<<Int1:32, Int2:32, Rest/binary>>, Pairs) -> read_integerpairs(<<Int1:32/little-integer, Int2:32/little-integer,
read_integerpairs(<<Rest/binary>>, Rest/binary>>, Pairs) ->
Pairs ++ [{endian_flip(Int1), read_integerpairs(<<Rest/binary>>, Pairs ++ [{Int1, Int2}]).
endian_flip(Int2)}]).
@ -1626,10 +1622,11 @@ search_hash_table(Handle,
((Slot + CycleCount - 1) rem TotalSlots) * ?DWORD_SIZE ((Slot + CycleCount - 1) rem TotalSlots) * ?DWORD_SIZE
+ FirstHashPosition, + FirstHashPosition,
{ok, _} = file:position(Handle, Offset), {ok, _} = file:position(Handle, Offset),
{StoredHash, DataLoc} = read_next_2_integers(Handle),
case StoredHash of case read_next_2_integers(Handle) of
Hash -> {0, 0} ->
{Timings, missing};
{Hash, DataLoc} ->
KV = KV =
case QuickCheck of case QuickCheck of
loose_presence -> loose_presence ->
@ -1652,8 +1649,6 @@ search_hash_table(Handle,
UpdTimings = update_fetchtimings(Timings, CycleCount), UpdTimings = update_fetchtimings(Timings, CycleCount),
{UpdTimings, KV} {UpdTimings, KV}
end; end;
0 ->
{Timings, missing};
_ -> _ ->
search_hash_table(Handle, search_hash_table(Handle,
{FirstHashPosition, {FirstHashPosition,
@ -1773,17 +1768,19 @@ perform_write_hash_tables(Handle, HashTreeBin, StartPos) ->
%% in the hash table %% in the hash table
%% The List passed in should be made up of {Index, Position, Count} tuples %% The List passed in should be made up of {Index, Position, Count} tuples
write_top_index_table(Handle, BasePos, IndexList) -> write_top_index_table(Handle, BasePos, IndexList) ->
FnWriteIndex = fun({_Index, Pos, Count}, {AccBin, CurrPos}) -> FnWriteIndex =
fun({_Index, Pos, Count}, {AccBin, CurrPos}) ->
{Position, NextPos} =
case Count == 0 of case Count == 0 of
true -> true ->
PosLE = endian_flip(CurrPos), {CurrPos, CurrPos};
NextPos = CurrPos;
false -> false ->
PosLE = endian_flip(Pos), {Pos, Pos + (Count * ?DWORD_SIZE)}
NextPos = Pos + (Count * ?DWORD_SIZE)
end, end,
CountLE = endian_flip(Count), {<<AccBin/binary,
{<<AccBin/binary, PosLE:32, CountLE:32>>, NextPos} Position:32/little-integer,
Count:32/little-integer>>,
NextPos}
end, end,
{IndexBin, _Pos} = lists:foldl(FnWriteIndex, {IndexBin, _Pos} = lists:foldl(FnWriteIndex,
@ -1794,12 +1791,6 @@ write_top_index_table(Handle, BasePos, IndexList) ->
ok = file:advise(Handle, 0, ?DWORD_SIZE * 256, will_need), ok = file:advise(Handle, 0, ?DWORD_SIZE * 256, will_need),
ok. ok.
%% To make this compatible with original Bernstein format this endian flip
%% and also the use of the standard hash function required.
endian_flip(Int) ->
<<X:32/unsigned-little-integer>> = <<Int:32>>,
X.
hash(Key) -> hash(Key) ->
leveled_util:magic_hash(Key). leveled_util:magic_hash(Key).
@ -1823,10 +1814,9 @@ key_value_to_record({Key, Value}, BinaryMode) ->
end, end,
KS = byte_size(BK), KS = byte_size(BK),
VS = byte_size(BV), VS = byte_size(BV),
KS_FL = endian_flip(KS),
VS_FL = endian_flip(VS + 4),
CRC = calc_crc(BK, BV), CRC = calc_crc(BK, BV),
<<KS_FL:32, VS_FL:32, BK:KS/binary, CRC:32/integer, BV:VS/binary>>. <<KS:32/little-integer, (VS + 4):32/little-integer,
BK:KS/binary, CRC:32/integer, BV:VS/binary>>.
multi_key_value_to_record(KVList, BinaryMode, LastPosition) -> multi_key_value_to_record(KVList, BinaryMode, LastPosition) ->
@ -1879,9 +1869,7 @@ to_slotmap(HashTree, Index) ->
IndexLength = length(HPList) * 2, IndexLength = length(HPList) * 2,
ConvertObjFun = ConvertObjFun =
fun({Hash, Position}) -> fun({Hash, Position}) ->
HashLE = endian_flip(Hash), NewBin = <<Hash:32/little-integer, Position:32/little-integer>>,
PosLE = endian_flip(Position),
NewBin = <<HashLE:32, PosLE:32>>,
{hash_to_slot(Hash, IndexLength), NewBin} {hash_to_slot(Hash, IndexLength), NewBin}
end, end,
lists:map(ConvertObjFun, HPList). lists:map(ConvertObjFun, HPList).
@ -1982,12 +1970,11 @@ write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos,
%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%
-ifdef(TEST). -ifdef(TEST).
%% %% To make this compatible with original Bernstein format this endian flip
%% dump(FileName) -> List %% and also the use of the standard hash function required.
%% Given a file name, this function returns a list endian_flip(Int) ->
%% of {key,value} tuples from the CDB. <<X:32/unsigned-little-integer>> = <<Int:32>>,
%% X.
%% from_dict(FileName,ListOfKeyValueTuples) %% from_dict(FileName,ListOfKeyValueTuples)
%% Given a filename and a dictionary, create a cdb %% Given a filename and a dictionary, create a cdb
@ -2117,6 +2104,71 @@ find_firstzero_test() ->
?assertMatch([<<71:64/integer>>, <<72:64/integer>>], RHS). ?assertMatch([<<71:64/integer>>, <<72:64/integer>>], RHS).
magickey_test() ->
{C, L1, L2} = {247, 10, 100},
% Magic constants - will lead to first hash slot being empty
% prompts potential issue when first hash slot is empty but
% hash is 0
MagicKey =
{315781,
stnd,
{o_rkv,
<<100,111,109,97,105,110,68,111,99,117,109,101,110,116>>,
<<48,48,48,49,52,54,56,54,51,48,48,48,51,50,49,54,51,51>>,
null}},
?assertEqual(0, hash(MagicKey)),
NotMagicKVGen =
fun(I) ->
{{I + C, stnd, {o_rkv, <<"B">>, integer_to_binary(I + C), null}},
<<"V">>}
end,
Set1 = lists:map(NotMagicKVGen, lists:seq(1, L1)),
Set2 = lists:map(NotMagicKVGen, lists:seq(L1 + 1, L2)),
{ok, P1} =
cdb_open_writer("test/test_area/magic_hash.pnd",
#cdb_options{binary_mode=true}),
ok = cdb_put(P1, MagicKey, <<"MagicV0">>),
lists:foreach(fun({K, V}) -> cdb_put(P1, K, V) end, Set1),
ok = cdb_put(P1, MagicKey, <<"MagicV1">>),
lists:foreach(fun({K, V}) -> cdb_put(P1, K, V) end, Set2),
{ok, F2} = cdb_complete(P1),
{ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=true}),
{GetK, GetV} = cdb_get(P2, MagicKey),
?assertEqual(<<"MagicV1">>, GetV),
AllKeys = cdb_directfetch(P2, cdb_getpositions(P2, all), key_only),
?assertMatch(true, lists:member(MagicKey, AllKeys)),
ok = cdb_close(P2),
ok = file:delete("test/test_area/magic_hash.cdb"),
{ok, P3} =
cdb_open_writer("test/test_area/magic_hash.pnd",
#cdb_options{binary_mode=true}),
KVL = Set1 ++ [{MagicKey, <<"MagicV1">>}] ++ Set2,
ok = cdb_mput(P3, KVL),
{ok, F2} = cdb_complete(P3),
{ok, P4} = cdb_open_reader(F2, #cdb_options{binary_mode=true}),
{GetK, GetV} = cdb_get(P4, MagicKey),
?assertEqual(<<"MagicV1">>, GetV),
ok = cdb_close(P4),
ok = file:delete("test/test_area/magic_hash.cdb"),
{ok, P5} =
cdb_open_writer("test/test_area/magic_hash.pnd",
#cdb_options{binary_mode=true}),
KVL5 = Set1 ++ Set2,
ok = cdb_mput(P5, KVL5),
{ok, F2} = cdb_complete(P5),
{ok, P6} = cdb_open_reader(F2, #cdb_options{binary_mode=true}),
missing = cdb_get(P6, MagicKey),
ok = cdb_close(P6),
ok = file:delete("test/test_area/magic_hash.cdb").
cyclecount_test() -> cyclecount_test() ->
io:format("~n~nStarting cycle count test~n"), io:format("~n~nStarting cycle count test~n"),
KVL1 = generate_sequentialkeys(5000, []), KVL1 = generate_sequentialkeys(5000, []),