Mas i361 d31 (#365)

* Resolve hash 0 of confusion (#362) * Resolve hash 0 of confusion Ensure that a hash of 0 is not confused with an empty index entry (where both hash and position are 0). For a genuine entry position must always be non-zero. * Use little endian format directly ... instead of extracting in big_endian format then flipping (or flipping before writing) * OTP 24 dialyzer fix in test
2021-10-27 14:04:08 +01:00 · 2021-10-27 14:04:08 +01:00 · 27cf6bef55
commit 27cf6bef55
parent 43ec9a4eab
1 changed files with 103 additions and 51 deletions
--- a/src/leveled_cdb.erl
+++ b/src/leveled_cdb.erl
@ -1003,9 +1003,9 @@ set_writeops(SyncStrategy) ->
 %% the file handle
 open_active_file(FileName) when is_list(FileName) ->
    {ok, Handle} = file:open(FileName, ?WRITE_OPS),
-    {ok, Position} = file:position(Handle, {bof, 256*?DWORD_SIZE}),
-    {LastPosition, {HashTree, LastKey}} = startup_scan_over_file(Handle,
-                                                                    Position),
+    {ok, Position} = file:position(Handle, {bof, 256 * ?DWORD_SIZE}),
+    {LastPosition, {HashTree, LastKey}} =
+        startup_scan_over_file(Handle, Position),
    case file:position(Handle, eof) of 
        {ok, LastPosition} ->
            ok = file:close(Handle);
@ -1239,8 +1239,7 @@ load_index(Handle) ->
    LoadIndexFun =
        fun(X) ->
            file:position(Handle, {bof, ?DWORD_SIZE * X}),
-            {HashTablePos, Count} = read_next_2_integers(Handle),
-            {HashTablePos, Count}
+            read_next_2_integers(Handle)
        end,
    list_to_tuple(lists:map(LoadIndexFun, Index)).

@ -1279,8 +1278,8 @@ scan_index_returnpositions(Handle, Position, Count, PosList0) ->
    {ok, _} = file:position(Handle, Position),
    AddPosFun =
        fun({Hash, HPosition}, PosList) ->
-            case Hash of
-                0 ->
+            case {Hash, HPosition} of
+                {0, 0} ->
                    PosList;
                _ ->
                    [HPosition|PosList]
@ -1572,13 +1571,11 @@ extract_valueandsize(ValueAsBin) ->
    {ValueAsBin, byte_size(ValueAsBin)}.


-%% Used for reading lengths
-%% Note that the endian_flip is required to make the file format compatible 
-%% with CDB 
+%% Used for reading lengths with CDB 
 read_next_2_integers(Handle) ->
    case file:read(Handle, ?DWORD_SIZE) of 
-        {ok, <<Int1:32,Int2:32>>} -> 
-            {endian_flip(Int1), endian_flip(Int2)};
+        {ok, <<Int1:32/little-integer, Int2:32/little-integer>>} ->
+            {Int1, Int2};
        ReadError ->
            ReadError
    end.
@ -1589,10 +1586,9 @@ read_next_n_integerpairs(Handle, NumberOfPairs) ->

 read_integerpairs(<<>>, Pairs) ->
    Pairs;
-read_integerpairs(<<Int1:32, Int2:32, Rest/binary>>, Pairs) ->
-    read_integerpairs(<<Rest/binary>>,
-                        Pairs ++ [{endian_flip(Int1),
-                                    endian_flip(Int2)}]).
+read_integerpairs(<<Int1:32/little-integer, Int2:32/little-integer,
+                        Rest/binary>>, Pairs) ->
+    read_integerpairs(<<Rest/binary>>, Pairs ++ [{Int1, Int2}]).



@ -1626,10 +1622,11 @@ search_hash_table(Handle,
        ((Slot + CycleCount - 1) rem TotalSlots) * ?DWORD_SIZE 
        + FirstHashPosition,
    {ok, _} = file:position(Handle, Offset),
-    {StoredHash, DataLoc} = read_next_2_integers(Handle),
    
-    case StoredHash of
-        Hash ->
+    case read_next_2_integers(Handle) of
+        {0, 0} ->
+            {Timings, missing};
+        {Hash, DataLoc} ->
            KV = 
                case QuickCheck of
                    loose_presence ->
@ -1652,8 +1649,6 @@ search_hash_table(Handle,
                    UpdTimings = update_fetchtimings(Timings, CycleCount),
                    {UpdTimings, KV} 
            end;
-        0 ->
-            {Timings, missing};
        _ ->
            search_hash_table(Handle, 
                                {FirstHashPosition,
@ -1773,18 +1768,20 @@ perform_write_hash_tables(Handle, HashTreeBin, StartPos) ->
 %% in the hash table
 %% The List passed in should be made up of {Index, Position, Count} tuples
 write_top_index_table(Handle, BasePos, IndexList) ->
-    FnWriteIndex = fun({_Index, Pos, Count}, {AccBin, CurrPos}) ->
-        case Count == 0 of
-            true ->
-                PosLE = endian_flip(CurrPos),
-                NextPos = CurrPos;
-            false ->
-                PosLE = endian_flip(Pos),
-                NextPos = Pos + (Count * ?DWORD_SIZE)
-        end, 
-        CountLE = endian_flip(Count),
-        {<<AccBin/binary, PosLE:32, CountLE:32>>, NextPos}
-    end,
+    FnWriteIndex = 
+        fun({_Index, Pos, Count}, {AccBin, CurrPos}) ->
+            {Position, NextPos} =
+                case Count == 0 of
+                    true ->
+                        {CurrPos, CurrPos};
+                    false ->
+                        {Pos, Pos + (Count * ?DWORD_SIZE)}
+                end, 
+                {<<AccBin/binary,
+                        Position:32/little-integer,
+                        Count:32/little-integer>>,
+                    NextPos}
+            end,
    
    {IndexBin, _Pos} = lists:foldl(FnWriteIndex,
                                    {<<>>, BasePos},
@ -1794,12 +1791,6 @@ write_top_index_table(Handle, BasePos, IndexList) ->
    ok = file:advise(Handle, 0, ?DWORD_SIZE * 256, will_need),
    ok.

-%% To make this compatible with original Bernstein format this endian flip
-%% and also the use of the standard hash function required.
-  
-endian_flip(Int) ->
-    <<X:32/unsigned-little-integer>> = <<Int:32>>,
-    X.

 hash(Key) ->
    leveled_util:magic_hash(Key).
@ -1823,10 +1814,9 @@ key_value_to_record({Key, Value}, BinaryMode) ->
            end,
    KS = byte_size(BK),
    VS = byte_size(BV),
-    KS_FL = endian_flip(KS),
-    VS_FL = endian_flip(VS + 4),
    CRC = calc_crc(BK, BV),
-    <<KS_FL:32, VS_FL:32, BK:KS/binary, CRC:32/integer, BV:VS/binary>>.
+    <<KS:32/little-integer, (VS + 4):32/little-integer, 
+        BK:KS/binary, CRC:32/integer, BV:VS/binary>>.


 multi_key_value_to_record(KVList, BinaryMode, LastPosition) ->
@ -1879,9 +1869,7 @@ to_slotmap(HashTree, Index) ->
    IndexLength = length(HPList) * 2,
    ConvertObjFun =
        fun({Hash, Position}) ->
-            HashLE = endian_flip(Hash),
-            PosLE = endian_flip(Position),
-            NewBin = <<HashLE:32, PosLE:32>>,
+            NewBin = <<Hash:32/little-integer, Position:32/little-integer>>,
            {hash_to_slot(Hash, IndexLength), NewBin}
        end,
    lists:map(ConvertObjFun, HPList).
@ -1982,12 +1970,11 @@ write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos,
 %%%%%%%%%%%%%%%  
 -ifdef(TEST).

-%%
-%% dump(FileName) -> List
-%% Given a file name, this function returns a list
-%% of {key,value} tuples from the CDB.
-%%
-
+%% To make this compatible with original Bernstein format this endian flip
+%% and also the use of the standard hash function required.
+endian_flip(Int) ->
+    <<X:32/unsigned-little-integer>> = <<Int:32>>,
+    X.

 %% from_dict(FileName,ListOfKeyValueTuples)
 %% Given a filename and a dictionary, create a cdb
@ -2117,6 +2104,71 @@ find_firstzero_test() ->
    ?assertMatch([<<71:64/integer>>, <<72:64/integer>>], RHS).


+magickey_test() ->
+    {C, L1, L2} = {247, 10, 100},
+        % Magic constants - will lead to first hash slot being empty
+        % prompts potential issue when first hash slot is empty but
+        % hash is 0
+
+    MagicKey = 
+        {315781,
+            stnd,
+            {o_rkv,
+                <<100,111,109,97,105,110,68,111,99,117,109,101,110,116>>,
+                <<48,48,48,49,52,54,56,54,51,48,48,48,51,50,49,54,51,51>>,
+                null}},
+    ?assertEqual(0, hash(MagicKey)),
+    NotMagicKVGen =
+        fun(I) ->
+            {{I + C, stnd, {o_rkv, <<"B">>, integer_to_binary(I + C), null}},
+                <<"V">>}
+        end,
+    Set1 = lists:map(NotMagicKVGen, lists:seq(1, L1)),
+    Set2 = lists:map(NotMagicKVGen, lists:seq(L1 + 1, L2)),
+    {ok, P1} =
+        cdb_open_writer("test/test_area/magic_hash.pnd",
+                        #cdb_options{binary_mode=true}),
+    
+    ok = cdb_put(P1, MagicKey, <<"MagicV0">>),
+    lists:foreach(fun({K, V}) -> cdb_put(P1, K, V) end, Set1),
+    ok = cdb_put(P1, MagicKey, <<"MagicV1">>),
+    lists:foreach(fun({K, V}) -> cdb_put(P1, K, V) end, Set2),
+    {ok, F2} = cdb_complete(P1),
+    {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=true}),
+    {GetK, GetV} = cdb_get(P2, MagicKey),
+    ?assertEqual(<<"MagicV1">>, GetV),
+    
+    AllKeys = cdb_directfetch(P2, cdb_getpositions(P2, all), key_only),
+    ?assertMatch(true, lists:member(MagicKey, AllKeys)),
+    ok = cdb_close(P2),
+
+    ok = file:delete("test/test_area/magic_hash.cdb"),
+    
+    {ok, P3} =
+        cdb_open_writer("test/test_area/magic_hash.pnd",
+                        #cdb_options{binary_mode=true}),
+    KVL = Set1 ++ [{MagicKey, <<"MagicV1">>}] ++ Set2,
+    ok = cdb_mput(P3, KVL),
+    {ok, F2} = cdb_complete(P3),
+    {ok, P4} = cdb_open_reader(F2, #cdb_options{binary_mode=true}),
+
+    {GetK, GetV} = cdb_get(P4, MagicKey),
+    ?assertEqual(<<"MagicV1">>, GetV),
+    ok = cdb_close(P4),
+    ok = file:delete("test/test_area/magic_hash.cdb"),
+    
+    {ok, P5} =
+        cdb_open_writer("test/test_area/magic_hash.pnd",
+                        #cdb_options{binary_mode=true}),
+    KVL5 = Set1 ++ Set2,
+    ok = cdb_mput(P5, KVL5),
+    {ok, F2} = cdb_complete(P5),
+    {ok, P6} = cdb_open_reader(F2, #cdb_options{binary_mode=true}),
+    missing = cdb_get(P6, MagicKey),
+    ok = cdb_close(P6),
+    ok = file:delete("test/test_area/magic_hash.cdb").
+
+
 cyclecount_test() ->
    io:format("~n~nStarting cycle count test~n"),
    KVL1 = generate_sequentialkeys(5000, []),