Ongoing improvements - in particular CDB now supports general erlang terms not just lists

This commit is contained in:
Martin Sumner 2015-06-04 21:15:31 +01:00
parent 647a7f44dc
commit c5f50c613d
4 changed files with 412 additions and 159 deletions

View file

@ -25,14 +25,14 @@
%% The slots is a series of references %% The slots is a series of references
%% - 4 byte bloom-filter length %% - 4 byte bloom-filter length
%% - 4 byte key-helper length %% - 4 byte key-helper length
%% - a variable-length compressed bloom filter for all keys in slot (approx 1KB) %% - a variable-length compressed bloom filter for all keys in slot (approx 3KB)
%% - 32 ordered variable-length key helpers pointing to first key in each %% - 64 ordered variable-length key helpers pointing to first key in each
%% block (in slot) of the form Key Length, Key, Block Position %% block (in slot) of the form Key Length, Key, Block Position
%% - 4 byte CRC for the slot %% - 4 byte CRC for the slot
%% %%
%% The slot index in the footer is made up of 128 keys and pointers at the %% The slot index in the footer is made up of 128 keys and pointers at the
%% the start of each slot %% the start of each slot
%% - 128 Key Length (4 byte), Key, Position (4 byte) indexes %% - 64 x Key Length (4 byte), Key, Position (4 byte) indexes
%% - 4 bytes CRC for the index %% - 4 bytes CRC for the index
%% %%
%% The format of the file is intended to support quick lookups, whilst %% The format of the file is intended to support quick lookups, whilst
@ -54,8 +54,9 @@
-record(metadata, {version = ?CURRENT_VERSION :: tuple(), -record(metadata, {version = ?CURRENT_VERSION :: tuple(),
mutable = false :: true | false, mutable = false :: true | false,
compressed = true :: tre | false, compressed = true :: true | false,
slot_list :: list(), slot_array,
open_slot :: integer(),
cache :: tuple(), cache :: tuple(),
smallest_key :: tuple(), smallest_key :: tuple(),
largest_key :: tuple(), largest_key :: tuple(),
@ -73,9 +74,9 @@ start_file(Handle) ->
{ok, _} = file:position(Handle, bof), {ok, _} = file:position(Handle, bof),
file:write(Handle, Header), file:write(Handle, Header),
{Version, {M, C}, _, _} = convert_header(Header), {Version, {M, C}, _, _} = convert_header(Header),
FileMD = #metadata{version=Version, mutable=M, compressed=C}, FileMD = #metadata{version = Version, mutable = M, compressed = C,
SlotArray = array:new(?SLOT_COUNT), slot_array = array:new(?SLOT_COUNT), open_slot = 0},
{Handle, FileMD, SlotArray}. {Handle, FileMD}.
create_header(initial) -> create_header(initial) ->
@ -119,6 +120,8 @@ convert_header_v01(Header) ->
{{0, 1}, {M, C}, {FooterP, SlotLng, HlpLng}, none}. {{0, 1}, {M, C}, {FooterP, SlotLng, HlpLng}, none}.
% add_slot(Handle, FileMD, SlotArray)
%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%
@ -148,7 +151,7 @@ bad_header_test() ->
?assertMatch(unknown_version, convert_header(NewHdr2)). ?assertMatch(unknown_version, convert_header(NewHdr2)).
record_onstartfile_test() -> record_onstartfile_test() ->
{_, FileMD, _} = start_file("onstartfile.bst"), {_, FileMD} = start_file("onstartfile.bst"),
?assertMatch({0, 1}, FileMD#metadata.version). ?assertMatch({0, 1}, FileMD#metadata.version).

View file

@ -54,24 +54,22 @@
put/4, put/4,
open_active_file/1, open_active_file/1,
get_nextkey/1, get_nextkey/1,
get_nextkey/2]). get_nextkey/2,
fold/3,
fold_keys/3]).
-include_lib("eunit/include/eunit.hrl"). -include_lib("eunit/include/eunit.hrl").
-define(DWORD_SIZE, 8). -define(DWORD_SIZE, 8).
-define(WORD_SIZE, 4). -define(WORD_SIZE, 4).
-define(CRC_CHECK, true). -define(CRC_CHECK, true).
-define(MAX_FILE_SIZE, 3221225472).
-define(BASE_POSITION, 2048).
%% %%
%% from_dict(FileName,ListOfKeyValueTuples) %% from_dict(FileName,ListOfKeyValueTuples)
%% Given a filename and a dictionary, create a cdb %% Given a filename and a dictionary, create a cdb
%% using the key value pairs from the dict. %% using the key value pairs from the dict.
%%
%% @spec from_dict(filename(),dictionary()) -> ok
%% where
%% filename() = string(),
%% dictionary() = dict()
%%
from_dict(FileName,Dict) -> from_dict(FileName,Dict) ->
KeyValueList = dict:to_list(Dict), KeyValueList = dict:to_list(Dict),
create(FileName, KeyValueList). create(FileName, KeyValueList).
@ -82,30 +80,21 @@ from_dict(FileName,Dict) ->
%% this function creates a CDB %% this function creates a CDB
%% %%
create(FileName,KeyValueList) -> create(FileName,KeyValueList) ->
{ok, Handle} = file:open(FileName, [write]), {ok, Handle} = file:open(FileName, [binary, raw, read, write]),
{ok, _} = file:position(Handle, {bof, 2048}), {ok, _} = file:position(Handle, {bof, ?BASE_POSITION}),
{BasePos, HashTree} = write_key_value_pairs(Handle, KeyValueList), {BasePos, HashTree} = write_key_value_pairs(Handle, KeyValueList),
io:format("KVs has been written to base position ~w~n", [BasePos]), close_file(Handle, HashTree, BasePos).
L2 = write_hash_tables(Handle, HashTree),
io:format("Index list output of ~w~n", [L2]),
write_top_index_table(Handle, BasePos, L2),
file:close(Handle).
%% %%
%% dump(FileName) -> List %% dump(FileName) -> List
%% Given a file name, this function returns a list %% Given a file name, this function returns a list
%% of {key,value} tuples from the CDB. %% of {key,value} tuples from the CDB.
%% %%
%%
%% @spec dump(filename()) -> key_value_list()
%% where
%% filename() = string(),
%% key_value_list() = [{key,value}]
dump(FileName) -> dump(FileName) ->
dump(FileName, ?CRC_CHECK). dump(FileName, ?CRC_CHECK).
dump(FileName, CRCCheck) -> dump(FileName, CRCCheck) ->
{ok, Handle} = file:open(FileName, [binary,raw]), {ok, Handle} = file:open(FileName, [binary, raw, read]),
Fn = fun(Index, Acc) -> Fn = fun(Index, Acc) ->
{ok, _} = file:position(Handle, ?DWORD_SIZE * Index), {ok, _} = file:position(Handle, ?DWORD_SIZE * Index),
{_, Count} = read_next_2_integers(Handle), {_, Count} = read_next_2_integers(Handle),
@ -117,8 +106,9 @@ dump(FileName, CRCCheck) ->
{ok, _} = file:position(Handle, {bof, 2048}), {ok, _} = file:position(Handle, {bof, 2048}),
Fn1 = fun(_I,Acc) -> Fn1 = fun(_I,Acc) ->
{KL,VL} = read_next_2_integers(Handle), {KL,VL} = read_next_2_integers(Handle),
Key = read_next_string(Handle, KL), Key = read_next_term(Handle, KL),
case read_next_string(Handle, VL, crc, CRCCheck) of io:format("Key read of ~w~n", [Key]),
case read_next_term(Handle, VL, crc, CRCCheck) of
{false, _} -> {false, _} ->
{ok, CurrLoc} = file:position(Handle, cur), {ok, CurrLoc} = file:position(Handle, cur),
Return = {crc_wonky, get(Handle, Key)}; Return = {crc_wonky, get(Handle, Key)};
@ -169,9 +159,15 @@ put(FileName, Key, Value, {LastPosition, HashTree}) when is_list(FileName) ->
[binary, raw, read, write, delayed_write]), [binary, raw, read, write, delayed_write]),
put(Handle, Key, Value, {LastPosition, HashTree}); put(Handle, Key, Value, {LastPosition, HashTree});
put(Handle, Key, Value, {LastPosition, HashTree}) -> put(Handle, Key, Value, {LastPosition, HashTree}) ->
Bin = key_value_to_record({Key, Value}), % create binary for Key and Value Bin = key_value_to_record({Key, Value}),
PotentialNewSize = LastPosition + byte_size(Bin),
if PotentialNewSize > ?MAX_FILE_SIZE ->
close_file(Handle, HashTree, LastPosition),
roll;
true ->
ok = file:pwrite(Handle, LastPosition, Bin), ok = file:pwrite(Handle, LastPosition, Bin),
{LastPosition + byte_size(Bin), put_hashtree(Key, LastPosition, HashTree)}. {Handle, PotentialNewSize, put_hashtree(Key, LastPosition, HashTree)}
end.
%% %%
@ -182,7 +178,7 @@ get(FileNameOrHandle, Key) ->
get(FileNameOrHandle, Key, ?CRC_CHECK). get(FileNameOrHandle, Key, ?CRC_CHECK).
get(FileName, Key, CRCCheck) when is_list(FileName), is_list(Key) -> get(FileName, Key, CRCCheck) when is_list(FileName), is_list(Key) ->
{ok,Handle} = file:open(FileName,[binary,raw]), {ok,Handle} = file:open(FileName,[binary, raw, read]),
get(Handle,Key, CRCCheck); get(Handle,Key, CRCCheck);
get(Handle, Key, CRCCheck) when is_tuple(Handle), is_list(Key) -> get(Handle, Key, CRCCheck) when is_tuple(Handle), is_list(Key) ->
@ -230,7 +226,7 @@ get_nextkey(Handle, {Position, FirstHashPosition}) ->
{ok, Position} = file:position(Handle, Position), {ok, Position} = file:position(Handle, Position),
case read_next_2_integers(Handle) of case read_next_2_integers(Handle) of
{KeyLength, ValueLength} -> {KeyLength, ValueLength} ->
NextKey = read_next_string(Handle, KeyLength), NextKey = read_next_term(Handle, KeyLength),
NextPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE, NextPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE,
case NextPosition of case NextPosition of
FirstHashPosition -> FirstHashPosition ->
@ -243,10 +239,76 @@ get_nextkey(Handle, {Position, FirstHashPosition}) ->
end. end.
%% Fold over all of the objects in the file, applying FoldFun to each object
%% where FoldFun(K, V, Acc0) -> Acc , or FoldFun(K, Acc0) -> Acc if KeyOnly is
%% set to true
fold(FileName, FoldFun, Acc0) when is_list(FileName) ->
{ok, Handle} = file:open(FileName, [binary, raw, read]),
fold(Handle, FoldFun, Acc0);
fold(Handle, FoldFun, Acc0) ->
{ok, _} = file:position(Handle, bof),
{FirstHashPosition, _} = read_next_2_integers(Handle),
fold(Handle, FoldFun, Acc0, {256 * ?DWORD_SIZE, FirstHashPosition}, false).
fold(Handle, FoldFun, Acc0, {Position, FirstHashPosition}, KeyOnly) ->
{ok, Position} = file:position(Handle, Position),
case Position of
FirstHashPosition ->
Acc0;
_ ->
case read_next_2_integers(Handle) of
{KeyLength, ValueLength} ->
NextKey = read_next_term(Handle, KeyLength),
NextPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE,
case KeyOnly of
true ->
fold(Handle, FoldFun, FoldFun(NextKey, Acc0),
{NextPosition, FirstHashPosition}, KeyOnly);
false ->
case read_next_term(Handle, ValueLength, crc, ?CRC_CHECK) of
{false, _} ->
io:format("Skipping value for Key ~w as CRC check failed~n",
[NextKey]),
fold(Handle, FoldFun, Acc0,
{NextPosition, FirstHashPosition}, KeyOnly);
{_, Value} ->
fold(Handle, FoldFun, FoldFun(NextKey, Value, Acc0),
{NextPosition, FirstHashPosition}, KeyOnly)
end
end;
eof ->
Acc0
end
end.
fold_keys(FileName, FoldFun, Acc0) when is_list(FileName) ->
{ok, Handle} = file:open(FileName, [binary, raw, read]),
fold_keys(Handle, FoldFun, Acc0);
fold_keys(Handle, FoldFun, Acc0) ->
{ok, _} = file:position(Handle, bof),
{FirstHashPosition, _} = read_next_2_integers(Handle),
fold(Handle, FoldFun, Acc0, {256 * ?DWORD_SIZE, FirstHashPosition}, true).
%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%
%% Internal functions %% Internal functions
%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%
%% Take an active file and write the hash details necessary to close that
%% file and roll a new active file if requested.
%%
%% Base Pos should be at the end of the KV pairs written (the position for)
%% the hash tables
close_file(Handle, HashTree, BasePos) ->
{ok, BasePos} = file:position(Handle, BasePos),
L2 = write_hash_tables(Handle, HashTree),
write_top_index_table(Handle, BasePos, L2),
file:close(Handle).
%% Fetch a list of positions by passing a key to the HashTree %% Fetch a list of positions by passing a key to the HashTree
get_hashtree(Key, HashTree) -> get_hashtree(Key, HashTree) ->
Hash = hash(Key), Hash = hash(Key),
@ -282,9 +344,9 @@ extract_kvpair(_, [], _, _) ->
extract_kvpair(Handle, [Position|Rest], Key, Check) -> extract_kvpair(Handle, [Position|Rest], Key, Check) ->
{ok, _} = file:position(Handle, Position), {ok, _} = file:position(Handle, Position),
{KeyLength, ValueLength} = read_next_2_integers(Handle), {KeyLength, ValueLength} = read_next_2_integers(Handle),
case read_next_string(Handle, KeyLength) of case read_next_term(Handle, KeyLength) of
Key -> % If same key as passed in, then found! Key -> % If same key as passed in, then found!
case read_next_string(Handle, ValueLength, crc, Check) of case read_next_term(Handle, ValueLength, crc, Check) of
{false, _} -> {false, _} ->
crc_wonky; crc_wonky;
{_, Value} -> {_, Value} ->
@ -301,10 +363,10 @@ scan_over_file(Handle, Position) ->
scan_over_file(Handle, Position, HashTree). scan_over_file(Handle, Position, HashTree).
scan_over_file(Handle, Position, HashTree) -> scan_over_file(Handle, Position, HashTree) ->
case read_next_2_integers(Handle) of case saferead_keyvalue(Handle) of
{KeyLength, ValueLength} -> false ->
Key = read_next_string(Handle, KeyLength), {Position, HashTree};
{ok, ValueAsBin} = file:read(Handle, ValueLength), {Key, ValueAsBin, KeyLength, ValueLength} ->
case crccheck_value(ValueAsBin) of case crccheck_value(ValueAsBin) of
true -> true ->
NewPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE, NewPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE,
@ -318,6 +380,34 @@ scan_over_file(Handle, Position, HashTree) ->
{Position, HashTree} {Position, HashTree}
end. end.
%% Read the Key/Value at this point, returning {ok, Key, Value}
%% catch expected exceptiosn associated with file corruption (or end) and
%% return eof
saferead_keyvalue(Handle) ->
case read_next_2_integers(Handle) of
{error, einval} ->
false;
eof ->
false;
{KeyL, ValueL} ->
case read_next_term(Handle, KeyL) of
{error, einval} ->
false;
eof ->
false;
Key ->
case file:read(Handle, ValueL) of
{error, einval} ->
false;
eof ->
false;
{ok, Value} ->
{Key, Value, KeyL, ValueL}
end
end
end.
%% The first four bytes of the value are the crc check %% The first four bytes of the value are the crc check
crccheck_value(Value) when byte_size(Value) >4 -> crccheck_value(Value) when byte_size(Value) >4 ->
<< Hash:32/integer, Tail/bitstring>> = Value, << Hash:32/integer, Tail/bitstring>> = Value,
@ -356,26 +446,30 @@ to_dict(FileName) ->
KeyValueList = dump(FileName), KeyValueList = dump(FileName),
dict:from_list(KeyValueList). dict:from_list(KeyValueList).
read_next_string(Handle, Length) -> read_next_term(Handle, Length) ->
{ok, Bin} = file:read(Handle, Length), case file:read(Handle, Length) of
binary_to_list(Bin). {ok, Bin} ->
binary_to_term(Bin);
ReadError ->
ReadError
end.
%% Read next string where the string has a CRC prepended - stripping the crc %% Read next string where the string has a CRC prepended - stripping the crc
%% and checking if requested %% and checking if requested
read_next_string(Handle, Length, crc, Check) -> read_next_term(Handle, Length, crc, Check) ->
case Check of case Check of
true -> true ->
{ok, <<CRC:32/integer, Bin/binary>>} = file:read(Handle, Length), {ok, <<CRC:32/integer, Bin/binary>>} = file:read(Handle, Length),
case calc_crc(Bin) of case calc_crc(Bin) of
CRC -> CRC ->
{true, binary_to_list(Bin)}; {true, binary_to_term(Bin)};
_ -> _ ->
{false, binary_to_list(Bin)} {false, binary_to_term(Bin)}
end; end;
_ -> _ ->
{ok, _} = file:position(Handle, {cur, 4}), {ok, _} = file:position(Handle, {cur, 4}),
{ok, Bin} = file:read(Handle, Length - 4), {ok, Bin} = file:read(Handle, Length - 4),
{unchecked, binary_to_list(Bin)} {unchecked, binary_to_term(Bin)}
end. end.
@ -386,9 +480,9 @@ read_next_2_integers(Handle) ->
case file:read(Handle,?DWORD_SIZE) of case file:read(Handle,?DWORD_SIZE) of
{ok, <<Int1:32,Int2:32>>} -> {ok, <<Int1:32,Int2:32>>} ->
{endian_flip(Int1), endian_flip(Int2)}; {endian_flip(Int1), endian_flip(Int2)};
MatchError ReadError
-> ->
MatchError ReadError
end. end.
%% Seach the hash table for the matching hash and key. Be prepared for %% Seach the hash table for the matching hash and key. Be prepared for
@ -398,7 +492,6 @@ search_hash_table(_Handle, [], _Hash, _Key, _CRCCHeck) ->
search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, CRCCheck) -> search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, CRCCheck) ->
{ok, _} = file:position(Handle, Entry), {ok, _} = file:position(Handle, Entry),
{StoredHash, DataLoc} = read_next_2_integers(Handle), {StoredHash, DataLoc} = read_next_2_integers(Handle),
io:format("looking in data location ~w~n", [DataLoc]),
case StoredHash of case StoredHash of
Hash -> Hash ->
KV = extract_kvpair(Handle, [DataLoc], Key, CRCCheck), KV = extract_kvpair(Handle, [DataLoc], Key, CRCCheck),
@ -432,7 +525,7 @@ write_key_value_pairs(_, [], Acc) ->
Acc; Acc;
write_key_value_pairs(Handle, [HeadPair|TailList], Acc) -> write_key_value_pairs(Handle, [HeadPair|TailList], Acc) ->
{Key, Value} = HeadPair, {Key, Value} = HeadPair,
{NewPosition, HashTree} = put(Handle, Key, Value, Acc), {Handle, NewPosition, HashTree} = put(Handle, Key, Value, Acc),
write_key_value_pairs(Handle, TailList, {NewPosition, HashTree}). write_key_value_pairs(Handle, TailList, {NewPosition, HashTree}).
%% Write the actual hashtables at the bottom of the file. Each hash table %% Write the actual hashtables at the bottom of the file. Each hash table
@ -549,11 +642,13 @@ endian_flip(Int) ->
X. X.
hash(Key) -> hash(Key) ->
BK = term_to_binary(Key),
H = 5381, H = 5381,
hash1(H,Key) band 16#FFFFFFFF. hash1(H, BK) band 16#FFFFFFFF.
hash1(H,[]) ->H; hash1(H, <<>>) ->
hash1(H,[B|Rest]) -> H;
hash1(H, <<B:8/integer, Rest/bytes>>) ->
H1 = H * 33, H1 = H * 33,
H2 = H1 bxor B, H2 = H1 bxor B,
hash1(H2, Rest). hash1(H2, Rest).
@ -568,40 +663,20 @@ hash_to_slot(Hash,L) ->
%% Create a binary of the LengthKeyLengthValue, adding a CRC check %% Create a binary of the LengthKeyLengthValue, adding a CRC check
%% at the front of the value %% at the front of the value
key_value_to_record({Key, Value}) -> key_value_to_record({Key, Value}) ->
L1 = endian_flip(length(Key)), BK = term_to_binary(Key),
L2 = endian_flip(length(Value) + 4), BV = term_to_binary(Value),
LB1 = list_to_binary(Key), LK = byte_size(BK),
LB2 = list_to_binary(Value), LV = byte_size(BV),
CRC = calc_crc(LB2), LK_FL = endian_flip(LK),
<<L1:32,L2:32,LB1/binary,CRC:32/integer,LB2/binary>>. LV_FL = endian_flip(LV + 4),
CRC = calc_crc(BV),
<<LK_FL:32, LV_FL:32, BK:LK/binary, CRC:32/integer, BV:LV/binary>>.
%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%
% T E S T % T E S T
%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%
-ifdef(TEST).
hash_1_test() ->
Hash = hash("key1"),
?assertMatch(Hash,2088047427).
hash_to_index_1_test() ->
Hash = hash("key1"),
Index = hash_to_index(Hash),
?assertMatch(Index,67).
hash_to_index_2_test() ->
Hash = 256,
I = hash_to_index(Hash),
?assertMatch(I,0).
hash_to_index_3_test() ->
Hash = 268,
I = hash_to_index(Hash),
?assertMatch(I,12).
hash_to_index_4_test() ->
Hash = hash("key2"),
Index = hash_to_index(Hash),
?assertMatch(Index,64).
write_key_value_pairs_1_test() -> write_key_value_pairs_1_test() ->
{ok,Handle} = file:open("test.cdb",write), {ok,Handle} = file:open("test.cdb",write),
@ -612,8 +687,11 @@ write_key_value_pairs_1_test() ->
Index2 = hash_to_index(Hash2), Index2 = hash_to_index(Hash2),
R0 = array:new(256, {default, gb_trees:empty()}), R0 = array:new(256, {default, gb_trees:empty()}),
R1 = array:set(Index1, gb_trees:insert(Hash1, [0], array:get(Index1, R0)), R0), R1 = array:set(Index1, gb_trees:insert(Hash1, [0], array:get(Index1, R0)), R0),
R2 = array:set(Index2, gb_trees:insert(Hash2, [22], array:get(Index2, R1)), R1), R2 = array:set(Index2, gb_trees:insert(Hash2, [30], array:get(Index2, R1)), R1),
?assertMatch(R2, HashTree). io:format("HashTree is ~w~n", [HashTree]),
io:format("Expected HashTree is ~w~n", [R2]),
?assertMatch(R2, HashTree),
ok = file:delete("test.cdb").
write_hash_tables_1_test() -> write_hash_tables_1_test() ->
@ -623,7 +701,8 @@ write_hash_tables_1_test() ->
R2 = array:set(67, gb_trees:insert(6383014723, [0], array:get(67, R1)), R1), R2 = array:set(67, gb_trees:insert(6383014723, [0], array:get(67, R1)), R1),
Result = write_hash_tables(Handle, R2), Result = write_hash_tables(Handle, R2),
io:format("write hash tables result of ~w ~n", [Result]), io:format("write hash tables result of ~w ~n", [Result]),
?assertMatch(Result,[{67,16,2},{64,0,2}]). ?assertMatch(Result,[{67,16,2},{64,0,2}]),
ok = file:delete("test.cdb").
find_open_slot_1_test() -> find_open_slot_1_test() ->
List = [<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], List = [<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>],
@ -654,7 +733,8 @@ full_1_test() ->
List1 = lists:sort([{"key1","value1"},{"key2","value2"}]), List1 = lists:sort([{"key1","value1"},{"key2","value2"}]),
create("simple.cdb",lists:sort([{"key1","value1"},{"key2","value2"}])), create("simple.cdb",lists:sort([{"key1","value1"},{"key2","value2"}])),
List2 = lists:sort(dump("simple.cdb")), List2 = lists:sort(dump("simple.cdb")),
?assertMatch(List1,List2). ?assertMatch(List1,List2),
ok = file:delete("simple.cdb").
full_2_test() -> full_2_test() ->
List1 = lists:sort([{lists:flatten(io_lib:format("~s~p",[Prefix,Plug])), List1 = lists:sort([{lists:flatten(io_lib:format("~s~p",[Prefix,Plug])),
@ -664,7 +744,8 @@ full_2_test() ->
"tiep4||","qweq"]]), "tiep4||","qweq"]]),
create("full.cdb",List1), create("full.cdb",List1),
List2 = lists:sort(dump("full.cdb")), List2 = lists:sort(dump("full.cdb")),
?assertMatch(List1,List2). ?assertMatch(List1,List2),
ok = file:delete("full.cdb").
from_dict_test() -> from_dict_test() ->
D = dict:new(), D = dict:new(),
@ -676,7 +757,8 @@ from_dict_test() ->
D3 = lists:sort(dict:to_list(D2)), D3 = lists:sort(dict:to_list(D2)),
io:format("KVP is ~w~n", [KVP]), io:format("KVP is ~w~n", [KVP]),
io:format("D3 is ~w~n", [D3]), io:format("D3 is ~w~n", [D3]),
?assertMatch(KVP,D3). ?assertMatch(KVP, D3),
ok = file:delete("from_dict_test.cdb").
to_dict_test() -> to_dict_test() ->
D = dict:new(), D = dict:new(),
@ -686,7 +768,8 @@ to_dict_test() ->
Dict = to_dict("from_dict_test.cdb"), Dict = to_dict("from_dict_test.cdb"),
D3 = lists:sort(dict:to_list(D2)), D3 = lists:sort(dict:to_list(D2)),
D4 = lists:sort(dict:to_list(Dict)), D4 = lists:sort(dict:to_list(Dict)),
?assertMatch(D4,D3). ?assertMatch(D4,D3),
ok = file:delete("from_dict_test.cdb").
crccheck_emptyvalue_test() -> crccheck_emptyvalue_test() ->
?assertMatch(false, crccheck_value(<<>>)). ?assertMatch(false, crccheck_value(<<>>)).
@ -729,9 +812,10 @@ activewrite_singlewrite_test() ->
{LastPosition, KeyDict} = open_active_file("test_mem.cdb"), {LastPosition, KeyDict} = open_active_file("test_mem.cdb"),
io:format("File opened as new active file " io:format("File opened as new active file "
"with LastPosition=~w ~n", [LastPosition]), "with LastPosition=~w ~n", [LastPosition]),
{_, UpdKeyDict} = put("test_mem.cdb", Key, Value, {LastPosition, KeyDict}), {_, _, UpdKeyDict} = put("test_mem.cdb", Key, Value, {LastPosition, KeyDict}),
io:format("New key and value added to active file ~n", []), io:format("New key and value added to active file ~n", []),
?assertMatch({Key, Value}, get_mem(Key, "test_mem.cdb", UpdKeyDict)). ?assertMatch({Key, Value}, get_mem(Key, "test_mem.cdb", UpdKeyDict)),
ok = file:delete("test_mem.cdb").
search_hash_table_findinslot_test() -> search_hash_table_findinslot_test() ->
Key1 = "key1", % this is in slot 3 if count is 8 Key1 = "key1", % this is in slot 3 if count is 8
@ -766,10 +850,11 @@ search_hash_table_findinslot_test() ->
ok = file:pwrite(Handle, FirstHashPosition + (Slot -1) * ?DWORD_SIZE, RBin), ok = file:pwrite(Handle, FirstHashPosition + (Slot -1) * ?DWORD_SIZE, RBin),
ok = file:close(Handle), ok = file:close(Handle),
io:format("Find key following change to hash table~n"), io:format("Find key following change to hash table~n"),
?assertMatch(missing, get("hashtable1_test.cdb", Key1)). ?assertMatch(missing, get("hashtable1_test.cdb", Key1)),
ok = file:delete("hashtable1_test.cdb").
getnextkey_test() -> getnextkey_inclemptyvalue_test() ->
L = [{"K9", "V9"}, {"K2", "V2"}, {"K3", "V3"}, L = [{"K9", "V9"}, {"K2", "V2"}, {"K3", ""},
{"K4", "V4"}, {"K5", "V5"}, {"K6", "V6"}, {"K7", "V7"}, {"K4", "V4"}, {"K5", "V5"}, {"K6", "V6"}, {"K7", "V7"},
{"K8", "V8"}, {"K1", "V1"}], {"K8", "V8"}, {"K1", "V1"}],
ok = create("hashtable1_test.cdb", L), ok = create("hashtable1_test.cdb", L),
@ -778,7 +863,8 @@ getnextkey_test() ->
?assertMatch("K9", FirstKey), ?assertMatch("K9", FirstKey),
{SecondKey, Handle, P2} = get_nextkey(Handle, P1), {SecondKey, Handle, P2} = get_nextkey(Handle, P1),
?assertMatch("K2", SecondKey), ?assertMatch("K2", SecondKey),
{_, Handle, P3} = get_nextkey(Handle, P2), {ThirdKeyNoValue, Handle, P3} = get_nextkey(Handle, P2),
?assertMatch("K3", ThirdKeyNoValue),
{_, Handle, P4} = get_nextkey(Handle, P3), {_, Handle, P4} = get_nextkey(Handle, P3),
{_, Handle, P5} = get_nextkey(Handle, P4), {_, Handle, P5} = get_nextkey(Handle, P4),
{_, Handle, P6} = get_nextkey(Handle, P5), {_, Handle, P6} = get_nextkey(Handle, P5),
@ -786,19 +872,114 @@ getnextkey_test() ->
{_, Handle, P8} = get_nextkey(Handle, P7), {_, Handle, P8} = get_nextkey(Handle, P7),
{LastKey, Info} = get_nextkey(Handle, P8), {LastKey, Info} = get_nextkey(Handle, P8),
?assertMatch(nomorekeys, Info), ?assertMatch(nomorekeys, Info),
?assertMatch("K1", LastKey). ?assertMatch("K1", LastKey),
ok = file:delete("hashtable1_test.cdb").
newactivefile_test() -> newactivefile_test() ->
{LastPosition, _} = open_active_file("activefile_test.cdb"), {LastPosition, _} = open_active_file("activefile_test.cdb"),
?assertMatch(256 * ?DWORD_SIZE, LastPosition), ?assertMatch(256 * ?DWORD_SIZE, LastPosition),
Response = get_nextkey("activefile_test.cdb"), Response = get_nextkey("activefile_test.cdb"),
?assertMatch(nomorekeys, Response). ?assertMatch(nomorekeys, Response),
ok = file:delete("activefile_test.cdb").
emptyvalue_fromdict_test() ->
D = dict:new(),
D1 = dict:store("K1", "V1", D),
D2 = dict:store("K2", "", D1),
D3 = dict:store("K3", "V3", D2),
D4 = dict:store("K4", "", D3),
ok = from_dict("from_dict_test_ev.cdb",D4),
io:format("Store created ~n", []),
KVP = lists:sort(dump("from_dict_test_ev.cdb")),
D_Result = lists:sort(dict:to_list(D4)),
io:format("KVP is ~w~n", [KVP]),
io:format("D_Result is ~w~n", [D_Result]),
?assertMatch(KVP, D_Result),
ok = file:delete("from_dict_test_ev.cdb").
fold_test() ->
K1 = {"Key1", 1},
V1 = 2,
K2 = {"Key1", 2},
V2 = 4,
K3 = {"Key1", 3},
V3 = 8,
K4 = {"Key1", 4},
V4 = 16,
K5 = {"Key1", 5},
V5 = 32,
D = dict:from_list([{K1, V1}, {K2, V2}, {K3, V3}, {K4, V4}, {K5, V5}]),
ok = from_dict("fold_test.cdb", D),
FromSN = 2,
FoldFun = fun(K, V, Acc) ->
{_Key, Seq} = K,
if Seq > FromSN ->
Acc + V;
true ->
Acc
end
end,
?assertMatch(56, fold("fold_test.cdb", FoldFun, 0)),
ok = file:delete("fold_test.cdb").
fold_keys_test() ->
K1 = {"Key1", 1},
V1 = 2,
K2 = {"Key2", 2},
V2 = 4,
K3 = {"Key3", 3},
V3 = 8,
K4 = {"Key4", 4},
V4 = 16,
K5 = {"Key5", 5},
V5 = 32,
D = dict:from_list([{K1, V1}, {K2, V2}, {K3, V3}, {K4, V4}, {K5, V5}]),
ok = from_dict("fold_keys_test.cdb", D),
FromSN = 2,
FoldFun = fun(K, Acc) ->
{Key, Seq} = K,
if Seq > FromSN ->
lists:append(Acc, [Key]);
true ->
Acc
end
end,
Result = fold_keys("fold_keys_test.cdb", FoldFun, []),
?assertMatch(["Key3", "Key4", "Key5"], lists:sort(Result)),
ok = file:delete("fold_keys_test.cdb").
fold2_test() ->
K1 = {"Key1", 1},
V1 = 2,
K2 = {"Key1", 2},
V2 = 4,
K3 = {"Key1", 3},
V3 = 8,
K4 = {"Key1", 4},
V4 = 16,
K5 = {"Key1", 5},
V5 = 32,
K6 = {"Key2", 1},
V6 = 64,
D = dict:from_list([{K1, V1}, {K2, V2}, {K3, V3},
{K4, V4}, {K5, V5}, {K6, V6}]),
ok = from_dict("fold2_test.cdb", D),
FoldFun = fun(K, V, Acc) ->
{Key, Seq} = K,
case dict:find(Key, Acc) of
error ->
dict:store(Key, {Seq, V}, Acc);
{ok, {LSN, _V}} when Seq > LSN ->
dict:store(Key, {Seq, V}, Acc);
_ ->
Acc
end
end,
RD = dict:new(),
RD1 = dict:store("Key1", {5, 32}, RD),
RD2 = dict:store("Key2", {1, 64}, RD1),
Result = fold("fold2_test.cdb", FoldFun, dict:new()),
?assertMatch(RD2, Result),
ok = file:delete("fold2_test.cdb").
-endif.

View file

@ -1,58 +1,69 @@
-module(leveled_internal). -module(leveled_iterator).
-export([termiterator/6]). -export([termiterator/3]).
-include_lib("eunit/include/eunit.hrl"). -include_lib("eunit/include/eunit.hrl").
%% We will have a sorted list of terms %% Takes a list of terms to iterate - the terms being sorted in Erlang term
%% Some terms will be dummy terms which are pointers to more terms which can be %% order
%% found. If a pointer is hit need to replenish the term list before
%% proceeding.
%% %%
%% Helper Functions should have free functions - %% Helper Functions should have free functions -
%% {FolderFun, CompareFun, PointerCheck} %% {FolderFun, CompareFun, PointerCheck, PointerFetch}
%% FolderFun - function which takes the next item and the accumulator and %% FolderFun - function which takes the next item and the accumulator and
%% returns an updated accumulator %% returns an updated accumulator. Note FolderFun can only increase the
%% accumulator by one entry each time
%% CompareFun - function which should be able to compare two keys (which are %% CompareFun - function which should be able to compare two keys (which are
%% not pointers), and return a winning item (or combination of items) %% not pointers), and return a winning item (or combination of items)
%% PointerCheck - function for differentiating between keys and pointer %% PointerCheck - function for differentiating between keys and pointer
%% PointerFetch - function that takes a pointer an EndKey (which may be
%% infinite) and returns a ne wslice of ordered results from that pointer
%%
%% Range can be for the form
%% {StartKey, EndKey, MaxKeys} where EndKey or MaxKeys can be infinite (but
%% not both)
termiterator(HeadItem, [], Acc, HelperFuns,
_StartKey, _EndKey) -> termiterator(ListToIterate, HelperFuns, Range) ->
case Range of
{_, infinte, infinite} ->
bad_iterator;
_ ->
termiterator(null, ListToIterate, [], HelperFuns, Range)
end.
termiterator(HeadItem, [], Acc, HelperFuns, _) ->
case HeadItem of case HeadItem of
null -> null ->
Acc; Acc;
_ -> _ ->
{FolderFun, _, _} = HelperFuns, {FolderFun, _, _, _} = HelperFuns,
FolderFun(Acc, HeadItem) FolderFun(Acc, HeadItem)
end; end;
termiterator(null, [NextItem|TailList], Acc, HelperFuns, termiterator(null, [NextItem|TailList], Acc, HelperFuns, Range) ->
StartKey, EndKey) ->
%% Check that the NextItem is not a pointer before promoting to HeadItem %% Check that the NextItem is not a pointer before promoting to HeadItem
%% Cannot now promote a HeadItem which is a pointer %% Cannot now promote a HeadItem which is a pointer
{_, _, PointerCheck} = HelperFuns, {_, _, PointerCheck, PointerFetch} = HelperFuns,
case PointerCheck(NextItem) of case PointerCheck(NextItem) of
{true, Pointer} -> {true, Pointer} ->
NewSlice = getnextslice(Pointer, EndKey), {_, EndKey, _} = Range,
NewSlice = PointerFetch(Pointer, EndKey),
ExtendedList = lists:merge(NewSlice, TailList), ExtendedList = lists:merge(NewSlice, TailList),
termiterator(null, ExtendedList, Acc, HelperFuns, termiterator(null, ExtendedList, Acc, HelperFuns, Range);
StartKey, EndKey);
false -> false ->
termiterator(NextItem, TailList, Acc, HelperFuns, termiterator(NextItem, TailList, Acc, HelperFuns, Range)
StartKey, EndKey)
end; end;
termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns, termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns, Range) ->
StartKey, EndKey) -> {FolderFun, CompareFun, PointerCheck, PointerFetch} = HelperFuns,
{FolderFun, CompareFun, PointerCheck} = HelperFuns, {_, EndKey, MaxItems} = Range,
%% HeadItem cannot be pointer, but NextItem might be, so check before %% HeadItem cannot be pointer, but NextItem might be, so check before
%% comparison %% comparison
case PointerCheck(NextItem) of case PointerCheck(NextItem) of
{true, Pointer} -> {true, Pointer} ->
NewSlice = getnextslice(Pointer, EndKey), NewSlice = PointerFetch(Pointer, EndKey),
ExtendedList = lists:merge(NewSlice, [NextItem|TailList]), ExtendedList = lists:merge(NewSlice, [HeadItem|TailList]),
termiterator(null, ExtendedList, Acc, HelperFuns, termiterator(null, ExtendedList, Acc, HelperFuns, Range);
StartKey, EndKey);
false -> false ->
%% Compare to see if Head and Next match, or if Head is a winner %% Compare to see if Head and Next match, or if Head is a winner
%% to be added to accumulator %% to be added to accumulator
@ -60,39 +71,65 @@ termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns,
{match, StrongItem, _WeakItem} -> {match, StrongItem, _WeakItem} ->
%% Discard WeakItem, Strong Item might be an aggregation of %% Discard WeakItem, Strong Item might be an aggregation of
%% the items %% the items
termiterator(StrongItem, TailList, Acc, HelperFuns, termiterator(StrongItem, TailList, Acc, HelperFuns, Range);
StartKey, EndKey);
{winner, HeadItem} -> {winner, HeadItem} ->
%% Add next item to accumulator, and proceed with next item %% Add next item to accumulator, and proceed with next item
AccPlus = FolderFun(Acc, HeadItem), AccPlus = FolderFun(Acc, HeadItem),
termiterator(NextItem, TailList, AccPlus, HelperFuns, case length(AccPlus) of
HeadItem, EndKey) MaxItems ->
AccPlus;
_ ->
termiterator(NextItem, TailList, AccPlus,
HelperFuns,
{HeadItem, EndKey, MaxItems})
end
end end
end. end.
%% Initial forms of keys supported are Index Keys and Object Keys
%%
%% All keys are of the form {Key, Value, SequenceNumber, State}
%%
%% The Key will be of the form:
%% {o, Bucket, Key} - for an Object Key
%% {i, Bucket, IndexName, IndexTerm, Key} - for an Index Key
%%
%% The value will be of the form:
%% {o, ObjectHash, [vector-clocks]} - for an Object Key
%% null - for an Index Key
%%
%% Sequence number is the sequence number the key was added, and the highest
%% sequence number in the list of keys for an index key.
%%
%% State can be one of the following:
%% live - an active key
%% tomb - a tombstone key
%% {timestamp, TS} - an active key to a certain timestamp
%% {pointer, Pointer} - to be added by iterators to indicate further data
%% available in the range from a particular source
pointercheck_indexkey(IndexKey) -> pointercheck_indexkey(IndexKey) ->
case IndexKey of case IndexKey of
{i, _Bucket, _Index, _Term, _Key, _Sequence, {zpointer, Pointer}} -> {_Key, _Values, _Sequence, {pointer, Pointer}} ->
{true, Pointer}; {true, Pointer};
_ -> _ ->
false false
end. end.
folder_indexkey(Acc, IndexKey) -> folder_indexkey(Acc, IndexKey) ->
io:format("Folding index key of - ~w~n", [IndexKey]),
case IndexKey of case IndexKey of
{i, _Bucket, _Index, _Term, _Key, _Sequence, tombstone} -> {_Key, _Value, _Sequence, tomb} ->
Acc; Acc;
{i, _Bucket, _Index, _Term, Key, _Sequence, null} -> {Key, _Value, _Sequence, live} ->
io:format("Adding key ~s~n", [Key]), {i, _, _, _, ObjectKey} = Key,
lists:append(Acc, [Key]) lists:append(Acc, [ObjectKey])
end. end.
compare_indexkey(IndexKey1, IndexKey2) -> compare_indexkey(IndexKey1, IndexKey2) ->
{i, Bucket1, Index1, Term1, Key1, Sequence1, _Value1} = IndexKey1, {{i, Bucket1, Index1, Term1, Key1}, _Val1, Sequence1, _St1} = IndexKey1,
{i, Bucket2, Index2, Term2, Key2, Sequence2, _Value2} = IndexKey2, {{i, Bucket2, Index2, Term2, Key2}, _Val2, Sequence2, _St2} = IndexKey2,
case {Bucket1, Index1, Term1, Key1} of case {Bucket1, Index1, Term1, Key1} of
{Bucket2, Index2, Term2, Key2} when Sequence1 >= Sequence2 -> {Bucket2, Index2, Term2, Key2} when Sequence1 >= Sequence2 ->
{match, IndexKey1, IndexKey2}; {match, IndexKey1, IndexKey2};
@ -105,6 +142,9 @@ compare_indexkey(IndexKey1, IndexKey2) ->
end. end.
%% Unit testsß
getnextslice(Pointer, _EndKey) -> getnextslice(Pointer, _EndKey) ->
case Pointer of case Pointer of
{test, NewList} -> {test, NewList} ->
@ -114,18 +154,43 @@ getnextslice(Pointer, _EndKey) ->
end. end.
%% Unit tests iterateoverindexkeyswithnopointer_test() ->
Key1 = {{i, "pdsRecord", "familyName_bin", "1972SMITH", "10001"},
null, 1, live},
iterateoverindexkeyswithnopointer_test_() -> Key2 = {{i, "pdsRecord", "familyName_bin", "1972SMITH", "10001"},
Key1 = {i, "pdsRecord", "familyName_bin", "1972SMITH", "10001", 1, null}, null, 2, tomb},
Key2 = {i, "pdsRecord", "familyName_bin", "1972SMITH", "10001", 2, tombstone}, Key3 = {{i, "pdsRecord", "familyName_bin", "1971SMITH", "10002"},
Key3 = {i, "pdsRecord", "familyName_bin", "1971SMITH", "10002", 2, null}, null, 2, live},
Key4 = {i, "pdsRecord", "familyName_bin", "1972JONES", "10003", 2, null}, Key4 = {{i, "pdsRecord", "familyName_bin", "1972JONES", "10003"},
null, 2, live},
KeyList = lists:sort([Key1, Key2, Key3, Key4]), KeyList = lists:sort([Key1, Key2, Key3, Key4]),
HelperFuns = {fun folder_indexkey/2, fun compare_indexkey/2, fun pointercheck_indexkey/1}, HelperFuns = {fun folder_indexkey/2, fun compare_indexkey/2,
ResultList = ["10002", "10003"], fun pointercheck_indexkey/1, fun getnextslice/2},
?_assertEqual(ResultList, termiterator(null, KeyList, [], HelperFuns, "1971", "1973")). ?assertMatch(["10002", "10003"],
termiterator(KeyList, HelperFuns, {"1971", "1973", infinite})).
iterateoverindexkeyswithpointer_test() ->
Key1 = {{i, "pdsRecord", "familyName_bin", "1972SMITH", "10001"},
null, 1, live},
Key2 = {{i, "pdsRecord", "familyName_bin", "1972SMITH", "10001"},
null, 2, tomb},
Key3 = {{i, "pdsRecord", "familyName_bin", "1971SMITH", "10002"},
null, 2, live},
Key4 = {{i, "pdsRecord", "familyName_bin", "1972JONES", "10003"},
null, 2, live},
Key5 = {{i, "pdsRecord", "familyName_bin", "1972ZAFRIDI", "10004"},
null, 2, live},
Key6 = {{i, "pdsRecord", "familyName_bin", "1972JONES", "10004"},
null, 0, {pointer, {test, [Key5]}}},
KeyList = lists:sort([Key1, Key2, Key3, Key4, Key6]),
HelperFuns = {fun folder_indexkey/2, fun compare_indexkey/2,
fun pointercheck_indexkey/1, fun getnextslice/2},
?assertMatch(["10002", "10003", "10004"],
termiterator(KeyList, HelperFuns, {"1971", "1973", infinite})),
?assertMatch(["10002", "10003"],
termiterator(KeyList, HelperFuns, {"1971", "1973", 2})).

View file

@ -1,7 +1,11 @@
-module(lookup_test). -module(lookup_test).
-export([go_dict/1, go_ets/1, go_gbtree/1, -export([go_dict/1,
go_arrayofdict/1, go_arrayofgbtree/1, go_arrayofdict_withcache/1]). go_ets/1,
go_gbtree/1,
go_arrayofdict/1,
go_arrayofgbtree/1,
go_arrayofdict_withcache/1]).
-define(CACHE_SIZE, 512). -define(CACHE_SIZE, 512).