diff --git a/src/activefile_test.cdb b/src/activefile_test.cdb new file mode 100644 index 0000000..e9784eb Binary files /dev/null and b/src/activefile_test.cdb differ diff --git a/src/eleveleddb.app.src b/src/eleveleddb.app.src new file mode 100644 index 0000000..1be8c00 --- /dev/null +++ b/src/eleveleddb.app.src @@ -0,0 +1,17 @@ +{application, eleveleddb, + [ + {description, ""}, + {vsn, "0.0.1"}, + {modules, []}, + {registered, []}, + {applications, [ + kernel, + stdlib + ]}, + {mod, {eleveleddb_app, []}}, + {env, [ + %% Default max file size (in bytes) + {max_file_size, 32#80000000}, % 4GB default + + ]} + ]}. \ No newline at end of file diff --git a/src/from_dict_test.cdb b/src/from_dict_test.cdb new file mode 100644 index 0000000..5cbf317 Binary files /dev/null and b/src/from_dict_test.cdb differ diff --git a/src/full.cdb b/src/full.cdb new file mode 100644 index 0000000..ffa584b Binary files /dev/null and b/src/full.cdb differ diff --git a/src/hashtable1_test.cdb b/src/hashtable1_test.cdb new file mode 100644 index 0000000..3a9c88b Binary files /dev/null and b/src/hashtable1_test.cdb differ diff --git a/src/leveled_bst.beam b/src/leveled_bst.beam new file mode 100644 index 0000000..09c8b2b Binary files /dev/null and b/src/leveled_bst.beam differ diff --git a/src/leveled_bst.erl b/src/leveled_bst.erl new file mode 100644 index 0000000..886705a --- /dev/null +++ b/src/leveled_bst.erl @@ -0,0 +1,156 @@ +%% +%% This module provides functions for managing bst files - a modified version +%% of sst files, to be used in leveleddb. +%% bst files are borken into the following sections: +%% - Header (fixed width 32 bytes - containing pointers and metadata) +%% - Blocks (variable length) +%% - Slots (variable length) +%% - Footer (variable length - contains slot index and helper metadata) +%% +%% The 32-byte header is made up of +%% - 1 byte version (major 5 bits, minor 3 bits) - default 0.1 +%% - 1 byte state bits (1 bit to indicate mutability, 1 for use of compression) +%% - 4 bytes footer position +%% - 4 bytes slot list length +%% - 4 bytes helper length +%% - 14 bytes spare for future options +%% - 4 bytes CRC (header) +%% +%% The Blocks is a series of blocks of: +%% - 4 byte block length +%% - variable-length compressed list of 32 keys & values +%% - 4 byte CRC for block +%% There will be up to 4000 blocks in a single bst file +%% +%% The slots is a series of references +%% - 4 byte bloom-filter length +%% - 4 byte key-helper length +%% - a variable-length compressed bloom filter for all keys in slot (approx 1KB) +%% - 32 ordered variable-length key helpers pointing to first key in each +%% block (in slot) of the form Key Length, Key, Block Position +%% - 4 byte CRC for the slot +%% +%% The slot index in the footer is made up of 128 keys and pointers at the +%% the start of each slot +%% - 128 Key Length (4 byte), Key, Position (4 byte) indexes +%% - 4 bytes CRC for the index +%% +%% The format of the file is intended to support quick lookups, whilst +%% allowing for a new file to be written incrementally (so that all keys and +%% values need not be retained in memory) - perhaps n blocks at a time + + +-module(leveled_bst). + +-export([start_file/1, convert_header/1]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(WORD_SIZE, 4). +-define(CURRENT_VERSION, {0,1}). +-define(SLOT_COUNT, 128). +-define(BLOCK_SIZE, 32). +-define(SLOT_SIZE, 32). + +-record(metadata, {version = ?CURRENT_VERSION :: tuple(), + mutable = false :: true | false, + compressed = true :: tre | false, + slot_list :: list(), + cache :: tuple(), + smallest_key :: tuple(), + largest_key :: tuple(), + smallest_sqn :: integer(), + largest_sqn :: integer() + }). + +%% Start a bare file with an initial header and no further details +%% Return the {Handle, metadata record} +start_file(FileName) when is_list(FileName) -> + {ok, Handle} = file:open(FileName, [binary, raw, read, write]), + start_file(Handle); +start_file(Handle) -> + Header = create_header(initial), + {ok, _} = file:position(Handle, bof), + file:write(Handle, Header), + {Version, {M, C}, _, _} = convert_header(Header), + FileMD = #metadata{version=Version, mutable=M, compressed=C}, + SlotArray = array:new(?SLOT_COUNT), + {Handle, FileMD, SlotArray}. + + +create_header(initial) -> + {Major, Minor} = ?CURRENT_VERSION, + Version = <>, + State = <<0:6, 1:1, 1:1>>, % Mutable and compressed + Lengths = <<0:32, 0:32, 0:32>>, + Options = <<0:112>>, + H1 = <>, + CRC32 = erlang:crc32(H1), + <

>. + + +convert_header(Header) -> + <> = Header, + case erlang:crc32(H1) of + CRC32 -> + <> = H1, + case {Major, Minor} of + {0, 1} -> + convert_header_v01(H1); + _ -> + unknown_version + end; + _ -> + crc_mismatch + end. + +convert_header_v01(Header) -> + <<_:8, 0:6, Mutable:1, Comp:1, + FooterP:32/integer, SlotLng:32/integer, HlpLng:32/integer, + _/binary>> = Header, + case Mutable of + 1 -> M = true; + 0 -> M = false + end, + case Comp of + 1 -> C = true; + 0 -> C = false + end, + {{0, 1}, {M, C}, {FooterP, SlotLng, HlpLng}, none}. + + + + +%%%%%%%%%%%%%%%% +% T E S T +%%%%%%%%%%%%%%% + +empty_header_test() -> + Header = create_header(initial), + ?assertMatch(32, byte_size(Header)), + <> = Header, + ?assertMatch({0, 1}, {Major, Minor}), + {Version, State, Lengths, Options} = convert_header(Header), + ?assertMatch({0, 1}, Version), + ?assertMatch({true, true}, State), + ?assertMatch({0, 0, 0}, Lengths), + ?assertMatch(none, Options). + +bad_header_test() -> + Header = create_header(initial), + <<_:1/binary, Rest/binary >> = Header, + HdrDetails1 = convert_header(<<0:5/integer, 2:3/integer, Rest/binary>>), + ?assertMatch(crc_mismatch, HdrDetails1), + <<_:1/binary, RestToCRC:27/binary, _:32/integer>> = Header, + NewHdr1 = <<0:5/integer, 2:3/integer, RestToCRC/binary>>, + CRC32 = erlang:crc32(NewHdr1), + NewHdr2 = <>, + ?assertMatch(unknown_version, convert_header(NewHdr2)). + +record_onstartfile_test() -> + {_, FileMD, _} = start_file("onstartfile.bst"), + ?assertMatch({0, 1}, FileMD#metadata.version). + + + + diff --git a/src/leveled_cdb.beam b/src/leveled_cdb.beam new file mode 100644 index 0000000..301deaa Binary files /dev/null and b/src/leveled_cdb.beam differ diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl new file mode 100644 index 0000000..13c3062 --- /dev/null +++ b/src/leveled_cdb.erl @@ -0,0 +1,804 @@ +%% +%% This is a modified version of the cdb module provided by Tom Whitcomb. +%% +%% - https://github.com/thomaswhitcomb/erlang-cdb +%% +%% The primary differences are: +%% - Support for incrementally writing a CDB file while keeping the hash table +%% in memory +%% - Support for merging of multiple CDB files with a key-checking function to +%% allow for compaction +%% - Automatic adding of a helper object that will keep a small proportion of +%% keys to be used when checking to see if the cdb file is a candidate for +%% compaction +%% - The ability to scan a database and accumulate all the Key, Values to +%% rebuild in-memory tables on startup +%% +%% This is to be used in eleveledb, and in this context: +%% - Keys will be a Sequence Number +%% - Values will be a Checksum; Pointers (length * 3); Key; [Metadata]; [Value] +%% where the pointers can be used to extract just part of the value +%% (i.e. metadata only) +%% +%% This module provides functions to create and query a CDB (constant database). +%% A CDB implements a two-level hashtable which provides fast {key,value} +%% lookups that remain fairly constant in speed regardless of the CDBs size. +%% +%% The first level in the CDB occupies the first 255 doublewords in the file. +%% Each doubleword slot contains two values. The first is a file pointer to +%% the primary hashtable (at the end of the file) and the second value is the +%% number of entries in the hashtable. The first level table of 255 entries +%% is indexed with the lower eight bits of the hash of the input key. +%% +%% Following the 255 doublewords are the {key,value} tuples. The tuples are +%% packed in the file without regard to word boundaries. Each {key,value} +%% tuple is represented with a four byte key length, a four byte value length, +%% the actual key value followed by the actual value. +%% +%% Following the {key,value} tuples are the primary hash tables. There are +%% at most 255 hash tables. Each hash table is referenced by one of the 255 +%% doubleword entries at the top of the file. For efficiency reasons, each +%% hash table is allocated twice the number of entries that it will need. +%% Each entry in the hash table is a doubleword. +%% The first word is the corresponding hash value and the second word is a +%% file pointer to the actual {key,value} tuple higher in the file. +%% + +-module(leveled_cdb). + +-export([from_dict/2, + create/2, + dump/1, + get/2, + get_mem/3, + put/4, + open_active_file/1, + get_nextkey/1, + get_nextkey/2]). + +-include_lib("eunit/include/eunit.hrl"). + +-define(DWORD_SIZE, 8). +-define(WORD_SIZE, 4). +-define(CRC_CHECK, true). + +%% +%% from_dict(FileName,ListOfKeyValueTuples) +%% Given a filename and a dictionary, create a cdb +%% using the key value pairs from the dict. +%% +%% @spec from_dict(filename(),dictionary()) -> ok +%% where +%% filename() = string(), +%% dictionary() = dict() +%% +from_dict(FileName,Dict) -> + KeyValueList = dict:to_list(Dict), + create(FileName, KeyValueList). + +%% +%% create(FileName,ListOfKeyValueTuples) -> ok +%% Given a filename and a list of {key,value} tuples, +%% this function creates a CDB +%% +create(FileName,KeyValueList) -> + {ok, Handle} = file:open(FileName, [write]), + {ok, _} = file:position(Handle, {bof, 2048}), + {BasePos, HashTree} = write_key_value_pairs(Handle, KeyValueList), + io:format("KVs has been written to base position ~w~n", [BasePos]), + L2 = write_hash_tables(Handle, HashTree), + io:format("Index list output of ~w~n", [L2]), + write_top_index_table(Handle, BasePos, L2), + file:close(Handle). + +%% +%% dump(FileName) -> List +%% Given a file name, this function returns a list +%% of {key,value} tuples from the CDB. +%% +%% +%% @spec dump(filename()) -> key_value_list() +%% where +%% filename() = string(), +%% key_value_list() = [{key,value}] +dump(FileName) -> + dump(FileName, ?CRC_CHECK). + +dump(FileName, CRCCheck) -> + {ok, Handle} = file:open(FileName, [binary,raw]), + Fn = fun(Index, Acc) -> + {ok, _} = file:position(Handle, ?DWORD_SIZE * Index), + {_, Count} = read_next_2_integers(Handle), + Acc + Count + end, + NumberOfPairs = lists:foldl(Fn, 0, lists:seq(0,255)) bsr 1, + io:format("Count of keys in db is ~w~n", [NumberOfPairs]), + + {ok, _} = file:position(Handle, {bof, 2048}), + Fn1 = fun(_I,Acc) -> + {KL,VL} = read_next_2_integers(Handle), + Key = read_next_string(Handle, KL), + case read_next_string(Handle, VL, crc, CRCCheck) of + {false, _} -> + {ok, CurrLoc} = file:position(Handle, cur), + Return = {crc_wonky, get(Handle, Key)}; + {_, Value} -> + {ok, CurrLoc} = file:position(Handle, cur), + Return = case get(Handle, Key) of + {Key,Value} -> {Key ,Value}; + X -> {wonky, X} + end + end, + {ok, _} = file:position(Handle, CurrLoc), + [Return | Acc] + end, + lists:foldr(Fn1,[],lists:seq(0,NumberOfPairs-1)). + +%% Open an active file - one for which it is assumed the hash tables have not +%% yet been written +%% +%% Needs to scan over file to incrementally produce the hash list, starting at +%% the end of the top index table. +%% +%% Should return a dictionary keyed by index containing a list of {Hash, Pos} +%% tuples as the write_key_value_pairs function, and the current position, and +%% the file handle +open_active_file(FileName) when is_list(FileName) -> + {ok, Handle} = file:open(FileName, [binary, raw, read, write]), + {ok, Position} = file:position(Handle, {bof, 256*?DWORD_SIZE}), + {LastPosition, HashTree} = scan_over_file(Handle, Position), + case file:position(Handle, eof) of + {ok, LastPosition} -> + ok = file:close(Handle); + {ok, _} -> + LogDetails = [LastPosition, file:position(Handle, eof)], + io:format("File to be truncated at last position of" + "~w with end of file at ~w~n", LogDetails), + {ok, LastPosition} = file:position(Handle, LastPosition), + ok = file:truncate(Handle), + ok = file:close(Handle) + end, + {LastPosition, HashTree}. + +%% put(Handle, Key, Value, {LastPosition, HashDict}) -> {NewPosition, KeyDict} +%% Append to an active file a new key/value pair returning an updated +%% dictionary of Keys and positions. Returns an updated Position +%% +put(FileName, Key, Value, {LastPosition, HashTree}) when is_list(FileName) -> + {ok, Handle} = file:open(FileName, + [binary, raw, read, write, delayed_write]), + put(Handle, Key, Value, {LastPosition, HashTree}); +put(Handle, Key, Value, {LastPosition, HashTree}) -> + Bin = key_value_to_record({Key, Value}), % create binary for Key and Value + ok = file:pwrite(Handle, LastPosition, Bin), + {LastPosition + byte_size(Bin), put_hashtree(Key, LastPosition, HashTree)}. + + +%% +%% get(FileName,Key) -> {key,value} +%% Given a filename and a key, returns a key and value tuple. +%% +get(FileNameOrHandle, Key) -> + get(FileNameOrHandle, Key, ?CRC_CHECK). + +get(FileName, Key, CRCCheck) when is_list(FileName), is_list(Key) -> + {ok,Handle} = file:open(FileName,[binary,raw]), + get(Handle,Key, CRCCheck); + +get(Handle, Key, CRCCheck) when is_tuple(Handle), is_list(Key) -> + Hash = hash(Key), + Index = hash_to_index(Hash), + {ok,_} = file:position(Handle, {bof, ?DWORD_SIZE * Index}), + % Get location of hashtable and number of entries in the hash + {HashTable, Count} = read_next_2_integers(Handle), + % If the count is 0 for that index - key must be missing + case Count of + 0 -> + missing; + _ -> + % Get starting slot in hashtable + {ok, FirstHashPosition} = file:position(Handle, {bof, HashTable}), + Slot = hash_to_slot(Hash, Count), + {ok, _} = file:position(Handle, {cur, Slot * ?DWORD_SIZE}), + LastHashPosition = HashTable + ((Count-1) * ?DWORD_SIZE), + LocList = lists:seq(FirstHashPosition, LastHashPosition, ?DWORD_SIZE), + % Split list around starting slot. + {L1, L2} = lists:split(Slot, LocList), + search_hash_table(Handle, lists:append(L2, L1), Hash, Key, CRCCheck) + end. + +%% Get a Key/Value pair from an active CDB file (with no hash table written) +%% This requires a key dictionary to be passed in (mapping keys to positions) +%% Will return {Key, Value} or missing +get_mem(Key, Filename, HashTree) when is_list(Filename) -> + {ok, Handle} = file:open(Filename, [binary, raw, read]), + get_mem(Key, Handle, HashTree); +get_mem(Key, Handle, HashTree) -> + extract_kvpair(Handle, get_hashtree(Key, HashTree), Key). + +%% Get the next key at a position in the file (or the first key if no position +%% is passed). Will return both a key and the next position +get_nextkey(Filename) when is_list(Filename) -> + {ok, Handle} = file:open(Filename, [binary, raw, read]), + get_nextkey(Handle); +get_nextkey(Handle) -> + {ok, _} = file:position(Handle, bof), + {FirstHashPosition, _} = read_next_2_integers(Handle), + get_nextkey(Handle, {256 * ?DWORD_SIZE, FirstHashPosition}). + +get_nextkey(Handle, {Position, FirstHashPosition}) -> + {ok, Position} = file:position(Handle, Position), + case read_next_2_integers(Handle) of + {KeyLength, ValueLength} -> + NextKey = read_next_string(Handle, KeyLength), + NextPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE, + case NextPosition of + FirstHashPosition -> + {NextKey, nomorekeys}; + _ -> + {NextKey, Handle, {NextPosition, FirstHashPosition}} + end; + eof -> + nomorekeys + end. + + +%%%%%%%%%%%%%%%%%%%% +%% Internal functions +%%%%%%%%%%%%%%%%%%%% + +%% Fetch a list of positions by passing a key to the HashTree +get_hashtree(Key, HashTree) -> + Hash = hash(Key), + Index = hash_to_index(Hash), + Tree = array:get(Index, HashTree), + case gb_trees:lookup(Hash, Tree) of + {value, List} -> + List; + _ -> + [] + end. + +%% Add to hash tree - this is an array of 256 gb_trees that contains the Hash +%% and position of objects which have been added to an open CDB file +put_hashtree(Key, Position, HashTree) -> + Hash = hash(Key), + Index = hash_to_index(Hash), + Tree = array:get(Index, HashTree), + case gb_trees:lookup(Hash, Tree) of + none -> + array:set(Index, gb_trees:insert(Hash, [Position], Tree), HashTree); + {value, L} -> + array:set(Index, gb_trees:update(Hash, [Position|L], Tree), HashTree) + end. + +%% Function to extract a Key-Value pair given a file handle and a position +%% Will confirm that the key matches and do a CRC check when requested +extract_kvpair(Handle, Positions, Key) -> + extract_kvpair(Handle, Positions, Key, ?CRC_CHECK). + +extract_kvpair(_, [], _, _) -> + missing; +extract_kvpair(Handle, [Position|Rest], Key, Check) -> + {ok, _} = file:position(Handle, Position), + {KeyLength, ValueLength} = read_next_2_integers(Handle), + case read_next_string(Handle, KeyLength) of + Key -> % If same key as passed in, then found! + case read_next_string(Handle, ValueLength, crc, Check) of + {false, _} -> + crc_wonky; + {_, Value} -> + {Key,Value} + end; + _ -> + extract_kvpair(Handle, Rest, Key, Check) + end. + +%% Scan through the file until there is a failure to crc check an input, and +%% at that point return the position and the key dictionary scanned so far +scan_over_file(Handle, Position) -> + HashTree = array:new(256, {default, gb_trees:empty()}), + scan_over_file(Handle, Position, HashTree). + +scan_over_file(Handle, Position, HashTree) -> + case read_next_2_integers(Handle) of + {KeyLength, ValueLength} -> + Key = read_next_string(Handle, KeyLength), + {ok, ValueAsBin} = file:read(Handle, ValueLength), + case crccheck_value(ValueAsBin) of + true -> + NewPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE, + scan_over_file(Handle, NewPosition, + put_hashtree(Key, Position, HashTree)); + false -> + io:format("CRC check returned false on key of ~w ~n", [Key]), + {Position, HashTree} + end; + eof -> + {Position, HashTree} + end. + +%% The first four bytes of the value are the crc check +crccheck_value(Value) when byte_size(Value) >4 -> + << Hash:32/integer, Tail/bitstring>> = Value, + case calc_crc(Tail) of + Hash -> + true; + _ -> + io:format("CRC check failed due to mismatch ~n"), + false + end; +crccheck_value(_) -> + io:format("CRC check failed due to size ~n"), + false. + +%% Run a crc check filling out any values which don't fit on byte boundary +calc_crc(Value) -> + case bit_size(Value) rem 8 of + 0 -> + erlang:crc32(Value); + N -> + M = 8 - N, + erlang:crc32(<>) + end. + +%% +%% to_dict(FileName) +%% Given a filename returns a dict containing +%% the key value pairs from the dict. +%% +%% @spec to_dict(filename()) -> dictionary() +%% where +%% filename() = string(), +%% dictionary() = dict() +%% +to_dict(FileName) -> + KeyValueList = dump(FileName), + dict:from_list(KeyValueList). + +read_next_string(Handle, Length) -> + {ok, Bin} = file:read(Handle, Length), + binary_to_list(Bin). + +%% Read next string where the string has a CRC prepended - stripping the crc +%% and checking if requested +read_next_string(Handle, Length, crc, Check) -> + case Check of + true -> + {ok, <>} = file:read(Handle, Length), + case calc_crc(Bin) of + CRC -> + {true, binary_to_list(Bin)}; + _ -> + {false, binary_to_list(Bin)} + end; + _ -> + {ok, _} = file:position(Handle, {cur, 4}), + {ok, Bin} = file:read(Handle, Length - 4), + {unchecked, binary_to_list(Bin)} + end. + + +%% Used for reading lengths +%% Note that the endian_flip is required to make the file format compatible +%% with CDB +read_next_2_integers(Handle) -> + case file:read(Handle,?DWORD_SIZE) of + {ok, <>} -> + {endian_flip(Int1), endian_flip(Int2)}; + MatchError + -> + MatchError + end. + +%% Seach the hash table for the matching hash and key. Be prepared for +%% multiple keys to have the same hash value. +search_hash_table(_Handle, [], _Hash, _Key, _CRCCHeck) -> + missing; +search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, CRCCheck) -> + {ok, _} = file:position(Handle, Entry), + {StoredHash, DataLoc} = read_next_2_integers(Handle), + io:format("looking in data location ~w~n", [DataLoc]), + case StoredHash of + Hash -> + KV = extract_kvpair(Handle, [DataLoc], Key, CRCCheck), + case KV of + missing -> + search_hash_table(Handle, RestOfEntries, Hash, Key, CRCCheck); + _ -> + KV + end; + 0 -> + % Hash is 0 so key must be missing as 0 found before Hash matched + missing; + _ -> + search_hash_table(Handle, RestOfEntries, Hash, Key, CRCCheck) + end. + +% Write Key and Value tuples into the CDB. Each tuple consists of a +% 4 byte key length, a 4 byte value length, the actual key followed +% by the value. +% +% Returns a dictionary that is keyed by +% the least significant 8 bits of each hash with the +% values being a list of the hash and the position of the +% key/value binary in the file. +write_key_value_pairs(Handle, KeyValueList) -> + {ok, Position} = file:position(Handle, cur), + HashTree = array:new(256, {default, gb_trees:empty()}), + write_key_value_pairs(Handle, KeyValueList, {Position, HashTree}). + +write_key_value_pairs(_, [], Acc) -> + Acc; +write_key_value_pairs(Handle, [HeadPair|TailList], Acc) -> + {Key, Value} = HeadPair, + {NewPosition, HashTree} = put(Handle, Key, Value, Acc), + write_key_value_pairs(Handle, TailList, {NewPosition, HashTree}). + +%% Write the actual hashtables at the bottom of the file. Each hash table +%% entry is a doubleword in length. The first word is the hash value +%% corresponding to a key and the second word is a file pointer to the +%% corresponding {key,value} tuple. +write_hash_tables(Handle, HashTree) -> + Seq = lists:seq(0, 255), + {ok, StartPos} = file:position(Handle, cur), + write_hash_tables(Seq, Handle, HashTree, StartPos, []). + +write_hash_tables([], Handle, _, StartPos, IndexList) -> + {ok, EndPos} = file:position(Handle, cur), + ok = file:advise(Handle, StartPos, EndPos - StartPos, will_need), + IndexList; +write_hash_tables([Index|Rest], Handle, HashTree, StartPos, IndexList) -> + Tree = array:get(Index, HashTree), + case gb_trees:keys(Tree) of + [] -> + write_hash_tables(Rest, Handle, HashTree, StartPos, IndexList); + _ -> + HashList = gb_trees:to_list(Tree), + BinList = build_binaryhashlist(HashList, []), + IndexLength = length(BinList) * 2, + SlotList = lists:duplicate(IndexLength, <<0:32, 0:32>>), + + Fn = fun({Hash, Binary}, AccSlotList) -> + Slot1 = find_open_slot(AccSlotList, Hash), + {L1, [<<0:32, 0:32>>|L2]} = lists:split(Slot1, AccSlotList), + lists:append(L1, [Binary|L2]) + end, + NewSlotList = lists:foldl(Fn, SlotList, BinList), + + {ok, CurrPos} = file:position(Handle, cur), + file:write(Handle, NewSlotList), + write_hash_tables(Rest, Handle, HashTree, StartPos, + [{Index, CurrPos, IndexLength}|IndexList]) + end. + +%% The list created from the original HashTree may have duplicate positions +%% e.g. {Key, [Value1, Value2]}. Before any writing is done it is necessary +%% to know the actual number of hashes - or the Slot may not be sized correctly +%% +%% This function creates {Hash, Binary} pairs on a list where there is a unique +%% entry for eveyr Key/Value +build_binaryhashlist([], BinList) -> + BinList; +build_binaryhashlist([{Hash, [Position|TailP]}|TailKV], BinList) -> + HashLE = endian_flip(Hash), + PosLE = endian_flip(Position), + NewBin = <>, + case TailP of + [] -> + build_binaryhashlist(TailKV, [{Hash, NewBin}|BinList]); + _ -> + build_binaryhashlist([{Hash, TailP}|TailKV], [{Hash, NewBin}|BinList]) + end. + +%% Slot is zero based because it comes from a REM +find_open_slot(List, Hash) -> + Len = length(List), + Slot = hash_to_slot(Hash, Len), + Seq = lists:seq(1, Len), + {CL1, CL2} = lists:split(Slot, Seq), + {L1, L2} = lists:split(Slot, List), + find_open_slot1(lists:append(CL2, CL1), lists:append(L2, L1)). + +find_open_slot1([Slot|_RestOfSlots], [<<0:32,0:32>>|_RestOfEntries]) -> + Slot - 1; +find_open_slot1([_|RestOfSlots], [_|RestOfEntries]) -> + find_open_slot1(RestOfSlots, RestOfEntries). + + +%% Write the top most 255 doubleword entries. First word is the +%% file pointer to a hashtable and the second word is the number of entries +%% in the hash table +%% The List passed in should be made up of {Index, Position, Count} tuples +write_top_index_table(Handle, BasePos, List) -> + % fold function to find any missing index tuples, and add one a replacement + % in this case with a count of 0. Also orders the list by index + FnMakeIndex = fun(I, Acc) -> + case lists:keysearch(I, 1, List) of + {value, Tuple} -> + [Tuple|Acc]; + false -> + [{I, BasePos, 0}|Acc] + end + end, + % Fold function to write the index entries + FnWriteIndex = fun({Index, Pos, Count}, CurrPos) -> + {ok, _} = file:position(Handle, ?DWORD_SIZE * Index), + case Count == 0 of + true -> + PosLE = endian_flip(CurrPos), + NextPos = CurrPos; + false -> + PosLE = endian_flip(Pos), + NextPos = Pos + (Count * ?DWORD_SIZE) + end, + CountLE = endian_flip(Count), + Bin = <>, + file:write(Handle, Bin), + NextPos + end, + + Seq = lists:seq(0, 255), + CompleteList = lists:keysort(1, lists:foldl(FnMakeIndex, [], Seq)), + lists:foldl(FnWriteIndex, BasePos, CompleteList), + ok = file:advise(Handle, 0, ?DWORD_SIZE * 256, will_need). + + +endian_flip(Int) -> + <> = <>, + X. + +hash(Key) -> + H = 5381, + hash1(H,Key) band 16#FFFFFFFF. + +hash1(H,[]) ->H; +hash1(H,[B|Rest]) -> + H1 = H * 33, + H2 = H1 bxor B, + hash1(H2,Rest). + +% Get the least significant 8 bits from the hash. +hash_to_index(Hash) -> + Hash band 255. + +hash_to_slot(Hash,L) -> + (Hash bsr 8) rem L. + +%% Create a binary of the LengthKeyLengthValue, adding a CRC check +%% at the front of the value +key_value_to_record({Key,Value}) -> + L1 = endian_flip(length(Key)), + L2 = endian_flip(length(Value) + 4), + LB1 = list_to_binary(Key), + LB2 = list_to_binary(Value), + CRC = calc_crc(LB2), + <>. + +%%%%%%%%%%%%%%%% +% T E S T +%%%%%%%%%%%%%%% + +hash_1_test() -> + Hash = hash("key1"), + ?assertMatch(Hash,2088047427). + +hash_to_index_1_test() -> + Hash = hash("key1"), + Index = hash_to_index(Hash), + ?assertMatch(Index,67). + +hash_to_index_2_test() -> + Hash = 256, + I = hash_to_index(Hash), + ?assertMatch(I,0). + +hash_to_index_3_test() -> + Hash = 268, + I = hash_to_index(Hash), + ?assertMatch(I,12). + +hash_to_index_4_test() -> + Hash = hash("key2"), + Index = hash_to_index(Hash), + ?assertMatch(Index,64). + +write_key_value_pairs_1_test() -> + {ok,Handle} = file:open("test.cdb",write), + {_, HashTree} = write_key_value_pairs(Handle,[{"key1","value1"},{"key2","value2"}]), + Hash1 = hash("key1"), + Index1 = hash_to_index(Hash1), + Hash2 = hash("key2"), + Index2 = hash_to_index(Hash2), + R0 = array:new(256, {default, gb_trees:empty()}), + R1 = array:set(Index1, gb_trees:insert(Hash1, [0], array:get(Index1, R0)), R0), + R2 = array:set(Index2, gb_trees:insert(Hash2, [22], array:get(Index2, R1)), R1), + ?assertMatch(R2, HashTree). + + +write_hash_tables_1_test() -> + {ok, Handle} = file:open("test.cdb",write), + R0 = array:new(256, {default, gb_trees:empty()}), + R1 = array:set(64, gb_trees:insert(6383014720, [18], array:get(64, R0)), R0), + R2 = array:set(67, gb_trees:insert(6383014723, [0], array:get(67, R1)), R1), + Result = write_hash_tables(Handle, R2), + io:format("write hash tables result of ~w ~n", [Result]), + ?assertMatch(Result,[{67,16,2},{64,0,2}]). + +find_open_slot_1_test() -> + List = [<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], + Slot = find_open_slot(List,0), + ?assertMatch(Slot,1). + +find_open_slot_2_test() -> + List = [<<0:32,0:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], + Slot = find_open_slot(List,0), + ?assertMatch(Slot,0). + +find_open_slot_3_test() -> + List = [<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>], + Slot = find_open_slot(List,2), + ?assertMatch(Slot,3). + +find_open_slot_4_test() -> + List = [<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>], + Slot = find_open_slot(List,1), + ?assertMatch(Slot,0). + +find_open_slot_5_test() -> + List = [<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>], + Slot = find_open_slot(List,3), + ?assertMatch(Slot,2). + +full_1_test() -> + List1 = lists:sort([{"key1","value1"},{"key2","value2"}]), + create("simple.cdb",lists:sort([{"key1","value1"},{"key2","value2"}])), + List2 = lists:sort(dump("simple.cdb")), + ?assertMatch(List1,List2). + +full_2_test() -> + List1 = lists:sort([{lists:flatten(io_lib:format("~s~p",[Prefix,Plug])), + lists:flatten(io_lib:format("value~p",[Plug]))} + || Plug <- lists:seq(1,2000), + Prefix <- ["dsd","so39ds","oe9%#*(","020dkslsldclsldowlslf%$#", + "tiep4||","qweq"]]), + create("full.cdb",List1), + List2 = lists:sort(dump("full.cdb")), + ?assertMatch(List1,List2). + +from_dict_test() -> + D = dict:new(), + D1 = dict:store("a","b",D), + D2 = dict:store("c","d",D1), + ok = from_dict("from_dict_test.cdb",D2), + io:format("Store created ~n", []), + KVP = lists:sort(dump("from_dict_test.cdb")), + D3 = lists:sort(dict:to_list(D2)), + io:format("KVP is ~w~n", [KVP]), + io:format("D3 is ~w~n", [D3]), + ?assertMatch(KVP,D3). + +to_dict_test() -> + D = dict:new(), + D1 = dict:store("a","b",D), + D2 = dict:store("c","d",D1), + ok = from_dict("from_dict_test.cdb",D2), + Dict = to_dict("from_dict_test.cdb"), + D3 = lists:sort(dict:to_list(D2)), + D4 = lists:sort(dict:to_list(Dict)), + ?assertMatch(D4,D3). + +crccheck_emptyvalue_test() -> + ?assertMatch(false, crccheck_value(<<>>)). + +crccheck_shortvalue_test() -> + Value = <<128,128,32>>, + ?assertMatch(false, crccheck_value(Value)). + +crccheck_justshortvalue_test() -> + Value = <<128,128,32,64>>, + ?assertMatch(false, crccheck_value(Value)). + +crccheck_correctvalue_test() -> + Value = term_to_binary("some text as value"), + Hash = erlang:crc32(Value), + ValueOnDisk = <>, + ?assertMatch(true, crccheck_value(ValueOnDisk)). + +crccheck_wronghash_test() -> + Value = term_to_binary("some text as value"), + Hash = erlang:crc32(Value) + 1, + ValueOnDisk = <>, + ?assertMatch(false, crccheck_value(ValueOnDisk)). + +crccheck_truncatedvalue_test() -> + Value = term_to_binary("some text as value"), + Hash = erlang:crc32(Value), + ValueOnDisk = <>, + Size = bit_size(ValueOnDisk) - 1, + <> = ValueOnDisk, + ?assertMatch(false, crccheck_value(TruncatedValue)). + +activewrite_singlewrite_test() -> + Key = "0002", + Value = "some text as new value", + InitialD = dict:new(), + InitialD1 = dict:store("0001", "Initial value", InitialD), + ok = from_dict("test_mem.cdb", InitialD1), + io:format("New db file created ~n", []), + {LastPosition, KeyDict} = open_active_file("test_mem.cdb"), + io:format("File opened as new active file " + "with LastPosition=~w ~n", [LastPosition]), + {_, UpdKeyDict} = put("test_mem.cdb", Key, Value, {LastPosition, KeyDict}), + io:format("New key and value added to active file ~n", []), + ?assertMatch({Key, Value}, get_mem(Key, "test_mem.cdb", UpdKeyDict)). + +search_hash_table_findinslot_test() -> + Key1 = "key1", % this is in slot 3 if count is 8 + D = dict:from_list([{Key1, "value1"}, {"K2", "V2"}, {"K3", "V3"}, + {"K4", "V4"}, {"K5", "V5"}, {"K6", "V6"}, {"K7", "V7"}, + {"K8", "V8"}]), + ok = from_dict("hashtable1_test.cdb",D), + {ok, Handle} = file:open("hashtable1_test.cdb", [binary, raw, read, write]), + Hash = hash(Key1), + Index = hash_to_index(Hash), + {ok, _} = file:position(Handle, {bof, ?DWORD_SIZE*Index}), + {HashTable, Count} = read_next_2_integers(Handle), + io:format("Count of ~w~n", [Count]), + {ok, FirstHashPosition} = file:position(Handle, {bof, HashTable}), + Slot = hash_to_slot(Hash, Count), + io:format("Slot of ~w~n", [Slot]), + {ok, _} = file:position(Handle, {cur, Slot * ?DWORD_SIZE}), + {ReadH3, ReadP3} = read_next_2_integers(Handle), + {ReadH4, ReadP4} = read_next_2_integers(Handle), + io:format("Slot 1 has Hash ~w Position ~w~n", [ReadH3, ReadP3]), + io:format("Slot 2 has Hash ~w Position ~w~n", [ReadH4, ReadP4]), + ?assertMatch(0, ReadH4), + ?assertMatch({"key1", "value1"}, get(Handle, Key1)), + {ok, _} = file:position(Handle, FirstHashPosition), + FlipH3 = endian_flip(ReadH3), + FlipP3 = endian_flip(ReadP3), + RBin = <>, + io:format("Replacement binary of ~w~n", [RBin]), + {ok, OldBin} = file:pread(Handle, + FirstHashPosition + (Slot -1) * ?DWORD_SIZE, 16), + io:format("Bin to be replaced is ~w ~n", [OldBin]), + ok = file:pwrite(Handle, FirstHashPosition + (Slot -1) * ?DWORD_SIZE, RBin), + ok = file:close(Handle), + io:format("Find key following change to hash table~n"), + ?assertMatch(missing, get("hashtable1_test.cdb", Key1)). + +getnextkey_test() -> + L = [{"K9", "V9"}, {"K2", "V2"}, {"K3", "V3"}, + {"K4", "V4"}, {"K5", "V5"}, {"K6", "V6"}, {"K7", "V7"}, + {"K8", "V8"}, {"K1", "V1"}], + ok = create("hashtable1_test.cdb", L), + {FirstKey, Handle, P1} = get_nextkey("hashtable1_test.cdb"), + io:format("Next position details of ~w~n", [P1]), + ?assertMatch("K9", FirstKey), + {SecondKey, Handle, P2} = get_nextkey(Handle, P1), + ?assertMatch("K2", SecondKey), + {_, Handle, P3} = get_nextkey(Handle, P2), + {_, Handle, P4} = get_nextkey(Handle, P3), + {_, Handle, P5} = get_nextkey(Handle, P4), + {_, Handle, P6} = get_nextkey(Handle, P5), + {_, Handle, P7} = get_nextkey(Handle, P6), + {_, Handle, P8} = get_nextkey(Handle, P7), + {LastKey, Info} = get_nextkey(Handle, P8), + ?assertMatch(nomorekeys, Info), + ?assertMatch("K1", LastKey). + +newactivefile_test() -> + {LastPosition, _} = open_active_file("activefile_test.cdb"), + ?assertMatch(256 * ?DWORD_SIZE, LastPosition), + Response = get_nextkey("activefile_test.cdb"), + ?assertMatch(nomorekeys, Response). + + + + + + + + + diff --git a/src/leveled_internal.beam b/src/leveled_internal.beam new file mode 100644 index 0000000..793f013 Binary files /dev/null and b/src/leveled_internal.beam differ diff --git a/src/leveled_internal.erl b/src/leveled_internal.erl new file mode 100644 index 0000000..874fe61 --- /dev/null +++ b/src/leveled_internal.erl @@ -0,0 +1,118 @@ +-module(leveled_internal). +-export([termiterator/6]). +-include_lib("eunit/include/eunit.hrl"). + + +%% We will have a sorted list of terms +%% Some terms will be dummy terms which are pointers to more terms which can be found +%% If a pointer is hit need to replenish the term list before proceeding +%% +%% Helper Functions should have free functions - FolderFun, CompareFun, PointerCheck} +%% FolderFun - function which takes the next item and the accumulator and returns an updated accunulator +%% CompareFun - function which should be able to compare two keys (which are not pointers) +%% PointerCheck - function for differentiating between keys and pointer + +termiterator(HeadItem, [], Acc, HelperFuns, _StartKey, _EndKey) -> + io:format("Reached empty list with head item of ~w~n", [HeadItem]), + case HeadItem of + null -> + Acc; + _ -> + {FolderFun, _, _} = HelperFuns, + FolderFun(Acc, HeadItem) + end; +termiterator(null, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) -> + %% Check that the NextItem is not a pointer before promoting to HeadItem + %% Cannot now promote a HeadItem which is a pointer + {_, _, PointerCheck} = HelperFuns, + case PointerCheck(NextItem) of + {true, Pointer} -> + NewSlice = getnextslice(Pointer, EndKey), + ExtendedList = lists:merge(NewSlice, TailList), + termiterator(null, ExtendedList, Acc, HelperFuns, StartKey, EndKey); + false -> + termiterator(NextItem, TailList, Acc, HelperFuns, StartKey, EndKey) + end; +termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) -> + io:format("Checking head item of ~w~n", [HeadItem]), + {FolderFun, CompareFun, PointerCheck} = HelperFuns, + %% HeadItem cannot be pointer, but NextItem might be, so check before comparison + case PointerCheck(NextItem) of + {true, Pointer} -> + NewSlice = getnextslice(Pointer, EndKey), + ExtendedList = lists:merge(NewSlice, [NextItem|TailList]), + termiterator(null, ExtendedList, Acc, HelperFuns, StartKey, EndKey); + false -> + %% Compare to see if Head and Next match, or if Head is a winner to be added + %% to accumulator + case CompareFun(HeadItem, NextItem) of + {match, StrongItem, _WeakItem} -> + %% Discard WeakItem + termiterator(StrongItem, TailList, Acc, HelperFuns, StartKey, EndKey); + {winner, HeadItem} -> + %% Add next item to accumulator, and proceed with next item + AccPlus = FolderFun(Acc, HeadItem), + termiterator(NextItem, TailList, AccPlus, HelperFuns, HeadItem, EndKey) + end + end. + + + +pointercheck_indexkey(IndexKey) -> + case IndexKey of + {i, _Bucket, _Index, _Term, _Key, _Sequence, {zpointer, Pointer}} -> + {true, Pointer}; + _ -> + false + end. + +folder_indexkey(Acc, IndexKey) -> + io:format("Folding index key of - ~w~n", [IndexKey]), + case IndexKey of + {i, _Bucket, _Index, _Term, _Key, _Sequence, tombstone} -> + Acc; + {i, _Bucket, _Index, _Term, Key, _Sequence, null} -> + io:format("Adding key ~s~n", [Key]), + lists:append(Acc, [Key]) + end. + +compare_indexkey(IndexKey1, IndexKey2) -> + {i, Bucket1, Index1, Term1, Key1, Sequence1, _Value1} = IndexKey1, + {i, Bucket2, Index2, Term2, Key2, Sequence2, _Value2} = IndexKey2, + case {Bucket1, Index1, Term1, Key1} of + {Bucket2, Index2, Term2, Key2} when Sequence1 >= Sequence2 -> + {match, IndexKey1, IndexKey2}; + {Bucket2, Index2, Term2, Key2} -> + {match, IndexKey2, IndexKey1}; + _ when IndexKey2 >= IndexKey1 -> + {winner, IndexKey1}; + _ -> + {winner, IndexKey2} + end. + + +getnextslice(Pointer, _EndKey) -> + case Pointer of + {test, NewList} -> + NewList; + _ -> + [] + end. + + +%% Unit tests + + +iterateoverindexkeyswithnopointer_test_() -> + Key1 = {i, "pdsRecord", "familyName_bin", "1972SMITH", "10001", 1, null}, + Key2 = {i, "pdsRecord", "familyName_bin", "1972SMITH", "10001", 2, tombstone}, + Key3 = {i, "pdsRecord", "familyName_bin", "1971SMITH", "10002", 2, null}, + Key4 = {i, "pdsRecord", "familyName_bin", "1972JONES", "10003", 2, null}, + KeyList = lists:sort([Key1, Key2, Key3, Key4]), + HelperFuns = {fun folder_indexkey/2, fun compare_indexkey/2, fun pointercheck_indexkey/1}, + ResultList = ["10002", "10003"], + ?_assertEqual(ResultList, termiterator(null, KeyList, [], HelperFuns, "1971", "1973")). + + + + diff --git a/src/onstartfile.bst b/src/onstartfile.bst new file mode 100644 index 0000000..72153f2 Binary files /dev/null and b/src/onstartfile.bst differ diff --git a/src/rice.erl b/src/rice.erl new file mode 100644 index 0000000..68dd78a --- /dev/null +++ b/src/rice.erl @@ -0,0 +1,155 @@ +-module(rice). +-export([encode/1, + encode/2, + checkforhash/2, + converttohash/1]). +-include_lib("eunit/include/eunit.hrl"). + +%% Factor is the power of 2 representing the expected normal gap size between +%% members of the hash, and therefore the size of the bitstring to represent the +%% remainder for the gap +%% +%% The encoded output should contain a single byte which is the Factor, followed +%% by a series of exponents and remainders. +%% +%% The exponent is n 1's followed by a 0, where n * (2 ^ Factor) + remainder +%% represents the gap to the next hash +%% +%% The size passed in should be the maximum possible value of the hash. +%% If this isn't provided - assumes 2^32 - the default for phash2 + +encode(HashList) -> + encode(HashList, 4 * 1024 * 1024 * 1024). + +encode(HashList, Size) -> + SortedHashList = lists:usort(HashList), + ExpectedGapSize = Size div length(SortedHashList), + Factor = findpowerundergap(ExpectedGapSize), + riceencode(SortedHashList, Factor). + +%% Outcome may be suboptimal if lists have not been de-duplicated +%% Will fail on an unsorted list + +riceencode(HashList, Factor) when Factor<256 -> + Divisor = powtwo(Factor), + riceencode(HashList, Factor, Divisor, <<>>, 0). + +riceencode([], Factor, _, BitStrAcc, _) -> + Prefix = binary:encode_unsigned(Factor), + <>; +riceencode([HeadHash|TailList], Factor, Divisor, BitStrAcc, LastHash) -> + HashGap = HeadHash - LastHash, + case HashGap of + 0 -> + riceencode(TailList, Factor, Divisor, BitStrAcc, HeadHash); + N when N > 0 -> + Exponent = buildexponent(HashGap div Divisor), + Remainder = HashGap rem Divisor, + ExpandedBitStrAcc = <>, + riceencode(TailList, Factor, Divisor, ExpandedBitStrAcc, HeadHash) + end. + + +%% Checking for a hash needs to roll through the compressed bloom, decoding until +%% the member is found (match!), passed (not matched) or the end of the encoded +%% bitstring has been reached (not matched) + +checkforhash(HashToCheck, BitStr) -> + <> = BitStr, + Divisor = powtwo(Factor), + checkforhash(HashToCheck, RiceEncodedBitStr, Factor, Divisor, 0). + +checkforhash(_, <<>>, _, _, _) -> + false; +checkforhash(HashToCheck, BitStr, Factor, Divisor, Acc) -> + [Exponent, BitStrTail] = findexponent(BitStr), + [Remainder, BitStrTail2] = findremainder(BitStrTail, Factor), + NextHash = Acc + Divisor * Exponent + Remainder, + case NextHash of + HashToCheck -> true; + N when N>HashToCheck -> false; + _ -> checkforhash(HashToCheck, BitStrTail2, Factor, Divisor, NextHash) + end. + + +%% Exported functions - currently used only in testing + +converttohash(ItemList) -> + converttohash(ItemList, []). + +converttohash([], HashList) -> + HashList; +converttohash([H|T], HashList) -> + converttohash(T, [erlang:phash2(H)|HashList]). + + + +%% Helper functions + +buildexponent(Exponent) -> + buildexponent(Exponent, <<0:1>>). + +buildexponent(0, OutputBits) -> + OutputBits; +buildexponent(Exponent, OutputBits) -> + buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>). + + +findexponent(BitStr) -> + findexponent(BitStr, 0). + +findexponent(BitStr, Acc) -> + <> = BitStr, + case H of + <<1:1>> -> findexponent(T, Acc + 1); + <<0:1>> -> [Acc, T] + end. + + +findremainder(BitStr, Factor) -> + <> = BitStr, + [Remainder, BitStrTail]. + + +powtwo(N) -> powtwo(N, 1). + +powtwo(0, Acc) -> + Acc; +powtwo(N, Acc) -> + powtwo(N-1, Acc * 2). + +%% Helper method for finding the factor of two which provides the most +%% efficient compression given an average gap size + +findpowerundergap(GapSize) -> findpowerundergap(GapSize, 1, 0). + +findpowerundergap(GapSize, Acc, Counter) -> + case Acc of + N when N > GapSize -> Counter - 1; + _ -> findpowerundergap(GapSize, Acc * 2, Counter + 1) + end. + + +%% Unit tests + +findpowerundergap_test_() -> + [ + ?_assertEqual(9, findpowerundergap(700)), + ?_assertEqual(9, findpowerundergap(512)), + ?_assertEqual(8, findpowerundergap(511))]. + +encode_test_() -> + [ + ?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,924], 1024)), + ?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,24,924], 1024)), + ?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,924,924], 1024)) + ]. + +check_test_() -> + [ + ?_assertEqual(true, checkforhash(924, <<9, 6, 44, 4:5>>)), + ?_assertEqual(true, checkforhash(24, <<9, 6, 44, 4:5>>)), + ?_assertEqual(false, checkforhash(23, <<9, 6, 44, 4:5>>)), + ?_assertEqual(false, checkforhash(923, <<9, 6, 44, 4:5>>)), + ?_assertEqual(false, checkforhash(925, <<9, 6, 44, 4:5>>)) + ]. diff --git a/src/simple.cdb b/src/simple.cdb new file mode 100644 index 0000000..14a53c0 Binary files /dev/null and b/src/simple.cdb differ diff --git a/src/test.cdb b/src/test.cdb new file mode 100644 index 0000000..2540a5b Binary files /dev/null and b/src/test.cdb differ diff --git a/src/test_inconsole.cdb b/src/test_inconsole.cdb new file mode 100644 index 0000000..e69de29 diff --git a/src/test_mem.cdb b/src/test_mem.cdb new file mode 100644 index 0000000..f6a008c Binary files /dev/null and b/src/test_mem.cdb differ diff --git a/test/lookup_test.beam b/test/lookup_test.beam new file mode 100644 index 0000000..3c8d764 Binary files /dev/null and b/test/lookup_test.beam differ diff --git a/test/lookup_test.erl b/test/lookup_test.erl new file mode 100644 index 0000000..f8632f2 --- /dev/null +++ b/test/lookup_test.erl @@ -0,0 +1,241 @@ +-module(lookup_test). + +-export([go_dict/1, go_ets/1, go_gbtree/1, + go_arrayofdict/1, go_arrayofgbtree/1, go_arrayofdict_withcache/1]). + +-define(CACHE_SIZE, 512). + +hash(Key) -> + H = 5381, + hash1(H,Key) band 16#FFFFFFFF. + +hash1(H,[]) ->H; +hash1(H,[B|Rest]) -> + H1 = H * 33, + H2 = H1 bxor B, + hash1(H2,Rest). + +% Get the least significant 8 bits from the hash. +hash_to_index(Hash) -> + Hash band 255. + + +%% +%% Timings (microseconds): +%% +%% go_dict(200000) : 1569894 +%% go_dict(1000000) : 17191365 +%% go_dict(5000000) : forever + +go_dict(N) -> + go_dict(dict:new(), N, N). + +go_dict(_, 0, _) -> + {erlang:memory(), statistics(garbage_collection)}; +go_dict(D, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + dict:find(LookupHash, D), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + case dict:find(Hash, D) of + error -> + go_dict(dict:store(Hash, [N], D), N-1, M); + {ok, List} -> + go_dict(dict:store(Hash, [N|List], D), N-1, M) + end. + + + +%% +%% Timings (microseconds): +%% +%% go_ets(200000) : 609119 +%% go_ets(1000000) : 3520757 +%% go_ets(5000000) : 19974562 + +go_ets(N) -> + go_ets(ets:new(ets_test, [private, bag]), N, N). + +go_ets(_, 0, _) -> + {erlang:memory(), statistics(garbage_collection)}; +go_ets(Ets, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + ets:lookup(Ets, LookupHash), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + ets:insert(Ets, {Hash, N}), + go_ets(Ets, N - 1, M). + +%% +%% Timings (microseconds): +%% +%% go_gbtree(200000) : 1393936 +%% go_gbtree(1000000) : 8430997 +%% go_gbtree(5000000) : 45630810 + +go_gbtree(N) -> + go_gbtree(gb_trees:empty(), N, N). + +go_gbtree(_, 0, _) -> + {erlang:memory(), statistics(garbage_collection)}; +go_gbtree(Tree, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + gb_trees:lookup(LookupHash, Tree), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + case gb_trees:lookup(Hash, Tree) of + none -> + go_gbtree(gb_trees:insert(Hash, [N], Tree), N - 1, M); + {value, List} -> + go_gbtree(gb_trees:update(Hash, [N|List], Tree), N - 1, M) + end. + + +%% +%% Timings (microseconds): +%% +%% go_arrayofidict(200000) : 1266931 +%% go_arrayofidict(1000000) : 7387219 +%% go_arrayofidict(5000000) : 49511484 + +go_arrayofdict(N) -> + go_arrayofdict(array:new(256, {default, dict:new()}), N, N). + +go_arrayofdict(_, 0, _) -> + % dict:to_list(array:get(0, Array)), + % dict:to_list(array:get(1, Array)), + % dict:to_list(array:get(2, Array)), + % dict:to_list(array:get(3, Array)), + % dict:to_list(array:get(4, Array)), + % dict:to_list(array:get(5, Array)), + % dict:to_list(array:get(6, Array)), + % dict:to_list(array:get(7, Array)), + % dict:to_list(array:get(8, Array)), + % dict:to_list(array:get(9, Array)), + {erlang:memory(), statistics(garbage_collection)}; +go_arrayofdict(Array, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + LookupIndex = hash_to_index(LookupHash), + dict:find(LookupHash, array:get(LookupIndex, Array)), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + Index = hash_to_index(Hash), + D = array:get(Index, Array), + case dict:find(Hash, D) of + error -> + go_arrayofdict(array:set(Index, + dict:store(Hash, [N], D), Array), N-1, M); + {ok, List} -> + go_arrayofdict(array:set(Index, + dict:store(Hash, [N|List], D), Array), N-1, M) + end. + +%% +%% Timings (microseconds): +%% +%% go_arrayofgbtree(200000) : 1176224 +%% go_arrayofgbtree(1000000) : 7480653 +%% go_arrayofgbtree(5000000) : 41266701 + +go_arrayofgbtree(N) -> + go_arrayofgbtree(array:new(256, {default, gb_trees:empty()}), N, N). + +go_arrayofgbtree(_, 0, _) -> + % gb_trees:to_list(array:get(0, Array)), + % gb_trees:to_list(array:get(1, Array)), + % gb_trees:to_list(array:get(2, Array)), + % gb_trees:to_list(array:get(3, Array)), + % gb_trees:to_list(array:get(4, Array)), + % gb_trees:to_list(array:get(5, Array)), + % gb_trees:to_list(array:get(6, Array)), + % gb_trees:to_list(array:get(7, Array)), + % gb_trees:to_list(array:get(8, Array)), + % gb_trees:to_list(array:get(9, Array)), + {erlang:memory(), statistics(garbage_collection)}; +go_arrayofgbtree(Array, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + LookupIndex = hash_to_index(LookupHash), + gb_trees:lookup(LookupHash, array:get(LookupIndex, Array)), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + Index = hash_to_index(Hash), + Tree = array:get(Index, Array), + case gb_trees:lookup(Hash, Tree) of + none -> + go_arrayofgbtree(array:set(Index, + gb_trees:insert(Hash, [N], Tree), Array), N - 1, M); + {value, List} -> + go_arrayofgbtree(array:set(Index, + gb_trees:update(Hash, [N|List], Tree), Array), N - 1, M) + end. + + +%% +%% Timings (microseconds): +%% +%% go_arrayofdict_withcache(200000) : 1432951 +%% go_arrayofdict_withcache(1000000) : 9140169 +%% go_arrayofdict_withcache(5000000) : 59435511 + +go_arrayofdict_withcache(N) -> + go_arrayofdict_withcache({array:new(256, {default, dict:new()}), + array:new(256, {default, dict:new()})}, N, N). + +go_arrayofdict_withcache(_, 0, _) -> + {erlang:memory(), statistics(garbage_collection)}; +go_arrayofdict_withcache({MArray, CArray}, N, M) -> + % Lookup a random key - which may not be present + LookupKey = lists:concat(["key-", random:uniform(M)]), + LookupHash = hash(LookupKey), + LookupIndex = hash_to_index(LookupHash), + dict:find(LookupHash, array:get(LookupIndex, CArray)), + dict:find(LookupHash, array:get(LookupIndex, MArray)), + + % Add a new key - which may be present so value to be appended + Key = lists:concat(["key-", N]), + Hash = hash(Key), + Index = hash_to_index(Hash), + Cache = array:get(Index, CArray), + case dict:find(Hash, Cache) of + error -> + UpdCache = dict:store(Hash, [N], Cache); + {ok, _} -> + UpdCache = dict:append(Hash, N, Cache) + end, + case dict:size(UpdCache) of + ?CACHE_SIZE -> + UpdCArray = array:set(Index, dict:new(), CArray), + UpdMArray = array:set(Index, dict:merge(fun merge_values/3, UpdCache, array:get(Index, MArray)), MArray), + go_arrayofdict_withcache({UpdMArray, UpdCArray}, N - 1, M); + _ -> + UpdCArray = array:set(Index, UpdCache, CArray), + go_arrayofdict_withcache({MArray, UpdCArray}, N - 1, M) + end. + + + +merge_values(_, Value1, Value2) -> + lists:append(Value1, Value2). + + +