Initial files proving concepts

WIP - nothing currently workable
2015-05-25 22:45:45 +01:00 · 2015-05-25 22:45:45 +01:00 · e2099d0c14
commit e2099d0c14
parent 85a6998ca0
19 changed files with 1491 additions and 0 deletions
--- a/src/activefile_test.cdb
+++ b/src/activefile_test.cdb
--- a/src/eleveleddb.app.src
+++ b/src/eleveleddb.app.src
@ -0,0 +1,17 @@
+{application, eleveleddb,
+ [
+  {description, ""},
+  {vsn, "0.0.1"},
+  {modules, []},
+  {registered, []},
+  {applications, [
+                  kernel,
+                  stdlib
+                 ]},
+  {mod, {eleveleddb_app, []}},
+  {env, [
+         %% Default max file size (in bytes)
+         {max_file_size, 32#80000000}, % 4GB default
+
+        ]}
+ ]}.
--- a/src/from_dict_test.cdb
+++ b/src/from_dict_test.cdb
--- a/src/full.cdb
+++ b/src/full.cdb
--- a/src/hashtable1_test.cdb
+++ b/src/hashtable1_test.cdb
--- a/src/leveled_bst.beam
+++ b/src/leveled_bst.beam
--- a/src/leveled_bst.erl
+++ b/src/leveled_bst.erl
@ -0,0 +1,156 @@
+%%
+%% This module provides functions for managing bst files - a modified version
+%% of sst files, to be used in leveleddb.
+%% bst files are borken into the following sections:
+%% - Header (fixed width 32 bytes - containing pointers and metadata)
+%% - Blocks (variable length)
+%% - Slots (variable length)
+%% - Footer (variable length - contains slot index and helper metadata)
+%%
+%% The 32-byte header is made up of
+%% - 1 byte version (major 5 bits, minor 3 bits) - default 0.1
+%% - 1 byte state bits (1 bit to indicate mutability, 1 for use of compression)
+%% - 4 bytes footer position
+%% - 4 bytes slot list length
+%% - 4 bytes helper length
+%% - 14 bytes spare for future options
+%% - 4 bytes CRC (header)  
+%%
+%% The Blocks is a series of blocks of: 
+%% - 4 byte block length
+%% - variable-length compressed list of 32 keys & values
+%% - 4 byte CRC for block
+%% There will be up to 4000 blocks in a single bst file
+%%
+%% The slots is a series of references
+%% - 4 byte bloom-filter length
+%% - 4 byte key-helper length 
+%% - a variable-length compressed bloom filter for all keys in slot (approx 1KB)
+%% - 32 ordered variable-length key helpers pointing to first key in each 
+%% block (in slot) of the form Key Length, Key, Block Position
+%% - 4 byte CRC for the slot
+%%
+%% The slot index in the footer is made up of 128 keys and pointers at the 
+%% the start of each slot
+%% - 128 Key Length (4 byte), Key, Position (4 byte) indexes
+%% - 4 bytes CRC for the index
+%%
+%% The format of the file is intended to support quick lookups, whilst 
+%% allowing for a new file to be written incrementally (so that all keys and
+%% values need not be retained in memory) - perhaps n blocks at a time
+
+
+-module(leveled_bst).
+
+-export([start_file/1, convert_header/1]).
+
+-include_lib("eunit/include/eunit.hrl").
+
+-define(WORD_SIZE, 4).
+-define(CURRENT_VERSION, {0,1}).
+-define(SLOT_COUNT, 128).
+-define(BLOCK_SIZE, 32).
+-define(SLOT_SIZE, 32).
+
+-record(metadata, {version = ?CURRENT_VERSION :: tuple(),
+					   mutable = false :: true | false,
+					   compressed = true :: tre | false,
+					   slot_list :: list(),
+					   cache :: tuple(),
+					   smallest_key :: tuple(),
+					   largest_key :: tuple(),
+					   smallest_sqn :: integer(),
+					   largest_sqn :: integer()
+					  }).
+
+%% Start a bare file with an initial header and no further details
+%% Return the {Handle, metadata record}
+start_file(FileName) when is_list(FileName) ->
+	{ok, Handle} = file:open(FileName, [binary, raw, read, write]),
+	start_file(Handle);
+start_file(Handle) -> 
+	Header = create_header(initial),
+	{ok, _} = file:position(Handle, bof),
+	file:write(Handle, Header),
+	{Version, {M, C}, _, _} = convert_header(Header),
+	FileMD = #metadata{version=Version, mutable=M, compressed=C},
+	SlotArray = array:new(?SLOT_COUNT),
+	{Handle, FileMD, SlotArray}.
+
+
+create_header(initial) ->
+	{Major, Minor} = ?CURRENT_VERSION, 
+	Version = <<Major:5, Minor:3>>,
+	State = <<0:6, 1:1, 1:1>>, % Mutable and compressed
+	Lengths = <<0:32, 0:32, 0:32>>,
+	Options = <<0:112>>,
+	H1 = <<Version/binary, State/binary, Lengths/binary, Options/binary>>,
+	CRC32 = erlang:crc32(H1),
+	<<H1/binary, CRC32:32/integer>>.
+
+
+convert_header(Header) ->
+	<<H1:28/binary, CRC32:32/integer>> = Header,
+	case erlang:crc32(H1) of 
+		CRC32 ->
+			<<Major:5/integer, Minor:3/integer, _/binary>> = H1,
+			case {Major, Minor} of 
+				{0, 1} -> 
+					convert_header_v01(H1);
+				_ ->
+					unknown_version
+			end;
+		_ ->
+			crc_mismatch
+	end.
+
+convert_header_v01(Header) ->
+	<<_:8, 0:6, Mutable:1, Comp:1, 
+	FooterP:32/integer, SlotLng:32/integer, HlpLng:32/integer, 
+	_/binary>> = Header,
+	case Mutable of 
+		1 -> M = true;
+		0 -> M = false
+	end,
+	case Comp of 
+		1 -> C = true;
+		0 -> C = false
+	end,
+	{{0, 1}, {M, C}, {FooterP, SlotLng, HlpLng}, none}.
+
+
+
+
+%%%%%%%%%%%%%%%%
+% T E S T 
+%%%%%%%%%%%%%%%  
+
+empty_header_test() ->
+	Header = create_header(initial),
+	?assertMatch(32, byte_size(Header)),
+	<<Major:5, Minor:3, _/binary>> = Header,
+	?assertMatch({0, 1}, {Major, Minor}),
+	{Version, State, Lengths, Options} = convert_header(Header),
+	?assertMatch({0, 1}, Version),
+	?assertMatch({true, true}, State),
+	?assertMatch({0, 0, 0}, Lengths),
+	?assertMatch(none, Options).
+
+bad_header_test() ->
+	Header = create_header(initial),
+	<<_:1/binary, Rest/binary >> = Header,
+	HdrDetails1 = convert_header(<<0:5/integer, 2:3/integer, Rest/binary>>),
+	?assertMatch(crc_mismatch, HdrDetails1),
+	<<_:1/binary, RestToCRC:27/binary, _:32/integer>> = Header,
+	NewHdr1 = <<0:5/integer, 2:3/integer, RestToCRC/binary>>,
+	CRC32 = erlang:crc32(NewHdr1),
+	NewHdr2 = <<NewHdr1/binary, CRC32:32/integer>>,
+	?assertMatch(unknown_version, convert_header(NewHdr2)).
+
+record_onstartfile_test() ->
+	{_, FileMD, _} = start_file("onstartfile.bst"),
+	?assertMatch({0, 1}, FileMD#metadata.version).
+
+
+
+
--- a/src/leveled_cdb.beam
+++ b/src/leveled_cdb.beam
--- a/src/leveled_cdb.erl
+++ b/src/leveled_cdb.erl
@ -0,0 +1,804 @@
+%%
+%% This is a modified version of the cdb module provided by Tom Whitcomb.  
+%%
+%% - https://github.com/thomaswhitcomb/erlang-cdb
+%%
+%% The primary differences are: 
+%% - Support for incrementally writing a CDB file while keeping the hash table 
+%% in memory
+%% - Support for merging of multiple CDB files with a key-checking function to 
+%% allow for compaction 
+%% - Automatic adding of a helper object that will keep a small proportion of 
+%% keys to be used when checking to see if the  cdb file is a candidate for 
+%% compaction
+%% - The ability to scan a database and accumulate all the Key, Values to 
+%% rebuild in-memory tables on startup 
+%%
+%% This is to be used in eleveledb, and in this context: 
+%% - Keys will be a Sequence Number
+%% - Values will be a Checksum; Pointers (length * 3); Key; [Metadata]; [Value]
+%% where the pointers can be used to extract just part of the value 
+%% (i.e. metadata only)
+%%
+%% This module provides functions to create and query a CDB (constant database).
+%% A CDB implements a two-level hashtable which provides fast {key,value} 
+%% lookups that remain fairly constant in speed regardless of the CDBs size.
+%%
+%% The first level in the CDB occupies the first 255 doublewords in the file.  
+%% Each doubleword slot contains two values.  The first is a file pointer to 
+%% the primary hashtable (at the end of the file) and the second value is the 
+%% number of entries in the hashtable.  The first level table of 255 entries 
+%% is indexed with the lower eight bits of the hash of the input key.
+%%
+%% Following the 255 doublewords are the {key,value} tuples.  The tuples are 
+%% packed in the file without regard to word boundaries.  Each {key,value} 
+%% tuple is represented with a four byte key length, a four byte value length,
+%% the actual key value followed by the actual value.
+%%
+%% Following the {key,value} tuples are the primary hash tables.  There are 
+%% at most 255 hash tables.  Each hash table is referenced by one of the 255 
+%% doubleword entries at the top of the file. For efficiency reasons, each 
+%% hash table is allocated twice the number of entries that it will need.  
+%% Each entry in the hash table is a doubleword.
+%% The first word is the corresponding hash value and the second word is a 
+%% file pointer to the actual {key,value} tuple higher in the file.
+%%
+
+-module(leveled_cdb).
+
+-export([from_dict/2, 
+  create/2,
+  dump/1,
+  get/2,
+  get_mem/3,
+  put/4,
+  open_active_file/1,
+  get_nextkey/1,
+  get_nextkey/2]).
+
+-include_lib("eunit/include/eunit.hrl").
+
+-define(DWORD_SIZE, 8).
+-define(WORD_SIZE, 4).
+-define(CRC_CHECK, true).
+
+%%
+%% from_dict(FileName,ListOfKeyValueTuples)
+%% Given a filename and a dictionary, create a cdb
+%% using the key value pairs from the dict.
+%%
+%% @spec from_dict(filename(),dictionary()) -> ok
+%% where
+%%	filename() = string(),
+%%	dictionary() = dict()
+%%
+from_dict(FileName,Dict) ->
+  KeyValueList = dict:to_list(Dict),
+  create(FileName, KeyValueList).
+
+%%
+%% create(FileName,ListOfKeyValueTuples) -> ok
+%% Given a filename and a list of {key,value} tuples,
+%% this function creates a CDB
+%%
+create(FileName,KeyValueList) ->
+  {ok, Handle} = file:open(FileName, [write]),
+  {ok, _} = file:position(Handle, {bof, 2048}),
+  {BasePos, HashTree} = write_key_value_pairs(Handle, KeyValueList),
+  io:format("KVs has been written to base position ~w~n", [BasePos]),
+  L2 = write_hash_tables(Handle, HashTree),
+  io:format("Index list output of ~w~n", [L2]),
+  write_top_index_table(Handle, BasePos, L2),
+  file:close(Handle).
+
+%%
+%% dump(FileName) -> List
+%% Given a file name, this function returns a list
+%% of {key,value} tuples from the CDB.
+%%
+%%
+%% @spec dump(filename()) -> key_value_list()
+%% where
+%%  filename() = string(),
+%%  key_value_list() = [{key,value}]
+dump(FileName) ->
+  dump(FileName, ?CRC_CHECK).
+
+dump(FileName, CRCCheck) ->
+  {ok, Handle} = file:open(FileName, [binary,raw]),
+  Fn = fun(Index, Acc) ->
+    {ok, _} = file:position(Handle, ?DWORD_SIZE * Index),
+    {_, Count} = read_next_2_integers(Handle),
+    Acc + Count    
+  end,
+  NumberOfPairs = lists:foldl(Fn, 0, lists:seq(0,255)) bsr 1,
+  io:format("Count of keys in db is ~w~n", [NumberOfPairs]),
+  
+  {ok, _} = file:position(Handle, {bof, 2048}),
+  Fn1 = fun(_I,Acc) ->
+    {KL,VL} = read_next_2_integers(Handle),
+    Key = read_next_string(Handle, KL),
+    case read_next_string(Handle, VL, crc, CRCCheck) of
+      {false, _} ->
+        {ok, CurrLoc} = file:position(Handle, cur),
+        Return = {crc_wonky, get(Handle, Key)};
+      {_, Value} ->
+        {ok, CurrLoc} = file:position(Handle, cur),
+        Return = case get(Handle, Key) of
+          {Key,Value} -> {Key ,Value};
+          X ->  {wonky, X}
+        end
+    end,
+    {ok, _} = file:position(Handle, CurrLoc),
+    [Return | Acc]
+  end,
+  lists:foldr(Fn1,[],lists:seq(0,NumberOfPairs-1)).
+
+%% Open an active file - one for which it is assumed the hash tables have not 
+%% yet been written
+%%
+%% Needs to scan over file to incrementally produce the hash list, starting at 
+%% the end of the top index table.
+%%
+%% Should return a dictionary keyed by index containing a list of {Hash, Pos} 
+%% tuples as the write_key_value_pairs function, and the current position, and 
+%% the file handle
+open_active_file(FileName) when is_list(FileName) ->
+  {ok, Handle} = file:open(FileName, [binary, raw, read, write]),
+  {ok, Position} = file:position(Handle, {bof, 256*?DWORD_SIZE}),
+  {LastPosition, HashTree} = scan_over_file(Handle, Position),
+  case file:position(Handle, eof) of 
+    {ok, LastPosition} ->
+      ok = file:close(Handle);
+    {ok, _} ->
+      LogDetails = [LastPosition, file:position(Handle, eof)],
+      io:format("File to be truncated at last position of" 
+        "~w with end of file at ~w~n", LogDetails),
+      {ok, LastPosition} = file:position(Handle, LastPosition),
+      ok = file:truncate(Handle),
+      ok = file:close(Handle)
+  end,
+  {LastPosition, HashTree}.
+
+%% put(Handle, Key, Value, {LastPosition, HashDict}) -> {NewPosition, KeyDict}
+%% Append to an active file a new key/value pair returning an updated 
+%% dictionary of Keys and positions.  Returns an updated Position
+%%
+put(FileName, Key, Value, {LastPosition, HashTree}) when is_list(FileName) ->
+  {ok, Handle} = file:open(FileName, 
+    [binary, raw, read, write, delayed_write]),
+  put(Handle, Key, Value, {LastPosition, HashTree});
+put(Handle, Key, Value, {LastPosition, HashTree}) ->
+  Bin = key_value_to_record({Key, Value}), % create binary for Key and Value
+  ok = file:pwrite(Handle, LastPosition, Bin),
+  {LastPosition + byte_size(Bin), put_hashtree(Key, LastPosition, HashTree)}.
+
+
+%%
+%% get(FileName,Key) -> {key,value}
+%% Given a filename and a key, returns a key and value tuple.
+%%
+get(FileNameOrHandle, Key) ->
+  get(FileNameOrHandle, Key, ?CRC_CHECK).
+
+get(FileName, Key, CRCCheck) when is_list(FileName), is_list(Key) ->
+  {ok,Handle} = file:open(FileName,[binary,raw]),
+  get(Handle,Key, CRCCheck);
+
+get(Handle, Key, CRCCheck) when is_tuple(Handle), is_list(Key) ->
+  Hash = hash(Key),
+  Index = hash_to_index(Hash),
+  {ok,_} = file:position(Handle, {bof, ?DWORD_SIZE * Index}),
+  % Get location of hashtable and number of entries in the hash
+  {HashTable, Count} = read_next_2_integers(Handle),
+  % If the count is 0 for that index - key must be missing
+  case Count of
+    0 ->
+      missing;
+    _ ->
+      % Get starting slot in hashtable
+      {ok, FirstHashPosition} = file:position(Handle, {bof, HashTable}),
+      Slot = hash_to_slot(Hash, Count),  
+      {ok, _} = file:position(Handle, {cur, Slot * ?DWORD_SIZE}),
+      LastHashPosition = HashTable + ((Count-1) * ?DWORD_SIZE),
+      LocList = lists:seq(FirstHashPosition, LastHashPosition, ?DWORD_SIZE), 
+      % Split list around starting slot.
+      {L1, L2} = lists:split(Slot, LocList),
+      search_hash_table(Handle, lists:append(L2, L1), Hash, Key, CRCCheck)
+  end.
+
+%% Get a Key/Value pair from an active CDB file (with no hash table written)
+%% This requires a key dictionary to be passed in (mapping keys to positions)
+%% Will return {Key, Value} or missing
+get_mem(Key, Filename, HashTree) when is_list(Filename) ->
+  {ok, Handle} = file:open(Filename, [binary, raw, read]),
+  get_mem(Key, Handle, HashTree);
+get_mem(Key, Handle, HashTree) ->
+  extract_kvpair(Handle, get_hashtree(Key, HashTree), Key).
+
+%% Get the next key at a position in the file (or the first key if no position 
+%% is passed).  Will return both a key and the next position
+get_nextkey(Filename) when is_list(Filename) ->
+  {ok, Handle} = file:open(Filename, [binary, raw, read]),
+  get_nextkey(Handle);
+get_nextkey(Handle) ->
+  {ok, _} = file:position(Handle, bof),
+  {FirstHashPosition, _} = read_next_2_integers(Handle),
+  get_nextkey(Handle, {256 * ?DWORD_SIZE, FirstHashPosition}).
+
+get_nextkey(Handle, {Position, FirstHashPosition}) ->
+  {ok, Position} = file:position(Handle, Position),
+  case read_next_2_integers(Handle) of 
+    {KeyLength, ValueLength} ->
+      NextKey = read_next_string(Handle, KeyLength),
+      NextPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE,
+      case NextPosition of 
+        FirstHashPosition ->
+          {NextKey, nomorekeys};
+        _ ->
+          {NextKey, Handle, {NextPosition, FirstHashPosition}}
+      end;
+    eof ->
+      nomorekeys
+  end.
+
+
+%%%%%%%%%%%%%%%%%%%%
+%% Internal functions
+%%%%%%%%%%%%%%%%%%%%
+
+%% Fetch a list of positions by passing a key to the HashTree
+get_hashtree(Key, HashTree) ->
+  Hash = hash(Key),
+  Index = hash_to_index(Hash),
+  Tree = array:get(Index, HashTree),
+  case gb_trees:lookup(Hash, Tree) of 
+    {value, List} ->
+      List;
+    _ ->
+      []
+  end.
+
+%% Add to hash tree - this is an array of 256 gb_trees that contains the Hash 
+%% and position of objects which have been added to an open CDB file
+put_hashtree(Key, Position, HashTree) ->
+  Hash = hash(Key),
+  Index = hash_to_index(Hash),
+  Tree = array:get(Index, HashTree),
+  case gb_trees:lookup(Hash, Tree) of 
+      none ->
+          array:set(Index, gb_trees:insert(Hash, [Position], Tree), HashTree);
+      {value, L} ->
+          array:set(Index, gb_trees:update(Hash, [Position|L], Tree), HashTree)
+  end. 
+
+%% Function to extract a Key-Value pair given a file handle and a position
+%% Will confirm that the key matches and do a CRC check when requested
+extract_kvpair(Handle, Positions, Key) ->
+  extract_kvpair(Handle, Positions, Key, ?CRC_CHECK).
+
+extract_kvpair(_, [], _, _) ->
+  missing;
+extract_kvpair(Handle, [Position|Rest], Key, Check) ->
+  {ok, _} = file:position(Handle, Position),
+  {KeyLength, ValueLength} = read_next_2_integers(Handle),
+  case read_next_string(Handle, KeyLength) of
+    Key ->  % If same key as passed in, then found!
+      case read_next_string(Handle, ValueLength, crc, Check) of
+        {false, _} -> 
+          crc_wonky;
+        {_, Value} ->
+          {Key,Value}
+      end;
+    _ ->
+      extract_kvpair(Handle, Rest, Key, Check)
+  end.
+
+%% Scan through the file until there is a failure to crc check an input, and 
+%% at that point return the position and the key dictionary scanned so far
+scan_over_file(Handle, Position) ->
+  HashTree = array:new(256, {default, gb_trees:empty()}),
+  scan_over_file(Handle, Position, HashTree).
+
+scan_over_file(Handle, Position, HashTree) ->
+  case read_next_2_integers(Handle) of 
+    {KeyLength, ValueLength} -> 
+      Key = read_next_string(Handle, KeyLength),
+      {ok, ValueAsBin} = file:read(Handle, ValueLength),
+      case crccheck_value(ValueAsBin) of
+        true ->
+          NewPosition = Position + KeyLength + ValueLength + ?DWORD_SIZE,
+          scan_over_file(Handle, NewPosition, 
+            put_hashtree(Key, Position, HashTree));
+        false ->
+          io:format("CRC check returned false on key of ~w ~n", [Key]),
+          {Position, HashTree}
+      end;
+    eof ->
+      {Position, HashTree}
+  end.
+
+%% The first four bytes of the value are the crc check
+crccheck_value(Value) when byte_size(Value) >4 ->
+  << Hash:32/integer, Tail/bitstring>> = Value,
+  case calc_crc(Tail) of 
+    Hash -> 
+      true;
+    _ -> 
+      io:format("CRC check failed due to mismatch ~n"),
+      false
+  end;
+crccheck_value(_) ->
+  io:format("CRC check failed due to size ~n"),
+  false.
+
+%% Run a crc check filling out any values which don't fit on byte boundary
+calc_crc(Value) ->
+  case bit_size(Value) rem 8 of 
+    0 -> 
+      erlang:crc32(Value);
+    N ->
+      M = 8 - N,
+      erlang:crc32(<<Value/bitstring,0:M>>)
+  end.
+
+%%
+%% to_dict(FileName)
+%% Given a filename returns a dict containing
+%% the key value pairs from the dict.
+%%
+%% @spec to_dict(filename()) -> dictionary()
+%% where
+%%  filename() = string(),
+%%  dictionary() = dict()
+%%
+to_dict(FileName) ->
+  KeyValueList = dump(FileName),
+  dict:from_list(KeyValueList).
+
+read_next_string(Handle, Length) ->
+  {ok, Bin} = file:read(Handle, Length),
+  binary_to_list(Bin).
+
+%% Read next string where the string has a CRC prepended - stripping the crc 
+%% and checking if requested
+read_next_string(Handle, Length, crc, Check) ->
+  case Check of 
+    true ->
+      {ok, <<CRC:32/integer, Bin/binary>>} = file:read(Handle, Length),
+      case calc_crc(Bin) of 
+        CRC ->
+          {true, binary_to_list(Bin)};
+        _ ->
+          {false, binary_to_list(Bin)}
+      end;
+    _ ->
+      {ok, _} = file:position(Handle, {cur, 4}),
+      {ok, Bin} = file:read(Handle, Length - 4),
+      {unchecked, binary_to_list(Bin)}
+  end.
+
+
+%% Used for reading lengths
+%% Note that the endian_flip is required to make the file format compatible 
+%% with CDB 
+read_next_2_integers(Handle) ->
+  case file:read(Handle,?DWORD_SIZE) of 
+    {ok, <<Int1:32,Int2:32>>} -> 
+      {endian_flip(Int1), endian_flip(Int2)};
+    MatchError
+      ->
+        MatchError
+  end.
+
+%% Seach the hash table for the matching hash and key.  Be prepared for 
+%% multiple keys to have the same hash value.
+search_hash_table(_Handle, [], _Hash, _Key, _CRCCHeck) -> 
+  missing;
+search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, CRCCheck) ->
+  {ok, _} = file:position(Handle, Entry),
+  {StoredHash, DataLoc} = read_next_2_integers(Handle),
+  io:format("looking in data location ~w~n", [DataLoc]),
+  case StoredHash of
+    Hash ->
+      KV = extract_kvpair(Handle, [DataLoc], Key, CRCCheck),
+      case KV of
+        missing ->
+          search_hash_table(Handle, RestOfEntries, Hash, Key, CRCCheck);
+        _ ->
+          KV 
+      end;
+    0 ->
+      % Hash is 0 so key must be missing as 0 found before Hash matched
+      missing;
+    _ ->
+      search_hash_table(Handle, RestOfEntries, Hash, Key, CRCCheck)
+  end.
+
+% Write Key and Value tuples into the CDB.  Each tuple consists of a
+% 4 byte key length, a 4 byte value length, the actual key followed
+% by the value.
+%
+% Returns a dictionary that is keyed by
+% the least significant 8 bits of each hash with the
+% values being a list of the hash and the position of the 
+% key/value binary in the file.
+write_key_value_pairs(Handle, KeyValueList) ->
+  {ok, Position} = file:position(Handle, cur),
+  HashTree = array:new(256, {default, gb_trees:empty()}),
+  write_key_value_pairs(Handle, KeyValueList, {Position, HashTree}).
+
+write_key_value_pairs(_, [], Acc) ->
+  Acc;
+write_key_value_pairs(Handle, [HeadPair|TailList], Acc) -> 
+  {Key, Value} = HeadPair,
+  {NewPosition, HashTree} = put(Handle, Key, Value, Acc),
+  write_key_value_pairs(Handle, TailList, {NewPosition, HashTree}).
+
+%% Write the actual hashtables at the bottom of the file.  Each hash table
+%% entry is a doubleword in length.  The first word is the hash value 
+%% corresponding to a key and the second word is a file pointer to the 
+%% corresponding {key,value} tuple.
+write_hash_tables(Handle, HashTree) ->
+  Seq = lists:seq(0, 255),
+  {ok, StartPos} = file:position(Handle, cur),
+  write_hash_tables(Seq, Handle, HashTree, StartPos, []).
+
+write_hash_tables([], Handle, _, StartPos, IndexList) ->
+  {ok, EndPos} = file:position(Handle, cur),
+  ok = file:advise(Handle, StartPos, EndPos - StartPos, will_need),
+  IndexList;
+write_hash_tables([Index|Rest], Handle, HashTree, StartPos, IndexList) ->
+  Tree = array:get(Index, HashTree),
+  case gb_trees:keys(Tree) of 
+    [] ->
+      write_hash_tables(Rest, Handle, HashTree, StartPos, IndexList);
+    _ ->
+      HashList = gb_trees:to_list(Tree),
+      BinList = build_binaryhashlist(HashList, []),
+      IndexLength = length(BinList) * 2,
+      SlotList = lists:duplicate(IndexLength, <<0:32, 0:32>>),
+      
+      Fn = fun({Hash, Binary}, AccSlotList) ->
+        Slot1 = find_open_slot(AccSlotList, Hash),
+        {L1, [<<0:32, 0:32>>|L2]} = lists:split(Slot1, AccSlotList),
+        lists:append(L1, [Binary|L2])
+      end,
+      NewSlotList = lists:foldl(Fn, SlotList, BinList),
+      
+      {ok, CurrPos} = file:position(Handle, cur),
+      file:write(Handle, NewSlotList),
+      write_hash_tables(Rest, Handle, HashTree, StartPos, 
+        [{Index, CurrPos, IndexLength}|IndexList])
+  end.
+
+%% The list created from the original HashTree may have duplicate positions 
+%% e.g. {Key, [Value1, Value2]}.  Before any writing is done it is necessary
+%% to know the actual number of hashes - or the Slot may not be sized correctly
+%%
+%% This function creates {Hash, Binary} pairs on a list where there is a unique
+%% entry for eveyr Key/Value
+build_binaryhashlist([], BinList) ->
+  BinList;
+build_binaryhashlist([{Hash, [Position|TailP]}|TailKV], BinList) ->
+  HashLE = endian_flip(Hash),
+  PosLE = endian_flip(Position),
+  NewBin = <<HashLE:32, PosLE:32>>,
+  case TailP of 
+    [] ->
+      build_binaryhashlist(TailKV, [{Hash, NewBin}|BinList]);
+    _ ->
+      build_binaryhashlist([{Hash, TailP}|TailKV], [{Hash, NewBin}|BinList])
+  end.
+
+%% Slot is zero based because it comes from a REM
+find_open_slot(List, Hash) ->
+  Len = length(List),
+  Slot = hash_to_slot(Hash, Len),
+  Seq = lists:seq(1, Len),
+  {CL1, CL2} = lists:split(Slot, Seq),
+  {L1, L2} = lists:split(Slot, List),
+  find_open_slot1(lists:append(CL2, CL1), lists:append(L2, L1)).
+  
+find_open_slot1([Slot|_RestOfSlots], [<<0:32,0:32>>|_RestOfEntries]) -> 
+  Slot - 1;
+find_open_slot1([_|RestOfSlots], [_|RestOfEntries]) -> 
+  find_open_slot1(RestOfSlots, RestOfEntries).
+
+
+%% Write the top most 255 doubleword entries.  First word is the 
+%% file pointer to a hashtable and the second word is the number of entries 
+%% in the hash table
+%% The List passed in should be made up of {Index, Position, Count} tuples
+write_top_index_table(Handle, BasePos, List) ->
+  % fold function to find any missing index tuples, and add one a replacement 
+  % in this case with a count of 0.  Also orders the list by index
+  FnMakeIndex = fun(I, Acc) ->
+    case lists:keysearch(I, 1, List) of
+      {value, Tuple} ->
+        [Tuple|Acc];
+      false ->
+        [{I, BasePos, 0}|Acc]
+    end
+  end,
+  % Fold function to write the index entries
+  FnWriteIndex = fun({Index, Pos, Count}, CurrPos) ->
+    {ok, _} = file:position(Handle, ?DWORD_SIZE * Index),
+    case Count == 0 of
+      true ->
+        PosLE = endian_flip(CurrPos),
+        NextPos = CurrPos;
+      false ->
+        PosLE = endian_flip(Pos),
+        NextPos = Pos + (Count * ?DWORD_SIZE)
+    end, 
+    CountLE = endian_flip(Count),
+    Bin = <<PosLE:32, CountLE:32>>,
+    file:write(Handle, Bin),
+    NextPos
+  end,
+
+  Seq = lists:seq(0, 255),
+  CompleteList = lists:keysort(1, lists:foldl(FnMakeIndex, [], Seq)),
+  lists:foldl(FnWriteIndex, BasePos, CompleteList),
+  ok = file:advise(Handle, 0, ?DWORD_SIZE * 256, will_need).
+
+  
+endian_flip(Int) ->
+  <<X:32/unsigned-little-integer>> = <<Int:32>>,
+  X.
+
+hash(Key) ->
+  H = 5381,
+  hash1(H,Key) band 16#FFFFFFFF.
+
+hash1(H,[]) ->H;
+hash1(H,[B|Rest]) ->
+  H1 = H * 33,
+  H2 = H1 bxor B,
+  hash1(H2,Rest).
+
+% Get the least significant 8 bits from the hash.
+hash_to_index(Hash) ->
+  Hash band 255.
+
+hash_to_slot(Hash,L) ->
+  (Hash bsr 8) rem L.
+
+%% Create a binary of the LengthKeyLengthValue, adding a CRC check
+%% at the front of the value
+key_value_to_record({Key,Value}) ->
+  L1 = endian_flip(length(Key)),
+  L2 = endian_flip(length(Value) + 4),
+  LB1 = list_to_binary(Key), 
+  LB2 = list_to_binary(Value), 
+  CRC = calc_crc(LB2),
+  <<L1:32,L2:32,LB1/binary,CRC:32/integer,LB2/binary>>.
+
+%%%%%%%%%%%%%%%%
+% T E S T 
+%%%%%%%%%%%%%%%  
+
+hash_1_test() ->
+  Hash = hash("key1"),
+  ?assertMatch(Hash,2088047427).
+
+hash_to_index_1_test() ->
+  Hash = hash("key1"),
+  Index = hash_to_index(Hash),
+  ?assertMatch(Index,67).
+
+hash_to_index_2_test() ->
+  Hash = 256,
+  I = hash_to_index(Hash),
+  ?assertMatch(I,0).
+ 
+hash_to_index_3_test() ->
+  Hash = 268,
+  I = hash_to_index(Hash),
+  ?assertMatch(I,12).
+
+hash_to_index_4_test() ->
+  Hash = hash("key2"),
+  Index = hash_to_index(Hash),
+  ?assertMatch(Index,64).
+
+write_key_value_pairs_1_test() ->
+  {ok,Handle} = file:open("test.cdb",write),
+  {_, HashTree} = write_key_value_pairs(Handle,[{"key1","value1"},{"key2","value2"}]),
+  Hash1 = hash("key1"),
+  Index1 = hash_to_index(Hash1),
+  Hash2 = hash("key2"),
+  Index2 = hash_to_index(Hash2),
+  R0 = array:new(256, {default, gb_trees:empty()}),
+  R1 = array:set(Index1, gb_trees:insert(Hash1, [0], array:get(Index1, R0)), R0),
+  R2 = array:set(Index2, gb_trees:insert(Hash2, [22], array:get(Index2, R1)), R1),
+  ?assertMatch(R2, HashTree).
+
+
+write_hash_tables_1_test() ->
+  {ok, Handle} = file:open("test.cdb",write),
+  R0 = array:new(256, {default, gb_trees:empty()}),
+  R1 = array:set(64, gb_trees:insert(6383014720, [18], array:get(64, R0)), R0),
+  R2 = array:set(67, gb_trees:insert(6383014723, [0], array:get(67, R1)), R1),
+  Result = write_hash_tables(Handle, R2),
+  io:format("write hash tables result of ~w ~n", [Result]),
+  ?assertMatch(Result,[{67,16,2},{64,0,2}]).
+
+find_open_slot_1_test() ->
+  List = [<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>],
+  Slot = find_open_slot(List,0),
+  ?assertMatch(Slot,1).
+
+find_open_slot_2_test() ->
+  List = [<<0:32,0:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>],
+  Slot = find_open_slot(List,0),
+  ?assertMatch(Slot,0).
+
+find_open_slot_3_test() ->
+  List = [<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>],
+  Slot = find_open_slot(List,2),
+  ?assertMatch(Slot,3).
+
+find_open_slot_4_test() ->
+  List = [<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>],
+  Slot = find_open_slot(List,1),
+  ?assertMatch(Slot,0).
+
+find_open_slot_5_test() ->
+  List = [<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>],
+  Slot = find_open_slot(List,3),
+  ?assertMatch(Slot,2).
+
+full_1_test() ->
+  List1 = lists:sort([{"key1","value1"},{"key2","value2"}]),
+  create("simple.cdb",lists:sort([{"key1","value1"},{"key2","value2"}])),
+  List2 = lists:sort(dump("simple.cdb")),
+  ?assertMatch(List1,List2).
+
+full_2_test() ->
+  List1 = lists:sort([{lists:flatten(io_lib:format("~s~p",[Prefix,Plug])),
+    lists:flatten(io_lib:format("value~p",[Plug]))} 
+    ||  Plug <- lists:seq(1,2000),
+    Prefix <- ["dsd","so39ds","oe9%#*(","020dkslsldclsldowlslf%$#",
+      "tiep4||","qweq"]]),
+  create("full.cdb",List1),
+  List2 = lists:sort(dump("full.cdb")),
+  ?assertMatch(List1,List2).
+
+from_dict_test() ->
+  D = dict:new(),
+  D1 = dict:store("a","b",D),
+  D2 = dict:store("c","d",D1),
+  ok = from_dict("from_dict_test.cdb",D2),
+  io:format("Store created ~n", []),
+  KVP = lists:sort(dump("from_dict_test.cdb")),
+  D3 = lists:sort(dict:to_list(D2)),
+  io:format("KVP is ~w~n", [KVP]),
+  io:format("D3 is ~w~n", [D3]),
+  ?assertMatch(KVP,D3).
+
+to_dict_test() ->
+  D = dict:new(),
+  D1 = dict:store("a","b",D),
+  D2 = dict:store("c","d",D1),
+  ok = from_dict("from_dict_test.cdb",D2),
+  Dict = to_dict("from_dict_test.cdb"),
+  D3 = lists:sort(dict:to_list(D2)),
+  D4 = lists:sort(dict:to_list(Dict)),
+  ?assertMatch(D4,D3).
+
+crccheck_emptyvalue_test() ->
+  ?assertMatch(false, crccheck_value(<<>>)).
+
+crccheck_shortvalue_test() ->
+  Value = <<128,128,32>>,
+  ?assertMatch(false, crccheck_value(Value)).
+
+crccheck_justshortvalue_test() ->
+  Value = <<128,128,32,64>>,
+  ?assertMatch(false, crccheck_value(Value)).
+
+crccheck_correctvalue_test() ->
+  Value = term_to_binary("some text as value"),
+  Hash = erlang:crc32(Value),
+  ValueOnDisk = <<Hash:32/integer, Value/binary>>,
+  ?assertMatch(true, crccheck_value(ValueOnDisk)).
+
+crccheck_wronghash_test() ->
+  Value = term_to_binary("some text as value"),
+  Hash = erlang:crc32(Value) + 1,
+  ValueOnDisk = <<Hash:32/integer, Value/binary>>,
+  ?assertMatch(false, crccheck_value(ValueOnDisk)).
+
+crccheck_truncatedvalue_test() ->
+  Value = term_to_binary("some text as value"),
+  Hash = erlang:crc32(Value),
+  ValueOnDisk = <<Hash:32/integer, Value/binary>>,
+  Size = bit_size(ValueOnDisk) - 1,
+  <<TruncatedValue:Size/bitstring, _/bitstring>> = ValueOnDisk,
+  ?assertMatch(false, crccheck_value(TruncatedValue)).
+
+activewrite_singlewrite_test() ->
+  Key = "0002",
+  Value = "some text as new value",
+  InitialD = dict:new(),
+  InitialD1 = dict:store("0001", "Initial value", InitialD),
+  ok = from_dict("test_mem.cdb", InitialD1),
+  io:format("New db file created ~n", []),
+  {LastPosition, KeyDict} = open_active_file("test_mem.cdb"),
+  io:format("File opened as new active file " 
+    "with LastPosition=~w ~n", [LastPosition]),
+  {_, UpdKeyDict} = put("test_mem.cdb", Key, Value, {LastPosition, KeyDict}),
+  io:format("New key and value added to active file ~n", []),
+  ?assertMatch({Key, Value}, get_mem(Key, "test_mem.cdb", UpdKeyDict)).
+
+search_hash_table_findinslot_test() ->
+  Key1 = "key1", % this is in slot 3 if count is 8
+  D = dict:from_list([{Key1, "value1"}, {"K2", "V2"}, {"K3", "V3"}, 
+    {"K4", "V4"}, {"K5", "V5"}, {"K6", "V6"}, {"K7", "V7"}, 
+    {"K8", "V8"}]),
+  ok = from_dict("hashtable1_test.cdb",D),
+  {ok, Handle} = file:open("hashtable1_test.cdb", [binary, raw, read, write]),
+  Hash = hash(Key1),
+  Index = hash_to_index(Hash),
+  {ok, _} = file:position(Handle, {bof, ?DWORD_SIZE*Index}),
+  {HashTable, Count} = read_next_2_integers(Handle),
+  io:format("Count of ~w~n", [Count]),
+  {ok, FirstHashPosition} = file:position(Handle, {bof, HashTable}),
+  Slot = hash_to_slot(Hash, Count),
+  io:format("Slot of ~w~n", [Slot]),
+  {ok, _} = file:position(Handle, {cur, Slot * ?DWORD_SIZE}),
+  {ReadH3, ReadP3} = read_next_2_integers(Handle),
+  {ReadH4, ReadP4} = read_next_2_integers(Handle),
+  io:format("Slot 1 has Hash ~w Position ~w~n", [ReadH3, ReadP3]),
+  io:format("Slot 2 has Hash ~w Position ~w~n", [ReadH4, ReadP4]),
+  ?assertMatch(0, ReadH4),
+  ?assertMatch({"key1", "value1"}, get(Handle, Key1)),
+  {ok, _} = file:position(Handle, FirstHashPosition),
+  FlipH3 = endian_flip(ReadH3),
+  FlipP3 = endian_flip(ReadP3),
+  RBin = <<FlipH3:32/integer, FlipP3:32/integer, 0:32/integer, 0:32/integer>>,
+  io:format("Replacement binary of ~w~n", [RBin]),
+  {ok, OldBin} = file:pread(Handle, 
+    FirstHashPosition + (Slot -1)  * ?DWORD_SIZE, 16),
+  io:format("Bin to be replaced is ~w ~n", [OldBin]),
+  ok = file:pwrite(Handle, FirstHashPosition + (Slot -1) * ?DWORD_SIZE, RBin),
+  ok = file:close(Handle),
+  io:format("Find key following change to hash table~n"),
+  ?assertMatch(missing, get("hashtable1_test.cdb", Key1)).
+
+getnextkey_test() ->
+  L = [{"K9", "V9"}, {"K2", "V2"}, {"K3", "V3"}, 
+    {"K4", "V4"}, {"K5", "V5"}, {"K6", "V6"}, {"K7", "V7"}, 
+    {"K8", "V8"}, {"K1", "V1"}],
+  ok = create("hashtable1_test.cdb", L),
+  {FirstKey, Handle, P1} = get_nextkey("hashtable1_test.cdb"),
+  io:format("Next position details of ~w~n", [P1]),
+  ?assertMatch("K9", FirstKey),
+  {SecondKey, Handle, P2} = get_nextkey(Handle, P1),
+  ?assertMatch("K2", SecondKey),
+  {_, Handle, P3} = get_nextkey(Handle, P2),
+  {_, Handle, P4} = get_nextkey(Handle, P3),
+  {_, Handle, P5} = get_nextkey(Handle, P4),
+  {_, Handle, P6} = get_nextkey(Handle, P5),
+  {_, Handle, P7} = get_nextkey(Handle, P6),
+  {_, Handle, P8} = get_nextkey(Handle, P7),
+  {LastKey, Info} = get_nextkey(Handle, P8),
+  ?assertMatch(nomorekeys, Info),
+  ?assertMatch("K1", LastKey).
+
+newactivefile_test() ->
+  {LastPosition, _} = open_active_file("activefile_test.cdb"),
+  ?assertMatch(256 * ?DWORD_SIZE, LastPosition),
+  Response = get_nextkey("activefile_test.cdb"),
+  ?assertMatch(nomorekeys, Response).
+
+
+
+
+
+
+
+
+
--- a/src/leveled_internal.beam
+++ b/src/leveled_internal.beam
--- a/src/leveled_internal.erl
+++ b/src/leveled_internal.erl
@ -0,0 +1,118 @@
+-module(leveled_internal).
+-export([termiterator/6]).
+-include_lib("eunit/include/eunit.hrl").
+
+
+%% We will have a sorted list of terms
+%% Some terms will be dummy terms which are pointers to more terms which can be found
+%% If a pointer is hit need to replenish the term list before proceeding
+%%
+%% Helper Functions should have free functions - FolderFun, CompareFun, PointerCheck}
+%% FolderFun - function which takes the next item and the accumulator and returns an updated accunulator
+%% CompareFun - function which should be able to compare two keys (which are not pointers)
+%% PointerCheck - function for differentiating between keys and pointer
+
+termiterator(HeadItem, [], Acc, HelperFuns, _StartKey, _EndKey) ->
+	io:format("Reached empty list with head item of ~w~n", [HeadItem]),
+	case HeadItem of 
+		null ->
+			Acc;
+		_ ->
+			{FolderFun, _, _} = HelperFuns,
+			FolderFun(Acc, HeadItem)
+	end;
+termiterator(null, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) ->
+	%% Check that the NextItem is not a pointer before promoting to HeadItem
+	%% Cannot now promote a HeadItem which is a pointer
+	{_, _, PointerCheck} = HelperFuns,
+	case PointerCheck(NextItem) of 
+		{true, Pointer} ->
+			NewSlice = getnextslice(Pointer, EndKey),
+			ExtendedList = lists:merge(NewSlice, TailList),
+			termiterator(null, ExtendedList, Acc, HelperFuns, StartKey, EndKey);
+		false ->
+			termiterator(NextItem, TailList, Acc, HelperFuns, StartKey, EndKey)
+	end;
+termiterator(HeadItem, [NextItem|TailList], Acc, HelperFuns, StartKey, EndKey) ->
+	io:format("Checking head item of ~w~n", [HeadItem]),
+	{FolderFun, CompareFun, PointerCheck} = HelperFuns,
+	%% HeadItem cannot be pointer, but NextItem might be, so check before comparison
+	case PointerCheck(NextItem) of 
+		{true, Pointer} ->
+			NewSlice = getnextslice(Pointer, EndKey),
+			ExtendedList = lists:merge(NewSlice, [NextItem|TailList]),
+			termiterator(null, ExtendedList, Acc, HelperFuns, StartKey, EndKey);
+		false ->
+			%% Compare to see if Head and Next match, or if Head is a winner to be added
+			%% to accumulator
+			case CompareFun(HeadItem, NextItem) of 
+				{match, StrongItem, _WeakItem} ->
+					%% Discard WeakItem
+					termiterator(StrongItem, TailList, Acc, HelperFuns, StartKey, EndKey);
+				{winner, HeadItem} ->
+					%% Add next item to accumulator, and proceed with next item
+					AccPlus = FolderFun(Acc, HeadItem),
+					termiterator(NextItem, TailList, AccPlus, HelperFuns, HeadItem, EndKey)
+			end
+	end.
+			
+
+
+pointercheck_indexkey(IndexKey) ->
+	case IndexKey of 
+		{i, _Bucket, _Index, _Term, _Key, _Sequence, {zpointer, Pointer}} ->
+			{true, Pointer};
+		_ -> 
+			false
+	end.
+
+folder_indexkey(Acc, IndexKey) ->
+	io:format("Folding index key of - ~w~n", [IndexKey]),
+	case IndexKey of 
+		{i, _Bucket, _Index, _Term, _Key, _Sequence, tombstone} ->
+			Acc;
+		{i, _Bucket, _Index, _Term, Key, _Sequence, null} ->
+			io:format("Adding key ~s~n", [Key]),
+			lists:append(Acc, [Key])
+	end.
+
+compare_indexkey(IndexKey1, IndexKey2) ->
+	{i, Bucket1, Index1, Term1, Key1, Sequence1, _Value1} = IndexKey1,
+	{i, Bucket2, Index2, Term2, Key2, Sequence2, _Value2} = IndexKey2,
+	case {Bucket1, Index1, Term1, Key1} of 
+		{Bucket2, Index2, Term2, Key2} when Sequence1 >= Sequence2 ->
+			{match, IndexKey1, IndexKey2};
+		{Bucket2, Index2, Term2, Key2} ->
+			{match, IndexKey2, IndexKey1};
+		_ when IndexKey2 >= IndexKey1 ->
+			{winner, IndexKey1};
+		_ ->
+			{winner, IndexKey2}
+	end.
+
+
+getnextslice(Pointer, _EndKey) ->
+	case Pointer of 
+		{test, NewList} ->
+			NewList;
+		_ ->
+			[]
+	end.
+
+
+%% Unit tests
+
+
+iterateoverindexkeyswithnopointer_test_() ->
+	Key1 = {i, "pdsRecord", "familyName_bin", "1972SMITH", "10001", 1, null},
+	Key2 = {i, "pdsRecord", "familyName_bin", "1972SMITH", "10001", 2, tombstone},
+	Key3 = {i, "pdsRecord", "familyName_bin", "1971SMITH", "10002", 2, null},
+	Key4 = {i, "pdsRecord", "familyName_bin", "1972JONES", "10003", 2, null},
+	KeyList = lists:sort([Key1, Key2, Key3, Key4]),
+	HelperFuns = {fun folder_indexkey/2, fun compare_indexkey/2, fun pointercheck_indexkey/1},
+	ResultList = ["10002", "10003"],
+	?_assertEqual(ResultList, termiterator(null, KeyList, [], HelperFuns, "1971", "1973")).
+
+
+
+
--- a/src/onstartfile.bst
+++ b/src/onstartfile.bst
--- a/src/rice.erl
+++ b/src/rice.erl
@ -0,0 +1,155 @@
+-module(rice).
+-export([encode/1, 
+	encode/2, 
+	checkforhash/2, 
+	converttohash/1]).
+-include_lib("eunit/include/eunit.hrl").
+
+%% Factor is the power of 2 representing the expected normal gap size between
+%% members of the hash, and therefore the size of the bitstring to represent the
+%% remainder for the gap
+%%
+%% The encoded output should contain a single byte which is the Factor, followed
+%% by a series of exponents and remainders.
+%%
+%% The exponent is n 1's followed by a 0, where n * (2 ^ Factor) + remainder 
+%% represents the gap to the next hash 
+%%
+%% The size passed in should be the maximum possible value of the hash.
+%% If this isn't provided - assumes 2^32 - the default for phash2 
+
+encode(HashList) ->
+	encode(HashList, 4 * 1024 * 1024 * 1024).
+
+encode(HashList, Size) ->
+	SortedHashList = lists:usort(HashList),
+	ExpectedGapSize = Size div length(SortedHashList),
+	Factor = findpowerundergap(ExpectedGapSize),
+	riceencode(SortedHashList, Factor).
+
+%% Outcome may be suboptimal if lists have not been de-duplicated
+%% Will fail on an unsorted list
+
+riceencode(HashList, Factor) when Factor<256 ->
+	Divisor = powtwo(Factor),
+	riceencode(HashList, Factor, Divisor, <<>>, 0).
+
+riceencode([], Factor, _, BitStrAcc, _) ->
+	Prefix  = binary:encode_unsigned(Factor),
+	<<Prefix/bytes, BitStrAcc/bitstring>>;
+riceencode([HeadHash|TailList], Factor, Divisor, BitStrAcc, LastHash) ->
+	HashGap = HeadHash - LastHash,
+	case HashGap of 
+		0 -> 
+			riceencode(TailList, Factor, Divisor, BitStrAcc, HeadHash);
+		N when N > 0 ->
+			Exponent = buildexponent(HashGap div Divisor),
+			Remainder = HashGap rem Divisor,
+			ExpandedBitStrAcc = <<BitStrAcc/bitstring, Exponent/bitstring, Remainder:Factor>>,
+			riceencode(TailList, Factor, Divisor, ExpandedBitStrAcc, HeadHash)
+		end.
+
+
+%% Checking for a hash needs to roll through the compressed bloom, decoding until
+%% the member is found (match!), passed (not matched) or the end of the encoded
+%% bitstring has been reached (not matched)
+
+checkforhash(HashToCheck, BitStr) ->
+	<<Factor:8/integer, RiceEncodedBitStr/bitstring>> = BitStr,
+	Divisor = powtwo(Factor),
+	checkforhash(HashToCheck, RiceEncodedBitStr, Factor, Divisor, 0).
+
+checkforhash(_, <<>>, _, _, _) ->
+	false;
+checkforhash(HashToCheck, BitStr, Factor, Divisor, Acc) ->
+	[Exponent, BitStrTail] = findexponent(BitStr),
+	[Remainder, BitStrTail2] = findremainder(BitStrTail, Factor),
+	NextHash = Acc + Divisor * Exponent + Remainder,
+	case NextHash of 
+		HashToCheck -> true;
+		N when N>HashToCheck -> false;
+		_ -> checkforhash(HashToCheck, BitStrTail2, Factor, Divisor, NextHash)
+	end.
+
+
+%% Exported functions - currently used only in testing
+
+converttohash(ItemList) -> 
+	converttohash(ItemList, []).
+
+converttohash([], HashList) ->
+	HashList;
+converttohash([H|T], HashList) ->
+	converttohash(T, [erlang:phash2(H)|HashList]).
+
+
+
+%% Helper functions
+
+buildexponent(Exponent) ->
+	buildexponent(Exponent, <<0:1>>).
+
+buildexponent(0, OutputBits) ->
+	OutputBits;
+buildexponent(Exponent, OutputBits) ->
+	buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>).
+
+
+findexponent(BitStr) ->
+	findexponent(BitStr, 0).
+
+findexponent(BitStr, Acc) ->
+	<<H:1/bitstring, T/bitstring>> = BitStr,
+	case H of
+		<<1:1>> -> findexponent(T, Acc + 1);
+		<<0:1>> -> [Acc, T]
+	end.
+
+
+findremainder(BitStr, Factor) ->
+	<<Remainder:Factor/integer, BitStrTail/bitstring>> = BitStr,
+	[Remainder, BitStrTail].
+
+
+powtwo(N) -> powtwo(N, 1).
+
+powtwo(0, Acc) ->
+	Acc;
+powtwo(N, Acc) ->
+	powtwo(N-1, Acc * 2).
+
+%% Helper method for finding the factor of two which provides the most 
+%% efficient compression given an average gap size
+
+findpowerundergap(GapSize) -> findpowerundergap(GapSize, 1, 0).
+
+findpowerundergap(GapSize, Acc, Counter) ->
+	case Acc of
+		N when N > GapSize -> Counter - 1;
+		_ -> findpowerundergap(GapSize, Acc * 2, Counter + 1)
+	end.
+
+
+%% Unit tests
+
+findpowerundergap_test_() -> 
+	[
+	?_assertEqual(9, findpowerundergap(700)), 
+	?_assertEqual(9, findpowerundergap(512)), 
+	?_assertEqual(8, findpowerundergap(511))].
+
+encode_test_() ->
+	[
+	?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,924], 1024)),
+	?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,24,924], 1024)),
+	?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,924,924], 1024))
+	].
+
+check_test_() ->
+	[
+	?_assertEqual(true, checkforhash(924, <<9, 6, 44, 4:5>>)),
+	?_assertEqual(true, checkforhash(24, <<9, 6, 44, 4:5>>)),
+	?_assertEqual(false, checkforhash(23, <<9, 6, 44, 4:5>>)),
+	?_assertEqual(false, checkforhash(923, <<9, 6, 44, 4:5>>)),
+	?_assertEqual(false, checkforhash(925, <<9, 6, 44, 4:5>>))
+	].
--- a/src/simple.cdb
+++ b/src/simple.cdb
--- a/src/test.cdb
+++ b/src/test.cdb
--- a/src/test_inconsole.cdb
+++ b/src/test_inconsole.cdb
--- a/src/test_mem.cdb
+++ b/src/test_mem.cdb
--- a/test/lookup_test.beam
+++ b/test/lookup_test.beam
--- a/test/lookup_test.erl
+++ b/test/lookup_test.erl
@ -0,0 +1,241 @@
+-module(lookup_test).
+
+-export([go_dict/1, go_ets/1, go_gbtree/1, 
+    go_arrayofdict/1, go_arrayofgbtree/1, go_arrayofdict_withcache/1]).
+
+-define(CACHE_SIZE, 512).
+
+hash(Key) ->
+  H = 5381,
+  hash1(H,Key) band 16#FFFFFFFF.
+
+hash1(H,[]) ->H;
+hash1(H,[B|Rest]) ->
+  H1 = H * 33,
+  H2 = H1 bxor B,
+  hash1(H2,Rest).
+
+% Get the least significant 8 bits from the hash.
+hash_to_index(Hash) ->
+  Hash band 255.
+
+
+%%
+%% Timings (microseconds):
+%%
+%% go_dict(200000) :   1569894
+%% go_dict(1000000) : 17191365
+%% go_dict(5000000) :  forever
+
+go_dict(N) ->
+    go_dict(dict:new(), N, N).
+
+go_dict(_, 0, _) -> 
+    {erlang:memory(), statistics(garbage_collection)};
+go_dict(D, N, M) ->
+    % Lookup a random key - which may not be present
+    LookupKey = lists:concat(["key-", random:uniform(M)]),
+    LookupHash = hash(LookupKey),
+    dict:find(LookupHash, D),
+
+    % Add a new key - which may be present so value to be appended
+    Key = lists:concat(["key-", N]),
+    Hash = hash(Key),
+    case dict:find(Hash, D) of 
+        error ->
+            go_dict(dict:store(Hash, [N], D), N-1, M);
+        {ok, List} ->
+            go_dict(dict:store(Hash, [N|List], D), N-1, M)
+    end.
+
+
+
+%%
+%% Timings (microseconds):
+%%
+%% go_ets(200000) :    609119
+%% go_ets(1000000) :  3520757
+%% go_ets(5000000) : 19974562
+
+go_ets(N) ->
+    go_ets(ets:new(ets_test, [private, bag]), N, N).
+
+go_ets(_, 0, _) ->
+    {erlang:memory(), statistics(garbage_collection)};
+go_ets(Ets, N, M) ->
+    % Lookup a random key - which may not be present
+    LookupKey = lists:concat(["key-", random:uniform(M)]),
+    LookupHash = hash(LookupKey),
+    ets:lookup(Ets, LookupHash),
+
+    % Add a new key - which may be present so value to be appended
+    Key = lists:concat(["key-", N]),
+    Hash = hash(Key),
+    ets:insert(Ets, {Hash, N}),
+    go_ets(Ets, N - 1, M).
+
+%%
+%% Timings (microseconds):
+%%
+%% go_gbtree(200000) :   1393936
+%% go_gbtree(1000000) :  8430997
+%% go_gbtree(5000000) : 45630810
+
+go_gbtree(N) ->
+    go_gbtree(gb_trees:empty(), N, N).
+
+go_gbtree(_, 0, _) ->
+    {erlang:memory(), statistics(garbage_collection)};
+go_gbtree(Tree, N, M) ->
+    % Lookup a random key - which may not be present
+    LookupKey = lists:concat(["key-", random:uniform(M)]),
+    LookupHash = hash(LookupKey),
+    gb_trees:lookup(LookupHash, Tree),
+
+    % Add a new key - which may be present so value to be appended
+    Key = lists:concat(["key-", N]),
+    Hash = hash(Key),
+    case gb_trees:lookup(Hash, Tree) of 
+        none ->
+            go_gbtree(gb_trees:insert(Hash, [N], Tree), N - 1, M);
+        {value, List} ->
+            go_gbtree(gb_trees:update(Hash, [N|List], Tree), N - 1, M)
+    end.
+
+
+%%
+%% Timings (microseconds):
+%%
+%% go_arrayofidict(200000) :   1266931
+%% go_arrayofidict(1000000) :  7387219
+%% go_arrayofidict(5000000) : 49511484
+
+go_arrayofdict(N) ->
+    go_arrayofdict(array:new(256, {default, dict:new()}), N, N).
+
+go_arrayofdict(_, 0, _) ->
+    % dict:to_list(array:get(0, Array)),
+    % dict:to_list(array:get(1, Array)),
+    % dict:to_list(array:get(2, Array)),
+    % dict:to_list(array:get(3, Array)),
+    % dict:to_list(array:get(4, Array)),
+    % dict:to_list(array:get(5, Array)),
+    % dict:to_list(array:get(6, Array)),
+    % dict:to_list(array:get(7, Array)),
+    % dict:to_list(array:get(8, Array)),
+    % dict:to_list(array:get(9, Array)),
+    {erlang:memory(), statistics(garbage_collection)};
+go_arrayofdict(Array, N, M) ->
+    % Lookup a random key - which may not be present
+    LookupKey = lists:concat(["key-", random:uniform(M)]),
+    LookupHash = hash(LookupKey),
+    LookupIndex = hash_to_index(LookupHash),
+    dict:find(LookupHash, array:get(LookupIndex, Array)),
+
+    % Add a new key - which may be present so value to be appended
+    Key = lists:concat(["key-", N]),
+    Hash = hash(Key),
+    Index = hash_to_index(Hash),
+    D = array:get(Index, Array),
+    case dict:find(Hash, D) of 
+        error ->
+            go_arrayofdict(array:set(Index, 
+                dict:store(Hash, [N], D), Array), N-1, M);
+        {ok, List} ->
+            go_arrayofdict(array:set(Index, 
+                dict:store(Hash, [N|List], D), Array), N-1, M)
+    end.
+
+%%
+%% Timings (microseconds):
+%%
+%% go_arrayofgbtree(200000) :   1176224
+%% go_arrayofgbtree(1000000) :  7480653
+%% go_arrayofgbtree(5000000) : 41266701
+
+go_arrayofgbtree(N) ->
+    go_arrayofgbtree(array:new(256, {default, gb_trees:empty()}), N, N).
+
+go_arrayofgbtree(_, 0, _) ->
+    % gb_trees:to_list(array:get(0, Array)),
+    % gb_trees:to_list(array:get(1, Array)),
+    % gb_trees:to_list(array:get(2, Array)),
+    % gb_trees:to_list(array:get(3, Array)),
+    % gb_trees:to_list(array:get(4, Array)),
+    % gb_trees:to_list(array:get(5, Array)),
+    % gb_trees:to_list(array:get(6, Array)),
+    % gb_trees:to_list(array:get(7, Array)),
+    % gb_trees:to_list(array:get(8, Array)),
+    % gb_trees:to_list(array:get(9, Array)),
+    {erlang:memory(), statistics(garbage_collection)};
+go_arrayofgbtree(Array, N, M) ->
+    % Lookup a random key - which may not be present
+    LookupKey = lists:concat(["key-", random:uniform(M)]),
+    LookupHash = hash(LookupKey),
+    LookupIndex = hash_to_index(LookupHash),
+    gb_trees:lookup(LookupHash, array:get(LookupIndex, Array)),
+
+    % Add a new key - which may be present so value to be appended
+    Key = lists:concat(["key-", N]),
+    Hash = hash(Key),
+    Index = hash_to_index(Hash),
+    Tree = array:get(Index, Array),
+    case gb_trees:lookup(Hash, Tree) of 
+        none ->
+            go_arrayofgbtree(array:set(Index, 
+                gb_trees:insert(Hash, [N], Tree), Array), N - 1, M);
+        {value, List} ->
+            go_arrayofgbtree(array:set(Index, 
+                gb_trees:update(Hash, [N|List], Tree), Array), N - 1, M)
+    end.
+
+
+%%
+%% Timings (microseconds):
+%%
+%% go_arrayofdict_withcache(200000) :   1432951
+%% go_arrayofdict_withcache(1000000) :  9140169
+%% go_arrayofdict_withcache(5000000) : 59435511
+
+go_arrayofdict_withcache(N) ->
+    go_arrayofdict_withcache({array:new(256, {default, dict:new()}), 
+        array:new(256, {default, dict:new()})}, N, N).
+
+go_arrayofdict_withcache(_, 0, _) ->
+    {erlang:memory(), statistics(garbage_collection)};
+go_arrayofdict_withcache({MArray, CArray}, N, M) ->
+    % Lookup a random key - which may not be present
+    LookupKey = lists:concat(["key-", random:uniform(M)]),
+    LookupHash = hash(LookupKey),
+    LookupIndex = hash_to_index(LookupHash),
+    dict:find(LookupHash, array:get(LookupIndex, CArray)),
+    dict:find(LookupHash, array:get(LookupIndex, MArray)),
+
+    % Add a new key - which may be present so value to be appended
+    Key = lists:concat(["key-", N]),
+    Hash = hash(Key),
+    Index = hash_to_index(Hash),
+    Cache = array:get(Index, CArray),
+    case dict:find(Hash, Cache) of 
+        error ->
+            UpdCache = dict:store(Hash, [N], Cache);
+        {ok, _} ->
+            UpdCache = dict:append(Hash, N, Cache)
+    end,
+    case dict:size(UpdCache) of 
+        ?CACHE_SIZE ->
+            UpdCArray = array:set(Index, dict:new(), CArray),
+            UpdMArray = array:set(Index, dict:merge(fun merge_values/3, UpdCache, array:get(Index, MArray)), MArray),
+            go_arrayofdict_withcache({UpdMArray, UpdCArray}, N - 1, M);
+        _ ->
+            UpdCArray = array:set(Index, UpdCache, CArray),
+            go_arrayofdict_withcache({MArray, UpdCArray}, N - 1, M)
+    end.
+
+
+
+merge_values(_, Value1, Value2) ->
+    lists:append(Value1, Value2).
+
+
+