From 52e21de298b97a18fe554628a6ca54140cd1ab6f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 12 Dec 2016 21:47:09 +0000 Subject: [PATCH 1/7] Initial switch to using ETS No real refactor of building hashtables at this stage - just using ETS not an arrary of skiplists --- src/leveled_cdb.erl | 110 ++++++++------------------------------------ 1 file changed, 18 insertions(+), 92 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index 63777b2..eda2692 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -1172,8 +1172,8 @@ write_hash_tables([Index|Rest], HashTree, CurrPos, IndexList, HashTreeBin) -> true -> write_hash_tables(Rest, HashTree, CurrPos, IndexList, HashTreeBin); false -> - HashList = to_list(HashTree, Index), - BinList = build_binaryhashlist(HashList, []), + BinList = to_binarylist(HashTree, Index), + % BinList = build_binaryhashlist(HashList, []), IndexLength = length(BinList) * 2, SlotList = lists:duplicate(IndexLength, <<0:32, 0:32>>), @@ -1195,26 +1195,6 @@ write_hash_tables([Index|Rest], HashTree, CurrPos, IndexList, HashTreeBin) -> NewSlotBin) end. -%% The list created from the original HashTree may have duplicate positions -%% e.g. {Key, [Value1, Value2]}. Before any writing is done it is necessary -%% to know the actual number of hashes - or the Slot may not be sized correctly -%% -%% This function creates {Hash, Binary} pairs on a list where there is a unique -%% entry for eveyr Key/Value -build_binaryhashlist([], BinList) -> - BinList; -build_binaryhashlist([{Hash, [Position|TailP]}|TailKV], BinList) -> - HashLE = endian_flip(Hash), - PosLE = endian_flip(Position), - NewBin = <>, - case TailP of - [] -> - build_binaryhashlist(TailKV, - [{Hash, NewBin}|BinList]); - _ -> - build_binaryhashlist([{Hash, TailP}|TailKV], - [{Hash, NewBin}|BinList]) - end. %% Slot is zero based because it comes from a REM find_open_slot(List, Hash) -> @@ -1319,42 +1299,33 @@ multi_key_value_to_record(KVList, BinaryMode, LastPosition) -> %%%============================================================================ lookup_positions(HashTree, Index, Hash) -> - Tree = array:get(Index, HashTree), - case leveled_skiplist:lookup(Hash, Tree) of - {value, List} -> - List; - _ -> - [] - end. + ConvertObjFun = fun({{_Idx, _H}, P}) -> P end, + lists:map(ConvertObjFun, ets:lookup(HashTree, {Index, Hash})). add_position_tohashtree(HashTree, Index, Hash, Position) -> - Tree = array:get(Index, HashTree), - case leveled_skiplist:lookup(Hash, Tree) of - none -> - array:set(Index, - leveled_skiplist:enter(Hash, [Position], Tree), - HashTree); - {value, L} -> - array:set(Index, - leveled_skiplist:enter(Hash, [Position|L], Tree), - HashTree) - end. + ets:insert(HashTree, {{Index, Hash}, Position}), + HashTree. new_hashtree() -> - array:new(256, {default, leveled_skiplist:empty()}). + ets:new(hashtree, [bag]). is_empty(HashTree, Index) -> - Tree = array:get(Index, HashTree), - case leveled_skiplist:size(Tree) of - 0 -> + case ets:match(HashTree, {{Index, '_'}, '_'}) of + '$end_of_table' -> true; _ -> false end. -to_list(HashTree, Index) -> - Tree = array:get(Index, HashTree), - leveled_skiplist:to_list(Tree). +to_binarylist(HashTree, Index) -> + ConvertObjFun = + fun({{_Idx, Hash}, Position}) -> + HashLE = endian_flip(Hash), + PosLE = endian_flip(Position), + NewBin = <>, + {Hash, NewBin} + end, + lists:map(ConvertObjFun, ets:match_object(HashTree, {{Index, '_'}, '_'})). %%%%%%%%%%%%%%%% % T E S T @@ -1406,51 +1377,6 @@ to_dict(FileName) -> - -write_key_value_pairs_1_test() -> - {ok,Handle} = file:open("../test/test.cdb",[write]), - {_, HashTree} = write_key_value_pairs(Handle, - [{"key1","value1"}, - {"key2","value2"}]), - Hash1 = hash("key1"), - Index1 = hash_to_index(Hash1), - Hash2 = hash("key2"), - Index2 = hash_to_index(Hash2), - R0 = array:new(256, {default, leveled_skiplist:empty()}), - R1 = array:set(Index1, - leveled_skiplist:enter(Hash1, - [0], - array:get(Index1, R0)), - R0), - R2 = array:set(Index2, - leveled_skiplist:enter(Hash2, - [30], - array:get(Index2, R1)), - R1), - io:format("HashTree is ~w~n", [HashTree]), - io:format("Expected HashTree is ~w~n", [R2]), - ?assertMatch(R2, HashTree), - ok = file:delete("../test/test.cdb"). - - -write_hash_tables_1_test() -> - {ok, Handle} = file:open("../test/testx.cdb", [write]), - R0 = array:new(256, {default, leveled_skiplist:empty()}), - R1 = array:set(64, - leveled_skiplist:enter(6383014720, - [18], - array:get(64, R0)), - R0), - R2 = array:set(67, - leveled_skiplist:enter(6383014723, - [0], - array:get(67, R1)), - R1), - Result = write_hash_tables(Handle, R2), - io:format("write hash tables result of ~w ~n", [Result]), - ?assertMatch(Result,[{67,16,2},{64,0,2}]), - ok = file:delete("../test/testx.cdb"). - find_open_slot_1_test() -> List = [<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], Slot = find_open_slot(List,0), From 972a0ee0b9459607524d5461e4679d4f118394a1 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 13 Dec 2016 02:15:13 +0000 Subject: [PATCH 2/7] Refactor hash table write Less looping and re-looping over list. Uses ordering to build more naturally. --- src/leveled_cdb.erl | 233 ++++++++++++++++++++++++++------------------ 1 file changed, 139 insertions(+), 94 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index eda2692..2bf6acd 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -753,11 +753,7 @@ end. hashtable_calc(HashTree, StartPos) -> Seq = lists:seq(0, 255), SWC = os:timestamp(), - {IndexList, HashTreeBin} = write_hash_tables(Seq, - HashTree, - StartPos, - [], - <<>>), + {IndexList, HashTreeBin} = write_hash_tables(Seq, HashTree, StartPos), leveled_log:log_timer("CDB07", [], SWC), {IndexList, HashTreeBin}. @@ -805,8 +801,8 @@ find_lastkey(Handle, IndexCache) -> scan_index(Handle, IndexCache, {ScanFun, InitAcc}) -> lists:foldl(fun({_X, {Pos, Count}}, Acc) -> - ScanFun(Handle, Pos, Count, Acc) - end, + ScanFun(Handle, Pos, Count, Acc) + end, InitAcc, IndexCache). @@ -1165,68 +1161,11 @@ perform_write_hash_tables(Handle, HashTreeBin, StartPos) -> ok. -write_hash_tables([], _HashTree, _CurrPos, IndexList, HashTreeBin) -> - {IndexList, HashTreeBin}; -write_hash_tables([Index|Rest], HashTree, CurrPos, IndexList, HashTreeBin) -> - case is_empty(HashTree, Index) of - true -> - write_hash_tables(Rest, HashTree, CurrPos, IndexList, HashTreeBin); - false -> - BinList = to_binarylist(HashTree, Index), - % BinList = build_binaryhashlist(HashList, []), - IndexLength = length(BinList) * 2, - SlotList = lists:duplicate(IndexLength, <<0:32, 0:32>>), - - Fn = fun({Hash, Binary}, AccSlotList) -> - Slot1 = find_open_slot(AccSlotList, Hash), - {L1, [<<0:32, 0:32>>|L2]} = lists:split(Slot1, AccSlotList), - lists:append(L1, [Binary|L2]) - end, - - NewSlotList = lists:foldl(Fn, SlotList, BinList), - NewSlotBin = lists:foldl(fun(X, Acc) -> - <> end, - HashTreeBin, - NewSlotList), - write_hash_tables(Rest, - HashTree, - CurrPos + length(NewSlotList) * ?DWORD_SIZE, - [{Index, CurrPos, IndexLength}|IndexList], - NewSlotBin) - end. - - -%% Slot is zero based because it comes from a REM -find_open_slot(List, Hash) -> - Len = length(List), - Slot = hash_to_slot(Hash, Len), - Seq = lists:seq(1, Len), - {CL1, CL2} = lists:split(Slot, Seq), - {L1, L2} = lists:split(Slot, List), - find_open_slot1(lists:append(CL2, CL1), lists:append(L2, L1)). - -find_open_slot1([Slot|_RestOfSlots], [<<0:32,0:32>>|_RestOfEntries]) -> - Slot - 1; -find_open_slot1([_|RestOfSlots], [_|RestOfEntries]) -> - find_open_slot1(RestOfSlots, RestOfEntries). - - %% Write the top most 255 doubleword entries. First word is the %% file pointer to a hashtable and the second word is the number of entries %% in the hash table %% The List passed in should be made up of {Index, Position, Count} tuples -write_top_index_table(Handle, BasePos, List) -> - % fold function to find any missing index tuples, and add one a replacement - % in this case with a count of 0. Also orders the list by index - FnMakeIndex = fun(I) -> - case lists:keysearch(I, 1, List) of - {value, Tuple} -> - Tuple; - false -> - {I, BasePos, 0} - end - end, - % Fold function to write the index entries +write_top_index_table(Handle, BasePos, IndexList) -> FnWriteIndex = fun({_Index, Pos, Count}, {AccBin, CurrPos}) -> case Count == 0 of true -> @@ -1240,11 +1179,9 @@ write_top_index_table(Handle, BasePos, List) -> {<>, NextPos} end, - Seq = lists:seq(0, 255), - CompleteList = lists:keysort(1, lists:map(FnMakeIndex, Seq)), {IndexBin, _Pos} = lists:foldl(FnWriteIndex, {<<>>, BasePos}, - CompleteList), + IndexList), {ok, _} = file:position(Handle, 0), ok = file:write(Handle, IndexBin), ok = file:advise(Handle, 0, ?DWORD_SIZE * 256, will_need), @@ -1317,15 +1254,110 @@ is_empty(HashTree, Index) -> false end. -to_binarylist(HashTree, Index) -> +to_slotmap(HashTree, Index) -> + ObjList = ets:match_object(HashTree, {{Index, '_'}, '_'}), + IndexLength = length(ObjList) * 2, ConvertObjFun = fun({{_Idx, Hash}, Position}) -> HashLE = endian_flip(Hash), PosLE = endian_flip(Position), NewBin = <>, - {Hash, NewBin} + {hash_to_slot(Hash, IndexLength), NewBin} end, - lists:map(ConvertObjFun, ets:match_object(HashTree, {{Index, '_'}, '_'})). + lists:keysort(1, lists:map(ConvertObjFun, ObjList)). + + +build_hashtree_binary(SlotMap, IndexLength) -> + build_hashtree_binary(SlotMap, IndexLength, 0, <<>>). + +build_hashtree_binary([], IdxLen, _SlotPos, Bin) -> + case byte_size(Bin) div ?DWORD_SIZE of + IdxLen -> + Bin; + N when N < IdxLen -> + ZeroLen = (IdxLen - N) * 64, + <> + end; +build_hashtree_binary([{TopSlot, TopBin}|SlotMapTail], IdxLen, SlotPos, Bin) -> + case TopSlot of + SlotPos -> + UpdBin = <>, + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + 1, + UpdBin); + N when N > SlotPos -> + Delta = N - SlotPos, + DeltaLen = Delta * 64, + UpdBin = <>, + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + Delta + 1, + UpdBin); + N when N < SlotPos, SlotPos < IdxLen -> + UpdBin = <>, + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + 1, + UpdBin); + N when N < SlotPos, SlotPos >= IdxLen -> + % Need to wrap round and put in the first empty slot from the + % beginning + Pos = find_firstzero(Bin, 0) * 64, + UpdBin = + case Pos of + 0 -> + <<0:64, Tail/binary>> = Bin, + <>; + _P -> + <> = Bin, + <> + end, + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + 1, + UpdBin) + end. + + +find_firstzero(<>, Pos) -> + case N of + 0 -> + Pos; + _ -> + find_firstzero(TailBin, Pos + 1) + end. + + +write_hash_tables(Indexes, HashTree, CurrPos) -> + write_hash_tables(Indexes, HashTree, CurrPos, CurrPos, [], <<>>). + +write_hash_tables([], _HashTree, _CurrPos, _BasePos, IndexList, HashTreeBin) -> + IL = lists:reverse(IndexList), + {IL, HashTreeBin}; +write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos, + IndexList, HashTreeBin) -> + case is_empty(HashTree, Index) of + true -> + write_hash_tables(Rest, + HashTree, + CurrPos, + BasePos, + [{Index, BasePos, 0}|IndexList], + HashTreeBin); + false -> + SlotMap = to_slotmap(HashTree, Index), + IndexLength = length(SlotMap) * 2, + NewSlotBin = build_hashtree_binary(SlotMap, IndexLength), + write_hash_tables(Rest, + HashTree, + CurrPos + IndexLength * ?DWORD_SIZE, + BasePos, + [{Index, CurrPos, IndexLength}|IndexList], + <>) + end. + + %%%%%%%%%%%%%%%% % T E S T @@ -1374,33 +1406,46 @@ dump(FileName) -> to_dict(FileName) -> KeyValueList = dump(FileName), dict:from_list(KeyValueList). - -find_open_slot_1_test() -> - List = [<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], - Slot = find_open_slot(List,0), - ?assertMatch(Slot,1). +build_hashtree_bunchedatend_binary_test() -> + SlotMap = [{1, <<10:32, 0:32>>}, + {4, <<11:32, 100:32>>}, + {8, <<12:32, 200:32>>}, + {8, <<13:32, 300:32>>}, + {14, <<14:32, 400:32>>}, + {14, <<15:32, 500:32>>}, + {15, <<16:32, 600:32>>}, + {15, <<17:32, 700:32>>}], + Bin = build_hashtree_binary(SlotMap, 16), + ExpBinP1 = <<16:32, 600:32, 10:32, 0:32, 17:32, 700:32, 0:64>>, + ExpBinP2 = <<11:32, 100:32, 0:192, 12:32, 200:32, 13:32, 300:32, 0:256>>, + ExpBinP3 = <<14:32, 400:32, 15:32, 500:32>>, + ExpBin = <>, + ?assertMatch(ExpBin, Bin). -find_open_slot_2_test() -> - List = [<<0:32,0:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], - Slot = find_open_slot(List,0), - ?assertMatch(Slot,0). +build_hashtree_bunchedatstart_binary_test() -> + SlotMap = [{1, <<10:32, 0:32>>}, + {2, <<11:32, 100:32>>}, + {3, <<12:32, 200:32>>}, + {4, <<13:32, 300:32>>}, + {5, <<14:32, 400:32>>}, + {6, <<15:32, 500:32>>}, + {7, <<16:32, 600:32>>}, + {8, <<17:32, 700:32>>}], + Bin = build_hashtree_binary(SlotMap, 16), + ExpBinP1 = <<0:64, 10:32, 0:32, 11:32, 100:32, 12:32, 200:32>>, + ExpBinP2 = <<13:32, 300:32, 14:32, 400:32, 15:32, 500:32, 16:32, 600:32>>, + ExpBinP3 = <<17:32, 700:32, 0:448>>, + ExpBin = <>, + ExpSize = byte_size(ExpBin), + ?assertMatch(ExpSize, byte_size(Bin)), + ?assertMatch(ExpBin, Bin). -find_open_slot_3_test() -> - List = [<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>], - Slot = find_open_slot(List,2), - ?assertMatch(Slot,3). +find_firstzero_test() -> + Bin = <<1:64/integer, 0:64/integer, 89:64/integer, 72:64/integer>>, + ?assertMatch(1, find_firstzero(Bin, 0)). -find_open_slot_4_test() -> - List = [<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>], - Slot = find_open_slot(List,1), - ?assertMatch(Slot,0). - -find_open_slot_5_test() -> - List = [<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>], - Slot = find_open_slot(List,3), - ?assertMatch(Slot,2). full_1_test() -> List1 = lists:sort([{"key1","value1"},{"key2","value2"}]), @@ -1683,7 +1728,7 @@ get_keys_byposition_manykeys_test() -> {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=false}), PositionList = cdb_getpositions(P2, all), L1 = length(PositionList), - ?assertMatch(L1, KeyCount), + ?assertMatch(KeyCount, L1), SampleList1 = cdb_getpositions(P2, 10), ?assertMatch(10, length(SampleList1)), From aa2d19df1d4eb3b4134497e18413acf9d7e1341c Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 13 Dec 2016 03:22:40 +0000 Subject: [PATCH 3/7] Revert back to handling list of binaries (but differently) Performance from last commit got worse not better :-( Perhaps better handling all as lists, and then building a binary at the end. --- src/leveled_cdb.erl | 83 +++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index 2bf6acd..99fce40 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -1268,75 +1268,63 @@ to_slotmap(HashTree, Index) -> build_hashtree_binary(SlotMap, IndexLength) -> - build_hashtree_binary(SlotMap, IndexLength, 0, <<>>). + build_hashtree_binary(SlotMap, IndexLength, 0, []). -build_hashtree_binary([], IdxLen, _SlotPos, Bin) -> - case byte_size(Bin) div ?DWORD_SIZE of +build_hashtree_binary([], IdxLen, SlotPos, Bin) -> + case SlotPos of IdxLen -> - Bin; + lists:reverse(Bin); N when N < IdxLen -> ZeroLen = (IdxLen - N) * 64, - <> - end; + lists:reverse([<<0:ZeroLen>>|Bin]) + end; build_hashtree_binary([{TopSlot, TopBin}|SlotMapTail], IdxLen, SlotPos, Bin) -> case TopSlot of - SlotPos -> - UpdBin = <>, - build_hashtree_binary(SlotMapTail, - IdxLen, - SlotPos + 1, - UpdBin); N when N > SlotPos -> - Delta = N - SlotPos, - DeltaLen = Delta * 64, - UpdBin = <>, + D = N - SlotPos, + Bridge = lists:duplicate(D, <<0:64>>) ++ Bin, + UpdBin = [<>|Bridge], build_hashtree_binary(SlotMapTail, IdxLen, - SlotPos + Delta + 1, + SlotPos + D + 1, UpdBin); - N when N < SlotPos, SlotPos < IdxLen -> - UpdBin = <>, + N when N =< SlotPos, SlotPos < IdxLen -> + UpdBin = [<>|Bin], build_hashtree_binary(SlotMapTail, IdxLen, SlotPos + 1, UpdBin); - N when N < SlotPos, SlotPos >= IdxLen -> + N when N < SlotPos, SlotPos == IdxLen -> % Need to wrap round and put in the first empty slot from the % beginning - Pos = find_firstzero(Bin, 0) * 64, - UpdBin = - case Pos of - 0 -> - <<0:64, Tail/binary>> = Bin, - <>; - _P -> - <> = Bin, - <> - end, + Pos = find_firstzero(Bin, length(Bin)), + {LHS, [<<0:64>>|RHS]} = lists:split(Pos - 1, Bin), + UpdBin = lists:append(LHS, [TopBin|RHS]), build_hashtree_binary(SlotMapTail, IdxLen, - SlotPos + 1, + SlotPos, UpdBin) end. -find_firstzero(<>, Pos) -> - case N of - 0 -> +% Search from the tail of the list to find the first zero +find_firstzero(Bin, Pos) -> + case lists:nth(Pos, Bin) of + <<0:64>> -> Pos; _ -> - find_firstzero(TailBin, Pos + 1) + find_firstzero(Bin, Pos - 1) end. write_hash_tables(Indexes, HashTree, CurrPos) -> - write_hash_tables(Indexes, HashTree, CurrPos, CurrPos, [], <<>>). + write_hash_tables(Indexes, HashTree, CurrPos, CurrPos, [], []). -write_hash_tables([], _HashTree, _CurrPos, _BasePos, IndexList, HashTreeBin) -> +write_hash_tables([], _HashTree, _CurrPos, _BasePos, IndexList, HT_BinList) -> IL = lists:reverse(IndexList), - {IL, HashTreeBin}; + {IL, list_to_binary(HT_BinList)}; write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos, - IndexList, HashTreeBin) -> + IndexList, HT_BinList) -> case is_empty(HashTree, Index) of true -> write_hash_tables(Rest, @@ -1344,7 +1332,7 @@ write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos, CurrPos, BasePos, [{Index, BasePos, 0}|IndexList], - HashTreeBin); + HT_BinList); false -> SlotMap = to_slotmap(HashTree, Index), IndexLength = length(SlotMap) * 2, @@ -1354,7 +1342,7 @@ write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos, CurrPos + IndexLength * ?DWORD_SIZE, BasePos, [{Index, CurrPos, IndexLength}|IndexList], - <>) + HT_BinList ++ NewSlotBin) end. @@ -1417,7 +1405,7 @@ build_hashtree_bunchedatend_binary_test() -> {14, <<15:32, 500:32>>}, {15, <<16:32, 600:32>>}, {15, <<17:32, 700:32>>}], - Bin = build_hashtree_binary(SlotMap, 16), + Bin = list_to_binary(build_hashtree_binary(SlotMap, 16)), ExpBinP1 = <<16:32, 600:32, 10:32, 0:32, 17:32, 700:32, 0:64>>, ExpBinP2 = <<11:32, 100:32, 0:192, 12:32, 200:32, 13:32, 300:32, 0:256>>, ExpBinP3 = <<14:32, 400:32, 15:32, 500:32>>, @@ -1433,7 +1421,7 @@ build_hashtree_bunchedatstart_binary_test() -> {6, <<15:32, 500:32>>}, {7, <<16:32, 600:32>>}, {8, <<17:32, 700:32>>}], - Bin = build_hashtree_binary(SlotMap, 16), + Bin = list_to_binary(build_hashtree_binary(SlotMap, 16)), ExpBinP1 = <<0:64, 10:32, 0:32, 11:32, 100:32, 12:32, 200:32>>, ExpBinP2 = <<13:32, 300:32, 14:32, 400:32, 15:32, 500:32, 16:32, 600:32>>, ExpBinP3 = <<17:32, 700:32, 0:448>>, @@ -1443,8 +1431,15 @@ build_hashtree_bunchedatstart_binary_test() -> ?assertMatch(ExpBin, Bin). find_firstzero_test() -> - Bin = <<1:64/integer, 0:64/integer, 89:64/integer, 72:64/integer>>, - ?assertMatch(1, find_firstzero(Bin, 0)). + Bin = [<<1:64/integer>>, <<0:64/integer>>, + <<89:64/integer>>, <<89:64/integer>>, + <<0:64/integer>>, + <<71:64/integer>>, <<72:64/integer>>], + ?assertMatch(5, find_firstzero(Bin, length(Bin))), + {LHS, [<<0:64>>|RHS]} = lists:split(4, Bin), + ?assertMatch([<<1:64/integer>>, <<0:64/integer>>, + <<89:64/integer>>, <<89:64/integer>>], LHS), + ?assertMatch([<<71:64/integer>>, <<72:64/integer>>], RHS). full_1_test() -> From cfc6a6763815cfba1b0d9c3cf3e6dff6c69a4335 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 13 Dec 2016 12:35:30 +0000 Subject: [PATCH 4/7] Switch to ordered_set Improved performance by a combination of switching to an ordered_set (so a list can be extracted in a sane way), and building the binary from an ordered list. --- src/leveled_cdb.erl | 68 +++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index 99fce40..958c6e5 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -1236,36 +1236,45 @@ multi_key_value_to_record(KVList, BinaryMode, LastPosition) -> %%%============================================================================ lookup_positions(HashTree, Index, Hash) -> - ConvertObjFun = fun({{_Idx, _H}, P}) -> P end, - lists:map(ConvertObjFun, ets:lookup(HashTree, {Index, Hash})). + lookup_positions(HashTree, Index, Hash, -1, []). + +lookup_positions(HashTree, Index, Hash, Pos, PosList) -> + case ets:next(HashTree, {Index, Hash, Pos}) of + {Index, Hash, NewPos} -> + lookup_positions(HashTree, Index, Hash, NewPos, [NewPos|PosList]); + _ -> + PosList + end. add_position_tohashtree(HashTree, Index, Hash, Position) -> - ets:insert(HashTree, {{Index, Hash}, Position}), + ets:insert(HashTree, {{Index, Hash, Position}}), HashTree. new_hashtree() -> - ets:new(hashtree, [bag]). + ets:new(hashtree, [ordered_set]). -is_empty(HashTree, Index) -> - case ets:match(HashTree, {{Index, '_'}, '_'}) of - '$end_of_table' -> - true; +to_list(HashTree, Index) -> + to_list(HashTree, Index, {0, -1}, []). + +to_list(HashTree, Index, {LastHash, LastPos}, Acc) -> + case ets:next(HashTree, {Index, LastHash, LastPos}) of + {Index, Hash, Pos} -> + to_list(HashTree, Index, {Hash, Pos}, [{Hash, Pos}|Acc]); _ -> - false + Acc end. to_slotmap(HashTree, Index) -> - ObjList = ets:match_object(HashTree, {{Index, '_'}, '_'}), - IndexLength = length(ObjList) * 2, + HPList = to_list(HashTree, Index), + IndexLength = length(HPList), ConvertObjFun = - fun({{_Idx, Hash}, Position}) -> + fun({Hash, Position}) -> HashLE = endian_flip(Hash), PosLE = endian_flip(Position), NewBin = <>, {hash_to_slot(Hash, IndexLength), NewBin} end, - lists:keysort(1, lists:map(ConvertObjFun, ObjList)). - + lists:map(ConvertObjFun, HPList). build_hashtree_binary(SlotMap, IndexLength) -> build_hashtree_binary(SlotMap, IndexLength, 0, []). @@ -1318,31 +1327,42 @@ find_firstzero(Bin, Pos) -> write_hash_tables(Indexes, HashTree, CurrPos) -> - write_hash_tables(Indexes, HashTree, CurrPos, CurrPos, [], []). + write_hash_tables(Indexes, HashTree, CurrPos, CurrPos, [], [], {0, 0, 0}). -write_hash_tables([], _HashTree, _CurrPos, _BasePos, IndexList, HT_BinList) -> +write_hash_tables([], _HashTree, _CurrPos, _BasePos, + IndexList, HT_BinList, {T1, T2, T3}) -> + io:format("CDB99 ~w T1 ~w T2 ~w T3 ~w~n", [self(), T1, T2, T3]), IL = lists:reverse(IndexList), {IL, list_to_binary(HT_BinList)}; write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos, - IndexList, HT_BinList) -> - case is_empty(HashTree, Index) of - true -> + IndexList, HT_BinList, Timers) -> + SW1 = os:timestamp(), + SlotMap = to_slotmap(HashTree, Index), + T1 = timer:now_diff(os:timestamp(), SW1) + element(1, Timers), + case SlotMap of + [] -> write_hash_tables(Rest, HashTree, CurrPos, BasePos, [{Index, BasePos, 0}|IndexList], - HT_BinList); - false -> - SlotMap = to_slotmap(HashTree, Index), + HT_BinList, + Timers); + _ -> + SW2 = os:timestamp(), IndexLength = length(SlotMap) * 2, - NewSlotBin = build_hashtree_binary(SlotMap, IndexLength), + SortedMap = lists:keysort(1, SlotMap), + T2 = timer:now_diff(os:timestamp(), SW2) + element(2, Timers), + SW3 = os:timestamp(), + NewSlotBin = build_hashtree_binary(SortedMap, IndexLength), + T3 = timer:now_diff(os:timestamp(), SW3) + element(3, Timers), write_hash_tables(Rest, HashTree, CurrPos + IndexLength * ?DWORD_SIZE, BasePos, [{Index, CurrPos, IndexLength}|IndexList], - HT_BinList ++ NewSlotBin) + HT_BinList ++ NewSlotBin, + {T1, T2, T3}) end. From 52499170c04da837269a87e74cc6999a205d954a Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 13 Dec 2016 12:41:44 +0000 Subject: [PATCH 5/7] Tidy logging following changes Include detailed timings in a permanent log --- src/leveled_cdb.erl | 6 +++--- src/leveled_log.erl | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index 958c6e5..80dbf87 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -1330,12 +1330,12 @@ write_hash_tables(Indexes, HashTree, CurrPos) -> write_hash_tables(Indexes, HashTree, CurrPos, CurrPos, [], [], {0, 0, 0}). write_hash_tables([], _HashTree, _CurrPos, _BasePos, - IndexList, HT_BinList, {T1, T2, T3}) -> - io:format("CDB99 ~w T1 ~w T2 ~w T3 ~w~n", [self(), T1, T2, T3]), + IndexList, HT_BinList, {T1, T2, T3}) -> + leveled_log:log("CDB14", [T1, T2, T3]), IL = lists:reverse(IndexList), {IL, list_to_binary(HT_BinList)}; write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos, - IndexList, HT_BinList, Timers) -> + IndexList, HT_BinList, Timers) -> SW1 = os:timestamp(), SlotMap = to_slotmap(HashTree, Index), T1 = timer:now_diff(os:timestamp(), SW1) + element(1, Timers), diff --git a/src/leveled_log.erl b/src/leveled_log.erl index f2306ce..f02d139 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -258,7 +258,10 @@ {"CDB12", {info, "HashTree written"}}, {"CDB13", - {info, "Write options of ~w"}} + {info, "Write options of ~w"}}, + {"CDB14", + {info, "Microsecond imings for hashtree build of " + ++ "to_list ~w sort ~w build ~w"}} ])). From 8f775a88fdba172bdabe297f934ff69123820cb3 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 13 Dec 2016 14:06:19 +0000 Subject: [PATCH 6/7] Investigate performance regression Performance has regressed following the hashtable change. Speculation that the hashtable format might not be right, and so there is more cycling around the hashtree. Logging added. --- src/leveled_cdb.erl | 24 ++++++++++++++++++++---- src/leveled_log.erl | 7 ++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index 80dbf87..b90dde5 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -1092,9 +1092,14 @@ read_integerpairs(<>, Pairs) -> %% false - don't check the CRC before returning key & value %% loose_presence - confirm that the hash of the key is present -search_hash_table(_Handle, [], _Hash, _Key, _QuickCheck) -> +search_hash_table(Handle, Entries, Hash, Key, QuickCheck) -> + search_hash_table(Handle, Entries, Hash, Key, QuickCheck, 0). + +search_hash_table(_Handle, [], _Hash, _Key, _QuickCheck, CycleCount) -> + log_cyclecount(CycleCount), missing; -search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, QuickCheck) -> +search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, + QuickCheck, CycleCount) -> {ok, _} = file:position(Handle, Entry), {StoredHash, DataLoc} = read_next_2_integers(Handle), case StoredHash of @@ -1111,15 +1116,26 @@ search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, QuickCheck) -> RestOfEntries, Hash, Key, - QuickCheck); + QuickCheck, + CycleCount + 1); _ -> + log_cyclecount(CycleCount), KV end; %0 -> % % Hash is 0 so key must be missing as 0 found before Hash matched % missing; _ -> - search_hash_table(Handle, RestOfEntries, Hash, Key, QuickCheck) + search_hash_table(Handle, RestOfEntries, Hash, Key, + QuickCheck, CycleCount + 1) + end. + +log_cyclecount(CycleCount) -> + if + CycleCount > 8 -> + leveled_log:log("CDB15", [CycleCount]); + true -> + ok end. % Write Key and Value tuples into the CDB. Each tuple consists of a diff --git a/src/leveled_log.erl b/src/leveled_log.erl index f02d139..a5d0559 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -260,9 +260,10 @@ {"CDB13", {info, "Write options of ~w"}}, {"CDB14", - {info, "Microsecond imings for hashtree build of " - ++ "to_list ~w sort ~w build ~w"}} - + {info, "Microsecond timings for hashtree build of " + ++ "to_list ~w sort ~w build ~w"}}, + {"CDB15", + {info, "Cycle count of ~w in hashtable search higher than expected~n"}} ])). From c8be3bfa469b09d2243b74f4cdb395d0c2929e10 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 13 Dec 2016 17:02:45 +0000 Subject: [PATCH 7/7] Slot hash corrected When building the hashtree the incorrect IndexLength was being used to calculate the slot - causing many queries to loop all the way round the Index --- src/leveled_cdb.erl | 58 +++++++++++++++++++++++++++++++++++++++------ src/leveled_log.erl | 3 ++- 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index b90dde5..c8110af 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -681,7 +681,7 @@ get(Handle, Key, Cache, QuickCheck) when is_tuple(Handle) -> _ -> % Get starting slot in hashtable {ok, FirstHashPosition} = file:position(Handle, {bof, HashTable}), - Slot = hash_to_slot(Hash, Count), + Slot = hash_to_slot(Hash, Count), {ok, _} = file:position(Handle, {cur, Slot * ?DWORD_SIZE}), LastHashPosition = HashTable + ((Count-1) * ?DWORD_SIZE), LocList = lists:seq(FirstHashPosition, @@ -1095,8 +1095,8 @@ read_integerpairs(<>, Pairs) -> search_hash_table(Handle, Entries, Hash, Key, QuickCheck) -> search_hash_table(Handle, Entries, Hash, Key, QuickCheck, 0). -search_hash_table(_Handle, [], _Hash, _Key, _QuickCheck, CycleCount) -> - log_cyclecount(CycleCount), +search_hash_table(_Handle, [], Hash, _Key, _QuickCheck, CycleCount) -> + log_cyclecount(CycleCount, Hash, missing), missing; search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, QuickCheck, CycleCount) -> @@ -1119,7 +1119,7 @@ search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, QuickCheck, CycleCount + 1); _ -> - log_cyclecount(CycleCount), + log_cyclecount(CycleCount, Hash, found), KV end; %0 -> @@ -1130,10 +1130,10 @@ search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, QuickCheck, CycleCount + 1) end. -log_cyclecount(CycleCount) -> +log_cyclecount(CycleCount, Hash, Result) -> if CycleCount > 8 -> - leveled_log:log("CDB15", [CycleCount]); + leveled_log:log("CDB15", [CycleCount, Hash, Result]); true -> ok end. @@ -1282,7 +1282,7 @@ to_list(HashTree, Index, {LastHash, LastPos}, Acc) -> to_slotmap(HashTree, Index) -> HPList = to_list(HashTree, Index), - IndexLength = length(HPList), + IndexLength = length(HPList) * 2, ConvertObjFun = fun({Hash, Position}) -> HashLE = endian_flip(Hash), @@ -1466,6 +1466,23 @@ build_hashtree_bunchedatstart_binary_test() -> ?assertMatch(ExpSize, byte_size(Bin)), ?assertMatch(ExpBin, Bin). + +build_hashtree_test() -> + SlotMap = [{3, <<2424914688:32, 100:32>>}, + {3, <<2424917760:32, 200:32>>}, + {7, <<2424915712:32, 300:32>>}, + {9, <<2424903936:32, 400:32>>}, + {9, <<2424907008:32, 500:32>>}, + {10, <<2424913408:32, 600:32>>}], + BinList = build_hashtree_binary(SlotMap, 12), + ExpOut = [<<0:64>>, <<0:64>>, <<0:64>>, <<2424914688:32, 100:32>>] ++ + [<<2424917760:32, 200:32>>, <<0:64>>, <<0:64>>] ++ + [<<2424915712:32, 300:32>>, <<0:64>>] ++ + [<<2424903936:32, 400:32>>, <<2424907008:32, 500:32>>] ++ + [<<2424913408:32, 600:32>>], + ?assertMatch(ExpOut, BinList). + + find_firstzero_test() -> Bin = [<<1:64/integer>>, <<0:64/integer>>, <<89:64/integer>>, <<89:64/integer>>, @@ -1478,6 +1495,32 @@ find_firstzero_test() -> ?assertMatch([<<71:64/integer>>, <<72:64/integer>>], RHS). +cyclecount_test() -> + io:format("~n~nStarting cycle count test~n"), + KVL1 = generate_sequentialkeys(5000, []), + KVL2 = lists:foldl(fun({K, V}, Acc) -> + H = hash(K), + I = hash_to_index(H), + case I of + 0 -> + [{K, V}|Acc]; + _ -> + Acc + end end, + [], + KVL1), + {ok, P1} = cdb_open_writer("../test/cycle_count.pnd", + #cdb_options{binary_mode=false}), + ok = cdb_mput(P1, KVL2), + {ok, F2} = cdb_complete(P1), + {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=false}), + lists:foreach(fun({K, V}) -> + ?assertMatch({K, V}, cdb_get(P2, K)) end, + KVL2), + ok = cdb_close(P2), + ok = file:delete("../test/cycle_count.cdb"). + + full_1_test() -> List1 = lists:sort([{"key1","value1"},{"key2","value2"}]), create("../test/simple.cdb", @@ -1823,6 +1866,7 @@ state_test() -> ?assertMatch({"Key1", "Value1"}, cdb_get(P1, "Key1")), ok = cdb_close(P1). + hashclash_test() -> {ok, P1} = cdb_open_writer("../test/hashclash_test.pnd", #cdb_options{binary_mode=false}), diff --git a/src/leveled_log.erl b/src/leveled_log.erl index a5d0559..931c975 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -263,7 +263,8 @@ {info, "Microsecond timings for hashtree build of " ++ "to_list ~w sort ~w build ~w"}}, {"CDB15", - {info, "Cycle count of ~w in hashtable search higher than expected~n"}} + {info, "Cycle count of ~w in hashtable search higher than expected" + ++ " in search for hash ~w with result ~w"}} ])).