From 972a0ee0b9459607524d5461e4679d4f118394a1 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 13 Dec 2016 02:15:13 +0000 Subject: [PATCH] Refactor hash table write Less looping and re-looping over list. Uses ordering to build more naturally. --- src/leveled_cdb.erl | 233 ++++++++++++++++++++++++++------------------ 1 file changed, 139 insertions(+), 94 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index eda2692..2bf6acd 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -753,11 +753,7 @@ end. hashtable_calc(HashTree, StartPos) -> Seq = lists:seq(0, 255), SWC = os:timestamp(), - {IndexList, HashTreeBin} = write_hash_tables(Seq, - HashTree, - StartPos, - [], - <<>>), + {IndexList, HashTreeBin} = write_hash_tables(Seq, HashTree, StartPos), leveled_log:log_timer("CDB07", [], SWC), {IndexList, HashTreeBin}. @@ -805,8 +801,8 @@ find_lastkey(Handle, IndexCache) -> scan_index(Handle, IndexCache, {ScanFun, InitAcc}) -> lists:foldl(fun({_X, {Pos, Count}}, Acc) -> - ScanFun(Handle, Pos, Count, Acc) - end, + ScanFun(Handle, Pos, Count, Acc) + end, InitAcc, IndexCache). @@ -1165,68 +1161,11 @@ perform_write_hash_tables(Handle, HashTreeBin, StartPos) -> ok. -write_hash_tables([], _HashTree, _CurrPos, IndexList, HashTreeBin) -> - {IndexList, HashTreeBin}; -write_hash_tables([Index|Rest], HashTree, CurrPos, IndexList, HashTreeBin) -> - case is_empty(HashTree, Index) of - true -> - write_hash_tables(Rest, HashTree, CurrPos, IndexList, HashTreeBin); - false -> - BinList = to_binarylist(HashTree, Index), - % BinList = build_binaryhashlist(HashList, []), - IndexLength = length(BinList) * 2, - SlotList = lists:duplicate(IndexLength, <<0:32, 0:32>>), - - Fn = fun({Hash, Binary}, AccSlotList) -> - Slot1 = find_open_slot(AccSlotList, Hash), - {L1, [<<0:32, 0:32>>|L2]} = lists:split(Slot1, AccSlotList), - lists:append(L1, [Binary|L2]) - end, - - NewSlotList = lists:foldl(Fn, SlotList, BinList), - NewSlotBin = lists:foldl(fun(X, Acc) -> - <> end, - HashTreeBin, - NewSlotList), - write_hash_tables(Rest, - HashTree, - CurrPos + length(NewSlotList) * ?DWORD_SIZE, - [{Index, CurrPos, IndexLength}|IndexList], - NewSlotBin) - end. - - -%% Slot is zero based because it comes from a REM -find_open_slot(List, Hash) -> - Len = length(List), - Slot = hash_to_slot(Hash, Len), - Seq = lists:seq(1, Len), - {CL1, CL2} = lists:split(Slot, Seq), - {L1, L2} = lists:split(Slot, List), - find_open_slot1(lists:append(CL2, CL1), lists:append(L2, L1)). - -find_open_slot1([Slot|_RestOfSlots], [<<0:32,0:32>>|_RestOfEntries]) -> - Slot - 1; -find_open_slot1([_|RestOfSlots], [_|RestOfEntries]) -> - find_open_slot1(RestOfSlots, RestOfEntries). - - %% Write the top most 255 doubleword entries. First word is the %% file pointer to a hashtable and the second word is the number of entries %% in the hash table %% The List passed in should be made up of {Index, Position, Count} tuples -write_top_index_table(Handle, BasePos, List) -> - % fold function to find any missing index tuples, and add one a replacement - % in this case with a count of 0. Also orders the list by index - FnMakeIndex = fun(I) -> - case lists:keysearch(I, 1, List) of - {value, Tuple} -> - Tuple; - false -> - {I, BasePos, 0} - end - end, - % Fold function to write the index entries +write_top_index_table(Handle, BasePos, IndexList) -> FnWriteIndex = fun({_Index, Pos, Count}, {AccBin, CurrPos}) -> case Count == 0 of true -> @@ -1240,11 +1179,9 @@ write_top_index_table(Handle, BasePos, List) -> {<>, NextPos} end, - Seq = lists:seq(0, 255), - CompleteList = lists:keysort(1, lists:map(FnMakeIndex, Seq)), {IndexBin, _Pos} = lists:foldl(FnWriteIndex, {<<>>, BasePos}, - CompleteList), + IndexList), {ok, _} = file:position(Handle, 0), ok = file:write(Handle, IndexBin), ok = file:advise(Handle, 0, ?DWORD_SIZE * 256, will_need), @@ -1317,15 +1254,110 @@ is_empty(HashTree, Index) -> false end. -to_binarylist(HashTree, Index) -> +to_slotmap(HashTree, Index) -> + ObjList = ets:match_object(HashTree, {{Index, '_'}, '_'}), + IndexLength = length(ObjList) * 2, ConvertObjFun = fun({{_Idx, Hash}, Position}) -> HashLE = endian_flip(Hash), PosLE = endian_flip(Position), NewBin = <>, - {Hash, NewBin} + {hash_to_slot(Hash, IndexLength), NewBin} end, - lists:map(ConvertObjFun, ets:match_object(HashTree, {{Index, '_'}, '_'})). + lists:keysort(1, lists:map(ConvertObjFun, ObjList)). + + +build_hashtree_binary(SlotMap, IndexLength) -> + build_hashtree_binary(SlotMap, IndexLength, 0, <<>>). + +build_hashtree_binary([], IdxLen, _SlotPos, Bin) -> + case byte_size(Bin) div ?DWORD_SIZE of + IdxLen -> + Bin; + N when N < IdxLen -> + ZeroLen = (IdxLen - N) * 64, + <> + end; +build_hashtree_binary([{TopSlot, TopBin}|SlotMapTail], IdxLen, SlotPos, Bin) -> + case TopSlot of + SlotPos -> + UpdBin = <>, + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + 1, + UpdBin); + N when N > SlotPos -> + Delta = N - SlotPos, + DeltaLen = Delta * 64, + UpdBin = <>, + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + Delta + 1, + UpdBin); + N when N < SlotPos, SlotPos < IdxLen -> + UpdBin = <>, + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + 1, + UpdBin); + N when N < SlotPos, SlotPos >= IdxLen -> + % Need to wrap round and put in the first empty slot from the + % beginning + Pos = find_firstzero(Bin, 0) * 64, + UpdBin = + case Pos of + 0 -> + <<0:64, Tail/binary>> = Bin, + <>; + _P -> + <> = Bin, + <> + end, + build_hashtree_binary(SlotMapTail, + IdxLen, + SlotPos + 1, + UpdBin) + end. + + +find_firstzero(<>, Pos) -> + case N of + 0 -> + Pos; + _ -> + find_firstzero(TailBin, Pos + 1) + end. + + +write_hash_tables(Indexes, HashTree, CurrPos) -> + write_hash_tables(Indexes, HashTree, CurrPos, CurrPos, [], <<>>). + +write_hash_tables([], _HashTree, _CurrPos, _BasePos, IndexList, HashTreeBin) -> + IL = lists:reverse(IndexList), + {IL, HashTreeBin}; +write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos, + IndexList, HashTreeBin) -> + case is_empty(HashTree, Index) of + true -> + write_hash_tables(Rest, + HashTree, + CurrPos, + BasePos, + [{Index, BasePos, 0}|IndexList], + HashTreeBin); + false -> + SlotMap = to_slotmap(HashTree, Index), + IndexLength = length(SlotMap) * 2, + NewSlotBin = build_hashtree_binary(SlotMap, IndexLength), + write_hash_tables(Rest, + HashTree, + CurrPos + IndexLength * ?DWORD_SIZE, + BasePos, + [{Index, CurrPos, IndexLength}|IndexList], + <>) + end. + + %%%%%%%%%%%%%%%% % T E S T @@ -1374,33 +1406,46 @@ dump(FileName) -> to_dict(FileName) -> KeyValueList = dump(FileName), dict:from_list(KeyValueList). - -find_open_slot_1_test() -> - List = [<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], - Slot = find_open_slot(List,0), - ?assertMatch(Slot,1). +build_hashtree_bunchedatend_binary_test() -> + SlotMap = [{1, <<10:32, 0:32>>}, + {4, <<11:32, 100:32>>}, + {8, <<12:32, 200:32>>}, + {8, <<13:32, 300:32>>}, + {14, <<14:32, 400:32>>}, + {14, <<15:32, 500:32>>}, + {15, <<16:32, 600:32>>}, + {15, <<17:32, 700:32>>}], + Bin = build_hashtree_binary(SlotMap, 16), + ExpBinP1 = <<16:32, 600:32, 10:32, 0:32, 17:32, 700:32, 0:64>>, + ExpBinP2 = <<11:32, 100:32, 0:192, 12:32, 200:32, 13:32, 300:32, 0:256>>, + ExpBinP3 = <<14:32, 400:32, 15:32, 500:32>>, + ExpBin = <>, + ?assertMatch(ExpBin, Bin). -find_open_slot_2_test() -> - List = [<<0:32,0:32>>,<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>], - Slot = find_open_slot(List,0), - ?assertMatch(Slot,0). +build_hashtree_bunchedatstart_binary_test() -> + SlotMap = [{1, <<10:32, 0:32>>}, + {2, <<11:32, 100:32>>}, + {3, <<12:32, 200:32>>}, + {4, <<13:32, 300:32>>}, + {5, <<14:32, 400:32>>}, + {6, <<15:32, 500:32>>}, + {7, <<16:32, 600:32>>}, + {8, <<17:32, 700:32>>}], + Bin = build_hashtree_binary(SlotMap, 16), + ExpBinP1 = <<0:64, 10:32, 0:32, 11:32, 100:32, 12:32, 200:32>>, + ExpBinP2 = <<13:32, 300:32, 14:32, 400:32, 15:32, 500:32, 16:32, 600:32>>, + ExpBinP3 = <<17:32, 700:32, 0:448>>, + ExpBin = <>, + ExpSize = byte_size(ExpBin), + ?assertMatch(ExpSize, byte_size(Bin)), + ?assertMatch(ExpBin, Bin). -find_open_slot_3_test() -> - List = [<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>], - Slot = find_open_slot(List,2), - ?assertMatch(Slot,3). +find_firstzero_test() -> + Bin = <<1:64/integer, 0:64/integer, 89:64/integer, 72:64/integer>>, + ?assertMatch(1, find_firstzero(Bin, 0)). -find_open_slot_4_test() -> - List = [<<0:32,0:32>>,<<1:32,1:32>>,<<1:32,1:32>>,<<1:32,1:32>>], - Slot = find_open_slot(List,1), - ?assertMatch(Slot,0). - -find_open_slot_5_test() -> - List = [<<1:32,1:32>>,<<1:32,1:32>>,<<0:32,0:32>>,<<1:32,1:32>>], - Slot = find_open_slot(List,3), - ?assertMatch(Slot,2). full_1_test() -> List1 = lists:sort([{"key1","value1"},{"key2","value2"}]), @@ -1683,7 +1728,7 @@ get_keys_byposition_manykeys_test() -> {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=false}), PositionList = cdb_getpositions(P2, all), L1 = length(PositionList), - ?assertMatch(L1, KeyCount), + ?assertMatch(KeyCount, L1), SampleList1 = cdb_getpositions(P2, 10), ?assertMatch(10, length(SampleList1)),