From 7dc4913d5ab2a0962a30c28cd87f30e878e0ea3f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 20 Mar 2017 22:43:22 +0000 Subject: [PATCH 01/15] ETS - delete table not objects Try and delete the table not just the objects in the table - will this improve memory leak? --- src/leveled_bookie.erl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index b450bcc..055cfa9 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1188,8 +1188,9 @@ maybepush_ledgercache(MaxCacheSize, Cache, Penciller) -> case leveled_penciller:pcl_pushmem(Penciller, CacheToLoad) of ok -> Cache0 = #ledger_cache{}, - true = ets:delete_all_objects(Tab), - {ok, Cache0#ledger_cache{mem=Tab}}; + true = ets:delete(Tab), + NewTab = ets:new(mem, [ordered_set]), + {ok, Cache0#ledger_cache{mem=NewTab}}; returned -> {returned, Cache} end; From 415ac6017b696ee3116293a3446d0b65fc55a9e9 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 20 Mar 2017 23:22:46 +0000 Subject: [PATCH 02/15] Move sst get_kv range back inside process Moved outside to stop blocking, but also avoids copy. Move back out to see if it may be related to the binary memory leak --- src/leveled_sst.erl | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index cb5d71c..2bbfc16 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -196,15 +196,9 @@ sst_get(Pid, LedgerKey, Hash) -> gen_fsm:sync_send_event(Pid, {get_kv, LedgerKey, Hash}, infinity). sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> - Reply = gen_fsm:sync_send_event(Pid, - {get_kvrange, StartKey, EndKey, ScanWidth}, - infinity), - FetchFun = - fun({SlotBin, SK, EK}, Acc) -> - Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) - end, - {SlotsToFetchBinList, SlotsToPoint} = Reply, - lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint. + gen_fsm:sync_send_event(Pid, + {get_kvrange, StartKey, EndKey, ScanWidth}, + infinity). sst_getslots(Pid, SlotList) -> SlotBins = gen_fsm:sync_send_event(Pid, {get_slots, SlotList}, infinity), @@ -315,8 +309,16 @@ reader({get_kv, LedgerKey, Hash}, _From, State) -> UpdTimings = leveled_log:sst_timing(State#state.sst_timings, SW, Stage), {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}}; reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> + FetchFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + {SlotsToFetchBinList, SlotsToPoint} = fetch_range(StartKey, + EndKey, + ScanWidth, + State), {reply, - fetch_range(StartKey, EndKey, ScanWidth, State), + lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint, reader, State}; reader({get_slots, SlotList}, _From, State) -> From 419541f5dd1ff61184b1afafab2bd661c7db9a57 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Mon, 20 Mar 2017 23:43:31 +0000 Subject: [PATCH 03/15] Fix to delete_pending state --- src/leveled_sst.erl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 2bbfc16..32e6ef5 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -355,8 +355,16 @@ delete_pending({get_kv, LedgerKey, Hash}, _From, State) -> {Result, _Stage, _SlotID, UpdState} = fetch(LedgerKey, Hash, State), {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT}; delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> + FetchFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + {SlotsToFetchBinList, SlotsToPoint} = fetch_range(StartKey, + EndKey, + ScanWidth, + State), {reply, - fetch_range(StartKey, EndKey, ScanWidth, State), + lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint, delete_pending, State, ?DELETE_TIMEOUT}; From e18d2f2f00c888315218f11bf9aa41634af0a454 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 01:31:42 +0000 Subject: [PATCH 04/15] Delete the ETS table from CDB files Rather than simply dereference it - delete it --- src/leveled_cdb.erl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index 2485395..19d660e 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -364,6 +364,7 @@ rolling({return_hashtable, IndexList, HashTreeBin}, _From, State) -> file:close(Handle), ok = rename_for_read(State#state.filename, NewName), leveled_log:log("CDB03", [NewName]), + ets:delete(State#state.hashtree), {NewHandle, Index, LastKey} = open_for_readonly(NewName, State#state.last_key), case State#state.deferred_delete of From c46377584f5d6c2aa1f43162d7939844fc27f2f5 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 01:32:41 +0000 Subject: [PATCH 05/15] Revert "ETS - delete table not objects" This reverts commit 7dc4913d5ab2a0962a30c28cd87f30e878e0ea3f. --- src/leveled_bookie.erl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 055cfa9..b450bcc 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1188,9 +1188,8 @@ maybepush_ledgercache(MaxCacheSize, Cache, Penciller) -> case leveled_penciller:pcl_pushmem(Penciller, CacheToLoad) of ok -> Cache0 = #ledger_cache{}, - true = ets:delete(Tab), - NewTab = ets:new(mem, [ordered_set]), - {ok, Cache0#ledger_cache{mem=NewTab}}; + true = ets:delete_all_objects(Tab), + {ok, Cache0#ledger_cache{mem=Tab}}; returned -> {returned, Cache} end; From dd0316eedf95cec831e459b2d97ab4ed5f3f0323 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 11:03:29 +0000 Subject: [PATCH 06/15] Yield on query selectively Still not clear if yielding is the cause of memory problems, but taking it away universally has impacted throughput. At the very least we should continue to yield on high-contention files (those at higher levels), where the processes are more likely to be quickly terminated anyway allowing GC to be invoked. --- src/leveled_sst.erl | 63 ++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 32e6ef5..287ff3c 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -120,12 +120,20 @@ size :: integer(), max_sqn :: integer()}). +%% yield_blockquery is used to detemrine if the work necessary to process a +%% range query beyond the fetching the slot should be managed from within +%% this process, or should be handled by the calling process. +%% Handling within the calling process may lead to extra binary heap garbage +%% see Issue 52. Handling within the SST process may lead to contention and +%% extra copying. Files at the top of the tree yield, those lower down don't. + -record(state, {summary, handle :: file:fd(), sst_timings :: tuple(), penciller :: pid(), root_path, filename, + yield_blockquery = false :: boolean(), blockindex_cache}). @@ -196,9 +204,18 @@ sst_get(Pid, LedgerKey, Hash) -> gen_fsm:sync_send_event(Pid, {get_kv, LedgerKey, Hash}, infinity). sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> - gen_fsm:sync_send_event(Pid, - {get_kvrange, StartKey, EndKey, ScanWidth}, - infinity). + case gen_fsm:sync_send_event(Pid, + {get_kvrange, StartKey, EndKey, ScanWidth}, + infinity) of + {yield, SlotsToFetchBinList, SlotsToPoint} -> + FetchFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint; + Reply -> + Reply + end. sst_getslots(Pid, SlotList) -> SlotBins = gen_fsm:sync_send_event(Pid, {get_slots, SlotList}, infinity), @@ -261,7 +278,10 @@ starting({sst_new, RootPath, Filename, Level, {SlotList, FirstKey}, MaxSQN}, Length, MaxSQN), ActualFilename = write_file(RootPath, Filename, SummaryBin, SlotsBin), - UpdState = read_file(ActualFilename, State#state{root_path=RootPath}), + YBQ = Level =< 1, + UpdState = read_file(ActualFilename, + State#state{root_path=RootPath, + yield_blockquery=YBQ}), Summary = UpdState#state.summary, leveled_log:log_timer("SST08", [ActualFilename, Level, Summary#summary.max_sqn], @@ -286,7 +306,9 @@ starting({sst_newlevelzero, RootPath, Filename, SlotCount, MaxSQN), ActualFilename = write_file(RootPath, Filename, SummaryBin, SlotsBin), - UpdState = read_file(ActualFilename, State#state{root_path=RootPath}), + UpdState = read_file(ActualFilename, + State#state{root_path = RootPath, + yield_blockquery = true}), Summary = UpdState#state.summary, leveled_log:log_timer("SST08", [ActualFilename, 0, Summary#summary.max_sqn], @@ -309,18 +331,26 @@ reader({get_kv, LedgerKey, Hash}, _From, State) -> UpdTimings = leveled_log:sst_timing(State#state.sst_timings, SW, Stage), {reply, Result, reader, UpdState#state{sst_timings = UpdTimings}}; reader({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> - FetchFun = - fun({SlotBin, SK, EK}, Acc) -> - Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) - end, {SlotsToFetchBinList, SlotsToPoint} = fetch_range(StartKey, EndKey, ScanWidth, State), - {reply, - lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint, - reader, - State}; + case State#state.yield_blockquery of + true -> + {reply, + {yield, SlotsToFetchBinList, SlotsToPoint}, + reader, + State}; + false -> + FetchFun = + fun({SlotBin, SK, EK}, Acc) -> + Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) + end, + {reply, + lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint, + reader, + State} + end; reader({get_slots, SlotList}, _From, State) -> SlotBins = read_slots(State#state.handle, SlotList), {reply, SlotBins, reader, State}; @@ -355,16 +385,13 @@ delete_pending({get_kv, LedgerKey, Hash}, _From, State) -> {Result, _Stage, _SlotID, UpdState} = fetch(LedgerKey, Hash, State), {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT}; delete_pending({get_kvrange, StartKey, EndKey, ScanWidth}, _From, State) -> - FetchFun = - fun({SlotBin, SK, EK}, Acc) -> - Acc ++ binaryslot_trimmedlist(SlotBin, SK, EK) - end, {SlotsToFetchBinList, SlotsToPoint} = fetch_range(StartKey, EndKey, ScanWidth, State), + % Always yield as about to clear and de-reference {reply, - lists:foldl(FetchFun, [], SlotsToFetchBinList) ++ SlotsToPoint, + {yield, SlotsToFetchBinList, SlotsToPoint}, delete_pending, State, ?DELETE_TIMEOUT}; From 682dfc4d5983363e5b0a34b8d840b7d820adf44b Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 12:02:22 +0000 Subject: [PATCH 07/15] Revert "Revert "ETS - delete table not objects"" This reverts commit c46377584f5d6c2aa1f43162d7939844fc27f2f5. --- src/leveled_bookie.erl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index b450bcc..055cfa9 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -1188,8 +1188,9 @@ maybepush_ledgercache(MaxCacheSize, Cache, Penciller) -> case leveled_penciller:pcl_pushmem(Penciller, CacheToLoad) of ok -> Cache0 = #ledger_cache{}, - true = ets:delete_all_objects(Tab), - {ok, Cache0#ledger_cache{mem=Tab}}; + true = ets:delete(Tab), + NewTab = ets:new(mem, [ordered_set]), + {ok, Cache0#ledger_cache{mem=NewTab}}; returned -> {returned, Cache} end; From 64e944d9baa8549e1bac182e7541cdcab156e6a4 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 16:54:23 +0000 Subject: [PATCH 08/15] Change to 5 blocks in SST Slot Change to 5 blocks is intended to make the blocks in lookup slots fractionally smaller, but more importantly to introduce a middle block that cna be opened in a binary-split style fashion to reduce the number of blocks that need to be opened for range queries. Worst case for full slots is 3 blocks now not 4. --- src/leveled_sst.erl | 254 +++++++++++++++++++++++++++++--------------- 1 file changed, 170 insertions(+), 84 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index cb5d71c..dc30ebe 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -65,9 +65,10 @@ -include("include/leveled.hrl"). -define(MAX_SLOTS, 256). --define(SLOT_SIZE, 128). % This is not configurable --define(NOLOOK_MULT, 2). % How much bigger is a slot/block with no lookups --define(NOLOOK_SLOTSIZE, ?SLOT_SIZE * ?NOLOOK_MULT). +-define(LOOK_SLOTSIZE, 128). +-define(LOOK_BLOCKSIZE, {28, 16}). % This is not configurable +-define(NOLOOK_SLOTSIZE, 256). +-define(NOLOOK_BLOCKSIZE, {56, 32}). % This is not configurable -define(COMPRESSION_LEVEL, 1). -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]). % -define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]). @@ -427,7 +428,7 @@ fetch(LedgerKey, Hash, State) -> slot_fetch, Slot#slot_index_value.slot_id, State#state{blockindex_cache = BlockIndexCache}}; - <> -> + <> -> PosList = find_pos(BlockIdx, double_hash(Hash, LedgerKey), [], @@ -730,43 +731,56 @@ generate_binary_slot(Lookup, KVL) -> {[], <<0:1/integer, 127:7/integer>>} end, - BlockSize = + {SideBlockSize, MidBlockSize} = case Lookup of lookup -> - ?SLOT_SIZE div 4; + ?LOOK_BLOCKSIZE; no_lookup -> - ?NOLOOK_SLOTSIZE div 4 + ?NOLOOK_BLOCKSIZE end, - - {B1, B2, B3, B4} = + {B1, B2, B3, B4, B5} = case length(KVL) of - L when L =< BlockSize -> + L when L =< SideBlockSize -> {term_to_binary(KVL, ?BINARY_SETTINGS), <<0:0>>, <<0:0>>, + <<0:0>>, <<0:0>>}; - L when L =< 2 * BlockSize -> - {KVLA, KVLB} = lists:split(BlockSize, KVL), + L when L =< 2 * SideBlockSize -> + {KVLA, KVLB} = lists:split(SideBlockSize, KVL), {term_to_binary(KVLA, ?BINARY_SETTINGS), term_to_binary(KVLB, ?BINARY_SETTINGS), <<0:0>>, + <<0:0>>, <<0:0>>}; - L when L =< 3 * BlockSize -> - {KVLA, KVLB_Rest} = lists:split(BlockSize, KVL), - {KVLB, KVLC} = lists:split(BlockSize, KVLB_Rest), + L when L =< (2 * SideBlockSize + MidBlockSize) -> + {KVLA, KVLB_Rest} = lists:split(SideBlockSize, KVL), + {KVLB, KVLC} = lists:split(SideBlockSize, KVLB_Rest), {term_to_binary(KVLA, ?BINARY_SETTINGS), term_to_binary(KVLB, ?BINARY_SETTINGS), term_to_binary(KVLC, ?BINARY_SETTINGS), + <<0:0>>, <<0:0>>}; - L when L =< 4 * BlockSize -> - {KVLA, KVLB_Rest} = lists:split(BlockSize, KVL), - {KVLB, KVLC_Rest} = lists:split(BlockSize, KVLB_Rest), - {KVLC, KVLD} = lists:split(BlockSize, KVLC_Rest), + L when L =< (3 * SideBlockSize + MidBlockSize) -> + {KVLA, KVLB_Rest} = lists:split(SideBlockSize, KVL), + {KVLB, KVLC_Rest} = lists:split(SideBlockSize, KVLB_Rest), + {KVLC, KVLD} = lists:split(MidBlockSize, KVLC_Rest), {term_to_binary(KVLA, ?BINARY_SETTINGS), term_to_binary(KVLB, ?BINARY_SETTINGS), term_to_binary(KVLC, ?BINARY_SETTINGS), - term_to_binary(KVLD, ?BINARY_SETTINGS)} + term_to_binary(KVLD, ?BINARY_SETTINGS), + <<0:0>>}; + L when L =< (4 * SideBlockSize + MidBlockSize) -> + {KVLA, KVLB_Rest} = lists:split(SideBlockSize, KVL), + {KVLB, KVLC_Rest} = lists:split(SideBlockSize, KVLB_Rest), + {KVLC, KVLD_Rest} = lists:split(MidBlockSize, KVLC_Rest), + {KVLD, KVLE} = lists:split(SideBlockSize, KVLD_Rest), + {term_to_binary(KVLA, ?BINARY_SETTINGS), + term_to_binary(KVLB, ?BINARY_SETTINGS), + term_to_binary(KVLC, ?BINARY_SETTINGS), + term_to_binary(KVLD, ?BINARY_SETTINGS), + term_to_binary(KVLE, ?BINARY_SETTINGS)} end, B1P = byte_size(PosBinIndex), @@ -774,14 +788,16 @@ generate_binary_slot(Lookup, KVL) -> B2L = byte_size(B2), B3L = byte_size(B3), B4L = byte_size(B4), + B5L = byte_size(B5), Lengths = <>, + B4L:32/integer, + B5L:32/integer>>, SlotBin = <>, + B1/binary, B2/binary, B3/binary, B4/binary, B5/binary>>, CRC32 = erlang:crc32(SlotBin), FullBin = <>, @@ -811,8 +827,8 @@ read_block(Handle, Slot, BlockLengths, BlockID) -> Slot#slot_index_value.start_position + BlockPos + Offset - + 24, - % 4-byte CRC, 4 byte pos, 4x4 byte lengths + + 28, + % 4-byte CRC, 4 byte pos, 5x4 byte lengths Length), BlockBin. @@ -894,9 +910,12 @@ binaryslot_tolist(FullBin) -> B1L:32/integer, B2L:32/integer, B3L:32/integer, - B4L:32/integer>> = BlockLengths, + B4L:32/integer, + B5L:32/integer>> = BlockLengths, <<_PosBinIndex:B1P/binary, Blocks/binary>> = RestBin, - lists:foldl(BlockFetchFun, {[], Blocks}, [B1L, B2L, B3L, B4L]); + lists:foldl(BlockFetchFun, + {[], Blocks}, + [B1L, B2L, B3L, B4L, B5L]); crc_wonky -> {[], <<>>} end, @@ -908,56 +927,103 @@ binaryslot_trimmedlist(FullBin, all, all) -> binaryslot_trimmedlist(FullBin, StartKey, EndKey) -> LTrimFun = fun({K, _V}) -> K < StartKey end, RTrimFun = fun({K, _V}) -> not leveled_codec:endkey_passed(EndKey, K) end, - BlockFetchFun = - fun(Length, {Acc, Bin, Continue}) -> - case {Length, Continue} of - {0, _} -> - {Acc, Bin, false}; - {_, true} -> - <> = Bin, - BlockList = binary_to_term(Block), - {LastKey, _LV} = lists:last(BlockList), - case StartKey > LastKey of - true -> - {Acc, Rest, true}; - false -> - {_LDrop, RKeep} = lists:splitwith(LTrimFun, - BlockList), - case leveled_codec:endkey_passed(EndKey, LastKey) of - true -> - {LKeep, _RDrop} = lists:splitwith(RTrimFun, RKeep), - {Acc ++ LKeep, Rest, false}; - false -> - {Acc ++ RKeep, Rest, true} - end - end; - {_ , false} -> - {Acc, Bin, false} - end - end, - - {Out, _Rem, _Continue} = + + % It will be more effecient to check a subset of blocks. To work out + % the best subset we always look in the middle block of 5, and based on + % the first and last keys of that middle block when compared to the Start + % and EndKey of the query determines a subset of blocks + % + % This isn't perfectly efficient, esepcially if the query overlaps Block2 + % and Block3 (as Block 1 will also be checked), but finessing this last + % scenario is hard to do in concise code + BlocksToCheck = case crc_check_slot(FullBin) of {BlockLengths, RestBin} -> <> = BlockLengths, - <<_PosBinIndex:B1P/binary, Blocks/binary>> = RestBin, - lists:foldl(BlockFetchFun, {[], Blocks, true}, [B1L, B2L, B3L, B4L]); + B4L:32/integer, + B5L:32/integer>> = BlockLengths, + <<_PosBinIndex:B1P/binary, + Block1:B1L/binary, Block2:B2L/binary, + MidBlock:B3L/binary, + Block4:B4L/binary, Block5:B5L/binary>> = RestBin, + case B3L of + 0 -> + [Block1, Block2]; + _ -> + MidBlockList = binary_to_term(MidBlock), + {MidFirst, _} = lists:nth(1, MidBlockList), + {MidLast, _} = lists:last(MidBlockList), + Split = {StartKey > MidLast, + StartKey >= MidFirst, + leveled_codec:endkey_passed(EndKey, + MidFirst), + leveled_codec:endkey_passed(EndKey, + MidLast)}, + case Split of + {true, _, _, _} -> + [Block4, Block5]; + {false, true, false, true} -> + [MidBlockList]; + {false, true, false, false} -> + [MidBlockList, Block4, Block5]; + {false, false, true, true} -> + [Block1, Block2]; + {false, false, false, true} -> + [Block1, Block2, MidBlockList]; + _ -> + [Block1, Block2, MidBlockList, Block4, Block5] + end + end; crc_wonky -> - {[], <<>>, true} + [] end, - Out. + + BlockCheckFun = + fun(Block, {Acc, Continue}) -> + case {Block, Continue} of + {<<>>, _} -> + {Acc, false}; + {_, true} -> + BlockList = + case is_binary(Block) of + true -> + binary_to_term(Block); + false -> + Block + end, + {LastKey, _LV} = lists:last(BlockList), + case StartKey > LastKey of + true -> + {Acc, true}; + false -> + {_LDrop, RKeep} = lists:splitwith(LTrimFun, + BlockList), + case leveled_codec:endkey_passed(EndKey, LastKey) of + true -> + {LKeep, _RDrop} = lists:splitwith(RTrimFun, RKeep), + {Acc ++ LKeep, false}; + false -> + {Acc ++ RKeep, true} + end + end; + {_ , false} -> + {Acc, false} + end + end, + + {Acc, _Continue} = lists:foldl(BlockCheckFun, {[], true}, BlocksToCheck), + Acc. crc_check_slot(FullBin) -> <> = FullBin, case erlang:crc32(SlotBin) of CRC32 -> - <> = SlotBin, + <> = SlotBin, {BlockLengths, Rest}; _ -> leveled_log:log("SST09", []), @@ -965,7 +1031,7 @@ crc_check_slot(FullBin) -> end. block_offsetandlength(BlockLengths, BlockID) -> - <> = BlockLengths, + <> = BlockLengths, case BlockID of 1 -> <> = BlockLengths0, @@ -983,8 +1049,17 @@ block_offsetandlength(BlockLengths, BlockID) -> <> = BlockLengths0, - {BlocksPos, B1L + B2L + B3L, B4L} + B4L:32/integer, + _BR/binary>> = BlockLengths0, + {BlocksPos, B1L + B2L + B3L, B4L}; + 5 -> + <> = BlockLengths0, + {BlocksPos, B1L + B2L + B3L + B4L, B5L} end. double_hash(Hash, Key) -> @@ -1004,15 +1079,26 @@ fetch_value([Pos|Rest], BlockLengths, Blocks, Key) -> case K of Key -> {K, V}; - _ -> + _ -> fetch_value(Rest, BlockLengths, Blocks, Key) end. revert_position(Pos) -> - BlockNumber = (Pos div 32) + 1, - BlockPos = (Pos rem 32) + 1, - {BlockNumber, BlockPos}. + {SideBlockSize, MidBlockSize} = ?LOOK_BLOCKSIZE, + case Pos < 2 * SideBlockSize of + true -> + {(Pos div SideBlockSize) + 1, (Pos rem SideBlockSize) + 1}; + false -> + case Pos < (2 * SideBlockSize + MidBlockSize) of + true -> + {3, ((Pos - 2 * SideBlockSize) rem MidBlockSize) + 1}; + false -> + TailPos = Pos - 2 * SideBlockSize - MidBlockSize, + {(TailPos div SideBlockSize) + 4, + (TailPos rem SideBlockSize) + 1} + end + end. find_pos(<<>>, _Hash, PosList, _Count) -> PosList; @@ -1059,7 +1145,7 @@ find_pos(<<0:1/integer, NHC:7/integer, T/binary>>, Hash, PosList, Count) -> %% any lower sequence numbers should be compacted out of existence merge_lists(KVList1) -> - SlotCount = length(KVList1) div ?SLOT_SIZE, + SlotCount = length(KVList1) div ?LOOK_SLOTSIZE, {[], [], split_lists(KVList1, [], SlotCount), @@ -1071,7 +1157,7 @@ split_lists(LastPuff, SlotLists, 0) -> SlotD = generate_binary_slot(lookup, LastPuff), lists:reverse([SlotD|SlotLists]); split_lists(KVList1, SlotLists, N) -> - {Slot, KVListRem} = lists:split(?SLOT_SIZE, KVList1), + {Slot, KVListRem} = lists:split(?LOOK_SLOTSIZE, KVList1), SlotD = generate_binary_slot(lookup, Slot), split_lists(KVListRem, [SlotD|SlotLists], N - 1). @@ -1105,7 +1191,8 @@ merge_lists(KVList1, KVList2, LI, SlotList, FirstKey, SlotCount) -> form_slot([], [], _LI, Type, _Size, Slot, FK) -> {[], [], {Type, lists:reverse(Slot)}, FK}; -form_slot(KVList1, KVList2, _LI, lookup, ?SLOT_SIZE, Slot, FK) -> + +form_slot(KVList1, KVList2, _LI, lookup, ?LOOK_SLOTSIZE, Slot, FK) -> {KVList1, KVList2, {lookup, lists:reverse(Slot)}, FK}; form_slot(KVList1, KVList2, _LI, no_lookup, ?NOLOOK_SLOTSIZE, Slot, FK) -> {KVList1, KVList2, {no_lookup, lists:reverse(Slot)}, FK}; @@ -1142,7 +1229,7 @@ form_slot(KVList1, KVList2, {IsBasement, TS}, no_lookup, Size, Slot, FK) -> [{TopK, TopV}|Slot], FK0); lookup -> - case Size >= ?SLOT_SIZE of + case Size >= ?LOOK_SLOTSIZE of true -> {KVList1, KVList2, @@ -1310,7 +1397,7 @@ form_slot_test() -> R1 = form_slot([SkippingKV], [], {true, 99999999}, no_lookup, - ?SLOT_SIZE + 1, + ?LOOK_SLOTSIZE + 1, Slot, {o, "B1", "K5", null}), ?assertMatch({[], [], {no_lookup, Slot}, {o, "B1", "K5", null}}, R1). @@ -1399,7 +1486,7 @@ indexed_list_mixedkeys2_test() -> indexed_list_allindexkeys_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128), {PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), - ?assertMatch(<<_BL:20/binary, 127:8/integer>>, PosBinIndex1), + ?assertMatch(<<_BL:24/binary, 127:8/integer>>, PosBinIndex1), % SW = os:timestamp(), BinToList = binaryslot_tolist(FullBin), % io:format(user, @@ -1410,9 +1497,9 @@ indexed_list_allindexkeys_test() -> indexed_list_allindexkeys_nolookup_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(1000)), - 128 * ?NOLOOK_MULT), + ?NOLOOK_SLOTSIZE), {PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(no_lookup, Keys), - ?assertMatch(<<_BL:20/binary, 127:8/integer>>, PosBinIndex1), + ?assertMatch(<<_BL:24/binary, 127:8/integer>>, PosBinIndex1), % SW = os:timestamp(), BinToList = binaryslot_tolist(FullBin), % io:format(user, @@ -1424,7 +1511,7 @@ indexed_list_allindexkeys_nolookup_test() -> indexed_list_allindexkeys_trimmed_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), 128), {PosBinIndex1, FullBin, _HL, _LK} = generate_binary_slot(lookup, Keys), - ?assertMatch(<<_BL:20/binary, 127:8/integer>>, PosBinIndex1), + ?assertMatch(<<_BL:24/binary, 127:8/integer>>, PosBinIndex1), ?assertMatch(Keys, binaryslot_trimmedlist(FullBin, {i, "Bucket", @@ -1546,7 +1633,7 @@ merge_test() -> simple_persisted_range_test() -> {RP, Filename} = {"../test/", "simple_test"}, - KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 16, 1, 20), + KVList0 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 16, 1, 20), KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), @@ -1659,8 +1746,9 @@ additional_range_test() -> simple_persisted_slotsize_test() -> {RP, Filename} = {"../test/", "simple_slotsize_test"}, - KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 2, 1, 20), - KVList1 = lists:sublist(lists:ukeysort(1, KVList0), ?SLOT_SIZE), + KVList0 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 2, 1, 20), + KVList1 = lists:sublist(lists:ukeysort(1, KVList0), + ?LOOK_SLOTSIZE), [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), {ok, Pid, {FirstKey, LastKey}} = sst_new(RP, @@ -1677,7 +1765,7 @@ simple_persisted_slotsize_test() -> simple_persisted_test() -> {RP, Filename} = {"../test/", "simple_test"}, - KVList0 = generate_randomkeys(1, ?SLOT_SIZE * 32, 1, 20), + KVList0 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 32, 1, 20), KVList1 = lists:ukeysort(1, KVList0), [{FirstKey, _FV}|_Rest] = KVList1, {LastKey, _LV} = lists:last(KVList1), @@ -1706,7 +1794,7 @@ simple_persisted_test() -> ++ "microseconds~n", [length(KVList1), timer:now_diff(os:timestamp(), SW1)]), ok = sst_printtimings(Pid), - KVList2 = generate_randomkeys(1, ?SLOT_SIZE * 32, 1, 20), + KVList2 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 32, 1, 20), MapFun = fun({K, V}, Acc) -> In = lists:keymember(K, 1, KVList1), @@ -1731,8 +1819,6 @@ simple_persisted_test() -> FoldFun = fun(X, Acc) -> case X of {pointer, P, S, SK, EK} -> - io:format("Get slot ~w with Acc at ~w~n", - [S, length(Acc)]), Acc ++ sst_getslots(P, [{pointer, P, S, SK, EK}]); _ -> Acc ++ [X] From 1fdcdf3b376ca3cf4a64d9ab9a7211fd945ab602 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 17:47:08 +0000 Subject: [PATCH 09/15] Midblock size - lookup No real reason for the midblock to be smaller in lookup slots - so give the blocks a more consistent size --- src/leveled_sst.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index dc30ebe..5b49052 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -65,10 +65,10 @@ -include("include/leveled.hrl"). -define(MAX_SLOTS, 256). --define(LOOK_SLOTSIZE, 128). --define(LOOK_BLOCKSIZE, {28, 16}). % This is not configurable +-define(LOOK_SLOTSIZE, 128). % This is not configurable +-define(LOOK_BLOCKSIZE, {24, 32}). -define(NOLOOK_SLOTSIZE, 256). --define(NOLOOK_BLOCKSIZE, {56, 32}). % This is not configurable +-define(NOLOOK_BLOCKSIZE, {56, 32}). -define(COMPRESSION_LEVEL, 1). -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]). % -define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]). From 756b46bb4d32efd7073bf24eefe1d8d4c937bee2 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 17:53:34 +0000 Subject: [PATCH 10/15] Return to merge scan width of 16 This was reduced before the use of binary blocks was committed --- src/leveled_sst.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 5b49052..8483d85 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -72,7 +72,7 @@ -define(COMPRESSION_LEVEL, 1). -define(BINARY_SETTINGS, [{compressed, ?COMPRESSION_LEVEL}]). % -define(LEVEL_BLOOM_BITS, [{0, 8}, {1, 10}, {2, 8}, {default, 6}]). --define(MERGE_SCANWIDTH, 8). +-define(MERGE_SCANWIDTH, 16). -define(DISCARD_EXT, ".discarded"). -define(DELETE_TIMEOUT, 10000). -define(TREE_TYPE, idxt). From f1088716918011f88c7d07b30c103564159c85cb Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 18:15:56 +0000 Subject: [PATCH 11/15] Vclock metadata change Test performance ocntinues to be worse since the vlock metadata change. Reversing out juts in case. --- src/leveled_codec.erl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 665c6dc..484d927 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -433,7 +433,8 @@ riak_extract_metadata(ObjBin, Size) -> %% <>. -riak_metadata_to_binary(VclockBin, SibData) -> +riak_metadata_to_binary(Vclock, SibData) -> + VclockBin = term_to_binary(Vclock), VclockLen = byte_size(VclockBin), % <>. @@ -454,7 +455,7 @@ riak_metadata_from_binary(V1Binary) -> SC when is_integer(SC) -> get_metadata_from_siblings(SibsBin, SibCount, []) end, - {VclockBin, SibMetaBinList}. + {binary_to_term(VclockBin), SibMetaBinList}. % Fixes the value length for each sibling to be zero, and so includes no value slimbin_content(MetaBin) -> From eef2199335adb8a54e3bb78c3f07f867d676c062 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 18:24:11 +0000 Subject: [PATCH 12/15] Up level for yield to 2 --- src/leveled_sst.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index 287ff3c..9ec97da 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -278,7 +278,7 @@ starting({sst_new, RootPath, Filename, Level, {SlotList, FirstKey}, MaxSQN}, Length, MaxSQN), ActualFilename = write_file(RootPath, Filename, SummaryBin, SlotsBin), - YBQ = Level =< 1, + YBQ = Level =< 2, UpdState = read_file(ActualFilename, State#state{root_path=RootPath, yield_blockquery=YBQ}), From 97312e13560511e6588d366d370469f3ce3a813c Mon Sep 17 00:00:00 2001 From: martinsumner Date: Tue, 21 Mar 2017 18:28:28 +0000 Subject: [PATCH 13/15] Test to reflect vclock not binary --- test/end_to_end/testutil.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index 9e7f758..e53eb4f 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -262,7 +262,7 @@ check_forobject(Bookie, TestObject) -> Vclock, _Hash, size} = leveled_codec:riak_extract_metadata(HeadBinary, size), - true = binary_to_term(Vclock) == TestObject#r_object.vclock. + true = Vclock == TestObject#r_object.vclock. check_formissingobject(Bookie, Bucket, Key) -> not_found = book_riakget(Bookie, Bucket, Key), From 15af4942ae790dd19a92bb3b6e8e13de7f30178e Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 22 Mar 2017 00:11:17 +0000 Subject: [PATCH 14/15] Remove busy log Accounts for 60% of logs --- src/leveled_bookie.erl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 055cfa9..7e5c4cd 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -637,7 +637,6 @@ snapshot_store(LedgerCache0, Penciller, Inker, SnapType, Query) -> snapshot_query = Query, bookies_mem = BookiesMem}, {ok, LedgerSnapshot} = leveled_penciller:pcl_start(PCLopts), - leveled_log:log_randomtimer("B0004", [cache_size(LedgerCache)], SW, 0.02), case SnapType of store -> InkerOpts = #inker_options{start_snapshot=true, From 8db73917fbb045eb01b95d101c1ce574b5fb62b5 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 22 Mar 2017 00:14:37 +0000 Subject: [PATCH 15/15] Need also to remove unused bits --- src/leveled_bookie.erl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 7e5c4cd..81a5a36 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -626,7 +626,6 @@ loadqueue_ledgercache(Cache) -> %% snapshot is to be used for one specific query only (this is much quicker to %% setup, assuming the range is a small subset of the overall key space). snapshot_store(LedgerCache0, Penciller, Inker, SnapType, Query) -> - SW = os:timestamp(), LedgerCache = readycache_forsnapshot(LedgerCache0, Query), BookiesMem = {LedgerCache#ledger_cache.loader, LedgerCache#ledger_cache.index, @@ -670,9 +669,6 @@ maybe_longrunning(SW, Aspect) -> ok end. -cache_size(LedgerCache) -> - ets:info(LedgerCache#ledger_cache.mem, size). - bucket_stats(State, Bucket, Tag) -> {ok, LedgerSnapshot, _JournalSnapshot} = snapshot_store(State, ledger,