From 5bdb7fd7facc33b447f8914c1e65366c93d5dfcc Mon Sep 17 00:00:00 2001 From: martinsumner Date: Thu, 8 Dec 2016 23:38:50 +0000 Subject: [PATCH 01/34] Alter Riak HEAD Change the extract of Riak metadata. In Riak-based volume tests hte writing of SFT files is tanking. Could this be the "extra" metadata. i.e. There are only current plans to look at the vclock. Sibling count is free to fetch, what if we just get these two items, will it be less CPU to extract the metadata, but also will the reduced weight reduce the downstream impact? --- src/leveled_codec.erl | 49 ++++++++----------------------------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 8903198..19e9c9f 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -331,8 +331,8 @@ build_metadata_object(PrimaryKey, MD) -> {Tag, _Bucket, _Key, null} = PrimaryKey, case Tag of ?RIAK_TAG -> - {SibMetaBinList, Vclock, _Hash, _Size} = MD, - riak_metadata_to_binary(Vclock, SibMetaBinList); + {SibCount, Vclock, _Hash, _Size} = MD, + riak_metadata_to_binary(Vclock, SibCount); ?STD_TAG -> MD end. @@ -341,55 +341,24 @@ build_metadata_object(PrimaryKey, MD) -> riak_extract_metadata(delete, Size) -> {delete, null, null, Size}; riak_extract_metadata(ObjBin, Size) -> - {Vclock, SibMetaBinList} = riak_metadata_from_binary(ObjBin), - {SibMetaBinList, Vclock, erlang:phash2(ObjBin), Size}. + {Vclock, SibCount} = riak_metadata_from_binary(ObjBin), + {SibCount, Vclock, erlang:phash2(ObjBin), Size}. %% <>. -riak_metadata_to_binary(Vclock, SibMetaBinList) -> +riak_metadata_to_binary(Vclock, SibCount) -> VclockBin = term_to_binary(Vclock), VclockLen = byte_size(VclockBin), - SibCount = length(SibMetaBinList), - SibsBin = slimbin_contents(SibMetaBinList), <>. + VclockBin:VclockLen/binary, SibCount:32/integer>>. -% Fixes the value length for each sibling to be zero, and so includes no value -slimbin_content(MetaBin) -> - MetaLen = byte_size(MetaBin), - <<0:32/integer, MetaLen:32/integer, MetaBin:MetaLen/binary>>. - -slimbin_contents(SibMetaBinList) -> - F = fun(MetaBin, Acc) -> - <> - end, - lists:foldl(F, <<>>, SibMetaBinList). - riak_metadata_from_binary(V1Binary) -> <> = V1Binary, - <> = Rest, - SibMetaBinList = - case SibCount of - 0 -> - []; - SC when is_integer(SC) -> - get_metadata_from_siblings(SibsBin, SibCount, []) - end, - {binary_to_term(VclockBin), SibMetaBinList}. - -get_metadata_from_siblings(<<>>, 0, SibMetaBinList) -> - SibMetaBinList; -get_metadata_from_siblings(<>, - SibCount, - SibMetaBinList) -> - <<_ValBin:ValLen/binary, MetaLen:32/integer, Rest1/binary>> = Rest0, - <> = Rest1, - get_metadata_from_siblings(Rest2, - SibCount - 1, - [MetaBin|SibMetaBinList]). - + <> = Rest, + {binary_to_term(VclockBin), SibCount}. + From 349d194a7cf465217345b4ba6c87f4ba99ddfde6 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 9 Dec 2016 09:52:31 +0000 Subject: [PATCH 02/34] Increase jitter slightly --- src/leveled_bookie.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 30e56e4..891d6a4 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -145,7 +145,7 @@ -define(LEDGER_FP, "ledger"). -define(SNAPSHOT_TIMEOUT, 300000). -define(CHECKJOURNAL_PROB, 0.2). --define(CACHE_SIZE_JITTER, 20). +-define(CACHE_SIZE_JITTER, 25). -define(JOURNAL_SIZE_JITTER, 10). -record(state, {inker :: pid(), From 82cb49638a1919ae1c3e6a0cd8f4cf9c1144f79e Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 9 Dec 2016 14:36:03 +0000 Subject: [PATCH 03/34] Attempt at performance improvement Try to add some extra jitter in to the process of L0 writes, and also make L0 writes delayed to help with bufferring --- include/leveled.hrl | 3 ++- src/leveled_bookie.erl | 5 +++-- src/leveled_penciller.erl | 28 ++++++++++++++++++++++++---- src/leveled_sft.erl | 4 +++- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/include/leveled.hrl b/include/leveled.hrl index 0e62cf3..25216f6 100644 --- a/include/leveled.hrl +++ b/include/leveled.hrl @@ -64,7 +64,8 @@ {root_path :: string(), max_inmemory_tablesize :: integer(), start_snapshot = false :: boolean(), - source_penciller :: pid()}). + source_penciller :: pid(), + levelzero_cointoss = false :: boolean}). -record(iclerk_options, {inker :: pid(), diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 891d6a4..ce444fb 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -146,7 +146,7 @@ -define(SNAPSHOT_TIMEOUT, 300000). -define(CHECKJOURNAL_PROB, 0.2). -define(CACHE_SIZE_JITTER, 25). --define(JOURNAL_SIZE_JITTER, 10). +-define(JOURNAL_SIZE_JITTER, 20). -record(state, {inker :: pid(), penciller :: pid(), @@ -692,7 +692,8 @@ set_options(Opts) -> binary_mode=true, sync_strategy=SyncStrat}}, #penciller_options{root_path = LedgerFP, - max_inmemory_tablesize = PCLL0CacheSize}}. + max_inmemory_tablesize = PCLL0CacheSize, + levelzero_cointoss = true}}. startup(InkerOpts, PencillerOpts) -> {ok, Inker} = leveled_inker:ink_start(InkerOpts), diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 94bac54..be62cf9 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -198,7 +198,7 @@ -define(MAX_TABLESIZE, 32000). -define(PROMPT_WAIT_ONL0, 5). -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). - +-define(COIN_SIDECOUNT, 4). -record(state, {manifest = [] :: list(), manifest_sqn = 0 :: integer(), @@ -217,6 +217,7 @@ % is an array - but cannot specif due to OTP compatability levelzero_size = 0 :: integer(), levelzero_maxcachesize :: integer(), + levelzero_cointoss = false :: boolean(), is_snapshot = false :: boolean(), snapshot_fully_loaded = false :: boolean(), @@ -537,10 +538,17 @@ start_from_file(PCLopts) -> end, {ok, MergeClerk} = leveled_pclerk:clerk_new(self()), + + CoinToss = PCLopts#penciller_options.levelzero_cointoss, + % Used to randomly defer the writing of L0 file. Intended to help with + % vnode syncronisation issues (e.g. stop them all by default merging to + % level zero concurrently) + InitState = #state{clerk=MergeClerk, root_path=RootPath, levelzero_index = leveled_pmem:new_index(), - levelzero_maxcachesize=MaxTableSize}, + levelzero_maxcachesize=MaxTableSize, + levelzero_cointoss=CoinToss}, %% Open manifest ManifestPath = InitState#state.root_path ++ "/" ++ ?MANIFEST_FP ++ "/", @@ -629,8 +637,20 @@ update_levelzero(L0Index, L0Size, PushedTree, LedgerSQN, L0Cache, State) -> ledger_sqn=MaxSQN}, CacheTooBig = NewL0Size > State#state.levelzero_maxcachesize, Level0Free = length(get_item(0, State#state.manifest, [])) == 0, - case {CacheTooBig, Level0Free} of - {true, true} -> + RandomFactor = + case State#state.levelzero_cointoss of + true -> + case random:uniform(?COIN_SIDECOUNT) of + 1 -> + true; + _ -> + false + end; + false -> + true + end, + case {CacheTooBig, Level0Free, RandomFactor} of + {true, true, true} -> L0Constructor = roll_memory(UpdState, false), UpdState#state{levelzero_pending=true, levelzero_constructor=L0Constructor}; diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 70b0b0f..9c67721 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -192,6 +192,8 @@ -define(DELETE_TIMEOUT, 10000). -define(MAX_KEYS, ?SLOT_COUNT * ?BLOCK_COUNT * ?BLOCK_SIZE). -define(DISCARD_EXT, ".discarded"). +-define(WRITE_OPS, [binary, raw, read, write, delayed_write]). +-define(READ_OPS, [binary, raw, read]). -record(state, {version = ?CURRENT_VERSION :: tuple(), slot_index :: list(), @@ -469,7 +471,7 @@ generate_filenames(RootFilename) -> create_file(FileName) when is_list(FileName) -> leveled_log:log("SFT01", [FileName]), ok = filelib:ensure_dir(FileName), - {ok, Handle} = file:open(FileName, [binary, raw, read, write]), + {ok, Handle} = file:open(FileName, ?WRITE_OPS), Header = create_header(initial), {ok, _} = file:position(Handle, bof), ok = file:write(Handle, Header), From f0db730f07dc83f1eebe9324223a3a7f810c1431 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 9 Dec 2016 16:34:15 +0000 Subject: [PATCH 04/34] Adjust jitter settings --- src/leveled_bookie.erl | 4 ++-- src/leveled_penciller.erl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index ce444fb..6781e08 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -233,7 +233,7 @@ init([Opts]) -> {Inker, Penciller} = startup(InkerOpts, PencillerOpts), CacheJitter = ?CACHE_SIZE div (100 div ?CACHE_SIZE_JITTER), CacheSize = get_opt(cache_size, Opts, ?CACHE_SIZE) - + erlang:phash2(self()) band CacheJitter, + + erlang:phash2(self()) rem CacheJitter, leveled_log:log("B0001", [Inker, Penciller]), {ok, #state{inker=Inker, penciller=Penciller, @@ -668,7 +668,7 @@ set_options(Opts) -> MaxJournalSize0 = get_opt(max_journalsize, Opts, 10000000000), JournalSizeJitter = MaxJournalSize0 div (100 div ?JOURNAL_SIZE_JITTER), MaxJournalSize = MaxJournalSize0 - - erlang:phash2(self()) band JournalSizeJitter, + erlang:phash2(self()) rem JournalSizeJitter, SyncStrat = get_opt(sync_strategy, Opts, sync), WRP = get_opt(waste_retention_period, Opts), diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index be62cf9..ab770a5 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -198,7 +198,7 @@ -define(MAX_TABLESIZE, 32000). -define(PROMPT_WAIT_ONL0, 5). -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). --define(COIN_SIDECOUNT, 4). +-define(COIN_SIDECOUNT, 2). -record(state, {manifest = [] :: list(), manifest_sqn = 0 :: integer(), From d2bd01eaf12dc73243ee9a408833debaf95a18a4 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 9 Dec 2016 18:30:40 +0000 Subject: [PATCH 05/34] Add fast fail to skiplist Add a bloom filter to the skiplist, to make it faster at returning not found. The SkipList is now encapsulated within a dict(). --- src/leveled_bookie.erl | 2 +- src/leveled_penciller.erl | 2 +- src/leveled_skiplist.erl | 104 +++++++++++++++++++++++++++++--------- 3 files changed, 81 insertions(+), 27 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 6781e08..78dbed8 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -151,7 +151,7 @@ -record(state, {inker :: pid(), penciller :: pid(), cache_size :: integer(), - ledger_cache :: list(), % a skiplist + ledger_cache :: dict:dict(), % a skiplist is_snapshot :: boolean(), slow_offer = false :: boolean()}). diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index ab770a5..fb8ef02 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -222,7 +222,7 @@ is_snapshot = false :: boolean(), snapshot_fully_loaded = false :: boolean(), source_penciller :: pid(), - levelzero_astree :: list(), % skiplist + levelzero_astree :: list(), ongoing_work = [] :: list(), work_backlog = false :: boolean()}). diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index b9d9af4..5cf8961 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -23,6 +23,7 @@ to_range/2, to_range/3, lookup/2, + lookup/3, empty/0, size/1 ]). @@ -32,50 +33,98 @@ -define(SKIP_WIDTH, 16). -define(LIST_HEIGHT, 2). -define(INFINITY_KEY, {null, null, null, null, null}). - +-define(BITARRAY_SIZE, 2048). %%%============================================================================ %%% SkipList API %%%============================================================================ enter(Key, Value, SkipList) -> - enter(Key, Value, SkipList, ?SKIP_WIDTH, ?LIST_HEIGHT). + Hash = erlang:phash2(Key), + SkipList0 = add_to_array(Hash, SkipList), + NewListPart = enter(Key, Value, Hash, + dict:fetch(?SKIP_WIDTH, SkipList0), + ?SKIP_WIDTH, ?LIST_HEIGHT), + dict:store(?SKIP_WIDTH, NewListPart, SkipList0). from_list(UnsortedKVL) -> KVL = lists:ukeysort(1, UnsortedKVL), - from_list(KVL, ?SKIP_WIDTH, ?LIST_HEIGHT). + from_sortedlist(KVL). from_sortedlist(SortedKVL) -> - from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT). + SL0 = lists:foldr(fun({K, _V}, SkipL) -> + H = erlang:phash2(K), + add_to_array(H, SkipL) end, + empty(), + SortedKVL), + dict:store(?SKIP_WIDTH, + from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT), + SL0). lookup(Key, SkipList) -> - lookup(Key, SkipList, ?LIST_HEIGHT). + lookup(Key, erlang:phash2(Key), SkipList). + +lookup(Key, Hash, SkipList) -> + {Slot, Bit} = hash_toslotbit(Hash), + RestLen = ?BITARRAY_SIZE - Bit - 1, + <<_Head:Bit/bitstring, + B:1/bitstring, + _Rest:RestLen/bitstring>> = dict:fetch(Slot, SkipList), + case B of + <<0:1>> -> + none; + <<1:1>> -> + list_lookup(Key, dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT) + end. %% Rather than support iterator_from like gb_trees, will just an output a key %% sorted list for the desired range, which can the be iterated over as normal to_range(SkipList, Start) -> - to_range(SkipList, Start, ?INFINITY_KEY, ?LIST_HEIGHT). + to_range(dict:fetch(?SKIP_WIDTH, SkipList), Start, ?INFINITY_KEY, ?LIST_HEIGHT). to_range(SkipList, Start, End) -> - to_range(SkipList, Start, End, ?LIST_HEIGHT). + to_range(dict:fetch(?SKIP_WIDTH, SkipList), Start, End, ?LIST_HEIGHT). to_list(SkipList) -> - to_list(SkipList, ?LIST_HEIGHT). + to_list(dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT). empty() -> - empty([], ?LIST_HEIGHT). + FoldFun = + fun(X, Acc) -> dict:store(X, <<0:?BITARRAY_SIZE>>, Acc) end, + lists:foldl(FoldFun, + dict:store(?SKIP_WIDTH, + empty([], ?LIST_HEIGHT), + dict:new()), + lists:seq(0, ?SKIP_WIDTH - 1)). + + size(SkipList) -> - size(SkipList, ?LIST_HEIGHT). + size(dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT). + %%%============================================================================ %%% SkipList Base Functions %%%============================================================================ -enter(Key, Value, SkipList, Width, 1) -> - Hash = erlang:phash2(Key), +hash_toslotbit(Hash) -> + Slot = Hash band (?SKIP_WIDTH - 1), + Bit = (Hash bsr ?SKIP_WIDTH) band (?BITARRAY_SIZE - 1), + {Slot, Bit}. + + +add_to_array(Hash, SkipList) -> + {Slot, Bit} = hash_toslotbit(Hash), + RestLen = ?BITARRAY_SIZE - Bit - 1, + <> = dict:fetch(Slot, SkipList), + BitArray = <>, + dict:store(Slot, BitArray, SkipList). + +enter(Key, Value, Hash, SkipList, Width, 1) -> {MarkerKey, SubList} = find_mark(Key, SkipList), case Hash rem Width of 0 -> @@ -101,11 +150,10 @@ enter(Key, Value, SkipList, Width, 1) -> end, lists:keyreplace(MarkerKey, 1, SkipList, {MarkerKey, UpdSubList}) end; -enter(Key, Value, SkipList, Width, Level) -> - Hash = erlang:phash2(Key), +enter(Key, Value, Hash, SkipList, Width, Level) -> HashMatch = width(Level, Width), {MarkerKey, SubSkipList} = find_mark(Key, SkipList), - UpdSubSkipList = enter(Key, Value, SubSkipList, Width, Level - 1), + UpdSubSkipList = enter(Key, Value, Hash, SubSkipList, Width, Level - 1), case Hash rem HashMatch of 0 -> % @@ -171,7 +219,7 @@ from_list(KVL, Width, Level) -> end. -lookup(Key, SkipList, 1) -> +list_lookup(Key, SkipList, 1) -> SubList = get_sublist(Key, SkipList), case lists:keyfind(Key, 1, SubList) of false -> @@ -179,13 +227,13 @@ lookup(Key, SkipList, 1) -> {Key, V} -> {value, V} end; -lookup(Key, SkipList, Level) -> +list_lookup(Key, SkipList, Level) -> SubList = get_sublist(Key, SkipList), case SubList of null -> none; _ -> - lookup(Key, SubList, Level - 1) + list_lookup(Key, SubList, Level - 1) end. @@ -385,16 +433,19 @@ dotest_skiplist_small(N) -> lists:ukeysort(1, lists:reverse(KL))). skiplist_test() -> - N = 8000, + N = 4000, KL = generate_randomkeys(1, N, 1, N div 5), SWaGSL = os:timestamp(), SkipList = from_list(lists:reverse(KL)), io:format(user, "Generating skip list with ~w keys in ~w microseconds~n" ++ "Top level key count of ~w~n", - [N, timer:now_diff(os:timestamp(), SWaGSL), length(SkipList)]), + [N, + timer:now_diff(os:timestamp(), SWaGSL), + length(dict:fetch(?SKIP_WIDTH, SkipList))]), io:format(user, "Second tier key counts of ~w~n", - [lists:map(fun({_L, SL}) -> length(SL) end, SkipList)]), + [lists:map(fun({_L, SL}) -> length(SL) end, + dict:fetch(?SKIP_WIDTH, SkipList))]), KLSorted = lists:ukeysort(1, lists:reverse(KL)), SWaGSL2 = os:timestamp(), @@ -413,9 +464,12 @@ skiplist_test() -> io:format(user, "Dynamic load of skiplist with ~w keys took ~w " ++ "microseconds~n" ++ "Top level key count of ~w~n", - [N, timer:now_diff(os:timestamp(), SWaDSL), length(SkipList1)]), + [N, + timer:now_diff(os:timestamp(), SWaDSL), + length(dict:fetch(?SKIP_WIDTH, SkipList1))]), io:format(user, "Second tier key counts of ~w~n", - [lists:map(fun({_L, SL}) -> length(SL) end, SkipList1)]), + [lists:map(fun({_L, SL}) -> length(SL) end, + dict:fetch(?SKIP_WIDTH, SkipList1))]), io:format(user, "~nRunning timing tests for generated skiplist:~n", []), skiplist_timingtest(KLSorted, SkipList, N), @@ -482,13 +536,13 @@ skiplist_timingtest(KL, SkipList, N) -> io:format(user, "Finding 10 ranges took ~w microseconds~n", [timer:now_diff(os:timestamp(), SWc)]), - AltKL1 = generate_randomkeys(1, 1000, 1, 200), + AltKL1 = generate_randomkeys(1, 2000, 1, 200), SWd = os:timestamp(), lists:foreach(fun({K, _V}) -> lookup(K, SkipList) end, AltKL1), - io:format(user, "Getting 1000 mainly missing keys took ~w microseconds~n", + io:format(user, "Getting 2000 mainly missing keys took ~w microseconds~n", [timer:now_diff(os:timestamp(), SWd)]), AltKL2 = generate_randomkeys(1, 1000, N div 5 + 1, N div 5 + 300), SWe = os:timestamp(), From a3f60e36099c58958d88b7cbd798d2763ea39bca Mon Sep 17 00:00:00 2001 From: martinsumner Date: Fri, 9 Dec 2016 18:55:13 +0000 Subject: [PATCH 06/34] OTP version shenanigans --- src/leveled_bookie.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 78dbed8..3e335a2 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -151,7 +151,7 @@ -record(state, {inker :: pid(), penciller :: pid(), cache_size :: integer(), - ledger_cache :: dict:dict(), % a skiplist + ledger_cache, % a skiplist is_snapshot :: boolean(), slow_offer = false :: boolean()}). From 626a8e63f914b416e1db7dba603bf120b485c32f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 10 Dec 2016 10:55:35 +0000 Subject: [PATCH 07/34] Experiment converting CDB to use skiplist not gb_tree Might insertion time be faster? --- src/leveled_bookie.erl | 6 +- src/leveled_cdb.erl | 52 ++++++------ src/leveled_penciller.erl | 2 +- src/leveled_skiplist.erl | 169 ++++++++++++++++++++++++++++++-------- 4 files changed, 166 insertions(+), 63 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 3e335a2..a50e9fa 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -238,14 +238,14 @@ init([Opts]) -> {ok, #state{inker=Inker, penciller=Penciller, cache_size=CacheSize, - ledger_cache=leveled_skiplist:empty(), + ledger_cache=leveled_skiplist:empty(true), is_snapshot=false}}; Bookie -> {ok, {Penciller, LedgerCache}, Inker} = book_snapshotstore(Bookie, self(), ?SNAPSHOT_TIMEOUT), ok = leveled_penciller:pcl_loadsnapshot(Penciller, - leveled_skiplist:empty()), + leveled_skiplist:empty(true)), leveled_log:log("B0002", [Inker, Penciller]), {ok, #state{penciller=Penciller, inker=Inker, @@ -885,7 +885,7 @@ maybepush_ledgercache(MaxCacheSize, Cache, Penciller) -> TimeToPush -> case leveled_penciller:pcl_pushmem(Penciller, Cache) of ok -> - {ok, leveled_skiplist:empty()}; + {ok, leveled_skiplist:empty(true)}; returned -> {returned, Cache} end; diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index 1354571..8e4451c 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -861,24 +861,28 @@ get_hashtree(Key, HashTree) -> Hash = hash(Key), Index = hash_to_index(Hash), Tree = array:get(Index, HashTree), - case gb_trees:lookup(Hash, Tree) of + case leveled_skiplist:lookup(Hash, Tree) of {value, List} -> List; _ -> [] end. -%% Add to hash tree - this is an array of 256 gb_trees that contains the Hash +%% Add to hash tree - this is an array of 256 skiplists that contains the Hash %% and position of objects which have been added to an open CDB file put_hashtree(Key, Position, HashTree) -> Hash = hash(Key), Index = hash_to_index(Hash), Tree = array:get(Index, HashTree), - case gb_trees:lookup(Hash, Tree) of + case leveled_skiplist:lookup(Hash, Tree) of none -> - array:set(Index, gb_trees:insert(Hash, [Position], Tree), HashTree); + array:set(Index, + leveled_skiplist:enter(Hash, [Position], Tree), + HashTree); {value, L} -> - array:set(Index, gb_trees:update(Hash, [Position|L], Tree), HashTree) + array:set(Index, + leveled_skiplist:enter(Hash, [Position|L], Tree), + HashTree) end. %% Function to extract a Key-Value pair given a file handle and a position @@ -920,7 +924,7 @@ extract_key_value_check(Handle, Position) -> %% Scan through the file until there is a failure to crc check an input, and %% at that point return the position and the key dictionary scanned so far startup_scan_over_file(Handle, Position) -> - HashTree = array:new(256, {default, gb_trees:empty()}), + HashTree = array:new(256, {default, leveled_skiplist:empty()}), scan_over_file(Handle, Position, fun startup_filter/5, @@ -1148,7 +1152,7 @@ search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, QuickCheck) -> % key/value binary in the file. write_key_value_pairs(Handle, KeyValueList) -> {ok, Position} = file:position(Handle, cur), - HashTree = array:new(256, {default, gb_trees:empty()}), + HashTree = array:new(256, {default, leveled_skiplist:empty()}), write_key_value_pairs(Handle, KeyValueList, {Position, HashTree}). write_key_value_pairs(_, [], Acc) -> @@ -1181,11 +1185,11 @@ write_hash_tables([], _HashTree, _CurrPos, IndexList, HashTreeBin) -> {IndexList, HashTreeBin}; write_hash_tables([Index|Rest], HashTree, CurrPos, IndexList, HashTreeBin) -> Tree = array:get(Index, HashTree), - case gb_trees:keys(Tree) of - [] -> + case leveled_skiplist:size(Tree) of + 0 -> write_hash_tables(Rest, HashTree, CurrPos, IndexList, HashTreeBin); _ -> - HashList = gb_trees:to_list(Tree), + HashList = leveled_skiplist:to_list(Tree), BinList = build_binaryhashlist(HashList, []), IndexLength = length(BinList) * 2, SlotList = lists:duplicate(IndexLength, <<0:32, 0:32>>), @@ -1402,16 +1406,16 @@ write_key_value_pairs_1_test() -> Index1 = hash_to_index(Hash1), Hash2 = hash("key2"), Index2 = hash_to_index(Hash2), - R0 = array:new(256, {default, gb_trees:empty()}), + R0 = array:new(256, {default, leveled_skiplist:empty()}), R1 = array:set(Index1, - gb_trees:insert(Hash1, - [0], - array:get(Index1, R0)), + leveled_skiplist:enter(Hash1, + [0], + array:get(Index1, R0)), R0), R2 = array:set(Index2, - gb_trees:insert(Hash2, - [30], - array:get(Index2, R1)), + leveled_skiplist:enter(Hash2, + [30], + array:get(Index2, R1)), R1), io:format("HashTree is ~w~n", [HashTree]), io:format("Expected HashTree is ~w~n", [R2]), @@ -1421,16 +1425,16 @@ write_key_value_pairs_1_test() -> write_hash_tables_1_test() -> {ok, Handle} = file:open("../test/testx.cdb", [write]), - R0 = array:new(256, {default, gb_trees:empty()}), + R0 = array:new(256, {default, leveled_skiplist:empty()}), R1 = array:set(64, - gb_trees:insert(6383014720, - [18], - array:get(64, R0)), + leveled_skiplist:enter(6383014720, + [18], + array:get(64, R0)), R0), R2 = array:set(67, - gb_trees:insert(6383014723, - [0], - array:get(67, R1)), + leveled_skiplist:enter(6383014723, + [0], + array:get(67, R1)), R1), Result = write_hash_tables(Handle, R2), io:format("write hash tables result of ~w ~n", [Result]), diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index fb8ef02..dc83474 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -1283,7 +1283,7 @@ confirm_delete_test() -> maybe_pause_push(PCL, KL) -> - T0 = leveled_skiplist:empty(), + T0 = leveled_skiplist:empty(true), T1 = lists:foldl(fun({K, V}, Acc) -> leveled_skiplist:enter(K, V, Acc) end, T0, KL), diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 5cf8961..63a3842 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -17,7 +17,9 @@ -export([ from_list/1, + from_list/2, from_sortedlist/1, + from_sortedlist/2, to_list/1, enter/3, to_range/2, @@ -25,6 +27,7 @@ lookup/2, lookup/3, empty/0, + empty/1, size/1 ]). @@ -41,28 +44,49 @@ enter(Key, Value, SkipList) -> Hash = erlang:phash2(Key), - SkipList0 = add_to_array(Hash, SkipList), - NewListPart = enter(Key, Value, Hash, - dict:fetch(?SKIP_WIDTH, SkipList0), - ?SKIP_WIDTH, ?LIST_HEIGHT), - dict:store(?SKIP_WIDTH, NewListPart, SkipList0). + case is_list(SkipList) of + true -> + enter(Key, Value, Hash, SkipList, ?SKIP_WIDTH, ?LIST_HEIGHT); + false -> + SkipList0 = add_to_array(Hash, SkipList), + NewListPart = enter(Key, Value, Hash, + dict:fetch(?SKIP_WIDTH, SkipList0), + ?SKIP_WIDTH, ?LIST_HEIGHT), + dict:store(?SKIP_WIDTH, NewListPart, SkipList0) + end. from_list(UnsortedKVL) -> + from_list(UnsortedKVL, false). + +from_list(UnsortedKVL, BloomProtect) -> KVL = lists:ukeysort(1, UnsortedKVL), - from_sortedlist(KVL). + from_sortedlist(KVL, BloomProtect). from_sortedlist(SortedKVL) -> - SL0 = lists:foldr(fun({K, _V}, SkipL) -> - H = erlang:phash2(K), - add_to_array(H, SkipL) end, - empty(), - SortedKVL), - dict:store(?SKIP_WIDTH, - from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT), - SL0). + from_sortedlist(SortedKVL, false). + +from_sortedlist(SortedKVL, BloomProtect) -> + case BloomProtect of + true -> + SL0 = lists:foldr(fun({K, _V}, SkipL) -> + H = erlang:phash2(K), + add_to_array(H, SkipL) end, + empty(true), + SortedKVL), + dict:store(?SKIP_WIDTH, + from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT), + SL0); + false -> + from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT) + end. lookup(Key, SkipList) -> - lookup(Key, erlang:phash2(Key), SkipList). + case is_list(SkipList) of + true -> + list_lookup(Key, SkipList, ?LIST_HEIGHT); + false -> + lookup(Key, erlang:phash2(Key), SkipList) + end. lookup(Key, Hash, SkipList) -> {Slot, Bit} = hash_toslotbit(Hash), @@ -81,27 +105,57 @@ lookup(Key, Hash, SkipList) -> %% Rather than support iterator_from like gb_trees, will just an output a key %% sorted list for the desired range, which can the be iterated over as normal to_range(SkipList, Start) -> - to_range(dict:fetch(?SKIP_WIDTH, SkipList), Start, ?INFINITY_KEY, ?LIST_HEIGHT). + case is_list(SkipList) of + true -> + to_range(SkipList, Start, ?INFINITY_KEY, ?LIST_HEIGHT); + false -> + to_range(dict:fetch(?SKIP_WIDTH, SkipList), + Start, ?INFINITY_KEY, + ?LIST_HEIGHT) + end. to_range(SkipList, Start, End) -> - to_range(dict:fetch(?SKIP_WIDTH, SkipList), Start, End, ?LIST_HEIGHT). + case is_list(SkipList) of + true -> + to_range(SkipList, Start, End, ?LIST_HEIGHT); + false -> + to_range(dict:fetch(?SKIP_WIDTH, SkipList), + Start, End, + ?LIST_HEIGHT) + end. to_list(SkipList) -> - to_list(dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT). + case is_list(SkipList) of + true -> + to_list(SkipList, ?LIST_HEIGHT); + false -> + to_list(dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT) + end. empty() -> - FoldFun = - fun(X, Acc) -> dict:store(X, <<0:?BITARRAY_SIZE>>, Acc) end, - lists:foldl(FoldFun, - dict:store(?SKIP_WIDTH, - empty([], ?LIST_HEIGHT), - dict:new()), - lists:seq(0, ?SKIP_WIDTH - 1)). - + empty(false). +empty(BloomProtect) -> + case BloomProtect of + true -> + FoldFun = + fun(X, Acc) -> dict:store(X, <<0:?BITARRAY_SIZE>>, Acc) end, + lists:foldl(FoldFun, + dict:store(?SKIP_WIDTH, + empty([], ?LIST_HEIGHT), + dict:new()), + lists:seq(0, ?SKIP_WIDTH - 1)); + false -> + empty([], ?LIST_HEIGHT) + end. size(SkipList) -> - size(dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT). + case is_list(SkipList) of + true -> + size(SkipList, ?LIST_HEIGHT); + false -> + size(dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT) + end. @@ -432,7 +486,54 @@ dotest_skiplist_small(N) -> end, lists:ukeysort(1, lists:reverse(KL))). -skiplist_test() -> +skiplist_withbloom_test() -> + io:format(user, "~n~nBloom protected skiplist test:~n~n", []), + N = 4000, + KL = generate_randomkeys(1, N, 1, N div 5), + + SWaGSL = os:timestamp(), + SkipList = from_list(lists:reverse(KL), true), + io:format(user, "Generating skip list with ~w keys in ~w microseconds~n" ++ + "Top level key count of ~w~n", + [N, + timer:now_diff(os:timestamp(), SWaGSL), + length(dict:fetch(?SKIP_WIDTH, SkipList))]), + io:format(user, "Second tier key counts of ~w~n", + [lists:map(fun({_L, SL}) -> length(SL) end, + dict:fetch(?SKIP_WIDTH, SkipList))]), + KLSorted = lists:ukeysort(1, lists:reverse(KL)), + + SWaGSL2 = os:timestamp(), + SkipList = from_sortedlist(KLSorted, true), + io:format(user, "Generating skip list with ~w sorted keys in ~w " ++ + "microseconds~n", + [N, timer:now_diff(os:timestamp(), SWaGSL2)]), + + SWaDSL = os:timestamp(), + SkipList1 = + lists:foldl(fun({K, V}, SL) -> + enter(K, V, SL) + end, + empty(true), + KL), + io:format(user, "Dynamic load of skiplist with ~w keys took ~w " ++ + "microseconds~n" ++ + "Top level key count of ~w~n", + [N, + timer:now_diff(os:timestamp(), SWaDSL), + length(dict:fetch(?SKIP_WIDTH, SkipList1))]), + io:format(user, "Second tier key counts of ~w~n", + [lists:map(fun({_L, SL}) -> length(SL) end, + dict:fetch(?SKIP_WIDTH, SkipList1))]), + + io:format(user, "~nRunning timing tests for generated skiplist:~n", []), + skiplist_timingtest(KLSorted, SkipList, N), + + io:format(user, "~nRunning timing tests for dynamic skiplist:~n", []), + skiplist_timingtest(KLSorted, SkipList1, N). + +skiplist_nobloom_test() -> + io:format(user, "~n~nBloom free skiplist test:~n~n", []), N = 4000, KL = generate_randomkeys(1, N, 1, N div 5), @@ -442,10 +543,9 @@ skiplist_test() -> "Top level key count of ~w~n", [N, timer:now_diff(os:timestamp(), SWaGSL), - length(dict:fetch(?SKIP_WIDTH, SkipList))]), + length(SkipList)]), io:format(user, "Second tier key counts of ~w~n", - [lists:map(fun({_L, SL}) -> length(SL) end, - dict:fetch(?SKIP_WIDTH, SkipList))]), + [lists:map(fun({_L, SL}) -> length(SL) end, SkipList)]), KLSorted = lists:ukeysort(1, lists:reverse(KL)), SWaGSL2 = os:timestamp(), @@ -466,17 +566,16 @@ skiplist_test() -> "Top level key count of ~w~n", [N, timer:now_diff(os:timestamp(), SWaDSL), - length(dict:fetch(?SKIP_WIDTH, SkipList1))]), + length(SkipList1)]), io:format(user, "Second tier key counts of ~w~n", - [lists:map(fun({_L, SL}) -> length(SL) end, - dict:fetch(?SKIP_WIDTH, SkipList1))]), + [lists:map(fun({_L, SL}) -> length(SL) end, SkipList1)]), io:format(user, "~nRunning timing tests for generated skiplist:~n", []), skiplist_timingtest(KLSorted, SkipList, N), io:format(user, "~nRunning timing tests for dynamic skiplist:~n", []), skiplist_timingtest(KLSorted, SkipList1, N). - + skiplist_timingtest(KL, SkipList, N) -> io:format(user, "Timing tests on skiplist of size ~w~n", From c4e4cf67fea34d5731439a2787d58708eea0adb1 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 10 Dec 2016 11:39:00 +0000 Subject: [PATCH 08/34] Add bloom to loaded skiplist --- src/leveled_inker.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_inker.erl b/src/leveled_inker.erl index cb00883..2bfcd9c 100644 --- a/src/leveled_inker.erl +++ b/src/leveled_inker.erl @@ -633,7 +633,7 @@ load_from_sequence(MinSQN, FilterFun, Penciller, [{_LowSQN, FN, Pid}|Rest]) -> load_between_sequence(MinSQN, MaxSQN, FilterFun, Penciller, CDBpid, StartPos, FN, Rest) -> leveled_log:log("I0014", [FN, MinSQN]), - InitAcc = {MinSQN, MaxSQN, leveled_skiplist:empty()}, + InitAcc = {MinSQN, MaxSQN, leveled_skiplist:empty(true)}, Res = case leveled_cdb:cdb_scan(CDBpid, FilterFun, InitAcc, StartPos) of {eof, {AccMinSQN, _AccMaxSQN, AccKL}} -> ok = push_to_penciller(Penciller, AccKL), From 06c58bf84becfbdf1c6781f4224ae9bbabe3dcae Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 10 Dec 2016 13:03:38 +0000 Subject: [PATCH 09/34] Split out hashtree implementation Split out hashtree implementation functions in leveled_cdb to make it easier to swap this out. Currently using an array of skiplists - may be better with an ets ordered_set --- src/leveled_cdb.erl | 74 ++++++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index 8e4451c..f8216d6 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -860,30 +860,14 @@ close_file(Handle, HashTree, BasePos) -> get_hashtree(Key, HashTree) -> Hash = hash(Key), Index = hash_to_index(Hash), - Tree = array:get(Index, HashTree), - case leveled_skiplist:lookup(Hash, Tree) of - {value, List} -> - List; - _ -> - [] - end. + lookup_positions(HashTree, Index, Hash). %% Add to hash tree - this is an array of 256 skiplists that contains the Hash %% and position of objects which have been added to an open CDB file put_hashtree(Key, Position, HashTree) -> Hash = hash(Key), Index = hash_to_index(Hash), - Tree = array:get(Index, HashTree), - case leveled_skiplist:lookup(Hash, Tree) of - none -> - array:set(Index, - leveled_skiplist:enter(Hash, [Position], Tree), - HashTree); - {value, L} -> - array:set(Index, - leveled_skiplist:enter(Hash, [Position|L], Tree), - HashTree) - end. + add_position_tohashtree(HashTree, Index, Hash, Position). %% Function to extract a Key-Value pair given a file handle and a position %% Will confirm that the key matches and do a CRC check @@ -924,7 +908,7 @@ extract_key_value_check(Handle, Position) -> %% Scan through the file until there is a failure to crc check an input, and %% at that point return the position and the key dictionary scanned so far startup_scan_over_file(Handle, Position) -> - HashTree = array:new(256, {default, leveled_skiplist:empty()}), + HashTree = new_hashtree(), scan_over_file(Handle, Position, fun startup_filter/5, @@ -1152,7 +1136,7 @@ search_hash_table(Handle, [Entry|RestOfEntries], Hash, Key, QuickCheck) -> % key/value binary in the file. write_key_value_pairs(Handle, KeyValueList) -> {ok, Position} = file:position(Handle, cur), - HashTree = array:new(256, {default, leveled_skiplist:empty()}), + HashTree = new_hashtree(), write_key_value_pairs(Handle, KeyValueList, {Position, HashTree}). write_key_value_pairs(_, [], Acc) -> @@ -1184,12 +1168,11 @@ perform_write_hash_tables(Handle, HashTreeBin, StartPos) -> write_hash_tables([], _HashTree, _CurrPos, IndexList, HashTreeBin) -> {IndexList, HashTreeBin}; write_hash_tables([Index|Rest], HashTree, CurrPos, IndexList, HashTreeBin) -> - Tree = array:get(Index, HashTree), - case leveled_skiplist:size(Tree) of - 0 -> + case is_empty(HashTree, Index) of + true -> write_hash_tables(Rest, HashTree, CurrPos, IndexList, HashTreeBin); - _ -> - HashList = leveled_skiplist:to_list(Tree), + false -> + HashList = to_list(HashTree, Index), BinList = build_binaryhashlist(HashList, []), IndexLength = length(BinList) * 2, SlotList = lists:duplicate(IndexLength, <<0:32, 0:32>>), @@ -1345,6 +1328,47 @@ multi_key_value_to_record(KVList, BinaryMode, LastPosition) -> {[], <<>>, empty}, KVList). +%%%============================================================================ +%%% HashTree Implementation +%%%============================================================================ + +lookup_positions(HashTree, Index, Hash) -> + Tree = array:get(Index, HashTree), + case leveled_skiplist:lookup(Hash, Tree) of + {value, List} -> + List; + _ -> + [] + end. + +add_position_tohashtree(HashTree, Index, Hash, Position) -> + Tree = array:get(Index, HashTree), + case leveled_skiplist:lookup(Hash, Tree) of + none -> + array:set(Index, + leveled_skiplist:enter(Hash, [Position], Tree), + HashTree); + {value, L} -> + array:set(Index, + leveled_skiplist:enter(Hash, [Position|L], Tree), + HashTree) + end. + +new_hashtree() -> + array:new(256, {default, leveled_skiplist:empty()}). + +is_empty(HashTree, Index) -> + Tree = array:get(Index, HashTree), + case leveled_skiplist:size(Tree) of + 0 -> + true; + _ -> + false + end. + +to_list(HashTree, Index) -> + Tree = array:get(Index, HashTree), + leveled_skiplist:to_list(Tree). %%%%%%%%%%%%%%%% % T E S T From 95d5e12ce73e1fd2ed46a16a845e3bd5eb0830d0 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sat, 10 Dec 2016 14:15:35 +0000 Subject: [PATCH 10/34] Switch to using ets set as index of L0 cache Hope is that this will cause less garbage collection, and also will be slightly faster. Note that snapshots don't now get an index - they get the special index 'snap'. However, the SkipLists have bloom protection, and most snapshots are iterators not fetchers. --- src/leveled_penciller.erl | 2 +- src/leveled_pmem.erl | 120 +++++++++++++++++++++----------------- 2 files changed, 69 insertions(+), 53 deletions(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index dc83474..a111054 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -395,7 +395,7 @@ handle_call({register_snapshot, Snapshot}, _From, State) -> Rs = [{Snapshot, State#state.manifest_sqn}|State#state.registered_snapshots], {reply, {ok, State}, State#state{registered_snapshots = Rs}}; handle_call({load_snapshot, BookieIncrTree}, _From, State) -> - L0D = leveled_pmem:add_to_index(State#state.levelzero_index, + L0D = leveled_pmem:add_to_index(snap, State#state.levelzero_size, BookieIncrTree, State#state.ledger_sqn, diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 39dd0c6..61ecd4e 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -51,42 +51,55 @@ -include_lib("eunit/include/eunit.hrl"). --define(SLOT_WIDTH, {4096, 12}). - %%%============================================================================ %%% API %%%============================================================================ +add_to_index(snap, L0Size, LevelMinus1, LedgerSQN, TreeList) -> + FoldFun = fun({K, V}, {AccMinSQN, AccMaxSQN, AccCount}) -> + SQN = leveled_codec:strip_to_seqonly({K, V}), + {min(SQN, AccMinSQN), + max(SQN, AccMaxSQN), + AccCount + 1} + end, + LM1List = leveled_skiplist:to_list(LevelMinus1), + StartingT = {infinity, 0, L0Size}, + {MinSQN, MaxSQN, NewL0Size} = lists:foldl(FoldFun, StartingT, LM1List), + if + MinSQN > LedgerSQN -> + {MaxSQN, + NewL0Size, + snap, + lists:append(TreeList, [LevelMinus1])} + end; add_to_index(L0Index, L0Size, LevelMinus1, LedgerSQN, TreeList) -> SW = os:timestamp(), SlotInTreeList = length(TreeList) + 1, - FoldFun = fun({K, V}, {AccMinSQN, AccMaxSQN, AccCount, HashIndex}) -> + FoldFun = fun({K, V}, {AccMinSQN, AccMaxSQN, AccCount}) -> SQN = leveled_codec:strip_to_seqonly({K, V}), - {Hash, Slot} = hash_to_slot(K), - L = array:get(Slot, HashIndex), - Count0 = case lists:keymember(Hash, 1, L) of - true -> - AccCount; - false -> - AccCount + 1 + Hash = erlang:phash2(K), + Count0 = case ets:lookup(L0Index, Hash) of + [] -> + ets:insert(L0Index, {Hash, [SlotInTreeList]}), + AccCount + 1; + [{Hash, L}] -> + ets:insert(L0Index, {Hash, [SlotInTreeList|L]}), + AccCount end, {min(SQN, AccMinSQN), max(SQN, AccMaxSQN), - Count0, - array:set(Slot, [{Hash, SlotInTreeList}|L], HashIndex)} + Count0} end, LM1List = leveled_skiplist:to_list(LevelMinus1), - StartingT = {infinity, 0, L0Size, L0Index}, - {MinSQN, MaxSQN, NewL0Size, UpdL0Index} = lists:foldl(FoldFun, - StartingT, - LM1List), + StartingT = {infinity, 0, L0Size}, + {MinSQN, MaxSQN, NewL0Size} = lists:foldl(FoldFun, StartingT, LM1List), leveled_log:log_timer("PM001", [NewL0Size], SW), if MinSQN > LedgerSQN -> {MaxSQN, NewL0Size, - UpdL0Index, + L0Index, lists:append(TreeList, [LevelMinus1])} end. @@ -106,38 +119,20 @@ to_list(Slots, FetchFun) -> new_index() -> - array:new(element(1, ?SLOT_WIDTH), [{default, []}, fixed]). - + ets:new(index, [set, private]). +check_levelzero(_Key, _L0Index, []) -> + {false, not_found}; +check_levelzero(Key, snap, TreeList) -> + check_slotlist(Key, lists:seq(1, length(TreeList)), TreeList); check_levelzero(Key, L0Index, TreeList) -> - {Hash, Slot} = hash_to_slot(Key), - CheckList = array:get(Slot, L0Index), - SlotList = lists:foldl(fun({H0, S0}, SL) -> - case H0 of - Hash -> - [S0|SL]; - _ -> - SL - end - end, - [], - CheckList), - lists:foldl(fun(SlotToCheck, {Found, KV}) -> - case Found of - true -> - {Found, KV}; - false -> - CheckTree = lists:nth(SlotToCheck, TreeList), - case leveled_skiplist:lookup(Key, CheckTree) of - none -> - {Found, KV}; - {value, Value} -> - {true, {Key, Value}} - end - end - end, - {false, not_found}, - lists:reverse(lists:usort(SlotList))). + Hash = erlang:phash2(Key), + case ets:lookup(L0Index, Hash) of + [] -> + {false, not_found}; + [{Hash, SlotList}] -> + check_slotlist(Key, SlotList, TreeList) + end. merge_trees(StartKey, EndKey, SkipListList, LevelMinus1) -> @@ -153,11 +148,25 @@ merge_trees(StartKey, EndKey, SkipListList, LevelMinus1) -> %%% Internal Functions %%%============================================================================ - -hash_to_slot(Key) -> - H = erlang:phash2(Key), - {H bsr element(2, ?SLOT_WIDTH), H band (element(1, ?SLOT_WIDTH) - 1)}. - +check_slotlist(Key, CheckList, TreeList) -> + SlotCheckFun = + fun(SlotToCheck, {Found, KV}) -> + case Found of + true -> + {Found, KV}; + false -> + CheckTree = lists:nth(SlotToCheck, TreeList), + case leveled_skiplist:lookup(Key, CheckTree) of + none -> + {Found, KV}; + {value, Value} -> + {true, {Key, Value}} + end + end + end, + lists:foldl(SlotCheckFun, + {false, not_found}, + lists:reverse(lists:usort(CheckList))). %%%============================================================================ %%% Test @@ -231,8 +240,15 @@ compare_method_test() -> end, [], TestList), + S2 = lists:foldl(fun({Key, _V}, Acc) -> + R0 = check_levelzero(Key, snap, TreeList), + [R0|Acc] + end, + [], + TestList), ?assertMatch(S0, S1), + ?assertMatch(S0, S2), StartKey = {o, "Bucket0100", null, null}, EndKey = {o, "Bucket0200", null, null}, From 2d3a40e6f11d78ea9321bd7b4227126f81f5c337 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 01:02:56 +0000 Subject: [PATCH 11/34] Magic Hash - and no L0 Index Move to using the DJ Bernstein Magic Hash consistently, and trying to make sure we only hash once for each operation (as the hash is more expensive than phash2). The improved lookup time for missing keys should allow for the L0 index to be removed, and hence speed up the completion time for push_mem operations. It is expected there will be a second stage of creating a tinybloom as part of the SFT creation process, and then adding that tinybloom to the manifest. This will then reduce the message passing required for a GET not in the cache or higher levels --- src/leveled_bookie.erl | 114 ++++++++++++------ src/leveled_cdb.erl | 16 +-- src/leveled_codec.erl | 72 ++++++++--- src/leveled_inker.erl | 19 ++- src/leveled_log.erl | 2 +- src/leveled_pclerk.erl | 10 +- src/leveled_penciller.erl | 169 +++++++++++++++----------- src/leveled_pmem.erl | 153 +++++++++-------------- src/leveled_sft.erl | 168 ++++++++++++++------------ src/leveled_skiplist.erl | 248 ++++++++++++++++---------------------- src/leveled_tinybloom.erl | 151 +++++++++++++++++++++++ 11 files changed, 646 insertions(+), 476 deletions(-) create mode 100644 src/leveled_tinybloom.erl diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index a50e9fa..62892ec 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -136,7 +136,10 @@ book_destroy/1]). -export([get_opt/2, - get_opt/3]). + get_opt/3, + load_snapshot/2, + empty_ledgercache/0, + push_ledgercache/2]). -include_lib("eunit/include/eunit.hrl"). @@ -148,15 +151,18 @@ -define(CACHE_SIZE_JITTER, 25). -define(JOURNAL_SIZE_JITTER, 20). +-record(ledger_cache, {skiplist = leveled_skiplist:empty(true) :: tuple(), + min_sqn = infinity :: integer()|infinity, + max_sqn = 0 :: integer()}). + -record(state, {inker :: pid(), penciller :: pid(), cache_size :: integer(), - ledger_cache, % a skiplist + ledger_cache = #ledger_cache{}, is_snapshot :: boolean(), slow_offer = false :: boolean()}). - %%%============================================================================ %%% API %%%============================================================================ @@ -238,14 +244,14 @@ init([Opts]) -> {ok, #state{inker=Inker, penciller=Penciller, cache_size=CacheSize, - ledger_cache=leveled_skiplist:empty(true), + ledger_cache=#ledger_cache{}, is_snapshot=false}}; Bookie -> {ok, {Penciller, LedgerCache}, Inker} = book_snapshotstore(Bookie, self(), ?SNAPSHOT_TIMEOUT), - ok = leveled_penciller:pcl_loadsnapshot(Penciller, - leveled_skiplist:empty(true)), + CacheToLoad = {leveled_skiplist:empty(true), 0, 0}, + ok = leveled_penciller:pcl_loadsnapshot(Penciller, CacheToLoad), leveled_log:log("B0002", [Inker, Penciller]), {ok, #state{penciller=Penciller, inker=Inker, @@ -276,9 +282,9 @@ handle_call({put, Bucket, Key, Object, IndexSpecs, Tag, TTL}, From, State) -> false -> gen_server:reply(From, ok) end, - case maybepush_ledgercache(State#state.cache_size, - Cache0, - State#state.penciller) of + case maybepush_ledgercache(State#state.cache_size, + Cache0, + State#state.penciller) of {ok, NewCache} -> {noreply, State#state{ledger_cache=NewCache, slow_offer=false}}; {returned, NewCache} -> @@ -292,7 +298,7 @@ handle_call({get, Bucket, Key, Tag}, _From, State) -> not_present -> {reply, not_found, State}; Head -> - {Seqn, Status, _MD} = leveled_codec:striphead_to_details(Head), + {Seqn, Status, _MH, _MD} = leveled_codec:striphead_to_details(Head), case Status of tomb -> {reply, not_found, State}; @@ -317,11 +323,10 @@ handle_call({head, Bucket, Key, Tag}, _From, State) -> not_present -> {reply, not_found, State}; Head -> - {_Seqn, Status, MD} = leveled_codec:striphead_to_details(Head), - case Status of - tomb -> + case leveled_codec:striphead_to_details(Head) of + {_SeqN, tomb, _MH, _MD} -> {reply, not_found, State}; - {active, TS} -> + {_SeqN, {active, TS}, _MH, MD} -> case TS >= leveled_codec:integer_now() of true -> OMD = leveled_codec:build_metadata_object(LedgerKey, MD), @@ -426,19 +431,39 @@ terminate(Reason, State) -> code_change(_OldVsn, State, _Extra) -> {ok, State}. +%%%============================================================================ +%%% External functions +%%%============================================================================ + +load_snapshot(LedgerSnapshot, LedgerCache) -> + CacheToLoad = {LedgerCache#ledger_cache.skiplist, + LedgerCache#ledger_cache.min_sqn, + LedgerCache#ledger_cache.max_sqn}, + ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, CacheToLoad). + +empty_ledgercache() -> + #ledger_cache{}. + +push_ledgercache(Penciller, Cache) -> + CacheToLoad = {Cache#ledger_cache.skiplist, + Cache#ledger_cache.min_sqn, + Cache#ledger_cache.max_sqn}, + leveled_penciller:pcl_pushmem(Penciller, CacheToLoad). %%%============================================================================ %%% Internal functions %%%============================================================================ +cache_size(LedgerCache) -> + leveled_skiplist:size(LedgerCache#ledger_cache.skiplist). + bucket_stats(State, Bucket, Tag) -> {ok, {LedgerSnapshot, LedgerCache}, _JournalSnapshot} = snapshot_store(State, ledger), Folder = fun() -> - leveled_log:log("B0004", [leveled_skiplist:size(LedgerCache)]), - ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, - LedgerCache), + leveled_log:log("B0004", [cache_size(LedgerCache)]), + load_snapshot(LedgerSnapshot, LedgerCache), StartKey = leveled_codec:to_ledgerkey(Bucket, null, Tag), EndKey = leveled_codec:to_ledgerkey(Bucket, null, Tag), AccFun = accumulate_size(), @@ -459,9 +484,8 @@ binary_bucketlist(State, Tag, {FoldBucketsFun, InitAcc}) -> {LedgerSnapshot, LedgerCache}, _JournalSnapshot} = snapshot_store(State, ledger), Folder = fun() -> - leveled_log:log("B0004", [leveled_skiplist:size(LedgerCache)]), - ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, - LedgerCache), + leveled_log:log("B0004", [cache_size(LedgerCache)]), + load_snapshot(LedgerSnapshot, LedgerCache), BucketAcc = get_nextbucket(null, Tag, LedgerSnapshot, @@ -514,9 +538,8 @@ index_query(State, {B, null} end, Folder = fun() -> - leveled_log:log("B0004", [leveled_skiplist:size(LedgerCache)]), - ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, - LedgerCache), + leveled_log:log("B0004", [cache_size(LedgerCache)]), + load_snapshot(LedgerSnapshot, LedgerCache), StartKey = leveled_codec:to_ledgerkey(Bucket, StartObjKey, ?IDX_TAG, @@ -556,9 +579,8 @@ hashtree_query(State, Tag, JournalCheck) -> {LedgerSnapshot, LedgerCache}, JournalSnapshot} = snapshot_store(State, SnapType), Folder = fun() -> - leveled_log:log("B0004", [leveled_skiplist:size(LedgerCache)]), - ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, - LedgerCache), + leveled_log:log("B0004", [cache_size(LedgerCache)]), + load_snapshot(LedgerSnapshot, LedgerCache), StartKey = leveled_codec:to_ledgerkey(null, null, Tag), EndKey = leveled_codec:to_ledgerkey(null, null, Tag), AccFun = accumulate_hashes(JournalCheck, JournalSnapshot), @@ -607,9 +629,8 @@ foldobjects(State, Tag, StartKey, EndKey, FoldObjectsFun) -> {FoldObjectsFun, []} end, Folder = fun() -> - leveled_log:log("B0004", [leveled_skiplist:size(LedgerCache)]), - ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, - LedgerCache), + leveled_log:log("B0004", [cache_size(LedgerCache)]), + load_snapshot(LedgerSnapshot, LedgerCache), AccFun = accumulate_objects(FoldFun, JournalSnapshot, Tag), Acc = leveled_penciller:pcl_fetchkeys(LedgerSnapshot, StartKey, @@ -628,9 +649,8 @@ bucketkey_query(State, Tag, Bucket, {FoldKeysFun, InitAcc}) -> {LedgerSnapshot, LedgerCache}, _JournalSnapshot} = snapshot_store(State, ledger), Folder = fun() -> - leveled_log:log("B0004", [leveled_skiplist:size(LedgerCache)]), - ok = leveled_penciller:pcl_loadsnapshot(LedgerSnapshot, - LedgerCache), + leveled_log:log("B0004", [cache_size(LedgerCache)]), + load_snapshot(LedgerSnapshot, LedgerCache), SK = leveled_codec:to_ledgerkey(Bucket, null, Tag), EK = leveled_codec:to_ledgerkey(Bucket, null, Tag), AccFun = accumulate_keys(FoldKeysFun), @@ -708,7 +728,7 @@ startup(InkerOpts, PencillerOpts) -> fetch_head(Key, Penciller, LedgerCache) -> - case leveled_skiplist:lookup(Key, LedgerCache) of + case leveled_skiplist:lookup(Key, LedgerCache#ledger_cache.skiplist) of {value, Head} -> Head; none -> @@ -874,18 +894,34 @@ preparefor_ledgercache(_Type, LedgerKey, SQN, Obj, Size, {IndexSpecs, TTL}) -> addto_ledgercache(Changes, Cache) -> - lists:foldl(fun({K, V}, Acc) -> leveled_skiplist:enter(K, V, Acc) end, - Cache, - Changes). + FoldChangesFun = + fun({K, V}, Cache0) -> + {SQN, Hash} = leveled_codec:strip_to_seqnhashonly({K, V}), + SL0 = Cache0#ledger_cache.skiplist, + SL1 = + case Hash of + no_lookup -> + leveled_skiplist:enter_nolookup(K, V, SL0); + _ -> + leveled_skiplist:enter(K, Hash, V, SL0) + end, + Cache0#ledger_cache{skiplist=SL1, + min_sqn=min(SQN, Cache0#ledger_cache.min_sqn), + max_sqn=max(SQN, Cache0#ledger_cache.max_sqn)} + end, + lists:foldl(FoldChangesFun, Cache, Changes). maybepush_ledgercache(MaxCacheSize, Cache, Penciller) -> - CacheSize = leveled_skiplist:size(Cache), + CacheSize = leveled_skiplist:size(Cache#ledger_cache.skiplist), TimeToPush = maybe_withjitter(CacheSize, MaxCacheSize), if TimeToPush -> - case leveled_penciller:pcl_pushmem(Penciller, Cache) of + CacheToLoad = {Cache#ledger_cache.skiplist, + Cache#ledger_cache.min_sqn, + Cache#ledger_cache.max_sqn}, + case leveled_penciller:pcl_pushmem(Penciller, CacheToLoad) of ok -> - {ok, leveled_skiplist:empty(true)}; + {ok, #ledger_cache{}}; returned -> {returned, Cache} end; diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl index f8216d6..63777b2 100644 --- a/src/leveled_cdb.erl +++ b/src/leveled_cdb.erl @@ -1272,27 +1272,13 @@ write_top_index_table(Handle, BasePos, List) -> %% To make this compatible with original Bernstein format this endian flip %% and also the use of the standard hash function required. -%% -%% Hash function contains mysterious constants, some explanation here as to -%% what they are - -%% http://stackoverflow.com/ ++ -%% questions/10696223/reason-for-5381-number-in-djb-hash-function endian_flip(Int) -> <> = <>, X. hash(Key) -> - BK = term_to_binary(Key), - H = 5381, - hash1(H, BK) band 16#FFFFFFFF. - -hash1(H, <<>>) -> - H; -hash1(H, <>) -> - H1 = H * 33, - H2 = H1 bxor B, - hash1(H2, Rest). + leveled_codec:magic_hash(Key). % Get the least significant 8 bits from the hash. hash_to_index(Hash) -> diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 19e9c9f..72b90b0 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -39,6 +39,7 @@ strip_to_statusonly/1, strip_to_keyseqstatusonly/1, strip_to_keyseqonly/1, + strip_to_seqnhashonly/1, striphead_to_details/1, is_active/3, endkey_passed/2, @@ -62,11 +63,38 @@ convert_indexspecs/5, generate_uuid/0, integer_now/0, - riak_extract_metadata/2]). + riak_extract_metadata/2, + magic_hash/1]). -define(V1_VERS, 1). -define(MAGIC, 53). % riak_kv -> riak_object +%% Use DJ Bernstein magic hash function. Note, this is more expensive than +%% phash2 but provides a much more balanced result. +%% +%% Hash function contains mysterious constants, some explanation here as to +%% what they are - +%% http://stackoverflow.com/ ++ +%% questions/10696223/reason-for-5381-number-in-djb-hash-function + +magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) -> + magic_hash({Bucket, Key}); +magic_hash({?STD_TAG, Bucket, Key, _SubKey}) -> + magic_hash({Bucket, Key}); +magic_hash({?IDX_TAG, _B, _Idx, _Key}) -> + no_lookup; +magic_hash(AnyKey) -> + BK = term_to_binary(AnyKey), + H = 5381, + hash1(H, BK) band 16#FFFFFFFF. + +hash1(H, <<>>) -> + H; +hash1(H, <>) -> + H1 = H * 33, + H2 = H1 bxor B, + hash1(H2, Rest). + %% Credit to %% https://github.com/afiskon/erlang-uuid-v4/blob/master/src/uuid.erl @@ -87,15 +115,18 @@ inker_reload_strategy(AltList) -> strip_to_keyonly({keyonly, K}) -> K; strip_to_keyonly({K, _V}) -> K. -strip_to_keyseqstatusonly({K, {SeqN, St, _MD}}) -> {K, SeqN, St}. +strip_to_keyseqstatusonly({K, {SeqN, St, _, _MD}}) -> {K, SeqN, St}. -strip_to_statusonly({_, {_, St, _}}) -> St. +strip_to_statusonly({_, {_, St, _, _}}) -> St. -strip_to_seqonly({_, {SeqN, _, _}}) -> SeqN. +strip_to_seqonly({_, {SeqN, _, _, _}}) -> SeqN. -strip_to_keyseqonly({LK, {SeqN, _, _}}) -> {LK, SeqN}. +strip_to_keyseqonly({LK, {SeqN, _, _, _}}) -> {LK, SeqN}. + +strip_to_seqnhashonly({_, {SeqN, _, MH, _}}) -> {SeqN, MH}. + +striphead_to_details({SeqN, St, MH, MD}) -> {SeqN, St, MH, MD}. -striphead_to_details({SeqN, St, MD}) -> {SeqN, St, MD}. key_dominates(LeftKey, RightKey) -> case {LeftKey, RightKey} of @@ -103,10 +134,10 @@ key_dominates(LeftKey, RightKey) -> left_hand_first; {{LK, _LVAL}, {RK, _RVAL}} when RK < LK -> right_hand_first; - {{LK, {LSN, _LST, _LMD}}, {RK, {RSN, _RST, _RMD}}} + {{LK, {LSN, _LST, _LMH, _LMD}}, {RK, {RSN, _RST, _RMH, _RMD}}} when LK == RK, LSN >= RSN -> left_hand_dominant; - {{LK, {LSN, _LST, _LMD}}, {RK, {RSN, _RST, _RMD}}} + {{LK, {LSN, _LST, _LMH, _LMD}}, {RK, {RSN, _RST, _RMH, _RMD}}} when LK == RK, LSN < RSN -> right_hand_dominant end. @@ -218,8 +249,6 @@ create_value_for_journal(Value) -> Value end. - - hash(Obj) -> erlang:phash2(term_to_binary(Obj)). @@ -273,7 +302,7 @@ convert_indexspecs(IndexSpecs, Bucket, Key, SQN, TTL) -> end, {to_ledgerkey(Bucket, Key, ?IDX_TAG, IdxField, IdxValue), - {SQN, Status, null}} + {SQN, Status, no_lookup, null}} end, IndexSpecs). @@ -285,9 +314,11 @@ generate_ledgerkv(PrimaryKey, SQN, Obj, Size, TS) -> _ -> {active, TS} end, - {Bucket, - Key, - {PrimaryKey, {SQN, Status, extract_metadata(Obj, Size, Tag)}}}. + Value = {SQN, + Status, + magic_hash(PrimaryKey), + extract_metadata(Obj, Size, Tag)}, + {Bucket, Key, {PrimaryKey, Value}}. integer_now() -> @@ -304,7 +335,7 @@ extract_metadata(Obj, Size, ?STD_TAG) -> get_size(PK, Value) -> {Tag, _Bucket, _Key, _} = PK, - {_, _, MD} = Value, + {_, _, _, MD} = Value, case Tag of ?RIAK_TAG -> {_RMD, _VC, _Hash, Size} = MD, @@ -316,7 +347,7 @@ get_size(PK, Value) -> get_keyandhash(LK, Value) -> {Tag, Bucket, Key, _} = LK, - {_, _, MD} = Value, + {_, _, _, MD} = Value, case Tag of ?RIAK_TAG -> {_RMD, _VC, Hash, _Size} = MD, @@ -375,11 +406,14 @@ indexspecs_test() -> {remove, "t1_bin", "abdc456"}], Changes = convert_indexspecs(IndexSpecs, "Bucket", "Key2", 1, infinity), ?assertMatch({{i, "Bucket", {"t1_int", 456}, "Key2"}, - {1, {active, infinity}, null}}, lists:nth(1, Changes)), + {1, {active, infinity}, no_lookup, null}}, + lists:nth(1, Changes)), ?assertMatch({{i, "Bucket", {"t1_bin", "adbc123"}, "Key2"}, - {1, {active, infinity}, null}}, lists:nth(2, Changes)), + {1, {active, infinity}, no_lookup, null}}, + lists:nth(2, Changes)), ?assertMatch({{i, "Bucket", {"t1_bin", "abdc456"}, "Key2"}, - {1, tomb, null}}, lists:nth(3, Changes)). + {1, tomb, no_lookup, null}}, + lists:nth(3, Changes)). endkey_passed_test() -> TestKey = {i, null, null, null}, diff --git a/src/leveled_inker.erl b/src/leveled_inker.erl index 2bfcd9c..9a37cae 100644 --- a/src/leveled_inker.erl +++ b/src/leveled_inker.erl @@ -633,13 +633,13 @@ load_from_sequence(MinSQN, FilterFun, Penciller, [{_LowSQN, FN, Pid}|Rest]) -> load_between_sequence(MinSQN, MaxSQN, FilterFun, Penciller, CDBpid, StartPos, FN, Rest) -> leveled_log:log("I0014", [FN, MinSQN]), - InitAcc = {MinSQN, MaxSQN, leveled_skiplist:empty(true)}, + InitAcc = {MinSQN, MaxSQN, leveled_bookie:empty_ledgercache()}, Res = case leveled_cdb:cdb_scan(CDBpid, FilterFun, InitAcc, StartPos) of - {eof, {AccMinSQN, _AccMaxSQN, AccKL}} -> - ok = push_to_penciller(Penciller, AccKL), + {eof, {AccMinSQN, _AccMaxSQN, AccLC}} -> + ok = push_to_penciller(Penciller, AccLC), {ok, AccMinSQN}; - {LastPosition, {_AccMinSQN, _AccMaxSQN, AccKL}} -> - ok = push_to_penciller(Penciller, AccKL), + {LastPosition, {_AccMinSQN, _AccMaxSQN, AccLC}} -> + ok = push_to_penciller(Penciller, AccLC), NextSQN = MaxSQN + 1, load_between_sequence(NextSQN, NextSQN + ?LOADING_BATCH, @@ -657,14 +657,13 @@ load_between_sequence(MinSQN, MaxSQN, FilterFun, Penciller, ok end. -push_to_penciller(Penciller, KeyTree) -> +push_to_penciller(Penciller, LedgerCache) -> % The push to penciller must start as a tree to correctly de-duplicate % the list by order before becoming a de-duplicated list for loading - R = leveled_penciller:pcl_pushmem(Penciller, KeyTree), - case R of + case leveled_bookie:push_ledgercache(Penciller, LedgerCache) of returned -> timer:sleep(?LOADING_PAUSE), - push_to_penciller(Penciller, KeyTree); + push_to_penciller(Penciller, LedgerCache); ok -> ok end. @@ -739,7 +738,7 @@ initiate_penciller_snapshot(Bookie) -> {ok, {LedgerSnap, LedgerCache}, _} = leveled_bookie:book_snapshotledger(Bookie, self(), undefined), - ok = leveled_penciller:pcl_loadsnapshot(LedgerSnap, LedgerCache), + leveled_bookie:load_snapshot(LedgerSnap, LedgerCache), MaxSQN = leveled_penciller:pcl_getstartupsequencenumber(LedgerSnap), {LedgerSnap, MaxSQN}. diff --git a/src/leveled_log.erl b/src/leveled_log.erl index 6c7e4cb..fa26555 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -165,7 +165,7 @@ {"I0013", {info, "File ~s to be removed from manifest"}}, {"I0014", - {info, "On startup oading from filename ~s from SQN ~w"}}, + {info, "On startup loading from filename ~s from SQN ~w"}}, {"I0015", {info, "Opening manifest file at ~s with SQN ~w"}}, {"I0016", diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index bbd2dae..272071d 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -363,11 +363,11 @@ generate_randomkeys(Count, Acc, BucketLow, BRange) -> BNumber = string:right(integer_to_list(BucketLow + random:uniform(BRange)), 4, $0), KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0), - RandKey = {{o, - "Bucket" ++ BNumber, - "Key" ++ KNumber}, - {Count + 1, - {active, infinity}, null}}, + K = {o, "Bucket" ++ BNumber, "Key" ++ KNumber}, + RandKey = {K, {Count + 1, + {active, infinity}, + leveled_codec:magic_hash(K), + null}}, generate_randomkeys(Count - 1, [RandKey|Acc], BucketLow, BRange). choose_pid_toquery([ManEntry|_T], Key) when diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index a111054..a1ab9b1 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -168,9 +168,11 @@ pcl_pushmem/2, pcl_fetchlevelzero/2, pcl_fetch/2, + pcl_fetch/3, pcl_fetchkeys/5, pcl_fetchnextkey/5, pcl_checksequencenumber/3, + pcl_checksequencenumber/4, pcl_workforclerk/1, pcl_promptmanifestchange/2, pcl_confirml0complete/4, @@ -213,8 +215,6 @@ levelzero_pending = false :: boolean(), levelzero_constructor :: pid(), levelzero_cache = [] :: list(), % a list of skiplists - levelzero_index, - % is an array - but cannot specif due to OTP compatability levelzero_size = 0 :: integer(), levelzero_maxcachesize :: integer(), levelzero_cointoss = false :: boolean(), @@ -236,9 +236,9 @@ pcl_start(PCLopts) -> gen_server:start(?MODULE, [PCLopts], []). -pcl_pushmem(Pid, DumpList) -> +pcl_pushmem(Pid, LedgerCache) -> %% Bookie to dump memory onto penciller - gen_server:call(Pid, {push_mem, DumpList}, infinity). + gen_server:call(Pid, {push_mem, LedgerCache}, infinity). pcl_fetchlevelzero(Pid, Slot) -> %% Timeout to cause crash of L0 file when it can't get the close signal @@ -249,7 +249,14 @@ pcl_fetchlevelzero(Pid, Slot) -> gen_server:call(Pid, {fetch_levelzero, Slot}, 60000). pcl_fetch(Pid, Key) -> - gen_server:call(Pid, {fetch, Key}, infinity). + Hash = leveled_codec:magic_hash(Key), + if + Hash /= no_lookup -> + gen_server:call(Pid, {fetch, Key, Hash}, infinity) + end. + +pcl_fetch(Pid, Key, Hash) -> + gen_server:call(Pid, {fetch, Key, Hash}, infinity). pcl_fetchkeys(Pid, StartKey, EndKey, AccFun, InitAcc) -> gen_server:call(Pid, @@ -262,7 +269,14 @@ pcl_fetchnextkey(Pid, StartKey, EndKey, AccFun, InitAcc) -> infinity). pcl_checksequencenumber(Pid, Key, SQN) -> - gen_server:call(Pid, {check_sqn, Key, SQN}, infinity). + Hash = leveled_codec:magic_hash(Key), + if + Hash /= no_lookup -> + gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity) + end. + +pcl_checksequencenumber(Pid, Key, Hash, SQN) -> + gen_server:call(Pid, {check_sqn, Key, Hash, SQN}, infinity). pcl_workforclerk(Pid) -> gen_server:call(Pid, work_for_clerk, infinity). @@ -313,8 +327,9 @@ init([PCLopts]) -> end. -handle_call({push_mem, PushedTree}, From, State=#state{is_snapshot=Snap}) - when Snap == false -> +handle_call({push_mem, {PushedTree, MinSQN, MaxSQN}}, + From, + State=#state{is_snapshot=Snap}) when Snap == false -> % The push_mem process is as follows: % % 1 - Receive a gb_tree containing the latest Key/Value pairs (note that @@ -342,25 +357,24 @@ handle_call({push_mem, PushedTree}, From, State=#state{is_snapshot=Snap}) false -> leveled_log:log("P0018", [ok, false, false]), gen_server:reply(From, ok), - {noreply, update_levelzero(State#state.levelzero_index, - State#state.levelzero_size, - PushedTree, + {noreply, update_levelzero(State#state.levelzero_size, + {PushedTree, MinSQN, MaxSQN}, State#state.ledger_sqn, State#state.levelzero_cache, State)} end; -handle_call({fetch, Key}, _From, State) -> +handle_call({fetch, Key, Hash}, _From, State) -> {reply, fetch_mem(Key, + Hash, State#state.manifest, - State#state.levelzero_index, State#state.levelzero_cache), State}; -handle_call({check_sqn, Key, SQN}, _From, State) -> +handle_call({check_sqn, Key, Hash, SQN}, _From, State) -> {reply, compare_to_sqn(fetch_mem(Key, + Hash, State#state.manifest, - State#state.levelzero_index, State#state.levelzero_cache), SQN), State}; @@ -394,15 +408,13 @@ handle_call(get_startup_sqn, _From, State) -> handle_call({register_snapshot, Snapshot}, _From, State) -> Rs = [{Snapshot, State#state.manifest_sqn}|State#state.registered_snapshots], {reply, {ok, State}, State#state{registered_snapshots = Rs}}; -handle_call({load_snapshot, BookieIncrTree}, _From, State) -> - L0D = leveled_pmem:add_to_index(snap, - State#state.levelzero_size, - BookieIncrTree, +handle_call({load_snapshot, {BookieIncrTree, MinSQN, MaxSQN}}, _From, State) -> + L0D = leveled_pmem:add_to_cache(State#state.levelzero_size, + {BookieIncrTree, MinSQN, MaxSQN}, State#state.ledger_sqn, State#state.levelzero_cache), - {LedgerSQN, L0Size, L0Index, L0Cache} = L0D, + {LedgerSQN, L0Size, L0Cache} = L0D, {reply, ok, State#state{levelzero_cache=L0Cache, - levelzero_index=L0Index, levelzero_size=L0Size, ledger_sqn=LedgerSQN, snapshot_fully_loaded=true}}; @@ -453,7 +465,6 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey}, State) -> {noreply, State#state{levelzero_cache=[], levelzero_pending=false, levelzero_constructor=undefined, - levelzero_index=leveled_pmem:new_index(), levelzero_size=0, manifest=UpdMan, persisted_sqn=State#state.ledger_sqn}}. @@ -546,7 +557,6 @@ start_from_file(PCLopts) -> InitState = #state{clerk=MergeClerk, root_path=RootPath, - levelzero_index = leveled_pmem:new_index(), levelzero_maxcachesize=MaxTableSize, levelzero_cointoss=CoinToss}, @@ -622,19 +632,18 @@ start_from_file(PCLopts) -> -update_levelzero(L0Index, L0Size, PushedTree, LedgerSQN, L0Cache, State) -> - Update = leveled_pmem:add_to_index(L0Index, - L0Size, - PushedTree, +update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, + LedgerSQN, L0Cache, State) -> + Update = leveled_pmem:add_to_cache(L0Size, + {PushedTree, MinSQN, MaxSQN}, LedgerSQN, L0Cache), - {MaxSQN, NewL0Size, UpdL0Index, UpdL0Cache} = Update, + {UpdMaxSQN, NewL0Size, UpdL0Cache} = Update, if - MaxSQN >= LedgerSQN -> + UpdMaxSQN >= LedgerSQN -> UpdState = State#state{levelzero_cache=UpdL0Cache, - levelzero_index=UpdL0Index, levelzero_size=NewL0Size, - ledger_sqn=MaxSQN}, + ledger_sqn=UpdMaxSQN}, CacheTooBig = NewL0Size > State#state.levelzero_maxcachesize, Level0Free = length(get_item(0, State#state.manifest, [])) == 0, RandomFactor = @@ -659,7 +668,6 @@ update_levelzero(L0Index, L0Size, PushedTree, LedgerSQN, L0Cache, State) -> end; NewL0Size == L0Size -> State#state{levelzero_cache=L0Cache, - levelzero_index=L0Index, levelzero_size=L0Size, ledger_sqn=LedgerSQN} end. @@ -707,8 +715,8 @@ levelzero_filename(State) -> FileName. -fetch_mem(Key, Manifest, L0Index, L0Cache) -> - L0Check = leveled_pmem:check_levelzero(Key, L0Index, L0Cache), +fetch_mem(Key, Hash, Manifest, L0Cache) -> + L0Check = leveled_pmem:check_levelzero(Key, Hash, L0Cache), case L0Check of {false, not_found} -> fetch(Key, Manifest, 0, fun leveled_sft:sft_get/2); @@ -1284,8 +1292,12 @@ confirm_delete_test() -> maybe_pause_push(PCL, KL) -> T0 = leveled_skiplist:empty(true), - T1 = lists:foldl(fun({K, V}, Acc) -> leveled_skiplist:enter(K, V, Acc) end, - T0, + T1 = lists:foldl(fun({K, V}, {AccSL, MinSQN, MaxSQN}) -> + SL = leveled_skiplist:enter(K, V, AccSL), + SQN = leveled_codec:strip_to_seqonly({K, V}), + {SL, min(SQN, MinSQN), max(SQN, MaxSQN)} + end, + {T0, infinity, 0}, KL), case pcl_pushmem(PCL, T1) of returned -> @@ -1295,23 +1307,32 @@ maybe_pause_push(PCL, KL) -> ok end. +%% old test data doesn't have the magic hash +add_missing_hash({K, {SQN, ST, MD}}) -> + {K, {SQN, ST, leveled_codec:magic_hash(K), MD}}. + + simple_server_test() -> RootPath = "../test/ledger", clean_testdir(RootPath), {ok, PCL} = pcl_start(#penciller_options{root_path=RootPath, max_inmemory_tablesize=1000}), - Key1 = {{o,"Bucket0001", "Key0001", null}, - {1, {active, infinity}, null}}, + Key1_Pre = {{o,"Bucket0001", "Key0001", null}, + {1, {active, infinity}, null}}, + Key1 = add_missing_hash(Key1_Pre), KL1 = leveled_sft:generate_randomkeys({1000, 2}), - Key2 = {{o,"Bucket0002", "Key0002", null}, + Key2_Pre = {{o,"Bucket0002", "Key0002", null}, {1002, {active, infinity}, null}}, + Key2 = add_missing_hash(Key2_Pre), KL2 = leveled_sft:generate_randomkeys({900, 1003}), % Keep below the max table size by having 900 not 1000 - Key3 = {{o,"Bucket0003", "Key0003", null}, + Key3_Pre = {{o,"Bucket0003", "Key0003", null}, {2003, {active, infinity}, null}}, + Key3 = add_missing_hash(Key3_Pre), KL3 = leveled_sft:generate_randomkeys({1000, 2004}), - Key4 = {{o,"Bucket0004", "Key0004", null}, + Key4_Pre = {{o,"Bucket0004", "Key0004", null}, {3004, {active, infinity}, null}}, + Key4 = add_missing_hash(Key4_Pre), KL4 = leveled_sft:generate_randomkeys({1000, 3005}), ok = maybe_pause_push(PCL, [Key1]), ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), @@ -1351,7 +1372,8 @@ simple_server_test() -> SnapOpts = #penciller_options{start_snapshot = true, source_penciller = PCLr}, {ok, PclSnap} = pcl_start(SnapOpts), - ok = pcl_loadsnapshot(PclSnap, leveled_skiplist:empty()), + leveled_bookie:load_snapshot(PclSnap, + leveled_bookie:empty_ledgercache()), ?assertMatch(Key1, pcl_fetch(PclSnap, {o,"Bucket0001", "Key0001", null})), ?assertMatch(Key2, pcl_fetch(PclSnap, {o,"Bucket0002", "Key0002", null})), ?assertMatch(Key3, pcl_fetch(PclSnap, {o,"Bucket0003", "Key0003", null})), @@ -1383,7 +1405,9 @@ simple_server_test() -> % Add some more keys and confirm that check sequence number still % sees the old version in the previous snapshot, but will see the new version % in a new snapshot - Key1A = {{o,"Bucket0001", "Key0001", null}, {4005, {active, infinity}, null}}, + Key1A_Pre = {{o,"Bucket0001", "Key0001", null}, + {4005, {active, infinity}, null}}, + Key1A = add_missing_hash(Key1A_Pre), KL1A = leveled_sft:generate_randomkeys({2000, 4006}), ok = maybe_pause_push(PCLr, [Key1A]), ok = maybe_pause_push(PCLr, KL1A), @@ -1400,7 +1424,7 @@ simple_server_test() -> term_to_binary("Hello")), {ok, PclSnap2} = pcl_start(SnapOpts), - ok = pcl_loadsnapshot(PclSnap2, leveled_skiplist:empty()), + leveled_bookie:load_snapshot(PclSnap2, leveled_bookie:empty_ledgercache()), ?assertMatch(false, pcl_checksequencenumber(PclSnap2, {o, "Bucket0001", @@ -1506,23 +1530,26 @@ simple_findnextkey_test() -> sqnoverlap_findnextkey_test() -> QueryArray = [ - {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, - {{o, "Bucket1", "Key5"}, {4, {active, infinity}, null}}]}, - {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}]}, - {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, null}}]} + {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key5"}, {4, {active, infinity}, 0, null}}]}, + {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}]}, + {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, 0, null}}]} ], {Array2, KV1} = find_nextkey(QueryArray, {o, "Bucket1", "Key0"}, {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, KV1), + ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, + KV1), {Array3, KV2} = find_nextkey(Array2, {o, "Bucket1", "Key0"}, {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}, KV2), + ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}, + KV2), {Array4, KV3} = find_nextkey(Array3, {o, "Bucket1", "Key0"}, {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key5"}, {4, {active, infinity}, null}}, KV3), + ?assertMatch({{o, "Bucket1", "Key5"}, {4, {active, infinity}, 0, null}}, + KV3), ER = find_nextkey(Array4, {o, "Bucket1", "Key0"}, {o, "Bucket1", "Key5"}), @@ -1530,23 +1557,26 @@ sqnoverlap_findnextkey_test() -> sqnoverlap_otherway_findnextkey_test() -> QueryArray = [ - {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, - {{o, "Bucket1", "Key5"}, {1, {active, infinity}, null}}]}, - {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}]}, - {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, null}}]} + {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key5"}, {1, {active, infinity}, 0, null}}]}, + {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}]}, + {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, 0, null}}]} ], {Array2, KV1} = find_nextkey(QueryArray, {o, "Bucket1", "Key0"}, {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, KV1), + ?assertMatch({{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, + KV1), {Array3, KV2} = find_nextkey(Array2, {o, "Bucket1", "Key0"}, {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}, KV2), + ?assertMatch({{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}, + KV2), {Array4, KV3} = find_nextkey(Array3, {o, "Bucket1", "Key0"}, {o, "Bucket1", "Key5"}), - ?assertMatch({{o, "Bucket1", "Key5"}, {2, {active, infinity}, null}}, KV3), + ?assertMatch({{o, "Bucket1", "Key5"}, {2, {active, infinity}, 0, null}}, + KV3), ER = find_nextkey(Array4, {o, "Bucket1", "Key0"}, {o, "Bucket1", "Key5"}), @@ -1554,19 +1584,19 @@ sqnoverlap_otherway_findnextkey_test() -> foldwithimm_simple_test() -> QueryArray = [ - {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, null}}, - {{o, "Bucket1", "Key5"}, {1, {active, infinity}, null}}]}, - {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, null}}]}, - {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, null}}]} + {2, [{{o, "Bucket1", "Key1"}, {5, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key5"}, {1, {active, infinity}, 0, null}}]}, + {3, [{{o, "Bucket1", "Key3"}, {3, {active, infinity}, 0, null}}]}, + {5, [{{o, "Bucket1", "Key5"}, {2, {active, infinity}, 0, null}}]} ], IMM0 = leveled_skiplist:enter({o, "Bucket1", "Key6"}, - {7, {active, infinity}, null}, + {7, {active, infinity}, 0, null}, leveled_skiplist:empty()), IMM1 = leveled_skiplist:enter({o, "Bucket1", "Key1"}, - {8, {active, infinity}, null}, + {8, {active, infinity}, 0, null}, IMM0), IMM2 = leveled_skiplist:enter({o, "Bucket1", "Key8"}, - {9, {active, infinity}, null}, + {9, {active, infinity}, 0, null}, IMM1), IMMiter = leveled_skiplist:to_range(IMM2, {o, "Bucket1", "Key1"}), AccFun = fun(K, V, Acc) -> SQN = leveled_codec:strip_to_seqonly({K, V}), @@ -1581,7 +1611,7 @@ foldwithimm_simple_test() -> {{o, "Bucket1", "Key6"}, 7}], Acc), IMM1A = leveled_skiplist:enter({o, "Bucket1", "Key1"}, - {8, {active, infinity}, null}, + {8, {active, infinity}, 0, null}, leveled_skiplist:empty()), IMMiterA = leveled_skiplist:to_range(IMM1A, {o, "Bucket1", "Key1"}), AccA = keyfolder(IMMiterA, @@ -1593,7 +1623,7 @@ foldwithimm_simple_test() -> {{o, "Bucket1", "Key5"}, 2}], AccA), IMM3 = leveled_skiplist:enter({o, "Bucket1", "Key4"}, - {10, {active, infinity}, null}, + {10, {active, infinity}, 0, null}, IMM2), IMMiterB = leveled_skiplist:to_range(IMM3, {o, "Bucket1", "Key1"}), AccB = keyfolder(IMMiterB, @@ -1688,14 +1718,15 @@ badmanifest_test() -> clean_testdir(RootPath), {ok, PCL} = pcl_start(#penciller_options{root_path=RootPath, max_inmemory_tablesize=1000}), - Key1 = {{o,"Bucket0001", "Key0001", null}, + Key1_pre = {{o,"Bucket0001", "Key0001", null}, {1001, {active, infinity}, null}}, + Key1 = add_missing_hash(Key1_pre), KL1 = leveled_sft:generate_randomkeys({1000, 1}), ok = maybe_pause_push(PCL, KL1 ++ [Key1]), %% Added together, as split apart there will be a race between the close %% call to the penciller and the second fetch of the cache entry - ?assertMatch(Key1, pcl_fetch(PCL, {o,"Bucket0001", "Key0001", null})), + ?assertMatch(Key1, pcl_fetch(PCL, {o, "Bucket0001", "Key0001", null})), timer:sleep(100), % Avoids confusion if L0 file not written before close ok = pcl_close(PCL), diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 61ecd4e..5ba62aa 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -42,9 +42,8 @@ -include("include/leveled.hrl"). -export([ - add_to_index/5, + add_to_cache/4, to_list/2, - new_index/0, check_levelzero/3, merge_trees/4 ]). @@ -56,53 +55,20 @@ %%% API %%%============================================================================ -add_to_index(snap, L0Size, LevelMinus1, LedgerSQN, TreeList) -> - FoldFun = fun({K, V}, {AccMinSQN, AccMaxSQN, AccCount}) -> - SQN = leveled_codec:strip_to_seqonly({K, V}), - {min(SQN, AccMinSQN), - max(SQN, AccMaxSQN), - AccCount + 1} - end, - LM1List = leveled_skiplist:to_list(LevelMinus1), - StartingT = {infinity, 0, L0Size}, - {MinSQN, MaxSQN, NewL0Size} = lists:foldl(FoldFun, StartingT, LM1List), - if - MinSQN > LedgerSQN -> - {MaxSQN, - NewL0Size, - snap, - lists:append(TreeList, [LevelMinus1])} - end; -add_to_index(L0Index, L0Size, LevelMinus1, LedgerSQN, TreeList) -> - SW = os:timestamp(), - SlotInTreeList = length(TreeList) + 1, - FoldFun = fun({K, V}, {AccMinSQN, AccMaxSQN, AccCount}) -> - SQN = leveled_codec:strip_to_seqonly({K, V}), - Hash = erlang:phash2(K), - Count0 = case ets:lookup(L0Index, Hash) of - [] -> - ets:insert(L0Index, {Hash, [SlotInTreeList]}), - AccCount + 1; - [{Hash, L}] -> - ets:insert(L0Index, {Hash, [SlotInTreeList|L]}), - AccCount - end, - {min(SQN, AccMinSQN), - max(SQN, AccMaxSQN), - Count0} - end, - LM1List = leveled_skiplist:to_list(LevelMinus1), - StartingT = {infinity, 0, L0Size}, - {MinSQN, MaxSQN, NewL0Size} = lists:foldl(FoldFun, StartingT, LM1List), - leveled_log:log_timer("PM001", [NewL0Size], SW), - if - MinSQN > LedgerSQN -> - {MaxSQN, - NewL0Size, - L0Index, - lists:append(TreeList, [LevelMinus1])} +add_to_cache(L0Size, {LevelMinus1, MinSQN, MaxSQN}, LedgerSQN, TreeList) -> + LM1Size = leveled_skiplist:size(LevelMinus1), + case LM1Size of + 0 -> + {LedgerSQN, L0Size, TreeList}; + _ -> + if + MinSQN >= LedgerSQN -> + {MaxSQN, + L0Size + LM1Size, + lists:append(TreeList, [LevelMinus1])} + end end. - + to_list(Slots, FetchFun) -> SW = os:timestamp(), @@ -118,21 +84,13 @@ to_list(Slots, FetchFun) -> FullList. -new_index() -> - ets:new(index, [set, private]). +check_levelzero(Key, TreeList) -> + check_levelzero(Key, leveled_codec:magic_hash(Key), TreeList). -check_levelzero(_Key, _L0Index, []) -> +check_levelzero(_Key, _Hash, []) -> {false, not_found}; -check_levelzero(Key, snap, TreeList) -> - check_slotlist(Key, lists:seq(1, length(TreeList)), TreeList); -check_levelzero(Key, L0Index, TreeList) -> - Hash = erlang:phash2(Key), - case ets:lookup(L0Index, Hash) of - [] -> - {false, not_found}; - [{Hash, SlotList}] -> - check_slotlist(Key, SlotList, TreeList) - end. +check_levelzero(Key, Hash, TreeList) -> + check_slotlist(Key, Hash, lists:seq(1, length(TreeList)), TreeList). merge_trees(StartKey, EndKey, SkipListList, LevelMinus1) -> @@ -148,7 +106,7 @@ merge_trees(StartKey, EndKey, SkipListList, LevelMinus1) -> %%% Internal Functions %%%============================================================================ -check_slotlist(Key, CheckList, TreeList) -> +check_slotlist(Key, Hash, CheckList, TreeList) -> SlotCheckFun = fun(SlotToCheck, {Found, KV}) -> case Found of @@ -156,7 +114,7 @@ check_slotlist(Key, CheckList, TreeList) -> {Found, KV}; false -> CheckTree = lists:nth(SlotToCheck, TreeList), - case leveled_skiplist:lookup(Key, CheckTree) of + case leveled_skiplist:lookup(Key, Hash, CheckTree) of none -> {Found, KV}; {value, Value} -> @@ -166,7 +124,7 @@ check_slotlist(Key, CheckList, TreeList) -> end, lists:foldl(SlotCheckFun, {false, not_found}, - lists:reverse(lists:usort(CheckList))). + lists:reverse(CheckList)). %%%============================================================================ %%% Test @@ -177,7 +135,7 @@ check_slotlist(Key, CheckList, TreeList) -> generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> generate_randomkeys(Seqn, Count, - leveled_skiplist:empty(), + leveled_skiplist:empty(true), BucketRangeLow, BucketRangeHigh). @@ -197,58 +155,59 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) -> compare_method_test() -> - R = lists:foldl(fun(_X, {LedgerSQN, L0Size, L0Index, L0TreeList}) -> + R = lists:foldl(fun(_X, {LedgerSQN, L0Size, L0TreeList}) -> LM1 = generate_randomkeys(LedgerSQN + 1, 2000, 1, 500), - add_to_index(L0Index, L0Size, LM1, LedgerSQN, - L0TreeList) + add_to_cache(L0Size, + {LM1, + LedgerSQN + 1, + LedgerSQN + 2000}, + LedgerSQN, + L0TreeList) end, - {0, 0, new_index(), []}, + {0, 0, []}, lists:seq(1, 16)), - {SQN, Size, Index, TreeList} = R, + {SQN, Size, TreeList} = R, ?assertMatch(32000, SQN), ?assertMatch(true, Size =< 32000), TestList = leveled_skiplist:to_list(generate_randomkeys(1, 2000, 1, 800)), - S0 = lists:foldl(fun({Key, _V}, Acc) -> - R0 = lists:foldr(fun(Tree, {Found, KV}) -> - case Found of - true -> - {true, KV}; - false -> - L0 = leveled_skiplist:lookup(Key, Tree), - case L0 of - none -> - {false, not_found}; - {value, Value} -> - {true, {Key, Value}} - end + FindKeyFun = + fun(Key) -> + fun(Tree, {Found, KV}) -> + case Found of + true -> + {true, KV}; + false -> + L0 = leveled_skiplist:lookup(Key, Tree), + case L0 of + none -> + {false, not_found}; + {value, Value} -> + {true, {Key, Value}} end - end, - {false, not_found}, - TreeList), - [R0|Acc] - end, - [], - TestList), + end + end + end, - S1 = lists:foldl(fun({Key, _V}, Acc) -> - R0 = check_levelzero(Key, Index, TreeList), - [R0|Acc] - end, + S0 = lists:foldl(fun({Key, _V}, Acc) -> + R0 = lists:foldr(FindKeyFun(Key), + {false, not_found}, + TreeList), + [R0|Acc] end, [], TestList), - S2 = lists:foldl(fun({Key, _V}, Acc) -> - R0 = check_levelzero(Key, snap, TreeList), + + S1 = lists:foldl(fun({Key, _V}, Acc) -> + R0 = check_levelzero(Key, TreeList), [R0|Acc] end, [], TestList), ?assertMatch(S0, S1), - ?assertMatch(S0, S2), StartKey = {o, "Bucket0100", null, null}, EndKey = {o, "Bucket0200", null, null}, diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 9c67721..4c86dff 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -1400,12 +1400,15 @@ generate_randomkeys(Count) -> generate_randomkeys(0, _SQN, Acc) -> lists:reverse(Acc); generate_randomkeys(Count, SQN, Acc) -> - RandKey = {{o, - lists:concat(["Bucket", random:uniform(1024)]), - lists:concat(["Key", random:uniform(1024)]), - null}, + K = {o, + lists:concat(["Bucket", random:uniform(1024)]), + lists:concat(["Key", random:uniform(1024)]), + null}, + RandKey = {K, {SQN, - {active, infinity}, null}}, + {active, infinity}, + leveled_codec:magic_hash(K), + null}}, generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). generate_sequentialkeys(Count, Start) -> @@ -1415,75 +1418,86 @@ generate_sequentialkeys(Target, Incr, Acc) when Incr =:= Target -> Acc; generate_sequentialkeys(Target, Incr, Acc) -> KeyStr = string:right(integer_to_list(Incr), 8, $0), - NextKey = {{o, - "BucketSeq", - lists:concat(["Key", KeyStr]), - null}, + K = {o, "BucketSeq", lists:concat(["Key", KeyStr]), null}, + NextKey = {K, {5, - {active, infinity}, null}}, + {active, infinity}, + leveled_codec:magic_hash(K), + null}}, generate_sequentialkeys(Target, Incr + 1, [NextKey|Acc]). simple_create_block_test() -> - KeyList1 = [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key3", null}, {2, {active, infinity}, null}}], - KeyList2 = [{{o, "Bucket1", "Key2", null}, {3, {active, infinity}, null}}], + KeyList1 = [{{o, "Bucket1", "Key1", null}, + {1, {active, infinity}, no_lookup, null}}, + {{o, "Bucket1", "Key3", null}, + {2, {active, infinity}, no_lookup, null}}], + KeyList2 = [{{o, "Bucket1", "Key2", null}, + {3, {active, infinity}, no_lookup, null}}], {MergedKeyList, ListStatus, SN, _, _, _} = create_block(KeyList1, KeyList2, #level{level=1}), ?assertMatch(partial, ListStatus), [H1|T1] = MergedKeyList, - ?assertMatch(H1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}), + ?assertMatch({{o, "Bucket1", "Key1", null}, + {1, {active, infinity}, no_lookup, null}}, H1), [H2|T2] = T1, - ?assertMatch(H2, {{o, "Bucket1", "Key2", null}, {3, {active, infinity}, null}}), - ?assertMatch(T2, [{{o, "Bucket1", "Key3", null}, {2, {active, infinity}, null}}]), + ?assertMatch({{o, "Bucket1", "Key2", null}, + {3, {active, infinity}, no_lookup, null}}, H2), + ?assertMatch([{{o, "Bucket1", "Key3", null}, + {2, {active, infinity}, no_lookup, null}}], T2), ?assertMatch(SN, {1,3}). dominate_create_block_test() -> - KeyList1 = [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key2", null}, {2, {active, infinity}, null}}], - KeyList2 = [{{o, "Bucket1", "Key2", null}, {3, {tomb, infinity}, null}}], + KeyList1 = [{{o, "Bucket1", "Key1", null}, + {1, {active, infinity}, no_lookup, null}}, + {{o, "Bucket1", "Key2", null}, + {2, {active, infinity}, no_lookup, null}}], + KeyList2 = [{{o, "Bucket1", "Key2", null}, + {3, {tomb, infinity}, no_lookup, null}}], {MergedKeyList, ListStatus, SN, _, _, _} = create_block(KeyList1, KeyList2, #level{level=1}), ?assertMatch(partial, ListStatus), [K1, K2] = MergedKeyList, - ?assertMatch(K1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}), - ?assertMatch(K2, {{o, "Bucket1", "Key2", null}, {3, {tomb, infinity}, null}}), + ?assertMatch(K1, lists:nth(1, KeyList1)), + ?assertMatch(K2, lists:nth(1, KeyList2)), ?assertMatch(SN, {1,3}). sample_keylist() -> - KeyList1 = [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key3", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key5", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key7", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key9", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key1", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key3", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key5", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key7", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key9", null}, {1, {active, infinity}, null}}, - {{o, "Bucket3", "Key1", null}, {1, {active, infinity}, null}}, - {{o, "Bucket3", "Key3", null}, {1, {active, infinity}, null}}, - {{o, "Bucket3", "Key5", null}, {1, {active, infinity}, null}}, - {{o, "Bucket3", "Key7", null}, {1, {active, infinity}, null}}, - {{o, "Bucket3", "Key9", null}, {1, {active, infinity}, null}}, - {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, null}}], - KeyList2 = [{{o, "Bucket1", "Key2", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key4", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key6", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key8", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key9a", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key9c", null}, {1, {active, infinity}, null}}, - {{o, "Bucket1", "Key9d", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key2", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key4", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key6", null}, {1, {active, infinity}, null}}, - {{o, "Bucket2", "Key8", null}, {1, {active, infinity}, null}}, - {{o, "Bucket3", "Key2", null}, {1, {active, infinity}, null}}, - {{o, "Bucket3", "Key4", null}, {3, {active, infinity}, null}}, - {{o, "Bucket3", "Key6", null}, {2, {active, infinity}, null}}, - {{o, "Bucket3", "Key8", null}, {1, {active, infinity}, null}}], + KeyList1 = + [{{o, "Bucket1", "Key1", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key3", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key5", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key7", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key9", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key1", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key3", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key5", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key7", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key9", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key1", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key3", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key5", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key7", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key9", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, 0, null}}], + KeyList2 = + [{{o, "Bucket1", "Key2", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key4", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key6", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key8", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key9a", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key9c", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket1", "Key9d", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key2", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key4", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key6", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket2", "Key8", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key2", null}, {1, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key4", null}, {3, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key6", null}, {2, {active, infinity}, 0, null}}, + {{o, "Bucket3", "Key8", null}, {1, {active, infinity}, 0, null}}], {KeyList1, KeyList2}. alternating_create_block_test() -> @@ -1495,12 +1509,12 @@ alternating_create_block_test() -> ?assertMatch(BlockSize, 32), ?assertMatch(ListStatus, complete), K1 = lists:nth(1, MergedKeyList), - ?assertMatch(K1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, null}}), + ?assertMatch(K1, {{o, "Bucket1", "Key1", null}, {1, {active, infinity}, 0, null}}), K11 = lists:nth(11, MergedKeyList), - ?assertMatch(K11, {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, null}}), + ?assertMatch(K11, {{o, "Bucket1", "Key9b", null}, {1, {active, infinity}, 0, null}}), K32 = lists:nth(32, MergedKeyList), - ?assertMatch(K32, {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, null}}), - HKey = {{o, "Bucket1", "Key0", null}, {1, {active, infinity}, null}}, + ?assertMatch(K32, {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, 0, null}}), + HKey = {{o, "Bucket1", "Key0", null}, {1, {active, infinity}, 0, null}}, {_, ListStatus2, _, _, _, _} = create_block([HKey|KeyList1], KeyList2, #level{level=1}), @@ -1752,7 +1766,7 @@ initial_create_file_test() -> Result1 = fetch_keyvalue(UpdHandle, UpdFileMD, {o, "Bucket1", "Key8", null}), io:format("Result is ~w~n", [Result1]), ?assertMatch(Result1, {{o, "Bucket1", "Key8", null}, - {1, {active, infinity}, null}}), + {1, {active, infinity}, 0, null}}), Result2 = fetch_keyvalue(UpdHandle, UpdFileMD, {o, "Bucket1", "Key88", null}), io:format("Result is ~w~n", [Result2]), ?assertMatch(Result2, not_present), @@ -1768,17 +1782,17 @@ big_create_file_test() -> InitFileMD, KL1, KL2, #level{level=1}), - [{K1, {Sq1, St1, V1}}|_] = KL1, - [{K2, {Sq2, St2, V2}}|_] = KL2, + [{K1, {Sq1, St1, MH1, V1}}|_] = KL1, + [{K2, {Sq2, St2, MH2, V2}}|_] = KL2, Result1 = fetch_keyvalue(Handle, FileMD, K1), Result2 = fetch_keyvalue(Handle, FileMD, K2), - ?assertMatch(Result1, {K1, {Sq1, St1, V1}}), - ?assertMatch(Result2, {K2, {Sq2, St2, V2}}), + ?assertMatch(Result1, {K1, {Sq1, St1, MH1, V1}}), + ?assertMatch(Result2, {K2, {Sq2, St2, MH2, V2}}), SubList = lists:sublist(KL2, 1000), - lists:foreach(fun(K) -> - {Kn, {_, _, _}} = K, + lists:foreach(fun(KV) -> + {Kn, _} = KV, Rn = fetch_keyvalue(Handle, FileMD, Kn), - ?assertMatch({Kn, {_, _, _}}, Rn) + ?assertMatch({Kn, _}, Rn) end, SubList), Result3 = fetch_keyvalue(Handle, @@ -1834,13 +1848,13 @@ initial_iterator_test() -> ok = file:delete(Filename). key_dominates_test() -> - KV1 = {{o, "Bucket", "Key1", null}, {5, {active, infinity}, []}}, - KV2 = {{o, "Bucket", "Key3", null}, {6, {active, infinity}, []}}, - KV3 = {{o, "Bucket", "Key2", null}, {3, {active, infinity}, []}}, - KV4 = {{o, "Bucket", "Key4", null}, {7, {active, infinity}, []}}, - KV5 = {{o, "Bucket", "Key1", null}, {4, {active, infinity}, []}}, - KV6 = {{o, "Bucket", "Key1", null}, {99, {tomb, 999}, []}}, - KV7 = {{o, "Bucket", "Key1", null}, {99, tomb, []}}, + KV1 = {{o, "Bucket", "Key1", null}, {5, {active, infinity}, 0, []}}, + KV2 = {{o, "Bucket", "Key3", null}, {6, {active, infinity}, 0, []}}, + KV3 = {{o, "Bucket", "Key2", null}, {3, {active, infinity}, 0, []}}, + KV4 = {{o, "Bucket", "Key4", null}, {7, {active, infinity}, 0, []}}, + KV5 = {{o, "Bucket", "Key1", null}, {4, {active, infinity}, 0, []}}, + KV6 = {{o, "Bucket", "Key1", null}, {99, {tomb, 999}, 0, []}}, + KV7 = {{o, "Bucket", "Key1", null}, {99, tomb, 0, []}}, KL1 = [KV1, KV2], KL2 = [KV3, KV4], ?assertMatch({{next_key, KV1}, [KV2], KL2}, @@ -1970,21 +1984,21 @@ hashclash_test() -> "Bucket", "Key8400" ++ integer_to_list(X), null}, - Value = {X, {active, infinity}, null}, + Value = {X, {active, infinity}, 0, null}, Acc ++ [{Key, Value}] end, [], lists:seq(10,98)), - KeyListToUse = [{Key1, {1, {active, infinity}, null}}|KeyList] - ++ [{Key99, {99, {active, infinity}, null}}], + KeyListToUse = [{Key1, {1, {active, infinity}, 0, null}}|KeyList] + ++ [{Key99, {99, {active, infinity}, 0, null}}], {InitHandle, InitFileMD} = create_file(Filename), {Handle, _FileMD, _Rem} = complete_file(InitHandle, InitFileMD, KeyListToUse, [], #level{level=1}), ok = file:close(Handle), {ok, SFTr, _KeyExtremes} = sft_open(Filename), - ?assertMatch({Key1, {1, {active, infinity}, null}}, + ?assertMatch({Key1, {1, {active, infinity}, 0, null}}, sft_get(SFTr, Key1)), - ?assertMatch({Key99, {99, {active, infinity}, null}}, + ?assertMatch({Key99, {99, {active, infinity}, 0, null}}, sft_get(SFTr, Key99)), ?assertMatch(not_present, sft_get(SFTr, KeyNF)), diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 63a3842..17da98c 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -22,6 +22,8 @@ from_sortedlist/2, to_list/1, enter/3, + enter/4, + enter_nolookup/3, to_range/2, to_range/3, lookup/2, @@ -43,17 +45,31 @@ %%%============================================================================ enter(Key, Value, SkipList) -> - Hash = erlang:phash2(Key), - case is_list(SkipList) of - true -> - enter(Key, Value, Hash, SkipList, ?SKIP_WIDTH, ?LIST_HEIGHT); - false -> - SkipList0 = add_to_array(Hash, SkipList), - NewListPart = enter(Key, Value, Hash, - dict:fetch(?SKIP_WIDTH, SkipList0), - ?SKIP_WIDTH, ?LIST_HEIGHT), - dict:store(?SKIP_WIDTH, NewListPart, SkipList0) - end. + Hash = leveled_codec:magic_hash(Key), + enter(Key, Hash, Value, SkipList). + +enter(Key, Hash, Value, SkipList) -> + Bloom0 = + case element(1, SkipList) of + list_only -> + list_only; + Bloom -> + leveled_tinybloom:enter({hash, Hash}, Bloom) + end, + {Bloom0, + enter(Key, Value, Hash, + element(2, SkipList), + ?SKIP_WIDTH, ?LIST_HEIGHT)}. + +%% Can iterate over a key entered this way, but never lookup the key +%% used for index terms +%% The key may still be a marker key - and the much cheaper native hash +%% is used to dtermine this, avoiding the more expensive magic hash +enter_nolookup(Key, Value, SkipList) -> + {element(1, SkipList), + enter(Key, Value, erlang:phash2(Key), + element(2, SkipList), + ?SKIP_WIDTH, ?LIST_HEIGHT)}. from_list(UnsortedKVL) -> from_list(UnsortedKVL, false). @@ -66,71 +82,45 @@ from_sortedlist(SortedKVL) -> from_sortedlist(SortedKVL, false). from_sortedlist(SortedKVL, BloomProtect) -> - case BloomProtect of - true -> - SL0 = lists:foldr(fun({K, _V}, SkipL) -> - H = erlang:phash2(K), - add_to_array(H, SkipL) end, - empty(true), - SortedKVL), - dict:store(?SKIP_WIDTH, - from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT), - SL0); - false -> - from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT) - end. + Bloom0 = + case BloomProtect of + true -> + lists:foldr(fun({K, _V}, Bloom) -> + leveled_tinybloom:enter(K, Bloom) end, + leveled_tinybloom:empty(?SKIP_WIDTH), + SortedKVL); + false -> + list_only + end, + {Bloom0, from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT)}. lookup(Key, SkipList) -> - case is_list(SkipList) of - true -> - list_lookup(Key, SkipList, ?LIST_HEIGHT); - false -> - lookup(Key, erlang:phash2(Key), SkipList) + case element(1, SkipList) of + list_only -> + list_lookup(Key, element(2, SkipList), ?LIST_HEIGHT); + _ -> + lookup(Key, leveled_codec:magic_hash(Key), SkipList) end. lookup(Key, Hash, SkipList) -> - {Slot, Bit} = hash_toslotbit(Hash), - RestLen = ?BITARRAY_SIZE - Bit - 1, - <<_Head:Bit/bitstring, - B:1/bitstring, - _Rest:RestLen/bitstring>> = dict:fetch(Slot, SkipList), - case B of - <<0:1>> -> + case leveled_tinybloom:check({hash, Hash}, element(1, SkipList)) of + false -> none; - <<1:1>> -> - list_lookup(Key, dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT) + true -> + list_lookup(Key, element(2, SkipList), ?LIST_HEIGHT) end. %% Rather than support iterator_from like gb_trees, will just an output a key %% sorted list for the desired range, which can the be iterated over as normal to_range(SkipList, Start) -> - case is_list(SkipList) of - true -> - to_range(SkipList, Start, ?INFINITY_KEY, ?LIST_HEIGHT); - false -> - to_range(dict:fetch(?SKIP_WIDTH, SkipList), - Start, ?INFINITY_KEY, - ?LIST_HEIGHT) - end. + to_range(element(2, SkipList), Start, ?INFINITY_KEY, ?LIST_HEIGHT). to_range(SkipList, Start, End) -> - case is_list(SkipList) of - true -> - to_range(SkipList, Start, End, ?LIST_HEIGHT); - false -> - to_range(dict:fetch(?SKIP_WIDTH, SkipList), - Start, End, - ?LIST_HEIGHT) - end. + to_range(element(2, SkipList), Start, End, ?LIST_HEIGHT). to_list(SkipList) -> - case is_list(SkipList) of - true -> - to_list(SkipList, ?LIST_HEIGHT); - false -> - to_list(dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT) - end. + to_list(element(2, SkipList), ?LIST_HEIGHT). empty() -> empty(false). @@ -138,46 +128,20 @@ empty() -> empty(BloomProtect) -> case BloomProtect of true -> - FoldFun = - fun(X, Acc) -> dict:store(X, <<0:?BITARRAY_SIZE>>, Acc) end, - lists:foldl(FoldFun, - dict:store(?SKIP_WIDTH, - empty([], ?LIST_HEIGHT), - dict:new()), - lists:seq(0, ?SKIP_WIDTH - 1)); + {leveled_tinybloom:empty(?SKIP_WIDTH), + empty([], ?LIST_HEIGHT)}; false -> - empty([], ?LIST_HEIGHT) + {list_only, empty([], ?LIST_HEIGHT)} end. size(SkipList) -> - case is_list(SkipList) of - true -> - size(SkipList, ?LIST_HEIGHT); - false -> - size(dict:fetch(?SKIP_WIDTH, SkipList), ?LIST_HEIGHT) - end. - + size(element(2, SkipList), ?LIST_HEIGHT). %%%============================================================================ %%% SkipList Base Functions %%%============================================================================ -hash_toslotbit(Hash) -> - Slot = Hash band (?SKIP_WIDTH - 1), - Bit = (Hash bsr ?SKIP_WIDTH) band (?BITARRAY_SIZE - 1), - {Slot, Bit}. - - -add_to_array(Hash, SkipList) -> - {Slot, Bit} = hash_toslotbit(Hash), - RestLen = ?BITARRAY_SIZE - Bit - 1, - <> = dict:fetch(Slot, SkipList), - BitArray = <>, - dict:store(Slot, BitArray, SkipList). - enter(Key, Value, Hash, SkipList, Width, 1) -> {MarkerKey, SubList} = find_mark(Key, SkipList), case Hash rem Width of @@ -488,68 +452,30 @@ dotest_skiplist_small(N) -> skiplist_withbloom_test() -> io:format(user, "~n~nBloom protected skiplist test:~n~n", []), - N = 4000, - KL = generate_randomkeys(1, N, 1, N div 5), - - SWaGSL = os:timestamp(), - SkipList = from_list(lists:reverse(KL), true), - io:format(user, "Generating skip list with ~w keys in ~w microseconds~n" ++ - "Top level key count of ~w~n", - [N, - timer:now_diff(os:timestamp(), SWaGSL), - length(dict:fetch(?SKIP_WIDTH, SkipList))]), - io:format(user, "Second tier key counts of ~w~n", - [lists:map(fun({_L, SL}) -> length(SL) end, - dict:fetch(?SKIP_WIDTH, SkipList))]), - KLSorted = lists:ukeysort(1, lists:reverse(KL)), + skiplist_tester(true). - SWaGSL2 = os:timestamp(), - SkipList = from_sortedlist(KLSorted, true), - io:format(user, "Generating skip list with ~w sorted keys in ~w " ++ - "microseconds~n", - [N, timer:now_diff(os:timestamp(), SWaGSL2)]), - - SWaDSL = os:timestamp(), - SkipList1 = - lists:foldl(fun({K, V}, SL) -> - enter(K, V, SL) - end, - empty(true), - KL), - io:format(user, "Dynamic load of skiplist with ~w keys took ~w " ++ - "microseconds~n" ++ - "Top level key count of ~w~n", - [N, - timer:now_diff(os:timestamp(), SWaDSL), - length(dict:fetch(?SKIP_WIDTH, SkipList1))]), - io:format(user, "Second tier key counts of ~w~n", - [lists:map(fun({_L, SL}) -> length(SL) end, - dict:fetch(?SKIP_WIDTH, SkipList1))]), - - io:format(user, "~nRunning timing tests for generated skiplist:~n", []), - skiplist_timingtest(KLSorted, SkipList, N), - - io:format(user, "~nRunning timing tests for dynamic skiplist:~n", []), - skiplist_timingtest(KLSorted, SkipList1, N). - skiplist_nobloom_test() -> io:format(user, "~n~nBloom free skiplist test:~n~n", []), + skiplist_tester(false). + +skiplist_tester(Bloom) -> N = 4000, KL = generate_randomkeys(1, N, 1, N div 5), SWaGSL = os:timestamp(), - SkipList = from_list(lists:reverse(KL)), + SkipList = from_list(lists:reverse(KL), Bloom), io:format(user, "Generating skip list with ~w keys in ~w microseconds~n" ++ "Top level key count of ~w~n", [N, timer:now_diff(os:timestamp(), SWaGSL), - length(SkipList)]), + length(element(2, SkipList))]), io:format(user, "Second tier key counts of ~w~n", - [lists:map(fun({_L, SL}) -> length(SL) end, SkipList)]), + [lists:map(fun({_L, SL}) -> length(SL) end, + element(2, SkipList))]), KLSorted = lists:ukeysort(1, lists:reverse(KL)), SWaGSL2 = os:timestamp(), - SkipList = from_sortedlist(KLSorted), + SkipList = from_sortedlist(KLSorted, Bloom), io:format(user, "Generating skip list with ~w sorted keys in ~w " ++ "microseconds~n", [N, timer:now_diff(os:timestamp(), SWaGSL2)]), @@ -559,25 +485,26 @@ skiplist_nobloom_test() -> lists:foldl(fun({K, V}, SL) -> enter(K, V, SL) end, - empty(), + empty(Bloom), KL), io:format(user, "Dynamic load of skiplist with ~w keys took ~w " ++ "microseconds~n" ++ "Top level key count of ~w~n", [N, timer:now_diff(os:timestamp(), SWaDSL), - length(SkipList1)]), + length(element(2, SkipList1))]), io:format(user, "Second tier key counts of ~w~n", - [lists:map(fun({_L, SL}) -> length(SL) end, SkipList1)]), + [lists:map(fun({_L, SL}) -> length(SL) end, + element(2, SkipList1))]), io:format(user, "~nRunning timing tests for generated skiplist:~n", []), - skiplist_timingtest(KLSorted, SkipList, N), + skiplist_timingtest(KLSorted, SkipList, N, Bloom), io:format(user, "~nRunning timing tests for dynamic skiplist:~n", []), - skiplist_timingtest(KLSorted, SkipList1, N). + skiplist_timingtest(KLSorted, SkipList1, N, Bloom). -skiplist_timingtest(KL, SkipList, N) -> +skiplist_timingtest(KL, SkipList, N, Bloom) -> io:format(user, "Timing tests on skiplist of size ~w~n", [leveled_skiplist:size(SkipList)]), CheckList1 = lists:sublist(KL, N div 4, 200), @@ -666,7 +593,24 @@ skiplist_timingtest(KL, SkipList, N) -> FlatList = to_list(SkipList), io:format(user, "Flattening skiplist took ~w microseconds~n", [timer:now_diff(os:timestamp(), SWg)]), - ?assertMatch(KL, FlatList). + ?assertMatch(KL, FlatList), + + case Bloom of + true -> + HashList = lists:map(fun(_X) -> + random:uniform(4296967295) end, + lists:seq(1, 2000)), + SWh = os:timestamp(), + lists:foreach(fun(X) -> + lookup(X, X, SkipList) end, + HashList), + io:format(user, + "Getting 2000 missing keys when hash was known " ++ + "took ~w microseconds~n", + [timer:now_diff(os:timestamp(), SWh)]); + false -> + ok + end. define_kv(X) -> {{o, "Bucket", "Key" ++ string:right(integer_to_list(X), 6), null}, @@ -688,5 +632,21 @@ skiplist_roundsize_test() -> ?assertMatch(L, R) end, lists:seq(0, 24)). +skiplist_nolookup_test() -> + N = 4000, + KL = generate_randomkeys(1, N, 1, N div 5), + SkipList = lists:foldl(fun({K, V}, Acc) -> + enter_nolookup(K, V, Acc) end, + empty(true), + KL), + KLSorted = lists:ukeysort(1, lists:reverse(KL)), + lists:foreach(fun({K, _V}) -> + ?assertMatch(none, lookup(K, SkipList)) end, + KL), + ?assertMatch(KLSorted, to_list(SkipList)). + +empty_skiplist_size_test() -> + ?assertMatch(0, leveled_skiplist:size(empty(false))), + ?assertMatch(0, leveled_skiplist:size(empty(true))). -endif. \ No newline at end of file diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl new file mode 100644 index 0000000..166d616 --- /dev/null +++ b/src/leveled_tinybloom.erl @@ -0,0 +1,151 @@ +%% -------- TINY BLOOM --------- +%% +%% For sheltering relatively expensive lookups with a probabilistic check +%% +%% Uses multiple 256 byte blooms. Can sensibly hold up to 1000 keys per array. +%% Even at 1000 keys should still offer only a 20% false positive +%% +%% Restricted to no more than 256 arrays - so can't handle more than 250K keys +%% in total +%% +%% Implemented this way to make it easy to control false positive (just by +%% setting the width). Also only requires binary manipulations of a single +%% hash + +-module(leveled_tinybloom). + +-include("include/leveled.hrl"). + +-export([ + enter/2, + check/2, + empty/1 + ]). + +-include_lib("eunit/include/eunit.hrl"). + +%%%============================================================================ +%%% Bloom API +%%%============================================================================ + + +empty(Width) when Width =< 256 -> + FoldFun = fun(X, Acc) -> dict:store(X, <<0:4096>>, Acc) end, + lists:foldl(FoldFun, dict:new(), lists:seq(0, Width - 1)). + +enter({hash, Hash}, Bloom) -> + {H0, Bit1, Bit2} = split_hash(Hash), + Slot = H0 rem dict:size(Bloom), + BitArray0 = dict:fetch(Slot, Bloom), + BitArray1 = lists:foldl(fun add_to_array/2, + BitArray0, + lists:usort([Bit1, Bit2])), + dict:store(Slot, BitArray1, Bloom); +enter(Key, Bloom) -> + Hash = leveled_codec:magic_hash(Key), + enter({hash, Hash}, Bloom). + +check({hash, Hash}, Bloom) -> + {H0, Bit1, Bit2} = split_hash(Hash), + Slot = H0 rem dict:size(Bloom), + BitArray = dict:fetch(Slot, Bloom), + case getbit(Bit1, BitArray) of + <<0:1>> -> + false; + <<1:1>> -> + case getbit(Bit2, BitArray) of + <<0:1>> -> + false; + <<1:1>> -> + true + end + end; +check(Key, Bloom) -> + Hash = leveled_codec:magic_hash(Key), + check({hash, Hash}, Bloom). + +%%%============================================================================ +%%% Internal Functions +%%%============================================================================ + +split_hash(Hash) -> + H0 = Hash band 255, + H1 = (Hash bsr 8) band 4095, + H2 = Hash bsr 20, + {H0, H1, H2}. + +add_to_array(Bit, BitArray) -> + RestLen = 4096 - Bit - 1, + <> = BitArray, + <>. + +getbit(Bit, BitArray) -> + RestLen = 4096 - Bit - 1, + <<_Head:Bit/bitstring, + B:1/bitstring, + _Rest:RestLen/bitstring>> = BitArray, + B. + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +simple_test() -> + N = 4000, + W = 4, + KLin = lists:map(fun(X) -> "Key_" ++ + integer_to_list(X) ++ + integer_to_list(random:uniform(100)) ++ + binary_to_list(crypto:rand_bytes(2)) + end, + lists:seq(1, N)), + KLout = lists:map(fun(X) -> + "NotKey_" ++ + integer_to_list(X) ++ + integer_to_list(random:uniform(100)) ++ + binary_to_list(crypto:rand_bytes(2)) + end, + lists:seq(1, N)), + SW0_PH = os:timestamp(), + lists:foreach(fun(X) -> erlang:phash2(X) end, KLin), + io:format(user, + "~nNative hash function hashes ~w keys in ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW0_PH)]), + SW0_MH = os:timestamp(), + lists:foreach(fun(X) -> leveled_codec:magic_hash(X) end, KLin), + io:format(user, + "~nMagic hash function hashes ~w keys in ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW0_MH)]), + + SW1 = os:timestamp(), + Bloom = lists:foldr(fun enter/2, empty(W), KLin), + io:format(user, + "~nAdding ~w keys to bloom took ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW1)]), + + SW2 = os:timestamp(), + lists:foreach(fun(X) -> ?assertMatch(true, check(X, Bloom)) end, KLin), + io:format(user, + "~nChecking ~w keys in bloom took ~w microseconds~n", + [N, timer:now_diff(os:timestamp(), SW2)]), + + SW3 = os:timestamp(), + FP = lists:foldr(fun(X, Acc) -> case check(X, Bloom) of + true -> Acc + 1; + false -> Acc + end end, + 0, + KLout), + io:format(user, + "~nChecking ~w keys out of bloom took ~w microseconds " ++ + "with ~w false positive rate~n", + [N, timer:now_diff(os:timestamp(), SW3), FP / N]), + ?assertMatch(true, FP < (N div 4)). + + +-endif. \ No newline at end of file From ccc993383d93078cdf8a39a8042ba25d33b89c39 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 01:21:53 +0000 Subject: [PATCH 12/34] Stop second hash on fetch_head The bookie should magic_hash for fetch_head, and now passes the hash to the Penciller so second hash not required. --- src/leveled_bookie.erl | 23 +++++++++++++++-------- src/leveled_log.erl | 2 -- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 62892ec..412eec0 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -728,15 +728,22 @@ startup(InkerOpts, PencillerOpts) -> fetch_head(Key, Penciller, LedgerCache) -> - case leveled_skiplist:lookup(Key, LedgerCache#ledger_cache.skiplist) of - {value, Head} -> - Head; - none -> - case leveled_penciller:pcl_fetch(Penciller, Key) of - {Key, Head} -> + Hash = leveled_codec:magic_hash(Key), + if + Hash /= no_lookup -> + L0R = leveled_skiplist:lookup(Key, + Hash, + LedgerCache#ledger_cache.skiplist), + case L0R of + {value, Head} -> Head; - not_present -> - not_present + none -> + case leveled_penciller:pcl_fetch(Penciller, Key, Hash) of + {Key, Head} -> + Head; + not_present -> + not_present + end end end. diff --git a/src/leveled_log.erl b/src/leveled_log.erl index fa26555..a10e641 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -198,8 +198,6 @@ {"IC011", {info, "Not clearing filename ~s as modified delta is only ~w seconds"}}, - {"PM001", - {info, "Indexed new cache entry with total L0 cache size now ~w"}}, {"PM002", {info, "Completed dump of L0 cache to list of size ~w"}}, From 1f38bcb3285288a23107949520c98d13e165dcc6 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 01:32:32 +0000 Subject: [PATCH 13/34] Magic Hash vs phash2 Magic Hash broke Skip List organisation --- src/leveled_skiplist.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 17da98c..142afc5 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -57,7 +57,7 @@ enter(Key, Hash, Value, SkipList) -> leveled_tinybloom:enter({hash, Hash}, Bloom) end, {Bloom0, - enter(Key, Value, Hash, + enter(Key, Value, erlang:phash2(Key), element(2, SkipList), ?SKIP_WIDTH, ?LIST_HEIGHT)}. From 5d11bc051f9d1a630d82ba859dc68097331bc5f9 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 01:49:03 +0000 Subject: [PATCH 14/34] Allow for more fluctuation in L0 write time Try to alleviate existing co-ordination issue when all vnodes tend to try and write L0 files concurrently --- src/leveled_penciller.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index a1ab9b1..93a9094 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -197,10 +197,10 @@ -define(CURRENT_FILEX, "crr"). -define(PENDING_FILEX, "pnd"). -define(MEMTABLE, mem). --define(MAX_TABLESIZE, 32000). +-define(MAX_TABLESIZE, 28000). % This is less than max - but COIN_SIDECOUNT -define(PROMPT_WAIT_ONL0, 5). -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). --define(COIN_SIDECOUNT, 2). +-define(COIN_SIDECOUNT, 4). -record(state, {manifest = [] :: list(), manifest_sqn = 0 :: integer(), From 2c7fdc74d484385aa973e6d658038029048f6565 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 01:58:25 +0000 Subject: [PATCH 15/34] Setting fiddling Try to find a happy medium --- src/leveled_penciller.erl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 93a9094..6cdf265 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -197,7 +197,8 @@ -define(CURRENT_FILEX, "crr"). -define(PENDING_FILEX, "pnd"). -define(MEMTABLE, mem). --define(MAX_TABLESIZE, 28000). % This is less than max - but COIN_SIDECOUNT +-define(MAX_TABLESIZE, 25000). % This is less than max - but COIN_SIDECOUNT +-define(SUPER_MAX_TABLE_SIZE, 45000) -define(PROMPT_WAIT_ONL0, 5). -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). -define(COIN_SIDECOUNT, 4). @@ -645,6 +646,7 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, levelzero_size=NewL0Size, ledger_sqn=UpdMaxSQN}, CacheTooBig = NewL0Size > State#state.levelzero_maxcachesize, + CacheMuchTooBig = NewL0Size > ?SUPER_MAX_TABLE_SIZE, Level0Free = length(get_item(0, State#state.manifest, [])) == 0, RandomFactor = case State#state.levelzero_cointoss of @@ -658,7 +660,7 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, false -> true end, - case {CacheTooBig, Level0Free, RandomFactor} of + case {CacheTooBig, Level0Free, RandomFactor or CacheMuchTooBig} of {true, true, true} -> L0Constructor = roll_memory(UpdState, false), UpdState#state{levelzero_pending=true, From ea8f3c07a7d52b16858f7055d6a7c90e4f56e57e Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 02:00:19 +0000 Subject: [PATCH 16/34] oops --- src/leveled_penciller.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 6cdf265..b937d30 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -198,7 +198,7 @@ -define(PENDING_FILEX, "pnd"). -define(MEMTABLE, mem). -define(MAX_TABLESIZE, 25000). % This is less than max - but COIN_SIDECOUNT --define(SUPER_MAX_TABLE_SIZE, 45000) +-define(SUPER_MAX_TABLE_SIZE, 45000). -define(PROMPT_WAIT_ONL0, 5). -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). -define(COIN_SIDECOUNT, 4). From 523716e8f2bcbc4cf0df16268b84d5367d45dd70 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 04:48:50 +0000 Subject: [PATCH 17/34] Add tiny bloom to Penciller Manifest This is an attempt to save on unnecessary message transfers, and slightly more expensive GCS checks in the SFT file itself. --- include/leveled.hrl | 1 + src/leveled_pclerk.erl | 2 + src/leveled_penciller.erl | 41 ++++---- src/leveled_sft.erl | 195 ++++++++++++++++++++++++-------------- src/leveled_tinybloom.erl | 4 + 5 files changed, 153 insertions(+), 90 deletions(-) diff --git a/include/leveled.hrl b/include/leveled.hrl index 25216f6..f57ffd4 100644 --- a/include/leveled.hrl +++ b/include/leveled.hrl @@ -41,6 +41,7 @@ {start_key :: tuple(), end_key :: tuple(), owner :: pid(), + bloom, filename :: string()}). -record(cdb_options, diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 272071d..649973b 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -320,6 +320,7 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> KL1, KL2, LevelR), + {ok, Bloom} = leveled_sft:sft_getbloom(Pid), case Reply of {{[], []}, null, _} -> leveled_log:log("PC013", [FileName]), @@ -331,6 +332,7 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> [#manifest_entry{start_key=SmallestKey, end_key=HighestKey, owner=Pid, + bloom=Bloom, filename=FileName}]), leveled_log:log_timer("PC015", [], TS1), do_merge(KL1Rem, KL2Rem, diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index b937d30..ee0b921 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -175,7 +175,7 @@ pcl_checksequencenumber/4, pcl_workforclerk/1, pcl_promptmanifestchange/2, - pcl_confirml0complete/4, + pcl_confirml0complete/5, pcl_confirmdelete/2, pcl_close/1, pcl_doom/1, @@ -285,8 +285,8 @@ pcl_workforclerk(Pid) -> pcl_promptmanifestchange(Pid, WI) -> gen_server:cast(Pid, {manifest_change, WI}). -pcl_confirml0complete(Pid, FN, StartKey, EndKey) -> - gen_server:cast(Pid, {levelzero_complete, FN, StartKey, EndKey}). +pcl_confirml0complete(Pid, FN, StartKey, EndKey, Bloom) -> + gen_server:cast(Pid, {levelzero_complete, FN, StartKey, EndKey, Bloom}). pcl_confirmdelete(Pid, FileName) -> gen_server:cast(Pid, {confirm_delete, FileName}). @@ -454,10 +454,11 @@ handle_cast({confirm_delete, FileName}, State=#state{is_snapshot=Snap}) _ -> {noreply, State} end; -handle_cast({levelzero_complete, FN, StartKey, EndKey}, State) -> +handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) -> leveled_log:log("P0029", []), ManEntry = #manifest_entry{start_key=StartKey, end_key=EndKey, + bloom=Bloom, owner=State#state.levelzero_constructor, filename=FN}, UpdMan = lists:keystore(0, 1, State#state.manifest, {0, [ManEntry]}), @@ -721,34 +722,40 @@ fetch_mem(Key, Hash, Manifest, L0Cache) -> L0Check = leveled_pmem:check_levelzero(Key, Hash, L0Cache), case L0Check of {false, not_found} -> - fetch(Key, Manifest, 0, fun leveled_sft:sft_get/2); + fetch(Key, Hash, Manifest, 0, fun leveled_sft:sft_get/2); {true, KV} -> KV end. -fetch(_Key, _Manifest, ?MAX_LEVELS + 1, _FetchFun) -> +fetch(_Key, _Hash, _Manifest, ?MAX_LEVELS + 1, _FetchFun) -> not_present; -fetch(Key, Manifest, Level, FetchFun) -> +fetch(Key, Hash, Manifest, Level, FetchFun) -> LevelManifest = get_item(Level, Manifest, []), case lists:foldl(fun(File, Acc) -> case Acc of not_present when Key >= File#manifest_entry.start_key, File#manifest_entry.end_key >= Key -> - File#manifest_entry.owner; - PidFound -> - PidFound + {File#manifest_entry.owner, + File#manifest_entry.bloom}; + FoundDetails -> + FoundDetails end end, not_present, LevelManifest) of not_present -> - fetch(Key, Manifest, Level + 1, FetchFun); - FileToCheck -> - case FetchFun(FileToCheck, Key) of - not_present -> - fetch(Key, Manifest, Level + 1, FetchFun); - ObjectFound -> - ObjectFound + fetch(Key, Hash, Manifest, Level + 1, FetchFun); + {FileToCheck, Bloom} -> + case leveled_tinybloom:check({hash, Hash}, Bloom) of + true -> + case FetchFun(FileToCheck, Key) of + not_present -> + fetch(Key, Hash, Manifest, Level + 1, FetchFun); + ObjectFound -> + ObjectFound + end; + false -> + fetch(Key, Hash, Manifest, Level + 1, FetchFun) end end. diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 4c86dff..4f67adf 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -161,6 +161,7 @@ sft_newfroml0cache/4, sft_open/1, sft_get/2, + sft_getbloom/1, sft_getkvrange/4, sft_close/1, sft_clear/1, @@ -189,6 +190,7 @@ -define(HEADER_LEN, 56). -define(ITERATOR_SCANWIDTH, 1). -define(MERGE_SCANWIDTH, 32). +-define(BLOOM_WIDTH, 48). -define(DELETE_TIMEOUT, 10000). -define(MAX_KEYS, ?SLOT_COUNT * ?BLOCK_COUNT * ?BLOCK_SIZE). -define(DISCARD_EXT, ".discarded"). @@ -211,7 +213,8 @@ handle :: file:fd(), background_complete = false :: boolean(), oversized_file = false :: boolean(), - penciller :: pid()}). + penciller :: pid(), + bloom}). %%%============================================================================ @@ -268,6 +271,9 @@ sft_open(Filename) -> sft_setfordelete(Pid, Penciller) -> gen_fsm:sync_send_event(Pid, {set_for_delete, Penciller}, infinity). +sft_getbloom(Pid) -> + gen_fsm:sync_send_event(Pid, get_bloom, infinity). + sft_get(Pid, Key) -> gen_fsm:sync_send_event(Pid, {get_kv, Key}, infinity). @@ -342,8 +348,9 @@ starting({sft_newfroml0cache, Filename, Slots, FetchFun, PCL}, _State) -> leveled_penciller:pcl_confirml0complete(PCL, State#state.filename, State#state.smallest_key, - State#state.highest_key), - {next_state, reader, State} + State#state.highest_key, + State#state.bloom), + {next_state, reader, State#state{bloom=none}} end. @@ -378,6 +385,12 @@ reader(background_complete, _From, State) -> reader, State} end; +reader(get_bloom, _From, State) -> + Bloom = State#state.bloom, + if + Bloom /= none -> + {reply, {ok, Bloom}, reader, State#state{bloom=none}} + end; reader(close, _From, State) -> ok = file:close(State#state.handle), {stop, normal, ok, State}. @@ -510,7 +523,7 @@ open_file(FileMD) -> Slen:32/integer>> = HeaderLengths, {ok, SummaryBin} = file:pread(Handle, ?HEADER_LEN + Blen + Ilen + Flen, Slen), - {{LowSQN, HighSQN}, {LowKey, HighKey}} = binary_to_term(SummaryBin), + {{LowSQN, HighSQN}, {LowKey, HighKey}, Bloom} = binary_to_term(SummaryBin), {ok, SlotIndexBin} = file:pread(Handle, ?HEADER_LEN + Blen, Ilen), SlotIndex = binary_to_term(SlotIndexBin), {Handle, FileMD#state{slot_index=SlotIndex, @@ -523,7 +536,8 @@ open_file(FileMD) -> filter_pointer=?HEADER_LEN + Blen + Ilen, summ_pointer=?HEADER_LEN + Blen + Ilen + Flen, summ_length=Slen, - handle=Handle}}. + handle=Handle, + bloom=Bloom}}. %% Take a file handle with a previously created header and complete it based on %% the two key lists KL1 and KL2 @@ -531,10 +545,11 @@ complete_file(Handle, FileMD, KL1, KL2, LevelR) -> complete_file(Handle, FileMD, KL1, KL2, LevelR, false). complete_file(Handle, FileMD, KL1, KL2, LevelR, Rename) -> + EmptyBloom = leveled_tinybloom:empty(?BLOOM_WIDTH), {ok, KeyRemainders} = write_keys(Handle, maybe_expand_pointer(KL1), maybe_expand_pointer(KL2), - [], <<>>, + [], <<>>, EmptyBloom, LevelR, fun sftwrite_function/2), {ReadHandle, UpdFileMD} = case Rename of @@ -769,12 +784,12 @@ get_nextkeyaftermatch([_KTuple|T], KeyToFind, PrevV) -> write_keys(Handle, KL1, KL2, - SlotIndex, SerialisedSlots, + SlotIndex, SerialisedSlots, InitialBloom, LevelR, WriteFun) -> write_keys(Handle, KL1, KL2, {0, 0}, - SlotIndex, SerialisedSlots, + SlotIndex, SerialisedSlots, InitialBloom, {infinity, 0}, null, {last, null}, LevelR, WriteFun). @@ -782,7 +797,7 @@ write_keys(Handle, write_keys(Handle, KL1, KL2, {SlotCount, SlotTotal}, - SlotIndex, SerialisedSlots, + SlotIndex, SerialisedSlots, Bloom, {LSN, HSN}, LowKey, LastKey, LevelR, WriteFun) when SlotCount =:= ?SLOT_GROUPWRITE_COUNT -> @@ -791,26 +806,27 @@ write_keys(Handle, reached -> {complete_keywrite(UpdHandle, SlotIndex, - {LSN, HSN}, {LowKey, LastKey}, + {{LSN, HSN}, {LowKey, LastKey}, Bloom}, WriteFun), {KL1, KL2}}; continue -> write_keys(UpdHandle, KL1, KL2, {0, SlotTotal}, - SlotIndex, <<>>, + SlotIndex, <<>>, Bloom, {LSN, HSN}, LowKey, LastKey, LevelR, WriteFun) end; write_keys(Handle, KL1, KL2, {SlotCount, SlotTotal}, - SlotIndex, SerialisedSlots, + SlotIndex, SerialisedSlots, Bloom, {LSN, HSN}, LowKey, LastKey, LevelR, WriteFun) -> - SlotOutput = create_slot(KL1, KL2, LevelR), + SlotOutput = create_slot(KL1, KL2, LevelR, Bloom), {{LowKey_Slot, SegFilter, SerialisedSlot, LengthList}, {{LSN_Slot, HSN_Slot}, LastKey_Slot, Status}, + UpdBloom, KL1rem, KL2rem} = SlotOutput, UpdSlotIndex = lists:append(SlotIndex, [{LowKey_Slot, SegFilter, LengthList}]), @@ -829,34 +845,34 @@ write_keys(Handle, UpdHandle = WriteFun(slots , {Handle, UpdSlots}), {complete_keywrite(UpdHandle, UpdSlotIndex, - SNExtremes, {FirstKey, FinalKey}, + {SNExtremes, {FirstKey, FinalKey}, UpdBloom}, WriteFun), {KL1rem, KL2rem}}; full -> write_keys(Handle, KL1rem, KL2rem, {SlotCount + 1, SlotTotal + 1}, - UpdSlotIndex, UpdSlots, + UpdSlotIndex, UpdSlots, UpdBloom, SNExtremes, FirstKey, FinalKey, LevelR, WriteFun); complete -> UpdHandle = WriteFun(slots , {Handle, UpdSlots}), {complete_keywrite(UpdHandle, UpdSlotIndex, - SNExtremes, {FirstKey, FinalKey}, + {SNExtremes, {FirstKey, FinalKey}, UpdBloom}, WriteFun), {KL1rem, KL2rem}} end. -complete_keywrite(Handle, SlotIndex, - SNExtremes, {FirstKey, FinalKey}, +complete_keywrite(Handle, + SlotIndex, + {SNExtremes, {FirstKey, FinalKey}, Bloom}, WriteFun) -> ConvSlotIndex = convert_slotindex(SlotIndex), WriteFun(finalise, {Handle, ConvSlotIndex, - SNExtremes, - {FirstKey, FinalKey}}). + {SNExtremes, {FirstKey, FinalKey}, Bloom}}). %% Take a slot index, and remove the SegFilters replacing with pointers @@ -884,16 +900,15 @@ sftwrite_function(slots, {Handle, SerialisedSlots}) -> Handle; sftwrite_function(finalise, {Handle, - {SlotFilters, PointerIndex}, - SNExtremes, - KeyExtremes}) -> + {SlotFilters, PointerIndex}, + {SNExtremes, KeyExtremes, Bloom}}) -> {ok, Position} = file:position(Handle, cur), BlocksLength = Position - ?HEADER_LEN, Index = term_to_binary(PointerIndex), IndexLength = byte_size(Index), FilterLength = byte_size(SlotFilters), - Summary = term_to_binary({SNExtremes, KeyExtremes}), + Summary = term_to_binary({SNExtremes, KeyExtremes, Bloom}), SummaryLength = byte_size(Summary), %% Write Index, Filter and Summary ok = file:write(Handle, < %% Also this should return a partial block if the KeyLists have been exhausted %% but the block is full -create_block(KeyList1, KeyList2, LevelR) -> - create_block(KeyList1, KeyList2, [], {infinity, 0}, [], LevelR). +create_block(KeyList1, KeyList2, LevelR, Bloom) -> + create_block(KeyList1, KeyList2, [], {infinity, 0}, [], LevelR, Bloom). create_block(KeyList1, KeyList2, - BlockKeyList, {LSN, HSN}, SegmentList, _LevelR) + BlockKeyList, {LSN, HSN}, SegmentList, _LevelR, Bloom) when length(BlockKeyList)==?BLOCK_SIZE -> case {KeyList1, KeyList2} of {[], []} -> - {BlockKeyList, complete, {LSN, HSN}, SegmentList, [], []}; + {BlockKeyList, complete, {LSN, HSN}, SegmentList, + Bloom, + [], []}; _ -> - {BlockKeyList, full, {LSN, HSN}, SegmentList, KeyList1, KeyList2} + {BlockKeyList, full, {LSN, HSN}, SegmentList, + Bloom, + KeyList1, KeyList2} end; -create_block([], [], - BlockKeyList, {LSN, HSN}, SegmentList, _LevelR) -> - {BlockKeyList, partial, {LSN, HSN}, SegmentList, [], []}; +create_block([], [], BlockKeyList, {LSN, HSN}, SegmentList, _LevelR, Bloom) -> + {BlockKeyList, partial, {LSN, HSN}, SegmentList, + Bloom, + [], []}; create_block(KeyList1, KeyList2, - BlockKeyList, {LSN, HSN}, SegmentList, LevelR) -> + BlockKeyList, {LSN, HSN}, SegmentList, LevelR, Bloom) -> case key_dominates(KeyList1, KeyList2, {LevelR#level.is_basement, LevelR#level.timestamp}) of {{next_key, TopKey}, Rem1, Rem2} -> - {UpdLSN, UpdHSN} = update_sequencenumbers(TopKey, LSN, HSN), + {_K, V} = TopKey, + {SQN, _St, MH, _MD} = leveled_codec:striphead_to_details(V), + {UpdLSN, UpdHSN} = update_sequencenumbers(SQN, LSN, HSN), + UpdBloom = leveled_tinybloom:enter({hash, MH}, Bloom), NewBlockKeyList = lists:append(BlockKeyList, [TopKey]), NewSegmentList = lists:append(SegmentList, - [hash_for_segmentid(TopKey)]), + [hash_for_segmentid(TopKey)]), create_block(Rem1, Rem2, NewBlockKeyList, {UpdLSN, UpdHSN}, - NewSegmentList, LevelR); + NewSegmentList, LevelR, UpdBloom); {skipped_key, Rem1, Rem2} -> create_block(Rem1, Rem2, BlockKeyList, {LSN, HSN}, - SegmentList, LevelR) + SegmentList, LevelR, Bloom) end. @@ -996,33 +1019,43 @@ create_block(KeyList1, KeyList2, %% - Remainder of any KeyLists used to make the slot -create_slot(KeyList1, KeyList2, Level) -> - create_slot(KeyList1, KeyList2, Level, ?BLOCK_COUNT, [], <<>>, [], - {null, infinity, 0, null, full}). +create_slot(KeyList1, KeyList2, Level, Bloom) -> + create_slot(KeyList1, KeyList2, Level, ?BLOCK_COUNT, Bloom, + [], <<>>, [], + {null, infinity, 0, null, full}). %% Keep adding blocks to the slot until either the block count is reached or %% there is a partial block -create_slot(KL1, KL2, _, 0, SegLists, SerialisedSlot, LengthList, - {LowKey, LSN, HSN, LastKey, Status}) -> +create_slot(KL1, KL2, _, 0, Bloom, + SegLists, SerialisedSlot, LengthList, + {LowKey, LSN, HSN, LastKey, Status}) -> {{LowKey, generate_segment_filter(SegLists), SerialisedSlot, LengthList}, {{LSN, HSN}, LastKey, Status}, + Bloom, KL1, KL2}; -create_slot(KL1, KL2, _, _, SegLists, SerialisedSlot, LengthList, - {LowKey, LSN, HSN, LastKey, partial}) -> +create_slot(KL1, KL2, _, _, Bloom, + SegLists, SerialisedSlot, LengthList, + {LowKey, LSN, HSN, LastKey, partial}) -> {{LowKey, generate_segment_filter(SegLists), SerialisedSlot, LengthList}, {{LSN, HSN}, LastKey, partial}, + Bloom, KL1, KL2}; -create_slot(KL1, KL2, _, _, SegLists, SerialisedSlot, LengthList, - {LowKey, LSN, HSN, LastKey, complete}) -> +create_slot(KL1, KL2, _, _, Bloom, + SegLists, SerialisedSlot, LengthList, + {LowKey, LSN, HSN, LastKey, complete}) -> {{LowKey, generate_segment_filter(SegLists), SerialisedSlot, LengthList}, {{LSN, HSN}, LastKey, partial}, + Bloom, KL1, KL2}; -create_slot(KL1, KL2, LevelR, BlockCount, SegLists, SerialisedSlot, LengthList, - {LowKey, LSN, HSN, LastKey, _Status}) -> +create_slot(KL1, KL2, LevelR, BlockCount, Bloom, + SegLists, SerialisedSlot, LengthList, + {LowKey, LSN, HSN, LastKey, _Status}) -> {BlockKeyList, Status, {LSNb, HSNb}, - SegmentList, KL1b, KL2b} = create_block(KL1, KL2, LevelR), + SegmentList, + UpdBloom, + KL1b, KL2b} = create_block(KL1, KL2, LevelR, Bloom), TrackingMetadata = case {LowKey, BlockKeyList} of {null, []} -> {null, LSN, HSN, LastKey, Status}; @@ -1043,9 +1076,10 @@ create_slot(KL1, KL2, LevelR, BlockCount, SegLists, SerialisedSlot, LengthList, SerialisedBlock = serialise_block(BlockKeyList), BlockLength = byte_size(SerialisedBlock), SerialisedSlot2 = <>, - create_slot(KL1b, KL2b, LevelR, BlockCount - 1, SegLists ++ [SegmentList], - SerialisedSlot2, LengthList ++ [BlockLength], - TrackingMetadata). + SegList2 = SegLists ++ [SegmentList], + create_slot(KL1b, KL2b, LevelR, BlockCount - 1, UpdBloom, + SegList2, SerialisedSlot2, LengthList ++ [BlockLength], + TrackingMetadata). serialise_block(BlockKeyList) -> term_to_binary(BlockKeyList, [{compressed, ?COMPRESSION_LEVEL}]). @@ -1133,8 +1167,6 @@ pointer_append_queryresults(Results, QueryPid) -> %% Update the sequence numbers -update_sequencenumbers(Item, LSN, HSN) when is_tuple(Item) -> - update_sequencenumbers(leveled_codec:strip_to_seqonly(Item), LSN, HSN); update_sequencenumbers(SN, infinity, 0) -> {SN, SN}; update_sequencenumbers(SN, LSN, HSN) when SN < LSN -> @@ -1433,9 +1465,11 @@ simple_create_block_test() -> {2, {active, infinity}, no_lookup, null}}], KeyList2 = [{{o, "Bucket1", "Key2", null}, {3, {active, infinity}, no_lookup, null}}], - {MergedKeyList, ListStatus, SN, _, _, _} = create_block(KeyList1, - KeyList2, - #level{level=1}), + BlockOutput = create_block(KeyList1, + KeyList2, + #level{level=1}, + leveled_tinybloom:empty(4)), + {MergedKeyList, ListStatus, SN, _, _, _, _} = BlockOutput, ?assertMatch(partial, ListStatus), [H1|T1] = MergedKeyList, ?assertMatch({{o, "Bucket1", "Key1", null}, @@ -1454,9 +1488,11 @@ dominate_create_block_test() -> {2, {active, infinity}, no_lookup, null}}], KeyList2 = [{{o, "Bucket1", "Key2", null}, {3, {tomb, infinity}, no_lookup, null}}], - {MergedKeyList, ListStatus, SN, _, _, _} = create_block(KeyList1, - KeyList2, - #level{level=1}), + BlockOutput = create_block(KeyList1, + KeyList2, + #level{level=1}, + leveled_tinybloom:empty(4)), + {MergedKeyList, ListStatus, SN, _, _, _, _} = BlockOutput, ?assertMatch(partial, ListStatus), [K1, K2] = MergedKeyList, ?assertMatch(K1, lists:nth(1, KeyList1)), @@ -1502,9 +1538,11 @@ sample_keylist() -> alternating_create_block_test() -> {KeyList1, KeyList2} = sample_keylist(), - {MergedKeyList, ListStatus, _, _, _, _} = create_block(KeyList1, - KeyList2, - #level{level=1}), + BlockOutput = create_block(KeyList1, + KeyList2, + #level{level=1}, + leveled_tinybloom:empty(4)), + {MergedKeyList, ListStatus, _SN, _, _, _, _} = BlockOutput, BlockSize = length(MergedKeyList), ?assertMatch(BlockSize, 32), ?assertMatch(ListStatus, complete), @@ -1515,10 +1553,11 @@ alternating_create_block_test() -> K32 = lists:nth(32, MergedKeyList), ?assertMatch(K32, {{o, "Bucket4", "Key1", null}, {1, {active, infinity}, 0, null}}), HKey = {{o, "Bucket1", "Key0", null}, {1, {active, infinity}, 0, null}}, - {_, ListStatus2, _, _, _, _} = create_block([HKey|KeyList1], - KeyList2, - #level{level=1}), - ?assertMatch(ListStatus2, full). + {_, LStatus2, _, _, _, _, _} = create_block([HKey|KeyList1], + KeyList2, + #level{level=1}, + leveled_tinybloom:empty(4)), + ?assertMatch(full, LStatus2). merge_seglists_test() -> @@ -1655,9 +1694,13 @@ merge_seglists_test() -> createslot_stage1_test() -> {KeyList1, KeyList2} = sample_keylist(), - Out = create_slot(KeyList1, KeyList2, #level{level=1}), + Out = create_slot(KeyList1, + KeyList2, + #level{level=1}, + leveled_tinybloom:empty(4)), {{LowKey, SegFilter, _SerialisedSlot, _LengthList}, {{LSN, HSN}, LastKey, Status}, + _UpdBloom, KL1, KL2} = Out, ?assertMatch(LowKey, {o, "Bucket1", "Key1", null}), ?assertMatch(LastKey, {o, "Bucket4", "Key1", null}), @@ -1678,9 +1721,11 @@ createslot_stage1_test() -> createslot_stage2_test() -> Out = create_slot(lists:sort(generate_randomkeys(100)), lists:sort(generate_randomkeys(100)), - #level{level=1}), + #level{level=1}, + leveled_tinybloom:empty(4)), {{_LowKey, _SegFilter, SerialisedSlot, LengthList}, {{_LSN, _HSN}, _LastKey, Status}, + _UpdBloom, _KL1, _KL2} = Out, ?assertMatch(Status, full), Sum1 = lists:foldl(fun(X, Sum) -> Sum + X end, 0, LengthList), @@ -1691,9 +1736,11 @@ createslot_stage2_test() -> createslot_stage3_test() -> Out = create_slot(lists:sort(generate_sequentialkeys(100, 1)), lists:sort(generate_sequentialkeys(100, 101)), - #level{level=1}), + #level{level=1}, + leveled_tinybloom:empty(4)), {{LowKey, SegFilter, SerialisedSlot, LengthList}, {{_LSN, _HSN}, LastKey, Status}, + _UpdBloom, KL1, KL2} = Out, ?assertMatch(Status, full), Sum1 = lists:foldl(fun(X, Sum) -> Sum + X end, 0, LengthList), @@ -1729,17 +1776,19 @@ createslot_stage3_test() -> testwrite_function(slots, {Handle, SerialisedSlots}) -> lists:append(Handle, [SerialisedSlots]); -testwrite_function(finalise, {Handle, C_SlotIndex, SNExtremes, KeyExtremes}) -> - {Handle, C_SlotIndex, SNExtremes, KeyExtremes}. +testwrite_function(finalise, + {Handle, C_SlotIndex, {SNExtremes, KeyExtremes, Bloom}}) -> + {Handle, C_SlotIndex, SNExtremes, KeyExtremes, Bloom}. writekeys_stage1_test() -> {KL1, KL2} = sample_keylist(), {FunOut, {_KL1Rem, _KL2Rem}} = write_keys([], KL1, KL2, [], <<>>, + leveled_tinybloom:empty(4), #level{level=1}, fun testwrite_function/2), - {Handle, {_, PointerIndex}, SNExtremes, KeyExtremes} = FunOut, + {Handle, {_, PointerIndex}, SNExtremes, KeyExtremes, _Bloom} = FunOut, ?assertMatch(SNExtremes, {1,3}), ?assertMatch(KeyExtremes, {{o, "Bucket1", "Key1", null}, {o, "Bucket4", "Key1", null}}), diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 166d616..dd72b6e 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -33,6 +33,8 @@ empty(Width) when Width =< 256 -> FoldFun = fun(X, Acc) -> dict:store(X, <<0:4096>>, Acc) end, lists:foldl(FoldFun, dict:new(), lists:seq(0, Width - 1)). +enter({hash, no_lookup}, Bloom) -> + Bloom; enter({hash, Hash}, Bloom) -> {H0, Bit1, Bit2} = split_hash(Hash), Slot = H0 rem dict:size(Bloom), @@ -45,6 +47,8 @@ enter(Key, Bloom) -> Hash = leveled_codec:magic_hash(Key), enter({hash, Hash}, Bloom). +check({hash, _Hash}, undefined) -> + true; check({hash, Hash}, Bloom) -> {H0, Bit1, Bit2} = split_hash(Hash), Slot = H0 rem dict:size(Bloom), From f848500eff6289f38a703072f07ac4c03ef21f2f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 04:53:36 +0000 Subject: [PATCH 18/34] Tinker, tinker, tinker, tinker --- src/leveled_penciller.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index ee0b921..431c501 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -197,11 +197,11 @@ -define(CURRENT_FILEX, "crr"). -define(PENDING_FILEX, "pnd"). -define(MEMTABLE, mem). --define(MAX_TABLESIZE, 25000). % This is less than max - but COIN_SIDECOUNT --define(SUPER_MAX_TABLE_SIZE, 45000). +-define(MAX_TABLESIZE, 28000). % This is less than max - but COIN_SIDECOUNT +-define(SUPER_MAX_TABLE_SIZE, 40000). -define(PROMPT_WAIT_ONL0, 5). -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). --define(COIN_SIDECOUNT, 4). +-define(COIN_SIDECOUNT, 3). -record(state, {manifest = [] :: list(), manifest_sqn = 0 :: integer(), From 8bcb49479df71300a75e5709645b600465424557 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 05:23:24 +0000 Subject: [PATCH 19/34] Re-introduce ETS Index Add ETS Index back in to avoid having to check each skip list in turn. Also this helps keep a lower skip list size. --- src/leveled_log.erl | 2 ++ src/leveled_penciller.erl | 34 ++++++++++++++++++++++++++++------ src/leveled_pmem.erl | 28 +++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/src/leveled_log.erl b/src/leveled_log.erl index a10e641..f2306ce 100644 --- a/src/leveled_log.erl +++ b/src/leveled_log.erl @@ -104,6 +104,8 @@ {info, "L0 completion confirmed and will transition to not pending"}}, {"P0030", {warn, "We're doomed - intention recorded to destroy all files"}}, + {"P0031", + {info, "Completion of update to levelzero"}}, {"PC001", {info, "Penciller's clerk ~w started with owner ~w"}}, diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 431c501..3547342 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -219,6 +219,7 @@ levelzero_size = 0 :: integer(), levelzero_maxcachesize :: integer(), levelzero_cointoss = false :: boolean(), + levelzero_index, % may be none or an ETS table reference is_snapshot = false :: boolean(), snapshot_fully_loaded = false :: boolean(), @@ -369,14 +370,16 @@ handle_call({fetch, Key, Hash}, _From, State) -> fetch_mem(Key, Hash, State#state.manifest, - State#state.levelzero_cache), + State#state.levelzero_cache, + State#state.levelzero_index), State}; handle_call({check_sqn, Key, Hash, SQN}, _From, State) -> {reply, compare_to_sqn(fetch_mem(Key, Hash, State#state.manifest, - State#state.levelzero_cache), + State#state.levelzero_cache, + State#state.levelzero_index), SQN), State}; handle_call({fetch_keys, StartKey, EndKey, AccFun, InitAcc, MaxKeys}, @@ -417,6 +420,7 @@ handle_call({load_snapshot, {BookieIncrTree, MinSQN, MaxSQN}}, _From, State) -> {LedgerSQN, L0Size, L0Cache} = L0D, {reply, ok, State#state{levelzero_cache=L0Cache, levelzero_size=L0Size, + levelzero_index=none, ledger_sqn=LedgerSQN, snapshot_fully_loaded=true}}; handle_call({fetch_levelzero, Slot}, _From, State) -> @@ -468,6 +472,7 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) -> levelzero_pending=false, levelzero_constructor=undefined, levelzero_size=0, + levelzero_index=leveled_pmem:new_index(), manifest=UpdMan, persisted_sqn=State#state.ledger_sqn}}. @@ -560,7 +565,8 @@ start_from_file(PCLopts) -> InitState = #state{clerk=MergeClerk, root_path=RootPath, levelzero_maxcachesize=MaxTableSize, - levelzero_cointoss=CoinToss}, + levelzero_cointoss=CoinToss, + levelzero_index=leveled_pmem:new_index()}, %% Open manifest ManifestPath = InitState#state.root_path ++ "/" ++ ?MANIFEST_FP ++ "/", @@ -636,10 +642,13 @@ start_from_file(PCLopts) -> update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, LedgerSQN, L0Cache, State) -> + SW = os:timestamp(), Update = leveled_pmem:add_to_cache(L0Size, {PushedTree, MinSQN, MaxSQN}, LedgerSQN, L0Cache), + leveled_pmem:add_to_index(PushedTree, State#state.levelzero_index), + {UpdMaxSQN, NewL0Size, UpdL0Cache} = Update, if UpdMaxSQN >= LedgerSQN -> @@ -661,15 +670,20 @@ update_levelzero(L0Size, {PushedTree, MinSQN, MaxSQN}, false -> true end, - case {CacheTooBig, Level0Free, RandomFactor or CacheMuchTooBig} of + JitterCheck = RandomFactor or CacheMuchTooBig, + case {CacheTooBig, Level0Free, JitterCheck} of {true, true, true} -> - L0Constructor = roll_memory(UpdState, false), + L0Constructor = roll_memory(UpdState, false), + leveled_log:log_timer("P0031", [], SW), UpdState#state{levelzero_pending=true, levelzero_constructor=L0Constructor}; _ -> + leveled_log:log_timer("P0031", [], SW), UpdState end; + NewL0Size == L0Size -> + leveled_log:log_timer("P0031", [], SW), State#state{levelzero_cache=L0Cache, levelzero_size=L0Size, ledger_sqn=LedgerSQN} @@ -718,13 +732,21 @@ levelzero_filename(State) -> FileName. -fetch_mem(Key, Hash, Manifest, L0Cache) -> + +fetch_mem(Key, Hash, Manifest, L0Cache, none) -> L0Check = leveled_pmem:check_levelzero(Key, Hash, L0Cache), case L0Check of {false, not_found} -> fetch(Key, Hash, Manifest, 0, fun leveled_sft:sft_get/2); {true, KV} -> KV + end; +fetch_mem(Key, Hash, Manifest, L0Cache, L0Index) -> + case leveled_pmem:check_index(Hash, L0Index) of + true -> + fetch_mem(Key, Hash, Manifest, L0Cache, none); + false -> + fetch(Key, Hash, Manifest, 0, fun leveled_sft:sft_get/2) end. fetch(_Key, _Hash, _Manifest, ?MAX_LEVELS + 1, _FetchFun) -> diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 5ba62aa..9f81c01 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -45,7 +45,10 @@ add_to_cache/4, to_list/2, check_levelzero/3, - merge_trees/4 + merge_trees/4, + add_to_index/2, + new_index/0, + check_index/2 ]). -include_lib("eunit/include/eunit.hrl"). @@ -69,6 +72,29 @@ add_to_cache(L0Size, {LevelMinus1, MinSQN, MaxSQN}, LedgerSQN, TreeList) -> end end. +add_to_index(LevelMinus1, L0Index) -> + IndexAddFun = + fun({_K, V}) -> + {_, _, Hash, _} = leveled_codec:striphead_to_details(V), + case Hash of + no_lookup -> + ok; + _ -> + ets:insert(L0Index, {Hash}) + end + end, + lists:foreach(IndexAddFun, leveled_skiplist:to_list(LevelMinus1)). + +new_index() -> + ets:new(l0index, [private, set]). + +check_index(Hash, L0Index) -> + case ets:lookup(L0Index, Hash) of + [{Hash}] -> + true; + [] -> + false + end. to_list(Slots, FetchFun) -> SW = os:timestamp(), From 32ac305c674329d5488cc459badf3e3ea5f430ea Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 06:53:25 +0000 Subject: [PATCH 20/34] Compaction test error Compaction tests now throwing up different corruption points --- src/leveled_codec.erl | 4 +++- src/leveled_iclerk.erl | 33 ++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 72b90b0..5303180 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -226,7 +226,9 @@ compact_inkerkvc({{SQN, ?INKT_STND, LK}, V, CrcCheck}, Strategy) -> {TagStrat, {{SQN, ?INKT_KEYD, LK}, {null, KeyDeltas}, CrcCheck}}; TagStrat -> {TagStrat, null} - end. + end; +compact_inkerkvc(_KVC, _Strategy) -> + skip. split_inkvalue(VBin) -> case is_binary(VBin) of diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl index a060774..c612367 100644 --- a/src/leveled_iclerk.erl +++ b/src/leveled_iclerk.erl @@ -238,19 +238,26 @@ check_single_file(CDB, FilterFun, FilterServer, MaxSQN, SampleSize, BatchSize) - FN = leveled_cdb:cdb_filename(CDB), PositionList = leveled_cdb:cdb_getpositions(CDB, SampleSize), KeySizeList = fetch_inbatches(PositionList, BatchSize, CDB, []), - R0 = lists:foldl(fun(KS, {ActSize, RplSize}) -> - {{SQN, _Type, PK}, Size} = KS, - Check = FilterFun(FilterServer, PK, SQN), - case {Check, SQN > MaxSQN} of - {true, _} -> - {ActSize + Size - ?CRC_SIZE, RplSize}; - {false, true} -> - {ActSize + Size - ?CRC_SIZE, RplSize}; - _ -> - {ActSize, RplSize + Size - ?CRC_SIZE} - end end, - {0, 0}, - KeySizeList), + + FoldFunForSizeCompare = + fun(KS, {ActSize, RplSize}) -> + case KS of + {{SQN, _Type, PK}, Size} -> + Check = FilterFun(FilterServer, PK, SQN), + case {Check, SQN > MaxSQN} of + {true, _} -> + {ActSize + Size - ?CRC_SIZE, RplSize}; + {false, true} -> + {ActSize + Size - ?CRC_SIZE, RplSize}; + _ -> + {ActSize, RplSize + Size - ?CRC_SIZE} + end; + _ -> + {ActSize, RplSize} + end + end, + + R0 = lists:foldl(FoldFunForSizeCompare, {0, 0}, KeySizeList), {ActiveSize, ReplacedSize} = R0, Score = case ActiveSize + ReplacedSize of 0 -> From 2758498fad5437fef6c48ec53d10a90718267fc7 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 06:54:41 +0000 Subject: [PATCH 21/34] More Jitter! Having reduced the size of the ledger cache (again) we can now tolerate more jitter here --- src/leveled_penciller.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 3547342..58f575e 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -201,7 +201,7 @@ -define(SUPER_MAX_TABLE_SIZE, 40000). -define(PROMPT_WAIT_ONL0, 5). -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). --define(COIN_SIDECOUNT, 3). +-define(COIN_SIDECOUNT, 5). -record(state, {manifest = [] :: list(), manifest_sqn = 0 :: integer(), From 6f06c6fdeb3d71c5c7dde4bc24cc838f76683631 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 07:07:30 +0000 Subject: [PATCH 22/34] ETS delete Delete the objects rather than starting a new table each time --- src/leveled_penciller.erl | 2 +- src/leveled_pmem.erl | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 58f575e..ce83252 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -467,12 +467,12 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) -> filename=FN}, UpdMan = lists:keystore(0, 1, State#state.manifest, {0, [ManEntry]}), % Prompt clerk to ask about work - do this for every L0 roll + leveled_pmem:clear_index(State#state.levelzero_index), ok = leveled_pclerk:clerk_prompt(State#state.clerk), {noreply, State#state{levelzero_cache=[], levelzero_pending=false, levelzero_constructor=undefined, levelzero_size=0, - levelzero_index=leveled_pmem:new_index(), manifest=UpdMan, persisted_sqn=State#state.ledger_sqn}}. diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 9f81c01..8629fb3 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -88,6 +88,9 @@ add_to_index(LevelMinus1, L0Index) -> new_index() -> ets:new(l0index, [private, set]). +clear_index(L0Index) -> + ets:delete_all_objects(L0Index). + check_index(Hash, L0Index) -> case ets:lookup(L0Index, Hash) of [{Hash}] -> From 16c704551b6be82b9c987b12d6c2f912735edc98 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 07:35:23 +0000 Subject: [PATCH 23/34] Revert to original SFT build settings Leveled is always CPU bound during tests, and it is the merge in the ledger that drains the CPU hardest, --- src/leveled_sft.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 4f67adf..caf4558 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -179,7 +179,7 @@ -define(DWORD_SIZE, 8). -define(CURRENT_VERSION, {0,1}). -define(SLOT_COUNT, 256). --define(SLOT_GROUPWRITE_COUNT, 64). +-define(SLOT_GROUPWRITE_COUNT, 32). -define(BLOCK_SIZE, 32). -define(BLOCK_COUNT, 4). -define(FOOTERPOS_HEADERPOS, 2). @@ -189,7 +189,7 @@ -define(COMPRESSION_LEVEL, 1). -define(HEADER_LEN, 56). -define(ITERATOR_SCANWIDTH, 1). --define(MERGE_SCANWIDTH, 32). +-define(MERGE_SCANWIDTH, 16). -define(BLOOM_WIDTH, 48). -define(DELETE_TIMEOUT, 10000). -define(MAX_KEYS, ?SLOT_COUNT * ?BLOCK_COUNT * ?BLOCK_SIZE). From fb069666dc6fd6464a477225e08b0a2b8ab0f02a Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 08:16:00 +0000 Subject: [PATCH 24/34] Export module --- src/leveled_pmem.erl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 8629fb3..0c61acf 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -48,6 +48,7 @@ merge_trees/4, add_to_index/2, new_index/0, + clear_index/1, check_index/2 ]). From 71cf7a3a5161db65753593342912a5799813788d Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 08:37:03 +0000 Subject: [PATCH 25/34] Setting change led to idle CPU --- src/leveled_sft.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index caf4558..da3a9fd 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -179,7 +179,7 @@ -define(DWORD_SIZE, 8). -define(CURRENT_VERSION, {0,1}). -define(SLOT_COUNT, 256). --define(SLOT_GROUPWRITE_COUNT, 32). +-define(SLOT_GROUPWRITE_COUNT, 128). -define(BLOCK_SIZE, 32). -define(BLOCK_COUNT, 4). -define(FOOTERPOS_HEADERPOS, 2). @@ -189,7 +189,7 @@ -define(COMPRESSION_LEVEL, 1). -define(HEADER_LEN, 56). -define(ITERATOR_SCANWIDTH, 1). --define(MERGE_SCANWIDTH, 16). +-define(MERGE_SCANWIDTH, 32). -define(BLOOM_WIDTH, 48). -define(DELETE_TIMEOUT, 10000). -define(MAX_KEYS, ?SLOT_COUNT * ?BLOCK_COUNT * ?BLOCK_SIZE). From 44cee5a6e8d84391c815664e7f2b25e6c459b2d6 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 12:33:09 +0000 Subject: [PATCH 26/34] Experiemnt with no compression Does compression hurt CPU more than the benefit gaine din some cases? --- src/leveled_codec.erl | 2 +- src/leveled_sft.erl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 5303180..35afbdb 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -246,7 +246,7 @@ check_forinkertype(_LedgerKey, _Object) -> create_value_for_journal(Value) -> case Value of {Object, KeyChanges} -> - term_to_binary({Object, KeyChanges}, [compressed]); + term_to_binary({Object, KeyChanges}); Value when is_binary(Value) -> Value end. diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index da3a9fd..5bae7da 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -1082,7 +1082,7 @@ create_slot(KL1, KL2, LevelR, BlockCount, Bloom, TrackingMetadata). serialise_block(BlockKeyList) -> - term_to_binary(BlockKeyList, [{compressed, ?COMPRESSION_LEVEL}]). + term_to_binary(BlockKeyList). %% Compare the keys at the head of the list, and either skip that "best" key or From 1b638450505ffab94873f5bc6d9ab190ea83273f Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 15:02:33 +0000 Subject: [PATCH 27/34] Bring compression back to SFT It is expensive on the CPU - but it leads to a 4 x increase in the cache coverage. Try and make some small micro gains in list handling in create_block --- src/leveled_codec.erl | 2 +- src/leveled_sft.erl | 34 +++++++++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 35afbdb..5303180 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -246,7 +246,7 @@ check_forinkertype(_LedgerKey, _Object) -> create_value_for_journal(Value) -> case Value of {Object, KeyChanges} -> - term_to_binary({Object, KeyChanges}); + term_to_binary({Object, KeyChanges}, [compressed]); Value when is_binary(Value) -> Value end. diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 5bae7da..428523a 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -179,7 +179,7 @@ -define(DWORD_SIZE, 8). -define(CURRENT_VERSION, {0,1}). -define(SLOT_COUNT, 256). --define(SLOT_GROUPWRITE_COUNT, 128). +-define(SLOT_GROUPWRITE_COUNT, 64). -define(BLOCK_SIZE, 32). -define(BLOCK_COUNT, 4). -define(FOOTERPOS_HEADERPOS, 2). @@ -970,16 +970,25 @@ create_block(KeyList1, KeyList2, when length(BlockKeyList)==?BLOCK_SIZE -> case {KeyList1, KeyList2} of {[], []} -> - {BlockKeyList, complete, {LSN, HSN}, SegmentList, + {lists:reverse(BlockKeyList), + complete, + {LSN, HSN}, + lists:reverse(SegmentList), Bloom, [], []}; _ -> - {BlockKeyList, full, {LSN, HSN}, SegmentList, + {lists:reverse(BlockKeyList), + full, + {LSN, HSN}, + lists:reverse(SegmentList), Bloom, KeyList1, KeyList2} end; create_block([], [], BlockKeyList, {LSN, HSN}, SegmentList, _LevelR, Bloom) -> - {BlockKeyList, partial, {LSN, HSN}, SegmentList, + {lists:reverse(BlockKeyList), + partial, + {LSN, HSN}, + lists:reverse(SegmentList), Bloom, [], []}; create_block(KeyList1, KeyList2, @@ -992,10 +1001,8 @@ create_block(KeyList1, KeyList2, {SQN, _St, MH, _MD} = leveled_codec:striphead_to_details(V), {UpdLSN, UpdHSN} = update_sequencenumbers(SQN, LSN, HSN), UpdBloom = leveled_tinybloom:enter({hash, MH}, Bloom), - NewBlockKeyList = lists:append(BlockKeyList, - [TopKey]), - NewSegmentList = lists:append(SegmentList, - [hash_for_segmentid(TopKey)]), + NewBlockKeyList = [TopKey|BlockKeyList], + NewSegmentList = [hash_for_segmentid(TopKey)|SegmentList], create_block(Rem1, Rem2, NewBlockKeyList, {UpdLSN, UpdHSN}, NewSegmentList, LevelR, UpdBloom); @@ -1061,13 +1068,13 @@ create_slot(KL1, KL2, LevelR, BlockCount, Bloom, {null, LSN, HSN, LastKey, Status}; {null, _} -> [NewLowKeyV|_] = BlockKeyList, - NewLastKey = lists:last([{keyonly, LastKey}|BlockKeyList]), + NewLastKey = last_key(BlockKeyList, {keyonly, LastKey}), {leveled_codec:strip_to_keyonly(NewLowKeyV), min(LSN, LSNb), max(HSN, HSNb), leveled_codec:strip_to_keyonly(NewLastKey), Status}; {_, _} -> - NewLastKey = lists:last([{keyonly, LastKey}|BlockKeyList]), + NewLastKey = last_key(BlockKeyList, {keyonly, LastKey}), {LowKey, min(LSN, LSNb), max(HSN, HSNb), leveled_codec:strip_to_keyonly(NewLastKey), @@ -1081,8 +1088,13 @@ create_slot(KL1, KL2, LevelR, BlockCount, Bloom, SegList2, SerialisedSlot2, LengthList ++ [BlockLength], TrackingMetadata). +last_key([], LastKey) -> + LastKey; +last_key(BlockKeyList, _LastKey) -> + lists:last(BlockKeyList). + serialise_block(BlockKeyList) -> - term_to_binary(BlockKeyList). + term_to_binary(BlockKeyList, [compressed]). %% Compare the keys at the head of the list, and either skip that "best" key or From a86686d621fcaa3001a287bdaa549ee8b82d6332 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 15:17:58 +0000 Subject: [PATCH 28/34] Remove unnecessary reverse --- src/leveled_sft.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 428523a..686511c 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -973,14 +973,14 @@ create_block(KeyList1, KeyList2, {lists:reverse(BlockKeyList), complete, {LSN, HSN}, - lists:reverse(SegmentList), + SegmentList, Bloom, [], []}; _ -> {lists:reverse(BlockKeyList), full, {LSN, HSN}, - lists:reverse(SegmentList), + SegmentList, Bloom, KeyList1, KeyList2} end; @@ -988,7 +988,7 @@ create_block([], [], BlockKeyList, {LSN, HSN}, SegmentList, _LevelR, Bloom) -> {lists:reverse(BlockKeyList), partial, {LSN, HSN}, - lists:reverse(SegmentList), + SegmentList, Bloom, [], []}; create_block(KeyList1, KeyList2, From 24a5347bec97c3b207d830296d70d30b6bc84dc5 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 15:19:34 +0000 Subject: [PATCH 29/34] Revert --- src/leveled_sft.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 686511c..206f0bd 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -1094,7 +1094,7 @@ last_key(BlockKeyList, _LastKey) -> lists:last(BlockKeyList). serialise_block(BlockKeyList) -> - term_to_binary(BlockKeyList, [compressed]). + term_to_binary(BlockKeyList, [{compressed, ?COMPRESSION_LEVEL}]). %% Compare the keys at the head of the list, and either skip that "best" key or From 5cfe9a71e1f5d8a4c872eb893fa2b99396a4e49a Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 15:25:14 +0000 Subject: [PATCH 30/34] Wrap test with non-default timeout --- src/leveled_pclerk.erl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 649973b..1e46d80 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -394,6 +394,9 @@ find_randomkeys(FList, Count, Source) -> merge_file_test() -> + {timeout, 10, merge_file_test_towrap()}. + +merge_file_test_towrap() -> KL1_L1 = lists:sort(generate_randomkeys(16000, 0, 1000)), {ok, PidL1_1, _} = leveled_sft:sft_new("../test/KL1_L1.sft", KL1_L1, [], 1), From f96d1480731dca4040e1fcf3e6cd7bf07f1bfb3c Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 20:17:05 +0000 Subject: [PATCH 31/34] Make the merge_test a more sensible size On the verge of a timeout. Rather than keep battling with the timeout, make it do less work --- src/leveled_pclerk.erl | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 1e46d80..63884bd 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -394,22 +394,19 @@ find_randomkeys(FList, Count, Source) -> merge_file_test() -> - {timeout, 10, merge_file_test_towrap()}. - -merge_file_test_towrap() -> - KL1_L1 = lists:sort(generate_randomkeys(16000, 0, 1000)), + KL1_L1 = lists:sort(generate_randomkeys(8000, 0, 1000)), {ok, PidL1_1, _} = leveled_sft:sft_new("../test/KL1_L1.sft", KL1_L1, [], 1), - KL1_L2 = lists:sort(generate_randomkeys(16000, 0, 250)), + KL1_L2 = lists:sort(generate_randomkeys(8000, 0, 250)), {ok, PidL2_1, _} = leveled_sft:sft_new("../test/KL1_L2.sft", KL1_L2, [], 2), - KL2_L2 = lists:sort(generate_randomkeys(16000, 250, 250)), + KL2_L2 = lists:sort(generate_randomkeys(8000, 250, 250)), {ok, PidL2_2, _} = leveled_sft:sft_new("../test/KL2_L2.sft", KL2_L2, [], 2), - KL3_L2 = lists:sort(generate_randomkeys(16000, 500, 250)), + KL3_L2 = lists:sort(generate_randomkeys(8000, 500, 250)), {ok, PidL2_3, _} = leveled_sft:sft_new("../test/KL3_L2.sft", KL3_L2, [], 2), - KL4_L2 = lists:sort(generate_randomkeys(16000, 750, 250)), + KL4_L2 = lists:sort(generate_randomkeys(8000, 750, 250)), {ok, PidL2_4, _} = leveled_sft:sft_new("../test/KL4_L2.sft", KL4_L2, [], 2), Result = perform_merge({PidL1_1, "../test/KL1_L1.sft"}, From 4b48ed14c6b429cb4c033e40269f8cac514ff8bb Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 20:38:20 +0000 Subject: [PATCH 32/34] Correct Mistyped 2 ^ 32 --- src/leveled_codec.erl | 3 +-- src/leveled_skiplist.erl | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 5303180..f08e2e9 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -74,8 +74,7 @@ %% %% Hash function contains mysterious constants, some explanation here as to %% what they are - -%% http://stackoverflow.com/ ++ -%% questions/10696223/reason-for-5381-number-in-djb-hash-function +%% http://stackoverflow.com/questions/10696223/reason-for-5381-number-in-djb-hash-function magic_hash({?RIAK_TAG, Bucket, Key, _SubKey}) -> magic_hash({Bucket, Key}); diff --git a/src/leveled_skiplist.erl b/src/leveled_skiplist.erl index 142afc5..7fcc81a 100644 --- a/src/leveled_skiplist.erl +++ b/src/leveled_skiplist.erl @@ -598,7 +598,7 @@ skiplist_timingtest(KL, SkipList, N, Bloom) -> case Bloom of true -> HashList = lists:map(fun(_X) -> - random:uniform(4296967295) end, + random:uniform(4294967295) end, lists:seq(1, 2000)), SWh = os:timestamp(), lists:foreach(fun(X) -> From 86bdfdeaf034ca51ccbc2920e5793e55203c8b10 Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 21:01:10 +0000 Subject: [PATCH 33/34] Reverted back out the additional bloom check This is desirable to add back in going forward, but wasn't implemented in a safe or clear way. The way the bloom was or was not on the LoopState was clumsy, and it got persisted in multiple places without a CRC check. Intention to implement back in wherby it is requested on-demand by the Penciller, and then the SFT worker lifts it off disk and CRC checks it. So it is never on the SFT LoopState. Also it will be easier to control the logic over which levels have the bloom in the Penciller. --- include/leveled.hrl | 1 - src/leveled_pclerk.erl | 2 -- src/leveled_penciller.erl | 39 ++++++++++++++++----------------------- src/leveled_sft.erl | 23 +++++------------------ 4 files changed, 21 insertions(+), 44 deletions(-) diff --git a/include/leveled.hrl b/include/leveled.hrl index f57ffd4..25216f6 100644 --- a/include/leveled.hrl +++ b/include/leveled.hrl @@ -41,7 +41,6 @@ {start_key :: tuple(), end_key :: tuple(), owner :: pid(), - bloom, filename :: string()}). -record(cdb_options, diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 63884bd..b5f8e3f 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -320,7 +320,6 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> KL1, KL2, LevelR), - {ok, Bloom} = leveled_sft:sft_getbloom(Pid), case Reply of {{[], []}, null, _} -> leveled_log:log("PC013", [FileName]), @@ -332,7 +331,6 @@ do_merge(KL1, KL2, {SrcLevel, IsB}, {Filepath, MSN}, FileCounter, OutList) -> [#manifest_entry{start_key=SmallestKey, end_key=HighestKey, owner=Pid, - bloom=Bloom, filename=FileName}]), leveled_log:log_timer("PC015", [], TS1), do_merge(KL1Rem, KL2Rem, diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index ce83252..d5b70d1 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -175,7 +175,7 @@ pcl_checksequencenumber/4, pcl_workforclerk/1, pcl_promptmanifestchange/2, - pcl_confirml0complete/5, + pcl_confirml0complete/4, pcl_confirmdelete/2, pcl_close/1, pcl_doom/1, @@ -286,8 +286,8 @@ pcl_workforclerk(Pid) -> pcl_promptmanifestchange(Pid, WI) -> gen_server:cast(Pid, {manifest_change, WI}). -pcl_confirml0complete(Pid, FN, StartKey, EndKey, Bloom) -> - gen_server:cast(Pid, {levelzero_complete, FN, StartKey, EndKey, Bloom}). +pcl_confirml0complete(Pid, FN, StartKey, EndKey) -> + gen_server:cast(Pid, {levelzero_complete, FN, StartKey, EndKey}). pcl_confirmdelete(Pid, FileName) -> gen_server:cast(Pid, {confirm_delete, FileName}). @@ -458,11 +458,10 @@ handle_cast({confirm_delete, FileName}, State=#state{is_snapshot=Snap}) _ -> {noreply, State} end; -handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) -> +handle_cast({levelzero_complete, FN, StartKey, EndKey}, State) -> leveled_log:log("P0029", []), ManEntry = #manifest_entry{start_key=StartKey, end_key=EndKey, - bloom=Bloom, owner=State#state.levelzero_constructor, filename=FN}, UpdMan = lists:keystore(0, 1, State#state.manifest, {0, [ManEntry]}), @@ -737,7 +736,7 @@ fetch_mem(Key, Hash, Manifest, L0Cache, none) -> L0Check = leveled_pmem:check_levelzero(Key, Hash, L0Cache), case L0Check of {false, not_found} -> - fetch(Key, Hash, Manifest, 0, fun leveled_sft:sft_get/2); + fetch(Key, Manifest, 0, fun leveled_sft:sft_get/2); {true, KV} -> KV end; @@ -746,38 +745,32 @@ fetch_mem(Key, Hash, Manifest, L0Cache, L0Index) -> true -> fetch_mem(Key, Hash, Manifest, L0Cache, none); false -> - fetch(Key, Hash, Manifest, 0, fun leveled_sft:sft_get/2) + fetch(Key, Manifest, 0, fun leveled_sft:sft_get/2) end. -fetch(_Key, _Hash, _Manifest, ?MAX_LEVELS + 1, _FetchFun) -> +fetch(_Key, _Manifest, ?MAX_LEVELS + 1, _FetchFun) -> not_present; -fetch(Key, Hash, Manifest, Level, FetchFun) -> +fetch(Key, Manifest, Level, FetchFun) -> LevelManifest = get_item(Level, Manifest, []), case lists:foldl(fun(File, Acc) -> case Acc of not_present when Key >= File#manifest_entry.start_key, File#manifest_entry.end_key >= Key -> - {File#manifest_entry.owner, - File#manifest_entry.bloom}; + File#manifest_entry.owner; FoundDetails -> FoundDetails end end, not_present, LevelManifest) of not_present -> - fetch(Key, Hash, Manifest, Level + 1, FetchFun); - {FileToCheck, Bloom} -> - case leveled_tinybloom:check({hash, Hash}, Bloom) of - true -> - case FetchFun(FileToCheck, Key) of - not_present -> - fetch(Key, Hash, Manifest, Level + 1, FetchFun); - ObjectFound -> - ObjectFound - end; - false -> - fetch(Key, Hash, Manifest, Level + 1, FetchFun) + fetch(Key, Manifest, Level + 1, FetchFun); + FileToCheck -> + case FetchFun(FileToCheck, Key) of + not_present -> + fetch(Key, Manifest, Level + 1, FetchFun); + ObjectFound -> + ObjectFound end end. diff --git a/src/leveled_sft.erl b/src/leveled_sft.erl index 206f0bd..5b4f24e 100644 --- a/src/leveled_sft.erl +++ b/src/leveled_sft.erl @@ -161,7 +161,6 @@ sft_newfroml0cache/4, sft_open/1, sft_get/2, - sft_getbloom/1, sft_getkvrange/4, sft_close/1, sft_clear/1, @@ -213,8 +212,7 @@ handle :: file:fd(), background_complete = false :: boolean(), oversized_file = false :: boolean(), - penciller :: pid(), - bloom}). + penciller :: pid()}). %%%============================================================================ @@ -271,9 +269,6 @@ sft_open(Filename) -> sft_setfordelete(Pid, Penciller) -> gen_fsm:sync_send_event(Pid, {set_for_delete, Penciller}, infinity). -sft_getbloom(Pid) -> - gen_fsm:sync_send_event(Pid, get_bloom, infinity). - sft_get(Pid, Key) -> gen_fsm:sync_send_event(Pid, {get_kv, Key}, infinity). @@ -348,9 +343,8 @@ starting({sft_newfroml0cache, Filename, Slots, FetchFun, PCL}, _State) -> leveled_penciller:pcl_confirml0complete(PCL, State#state.filename, State#state.smallest_key, - State#state.highest_key, - State#state.bloom), - {next_state, reader, State#state{bloom=none}} + State#state.highest_key), + {next_state, reader, State} end. @@ -385,12 +379,6 @@ reader(background_complete, _From, State) -> reader, State} end; -reader(get_bloom, _From, State) -> - Bloom = State#state.bloom, - if - Bloom /= none -> - {reply, {ok, Bloom}, reader, State#state{bloom=none}} - end; reader(close, _From, State) -> ok = file:close(State#state.handle), {stop, normal, ok, State}. @@ -523,7 +511,7 @@ open_file(FileMD) -> Slen:32/integer>> = HeaderLengths, {ok, SummaryBin} = file:pread(Handle, ?HEADER_LEN + Blen + Ilen + Flen, Slen), - {{LowSQN, HighSQN}, {LowKey, HighKey}, Bloom} = binary_to_term(SummaryBin), + {{LowSQN, HighSQN}, {LowKey, HighKey}, _Bloom} = binary_to_term(SummaryBin), {ok, SlotIndexBin} = file:pread(Handle, ?HEADER_LEN + Blen, Ilen), SlotIndex = binary_to_term(SlotIndexBin), {Handle, FileMD#state{slot_index=SlotIndex, @@ -536,8 +524,7 @@ open_file(FileMD) -> filter_pointer=?HEADER_LEN + Blen + Ilen, summ_pointer=?HEADER_LEN + Blen + Ilen + Flen, summ_length=Slen, - handle=Handle, - bloom=Bloom}}. + handle=Handle}}. %% Take a file handle with a previously created header and complete it based on %% the two key lists KL1 and KL2 From f28c7e02bf93224f3f4fd026358ec840dd251b8d Mon Sep 17 00:00:00 2001 From: martinsumner Date: Sun, 11 Dec 2016 21:24:04 +0000 Subject: [PATCH 34/34] Remove unnecessary clause As the intention is to change the way the tiny bloom is called, the unnecessary clause of handling an undefined bloom can be removed. --- src/leveled_tinybloom.erl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index dd72b6e..f9212ad 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -47,8 +47,6 @@ enter(Key, Bloom) -> Hash = leveled_codec:magic_hash(Key), enter({hash, Hash}, Bloom). -check({hash, _Hash}, undefined) -> - true; check({hash, Hash}, Bloom) -> {H0, Bit1, Bit2} = split_hash(Hash), Slot = H0 rem dict:size(Bloom),