From a0e9ac737c9e0a63d0b66d554e20bac44bcbeba1 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Mon, 23 Aug 2021 17:18:45 +0100
Subject: [PATCH] Mas i340 doublel3 d31 (#347)

* Double size of L4 files

And double max efficient size of leveled_ebloom

* Revert penciller shape

But expand file size at L3

* More concise version

Following code review

* OTP 24 dialyzer fix

Bindings intended to match - so don't use underscore

* Allow eqc tests to work from `rebar3 as eqc shell`

Then `eqc:quickcheck(leveled_statemeqc:prop_db()).`

Plus markdown tidy
---
 README.md                 |   4 +-
 rebar.config              |   2 +-
 src/leveled_ebloom.erl    |  41 +++++++++----
 src/leveled_penciller.erl |  15 +++--
 src/leveled_sst.erl       | 118 +++++++++++++++++++++++++++++++++-----
 5 files changed, 146 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index be0c60f..f71b605 100644
--- a/README.md
+++ b/README.md
@@ -78,8 +78,8 @@ In order to contribute to leveled, fork the repository, make a branch for your c
 
 To have rebar3 execute the full set of tests, run:
 
-    `rebar3 as test do xref, dialyzer, cover --reset, eunit --cover, ct --cover, cover --verbose`
+```rebar3 as test do xref, dialyzer, cover --reset, eunit --cover, ct --cover, cover --verbose```
 
 For those with a Quickcheck license, property-based tests can also be run using:
 
-    `rebar3 as eqc do eunit --module=leveled_simpleeqc, eunit --module=leveled_statemeqc`
+```rebar3 as eqc do eunit --module=leveled_simpleeqc, eunit --module=leveled_statemeqc```
diff --git a/rebar.config b/rebar.config
index 1c6298b..d8aeaa3 100644
--- a/rebar.config
+++ b/rebar.config
@@ -19,7 +19,7 @@
 {profiles,
  [{eqc, [{deps, [meck, fqc]},
     {erl_opts, [debug_info,  {d, 'EQC'}]},
-    {extra_src_dirs, ["test/property"]},
+    {extra_src_dirs, ["test/property", "test/end_to_end"]},
     {shell, [{apps, [lz4]}]},
     {plugins, [rebar_eqc]}
    ]},
diff --git a/src/leveled_ebloom.erl b/src/leveled_ebloom.erl
index 9612f72..6484891 100644
--- a/src/leveled_ebloom.erl
+++ b/src/leveled_ebloom.erl
@@ -29,13 +29,32 @@
 %%% API
 %%%============================================================================
 
--spec create_bloom(list(integer())) -> bloom().
+-spec create_bloom(list(leveled_codec:segment_hash())) -> bloom().
 %% @doc
-%% Create a binary bloom filter from alist of hashes
+%% Create a binary bloom filter from a list of hashes
 create_bloom(HashList) ->
     case length(HashList) of
         0 ->
             <<>>;
+        L when L > 32768 ->
+            {HL0, HL1} =
+                lists:partition(fun({_, Hash}) -> Hash band 32 == 0 end,
+                                HashList),
+            Bin1 =
+                add_hashlist(HL0,
+                                32,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0),
+            Bin2 =
+                add_hashlist(HL1,
+                                32,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0),
+            <<Bin1/binary, Bin2/binary>>;
         L when L > 16384 ->
             add_hashlist(HashList,
                             32,
@@ -55,7 +74,7 @@ create_bloom(HashList) ->
     end.
 
 
--spec check_hash(integer(), bloom()) -> boolean().
+-spec check_hash(leveled_codec:segment_hash(), bloom()) -> boolean().
 %% @doc
 %% Check for the presence of a given hash within a bloom
 check_hash(_Hash, <<>>) ->
@@ -548,15 +567,17 @@ empty_bloom_test() ->
                     check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})).
 
 bloom_test_() ->
-    {timeout, 60, fun bloom_test_ranges/0}.
+    {timeout, 120, fun bloom_test_ranges/0}.
 
 bloom_test_ranges() ->
-    test_bloom(40000, 2),
-    test_bloom(128 * 256, 10),
-    test_bloom(20000, 2),
-    test_bloom(10000, 2),
-    test_bloom(5000, 2),
-    test_bloom(2000, 2).
+    test_bloom(80000, 4),
+    test_bloom(60000, 4),
+    test_bloom(40000, 4),
+    test_bloom(128 * 256, 4),
+    test_bloom(20000, 4),
+    test_bloom(10000, 4),
+    test_bloom(5000, 4),
+    test_bloom(2000, 4).
 
 test_bloom(N, Runs) ->
     ListOfHashLists = 
diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index 4e972df..a756ad3 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -307,17 +307,20 @@
 -type iterator() :: list(iterator_entry()).
 -type bad_ledgerkey() :: list().
 -type sqn_check() :: current|replaced|missing.
--type pclacc_fun() ::
-        fun((leveled_codec:ledger_key(),
-                leveled_codec:ledger_value(),
-                any()) -> any()).
 -type sst_fetchfun() ::
         fun((pid(),
                 leveled_codec:ledger_key(),
                 leveled_codec:segment_hash(),
-                non_neg_integer()) -> leveled_codec:ledger_kv()|not_present).
+                non_neg_integer()) -> 
+                    leveled_codec:ledger_kv()|not_present).
+-type levelzero_returnfun() :: fun((levelzero_cacheentry()) -> ok).
+-type pclacc_fun() ::
+        fun((leveled_codec:ledger_key(),
+                leveled_codec:ledger_value(),
+                any()) -> any()).
 
--export_type([levelzero_cacheentry/0, sqn_check/0]).
+
+-export_type([levelzero_cacheentry/0, levelzero_returnfun/0, sqn_check/0]).
 
 %%%============================================================================
 %%% API
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index cff1799..2545559 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -91,6 +91,7 @@
 -define(LMD_LENGTH, 4).
 -define(FLIPPER32, 4294967295).
 -define(COMPRESS_AT_LEVEL, 1).
+-define(DOUBLESIZE_LEVEL, 3).
 -define(INDEX_MODDATE, true).
 -define(TOMB_COUNT, true).
 -define(USE_SET_FOR_SPEED, 64).
@@ -281,7 +282,11 @@ sst_new(RootPath, Filename, Level, KVList, MaxSQN, OptsSST) ->
 sst_new(RootPath, Filename, Level, KVList, MaxSQN, OptsSST, IndexModDate) ->
     {ok, Pid} = gen_fsm:start_link(?MODULE, [], []),
     PressMethod0 = compress_level(Level, OptsSST#sst_options.press_method),
-    OptsSST0 = OptsSST#sst_options{press_method = PressMethod0},
+    MaxSlots0 = maxslots_level(Level, OptsSST#sst_options.max_sstslots),
+    OptsSST0 =
+        OptsSST#sst_options{press_method = PressMethod0,
+                            max_sstslots = MaxSlots0},
+
     {[], [], SlotList, FK, _CountOfTombs}  =
         merge_lists(KVList, OptsSST0, IndexModDate),
     case gen_fsm:sync_send_event(Pid,
@@ -318,7 +323,7 @@ sst_new(RootPath, Filename, Level, KVList, MaxSQN, OptsSST, IndexModDate) ->
 %% deleted.
 %%
 %% The remainder of the lists is returned along with the StartKey and EndKey
-%% so that the remainder cna be  used in the next file in the merge.  It might
+%% so that the remainder can be  used in the next file in the merge.  It might
 %% be that the merge_lists returns nothing (for example when a basement file is
 %% all tombstones) - and the atom empty is returned in this case so that the
 %% file is not added to the manifest.
@@ -333,7 +338,10 @@ sst_newmerge(RootPath, Filename,
         KVL1, KVL2, IsBasement, Level, 
         MaxSQN, OptsSST, IndexModDate, TombCount) ->
     PressMethod0 = compress_level(Level, OptsSST#sst_options.press_method),
-    OptsSST0 = OptsSST#sst_options{press_method = PressMethod0},
+    MaxSlots0 = maxslots_level(Level, OptsSST#sst_options.max_sstslots),
+    OptsSST0 =
+        OptsSST#sst_options{press_method = PressMethod0,
+                            max_sstslots = MaxSlots0},
     {Rem1, Rem2, SlotList, FK, CountOfTombs} = 
         merge_lists(KVL1, KVL2, {IsBasement, Level}, OptsSST0,
                     IndexModDate, TombCount),
@@ -360,7 +368,13 @@ sst_newmerge(RootPath, Filename,
     end.
 
 -spec sst_newlevelzero(string(), string(),
-                            integer(), fun()|list(), pid()|undefined, integer(), 
+                            integer(), 
+                            fun((pos_integer(),
+                                    leveled_penciller:levelzero_returnfun())
+                                -> ok)|
+                                list(), 
+                            pid()|undefined, 
+                            integer(), 
                             sst_options()) ->
                                         {ok, pid(), noreply}.
 %% @doc
@@ -371,7 +385,10 @@ sst_newlevelzero(RootPath, Filename,
                     Slots, Fetcher, Penciller,
                     MaxSQN, OptsSST) ->
     PressMethod0 = compress_level(0, OptsSST#sst_options.press_method),
-    OptsSST0 = OptsSST#sst_options{press_method = PressMethod0},
+    MaxSlots0 = maxslots_level(0, OptsSST#sst_options.max_sstslots),
+    OptsSST0 =
+        OptsSST#sst_options{press_method = PressMethod0,
+                            max_sstslots = MaxSlots0},
     {ok, Pid} = gen_fsm:start_link(?MODULE, [], []),
     % Initiate the file into the "starting" state
     ok = gen_fsm:sync_send_event(Pid,
@@ -1325,6 +1342,12 @@ compress_level(Level, _PressMethod) when Level < ?COMPRESS_AT_LEVEL ->
 compress_level(_Level, PressMethod) ->
     PressMethod.
 
+-spec maxslots_level(non_neg_integer(), pos_integer()) ->  pos_integer().
+maxslots_level(Level, MaxSlotCount) when Level < ?DOUBLESIZE_LEVEL ->
+    MaxSlotCount;
+maxslots_level(_Level, MaxSlotCount) ->
+    2 * MaxSlotCount.
+
 write_file(RootPath, Filename, SummaryBin, SlotsBin,
             PressMethod, IdxModDate, CountOfTombs) ->
     SummaryLength = byte_size(SummaryBin),
@@ -2921,8 +2944,13 @@ generate_indexkey(Term, Count) ->
                                     Count, 
                                     infinity).
 
-
 tombcount_test() ->
+    tombcount_tester(1),
+    tombcount_tester(2),
+    tombcount_tester(3),
+    tombcount_tester(4).
+
+tombcount_tester(Level) ->
     N = 1600,
     KL1 = generate_randomkeys(N div 2 + 1, N, 1, 4),
     KL2 = generate_indexkeys(N div 2),
@@ -2952,23 +2980,22 @@ tombcount_test() ->
     OptsSST = 
         #sst_options{press_method=native,
                         log_options=leveled_log:get_opts()},
-    {ok, SST1, KD, BB} = sst_newmerge(RP, Filename,
-                               KVL1, KVL2, false, 2,
-                               N, OptsSST, false, false),
+    {ok, SST1, KD, BB} = sst_newmerge(RP, Filename, 
+                                KVL1, KVL2, false, Level, 
+                                N, OptsSST, false, false),
     ?assertMatch(not_counted, sst_gettombcount(SST1)),
     ok = sst_close(SST1),
     ok = file:delete(filename:join(RP, Filename ++ ".sst")),
 
-    {ok, SST2, KD, BB} = sst_newmerge(RP, Filename,
-                               KVL1, KVL2, false, 2,
-                               N, OptsSST, false, true),
-
+    {ok, SST2, KD, BB} = sst_newmerge(RP, Filename, 
+                                KVL1, KVL2, false, Level, 
+                                N, OptsSST, false, true),
+    
     ?assertMatch(ExpectedCount, sst_gettombcount(SST2)),
     ok = sst_close(SST2),
     ok = file:delete(filename:join(RP, Filename ++ ".sst")).
 
 
-
 form_slot_test() ->
     % If a skip key happens, mustn't switch to loookup by accident as could be
     % over the expected size
@@ -3275,7 +3302,68 @@ test_binary_slot(FullBin, Key, Hash, ExpectedValue) ->
     % io:format(user, "Fetch success in ~w microseconds ~n",
     %             [timer:now_diff(os:timestamp(), SW)]).
 
-    
+doublesize_test_() ->
+    {timeout, 300, fun doublesize_tester/0}.
+
+doublesize_tester() ->
+    io:format(user, "~nPreparing key lists for test~n", []),
+    Contents = lists:ukeysort(1, generate_randomkeys(1, 65000, 1, 6)),
+    SplitFun =
+        fun({K, V}, {L1, L2}) ->
+            case length(L1) > length(L2) of
+                true ->
+                    {L1, [{K, V}|L2]};
+                _ ->
+                    {[{K, V}|L1], L2}
+            end
+        end,
+    {KVL1, KVL2} = lists:foldr(SplitFun, {[], []}, Contents),
+
+    io:format(user, "Running tests over different sizes:~n", []),
+
+    size_tester(lists:sublist(KVL1, 4000), lists:sublist(KVL2, 4000), 8000),
+    size_tester(lists:sublist(KVL1, 16000), lists:sublist(KVL2, 16000), 32000),
+    size_tester(lists:sublist(KVL1, 24000), lists:sublist(KVL2, 24000), 48000),
+    size_tester(lists:sublist(KVL1, 32000), lists:sublist(KVL2, 32000), 64000).
+
+size_tester(KVL1, KVL2, N) ->
+    io:format(user, "~nStarting ... test with ~w keys ~n", [N]),
+
+    {RP, Filename} = {?TEST_AREA, "doublesize_test"},
+    OptsSST = 
+        #sst_options{press_method=native,
+                        log_options=leveled_log:get_opts()},
+    {ok, SST1, _KD, _BB} = sst_newmerge(RP, Filename, 
+                                KVL1, KVL2, false, ?DOUBLESIZE_LEVEL, 
+                                N, OptsSST, false, false),
+    ok = sst_close(SST1),
+    {ok, SST2, _SKEK, Bloom} =
+        sst_open(RP, Filename ++ ".sst", OptsSST, ?DOUBLESIZE_LEVEL),
+    FetchFun =
+        fun({K, V}) ->
+            {K0, V0} = sst_get(SST2, K),
+            ?assertMatch(K, K0),
+            ?assertMatch(V, V0)
+        end,
+    lists:foreach(FetchFun, KVL1 ++ KVL2),
+
+    CheckBloomFun =
+        fun({K, _V}) ->
+            leveled_ebloom:check_hash(leveled_codec:segment_hash(K), Bloom)
+        end,
+    KBIn = length(lists:filter(CheckBloomFun, KVL1 ++ KVL2)),
+    KBOut =
+        length(lists:filter(CheckBloomFun,
+            generate_randomkeys(1, 1000, 7, 9))),
+
+    ?assertMatch(N, KBIn),
+
+    io:format(user, "~w false positives in 1000~n", [KBOut]),
+
+    ok = sst_close(SST2),
+    ok = file:delete(filename:join(RP, Filename ++ ".sst")).
+
+
 merge_test() ->
     filelib:ensure_dir(?TEST_AREA),
     merge_tester(fun testsst_new/6, fun testsst_new/8).