From c060c0e41df43475db9c3c69f2ed9b96fefece85 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Mon, 14 Jan 2019 12:27:51 +0000
Subject: [PATCH] Handle L0 cache being full

A test thta will cause leveled to crash due to a low cache size being set - but protect against this (as well as the general scenario of the cache being full).

There could be a potential case where a L0 file present (post pending) without work backlog being set.  In this case we want to roll the level zero to memory, but don't accept the cache update if the L0 cache is already full.
---
 src/leveled_log.erl             |  4 +-
 src/leveled_penciller.erl       | 74 ++++++++++++++++++++++++---------
 src/leveled_pmem.erl            |  9 +++-
 test/end_to_end/basic_SUITE.erl |  9 +++-
 4 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 3bf6593..7f06ff9 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -139,7 +139,7 @@
         {warn, "We're doomed - intention recorded to destroy all files"}},
     {"P0031",
         {info, "Completion of update to levelzero"
-                    ++ " with cache size status ~w ~w"}},
+                    ++ " with cache_size=~w status=~w and update_success=~w"}},
     {"P0032",
         {info, "Fetch head timing with sample_count=~w and level timings of"
                     ++ " foundmem_time=~w found0_time=~w found1_time=~w" 
@@ -171,6 +171,8 @@
         {info, "Archiving filename ~s as unused at startup"}},
     {"P0041",
         {info, "Penciller manifest switched from SQN ~w to ~w"}},
+    {"P0042",
+        {warn, "Cache full so attempting roll memory with l0_size=~w"}},
         
     {"PC001",
         {info, "Penciller's clerk ~w started with owner ~w"}},
diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index 74e3756..2163e80 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -672,13 +672,27 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
     %
     % Check the approximate size of the cache.  If it is over the maximum size,
     % trigger a background L0 file write and update state of levelzero_pending.
-    case State#state.levelzero_pending or State#state.work_backlog of
-        true ->
+    CacheUpdateBlockedByPendingWork
+        = State#state.levelzero_pending or State#state.work_backlog,
+    CacheFull = leveled_pmem:cache_full(State#state.levelzero_cache),
+    case {CacheUpdateBlockedByPendingWork, CacheFull} of
+        {true, _} ->
             leveled_log:log("P0018", [returned,
                                         State#state.levelzero_pending,
                                         State#state.work_backlog]),
             {reply, returned, State};
-        false ->
+        {false, true} ->
+            leveled_log:log("P0042", [State#state.levelzero_size]),
+            % The cache is full (there are 127 items already in it), so
+            % can't accept any more.  However, we need to try and roll
+            % memory otherwise cache may be permanently full.
+            gen_server:reply(From, returned),
+            {L0Pend, L0Constructor, none} =
+                maybe_roll_memory(State, false),
+            {noreply, 
+                State#state{levelzero_pending=L0Pend,
+                            levelzero_constructor=L0Constructor}};
+        {false, false} ->
             leveled_log:log("P0018", [ok, false, false]),
             PushedTree =
                 case is_tuple(LedgerTable) of
@@ -689,7 +703,9 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
                                                         ?CACHE_TYPE)
                 end,
             % Reply must happen after the table has been converted
-            gen_server:reply(From, ok), 
+            gen_server:reply(From, ok),
+            % Update LevelZero will add to the cache and maybe roll the
+            % cache from memory to L0 disk if the cache is too big
             {noreply,
                 update_levelzero(State#state.levelzero_size,
                                     {PushedTree, PushedIdx, MinSQN, MaxSQN},
@@ -893,13 +909,16 @@ handle_call(close, _From, State) ->
     % on the clerk.
     ok = leveled_pclerk:clerk_close(State#state.clerk),
     leveled_log:log("P0008", [close]),
-
-    L0_Present = leveled_pmanifest:key_lookup(State#state.manifest, 0, all),
     L0_Left = State#state.levelzero_size > 0,
-    case {State#state.levelzero_pending, L0_Present, L0_Left} of
-        {false, false, true} ->
-            {L0Pid, _L0Bloom} = roll_memory(State, true),
-            ok = leveled_sst:sst_close(L0Pid);
+    case {State#state.levelzero_pending, L0_Left} of
+        {false, true} ->
+            {_L0Pend, L0Pid, _L0Bloom} = maybe_roll_memory(State, true),
+            case is_pid(L0Pid) of
+                true ->
+                    ok = leveled_sst:sst_close(L0Pid);
+                false ->
+                    ok
+            end;
         StatusTuple ->
             leveled_log:log("P0010", [StatusTuple])
     end,
@@ -1249,8 +1268,6 @@ update_levelzero(L0Size, {PushedTree, PushedIdx, MinSQN, MaxSQN},
                                     ledger_sqn=UpdMaxSQN},
             CacheTooBig = NewL0Size > State#state.levelzero_maxcachesize,
             CacheMuchTooBig = NewL0Size > ?SUPER_MAX_TABLE_SIZE,
-            L0Free = 
-                not leveled_pmanifest:levelzero_present(State#state.manifest),
             RandomFactor =
                 case State#state.levelzero_cointoss of
                     true ->
@@ -1265,20 +1282,36 @@ update_levelzero(L0Size, {PushedTree, PushedIdx, MinSQN, MaxSQN},
                 end,
             NoPendingManifestChange = not State#state.work_ongoing,
             JitterCheck = RandomFactor or CacheMuchTooBig,
-            case {CacheTooBig, L0Free, JitterCheck, NoPendingManifestChange} of
-                {true, true, true, true}  ->
-                    {L0Constructor, none} = roll_memory(UpdState, false),
-                    leveled_log:log_timer("P0031", [true, true], SW),
-                    UpdState#state{levelzero_pending=true,
+            case {CacheTooBig, JitterCheck, NoPendingManifestChange} of
+                {true, true, true}  ->
+                    {L0Pend, L0Constructor, none} =
+                        maybe_roll_memory(UpdState, false),
+                    leveled_log:log_timer("P0031", [true, true, L0Pend], SW),
+                    UpdState#state{levelzero_pending=L0Pend,
                                     levelzero_constructor=L0Constructor};
                 _ ->
                     leveled_log:log_timer("P0031", 
-                                            [CacheTooBig, JitterCheck], 
+                                            [CacheTooBig, JitterCheck, false], 
                                             SW),
                     UpdState
             end
     end.
 
+
+-spec maybe_roll_memory(pcl_state(), boolean())
+                -> {boolean(), pid()|undefined, leveled_ebloom:bloom()|none}.
+%% @doc
+%% Check that no L0 file is present before rolling memory
+maybe_roll_memory(State, SyncRoll) ->
+    BlockedByL0 = leveled_pmanifest:levelzero_present(State#state.manifest),
+    case BlockedByL0 of
+        true ->
+            {false, undefined, none};
+        false ->
+            {L0Constructor, Bloom} = roll_memory(State, SyncRoll),
+            {true, L0Constructor, Bloom}
+    end.
+
 -spec roll_memory(pcl_state(), boolean()) 
                                     -> {pid(), leveled_ebloom:bloom()|none}.
 %% @doc
@@ -1916,7 +1949,10 @@ add_missing_hash({K, {SQN, ST, MD}}) ->
 clean_dir_test() ->
     % Pointless gesture to test coverage
     RootPath = "../test/ledger",
-    ok = clean_subdir(RootPath ++ "/test.bob").
+    ok = filelib:ensure_dir(RootPath),
+    ?assertMatch(ok, file:write_file(RootPath ++ "/test.bob", "hello")),
+    ok = clean_subdir(RootPath ++ "/test.bob"),
+    ok = file:delete(RootPath ++ "/test.bob").
 
 
 archive_files_test() ->
diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl
index f19103a..e2de638 100644
--- a/src/leveled_pmem.erl
+++ b/src/leveled_pmem.erl
@@ -38,7 +38,8 @@
         add_to_index/3,
         new_index/0,
         clear_index/1,
-        check_index/2
+        check_index/2,
+        cache_full/1
         ]).      
 
 -include_lib("eunit/include/eunit.hrl").
@@ -52,6 +53,12 @@
 %%% API
 %%%============================================================================
 
+-spec cache_full(list()) -> boolean().
+%% @doc
+%% If there are already 127 entries in the cache then the cache is full
+cache_full(L0Cache) ->
+    length(L0Cache) == 127.
+
 -spec prepare_for_index(index_array(), leveled_codec:segment_hash()) 
                                                             -> index_array().
 %% @doc
diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl
index b610440..5aef2e8 100644
--- a/test/end_to_end/basic_SUITE.erl
+++ b/test/end_to_end/basic_SUITE.erl
@@ -474,9 +474,16 @@ fetchput_snapshot(_Config) ->
 load_and_count(_Config) ->
     % Use artificially small files, and the load keys, counting they're all
     % present
+    load_and_count(50000000, 2500, 28000),
+    load_and_count(200000000, 100, 300000).
+
+
+load_and_count(JournalSize, BookiesMemSize, PencillerMemSize) ->
     RootPath = testutil:reset_filestructure(),
     StartOpts1 = [{root_path, RootPath},
-                    {max_journalsize, 50000000},
+                    {max_journalsize, JournalSize},
+                    {cache_size, BookiesMemSize},
+                    {max_pencillercachesize, PencillerMemSize},
                     {sync_strategy, testutil:sync_strategy()}],
     {ok, Bookie1} = leveled_bookie:book_start(StartOpts1),
     {TestObject, TestSpec} = testutil:generate_testobject(),