Merge pull request #167 from russelldb/rdb/spec-folds-cp

Provide a top level API for folds
2018-09-07 11:18:23 +01:00 · 2018-09-07 11:18:23 +01:00 · faec45ad8e
commit faec45ad8e
parent 3249cc8d3e 3a2d4aa496
6 changed files with 450 additions and 102 deletions
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@ -58,7 +58,6 @@
        book_get/4,
        book_head/3,
        book_head/4,
-        book_returnfolder/2,
        book_snapshot/4,
        book_compactjournal/2,
        book_islastcompactionpending/1,
@ -67,6 +66,21 @@
        book_destroy/1,
        book_isempty/2]).

+%% folding API
+-export([
+         book_returnfolder/2,
+         book_indexfold/5,
+         book_bucketlist/4,
+         book_keylist/3,
+         book_keylist/4,
+         book_keylist/5,
+         book_objectfold/4,
+         book_objectfold/5,
+         book_objectfold/6,
+         book_headfold/6,
+         book_headfold/7
+        ]).
+
 -export([empty_ledgercache/0,
            loadqueue_ledgercache/1,
            push_ledgercache/2,
@ -530,6 +544,298 @@ book_head(Pid, Bucket, Key) ->
 book_returnfolder(Pid, RunnerType) ->
    gen_server:call(Pid, {return_runner, RunnerType}, infinity).

+%% @doc Builds and returns an `{async, Runner}' pair for secondary
+%% index queries. Calling `Runner' will fold over keys (ledger) tagged
+%% with the index `?IDX_TAG' and Constrain the fold to a specific
+%% `Bucket''s index fields, as specified by the `Constraint'
+%% argument. If `Constraint' is a tuple of `{Bucket, Key}' the fold
+%% starts at `Key' (this is useful for implementing pagination, for
+%% example.)  Provide a `FoldAccT' tuple of fold fun ( which is 3
+%% arity fun that will be called once per-matching index entry, with
+%% the Bucket, Primary Key (or {IndexVal and Primary key} if
+%% `ReturnTerms' is true)) and an initial Accumulator, which will be
+%% passed as the 3rd argument in the initial call to
+%% FoldFun. Subsequent calls to FoldFun will use the previous return
+%% of FoldFun as the 3rd argument, and the final return of `Runner' is
+%% the final return of `FoldFun', the final Accumulator value. The
+%% query can filter inputs based on `Range' and `TermHandling'.
+%% `Range' specifies the name of `IndexField' to query, and `Start'
+%% and `End' optionally provide the range to query over.
+%% `TermHandling' is a 2-tuple, the first element is a `boolean()',
+%% `true' meaning return terms, (see fold fun above), `false' meaning
+%% just return primary keys. `TermRegex' is either a regular
+%% expression of type `re:mp()' (that will be run against each index
+%% term value, and only those that match will be accumulated) or
+%% `undefined', which means no regular expression filtering of index
+%% values.
+-spec book_indexfold(pid(),
+                     Constraint:: {Bucket, Key} | Bucket,
+                     FoldAccT :: {FoldFun, Acc},
+                     Range :: {IndexField, Start, End},
+                     TermHandling :: {ReturnTerms, TermRegex}) ->
+                            {async, Runner::fun()}
+                                when Bucket::term(),
+                                     Key::term(),
+                                     FoldFun::fun((Bucket, Key | {IndexVal, Key}, Acc) -> Acc),
+                                     Acc::term(),
+                                     IndexField::term(),
+                                     IndexVal::term(),
+                                     Start::IndexVal,
+                                     End::IndexVal,
+                                     ReturnTerms::boolean(),
+                                     TermRegex :: re:mp() | undefined.
+
+book_indexfold(Pid, Constraint, FoldAccT, Range, TermHandling) ->
+    RunnerType = {index_query, Constraint, FoldAccT, Range, TermHandling},
+    book_returnfolder(Pid, RunnerType).
+
+
+%% @doc list buckets. Folds over the ledger only. Given a `Tag' folds
+%% over the keyspace calling `FoldFun' from `FoldAccT' for each
+%% `Bucket'. `FoldFun' is a 2-arity function that is passed `Bucket'
+%% and `Acc'. On first call `Acc' is the initial `Acc' from
+%% `FoldAccT', thereafter the result of the previous call to
+%% `FoldFun'. `Constraint' can be either atom `all' or `first' meaning
+%% return all buckets, or just the first one found. Returns `{async,
+%% Runner}' where `Runner' is a fun that returns the final value of
+%% `FoldFun', the final `Acc' accumulator.
+-spec book_bucketlist(pid(), Tag, FoldAccT, Constraint) ->
+                             {async, Runner} when
+      Tag :: leveled_codec:tag(),
+      FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Acc) -> Acc),
+      Acc :: term(),
+      Constraint :: first | all,
+      Bucket :: term(),
+      Acc :: term(),
+      Runner :: fun(() -> Acc).
+book_bucketlist(Pid, Tag, FoldAccT, Constraint) ->
+    RunnerType=
+        case Constraint of
+            first-> {first_bucket, Tag, FoldAccT};
+            all -> {bucket_list, Tag, FoldAccT}
+        end,
+    book_returnfolder(Pid, RunnerType).
+
+
+%% @doc fold over the keys (ledger only) for a given `Tag'. Each key
+%% will result in a call to `FoldFun' from `FoldAccT'. `FoldFun' is a
+%% 3-arity function, called with `Bucket', `Key' and `Acc'. The
+%% initial value of `Acc' is the second element of `FoldAccT'. Returns
+%% `{async, Runner}' where `Runner' is a function that will run the
+%% fold and return the final value of `Acc'
+-spec book_keylist(pid(), Tag, FoldAccT) -> {async, Runner} when
+      Tag :: leveled_codec:tag(),
+      FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Key, Acc) -> Acc),
+      Acc :: term(),
+      Bucket :: term(),
+      Key :: term(),
+      Runner :: fun(() -> Acc).
+book_keylist(Pid, Tag, FoldAccT) ->
+    RunnerType = {keylist, Tag, FoldAccT},
+    book_returnfolder(Pid, RunnerType).
+
+%% @doc as for book_keylist/3 but constrained to only those keys in
+%% `Bucket'
+-spec book_keylist(pid(), Tag, Bucket, FoldAccT) -> {async, Runner} when
+      Tag :: leveled_codec:tag(),
+      FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Key, Acc) -> Acc),
+      Acc :: term(),
+      Bucket :: term(),
+      Key :: term(),
+      Runner :: fun(() -> Acc).
+book_keylist(Pid, Tag, Bucket, FoldAccT) ->
+    RunnerType = {keylist, Tag, Bucket, FoldAccT},
+    book_returnfolder(Pid, RunnerType).
+
+%% @doc as for book_keylist/4 with additional constraint that only
+%% keys in the `KeyRange' tuple will be folder over, where `KeyRange'
+%% is `StartKey', the first key in the range and `EndKey' the last,
+%% (inclusive.) Or the atom `all', which will return all keys in the
+%% `Bucket'.
+-spec book_keylist(pid(), Tag, Bucket, KeyRange, FoldAccT) -> {async, Runner} when
+      Tag :: leveled_codec:tag(),
+      FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Key, Acc) -> Acc),
+      Acc :: term(),
+      Bucket :: term(),
+      KeyRange :: {StartKey, EndKey} | all,
+      StartKey :: Key,
+      EndKey :: Key,
+      Key :: term(),
+      Runner :: fun(() -> Acc).
+book_keylist(Pid, Tag, Bucket, KeyRange, FoldAccT) ->
+    RunnerType = {keylist, Tag, Bucket, KeyRange, FoldAccT},
+    book_returnfolder(Pid, RunnerType).
+
+%% @doc fold over all the objects/values in the store in key
+%% order. `Tag' is the tagged type of object. `FoldAccT' is a 2-tuple,
+%% the first element being a 4-arity fun, that is called once for each
+%% key with the arguments `Bucket', `Key', `Value', `Acc'. The 2nd
+%% element is the initial accumulator `Acc' which is passed to
+%% `FoldFun' on it's first call. Thereafter the return value from
+%% `FoldFun' is the 4th argument to the next call of
+%% `FoldFun'. `SnapPreFold' is a boolean where `true' means take the
+%% snapshot at once, and `false' means take the snapshot when the
+%% returned `Runner' is executed. Return `{async, Runner}' where
+%% `Runner' is a 0-arity function that returns the final accumulator
+%% from `FoldFun'
+-spec book_objectfold(pid(), Tag, FoldAccT, SnapPreFold) -> {async, Runner} when
+      Tag :: leveled_codec:tag(),
+      FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Key, Value, Acc) -> Acc),
+      Acc :: term(),
+      Bucket :: term(),
+      Key :: term(),
+      Value :: term(),
+      SnapPreFold :: boolean(),
+      Runner :: fun(() -> Acc).
+book_objectfold(Pid, Tag, FoldAccT, SnapPreFold) ->
+    RunnerType = {foldobjects_allkeys, Tag, FoldAccT, SnapPreFold},
+    book_returnfolder(Pid, RunnerType).
+
+%% @doc exactly as book_objectfold/4 with the additional parameter
+%% `Order'. `Order' can be `sqn_order' or `key_order'. In
+%% book_objectfold/4 and book_objectfold/6 `key_order' is
+%% implied. This function called with `Option == key_order' is
+%% identical to book_objectfold/4. NOTE: if you most fold over ALL
+%% objects, this is quicker than `key_order' due to accessing the
+%% journal objects in thei ron disk order, not via a fold over the
+%% ledger.
+-spec book_objectfold(pid(), Tag, FoldAccT, SnapPreFold, Order) -> {async, Runner} when
+      Tag :: leveled_codec:tag(),
+      FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Key, Value, Acc) -> Acc),
+      Acc :: term(),
+      Bucket :: term(),
+      Key :: term(),
+      Value :: term(),
+      SnapPreFold :: boolean(),
+      Runner :: fun(() -> Acc),
+      Order :: key_order | sqn_order.
+book_objectfold(Pid, Tag, FoldAccT, SnapPreFold, Order) ->
+    RunnerType = {foldobjects_allkeys, Tag, FoldAccT, SnapPreFold, Order},
+    book_returnfolder(Pid, RunnerType).
+
+%% @doc as book_objectfold/4, with the addition of some constraints on
+%% the range of objects folded over. The 3rd argument `Bucket' limits
+%% ths fold to that specific bucket only. The 4th argument `Limiter'
+%% further constrains the fold. `Limiter' can be either a `Range' or
+%% `Index' query. `Range' is either that atom `all', meaning {min,
+%% max}, or, a two tuple of start key and end key, inclusive. Index
+%% Query is a 3-tuple of `{IndexField, StartTerm, EndTerm}`, just as
+%% in book_indexfold/5
+-spec book_objectfold(pid(), Tag, Bucket, Limiter, FoldAccT, SnapPreFold) ->
+                             {async, Runner} when
+      Tag :: leveled_codec:tag(),
+      FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Key, Value, Acc) -> Acc),
+      Acc :: term(),
+      Bucket :: term(),
+      Key :: term(),
+      Value :: term(),
+      Limiter :: Range | Index,
+      Range :: {StartKey, EndKey} | all,
+      Index :: {IndexField, Start, End},
+      IndexField::term(),
+      IndexVal::term(),
+      Start::IndexVal,
+      End::IndexVal,
+      StartKey :: Key,
+      EndKey :: Key,
+      SnapPreFold :: boolean(),
+      Runner :: fun(() -> Acc).
+book_objectfold(Pid, Tag, Bucket, Limiter, FoldAccT, SnapPreFold) ->
+    RunnerType =
+        case Limiter of
+            all ->
+                {foldobjects_bybucket, Tag, Bucket, all, FoldAccT, SnapPreFold};
+            Range when is_tuple(Range) andalso size(Range) == 2 ->
+                {foldobjects_bybucket, Tag, Bucket, Range, FoldAccT, SnapPreFold};
+            IndexQuery when is_tuple(IndexQuery) andalso size(IndexQuery) == 3 ->
+                IndexQuery = Limiter,
+                {foldobjects_byindex, Tag, Bucket, IndexQuery, FoldAccT, SnapPreFold}
+        end,
+    book_returnfolder(Pid, RunnerType).
+
+
+%% @doc LevelEd stores not just Keys in the ledger, but also may store
+%% object metadata, referred to as heads (after Riak head request for
+%% object metadata) Often when folding over objects all that is really
+%% required is the object metadata. These "headfolds" are an efficient
+%% way to fold over the ledger (possibly wholly in memory) and get
+%% object metadata.
+%%
+%% Fold over the object's head. `Tag' is the tagged type of the
+%% objects to fold over. `FoldAccT' is a 2-tuple. The 1st element is a
+%% 4-arity fold fun, that takes a Bucket, Key, ProxyObject, and the
+%% `Acc'. The ProxyObject is an object that only contains the
+%% head/metadata, and no object data from the journal. The `Acc' in
+%% the first call is that provided as the second element of `FoldAccT'
+%% and thereafter the return of the previous all to the fold fun. If
+%% `JournalCheck' is `true' then the journal is checked to see if the
+%% object in the ledger is present, which means a snapshot of the
+%% whole store is required, if `false', then no such check is
+%% performed, and onlt ledger need be snapshotted. `SnapPreFold' is a
+%% boolean that determines if the snapshot is taken when the folder is
+%% requested `true', or when when run `false'. `SegmentList' can be
+%% `false' meaning, all heads, or a list of integers that designate
+%% segments in a TicTac Tree.
+-spec book_headfold(pid(), Tag, FoldAccT, JournalCheck, SnapPreFold, SegmentList) ->
+                           {async, Runner} when
+      Tag :: leveled_codec:tag(),
+            FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Key, Value, Acc) -> Acc),
+      Acc :: term(),
+      Bucket :: term(),
+      Key :: term(),
+      Value :: term(),
+      JournalCheck :: boolean(),
+      SnapPreFold :: boolean(),
+      SegmentList :: false | list(integer()),
+      Runner :: fun(() -> Acc).
+book_headfold(Pid, Tag, FoldAccT, JournalCheck, SnapPreFold, SegmentList) ->
+    RunnerType = {foldheads_allkeys, Tag, FoldAccT, JournalCheck, SnapPreFold, SegmentList},
+    book_returnfolder(Pid, RunnerType).
+
+%% @doc as book_headfold/6, but with the addition of a `Limiter' that
+%% restricts the set of objects folded over. `Limiter' can either be a
+%% bucket list, or a key range of a single bucket. For bucket list,
+%% the `Limiter' should be a 2-tuple, the first element the tag
+%% `bucket_list' and the second a `list()' of `Bucket'. Only heads
+%% from the listed buckets will be folded over. A single bucket key
+%% range may also be used as a `Limiter', in which case the argument
+%% is a 3-tuple of `{range ,Bucket, Range}' where `Bucket' is a
+%% bucket, and `Range' is a 2-tuple of start key and end key,
+%% inclusive, or the atom `all'. The rest of the arguments are as
+%% `book_headfold/6'
+-spec book_headfold(pid(), Tag, Limiter, FoldAccT, JournalCheck, SnapPreFold, SegmentList) ->
+                           {async, Runner} when
+      Tag :: leveled_codec:tag(),
+      Limiter :: BucketList | BucketKeyRange,
+      BucketList :: {bucket_list, list(Bucket)},
+      BucketKeyRange :: {range, Bucket, KeyRange},
+      KeyRange :: {StartKey, EndKey} | all,
+      StartKey :: Key,
+      EndKey :: Key,
+      FoldAccT :: {FoldFun, Acc},
+      FoldFun :: fun((Bucket, Key, Value, Acc) -> Acc),
+      Acc :: term(),
+      Bucket :: term(),
+      Key :: term(),
+      Value :: term(),
+      JournalCheck :: boolean(),
+      SnapPreFold :: boolean(),
+      SegmentList :: false | list(integer()),
+      Runner :: fun(() -> Acc).
+book_headfold(Pid, Tag, {bucket_list, BucketList}, FoldAccT, JournalCheck, SnapPreFold, SegmentList) ->
+    RunnerType = {foldheads_bybucket, Tag, BucketList, bucket_list, FoldAccT, JournalCheck, SnapPreFold, SegmentList},
+    book_returnfolder(Pid, RunnerType);
+book_headfold(Pid, Tag, {range, Bucket, KeyRange}, FoldAccT, JournalCheck, SnapPreFold, SegmentList) ->
+    RunnerType = {foldheads_bybucket, Tag, Bucket, KeyRange, FoldAccT, JournalCheck, SnapPreFold, SegmentList},
+    book_returnfolder(Pid, RunnerType).

 -spec book_snapshot(pid(), 
                    store|ledger, 
@ -593,8 +899,7 @@ book_destroy(Pid) ->
 %% given tag
 book_isempty(Pid, Tag) ->
    FoldAccT = {fun(_B, _Acc) -> false end, true},
-    {async, Runner} = 
-        gen_server:call(Pid, {return_runner, {first_bucket, Tag, FoldAccT}}),
+    {async, Runner} = book_bucketlist(Pid, Tag, FoldAccT, first),
    Runner().

 %%%============================================================================