Settled on sizes

Also removed length check due to warning in Erlang guidance about
non-constant time nature of this command.  Intend to remove lengths from
elsewhere (especially when used simply for logging).
This commit is contained in:
martinsumner 2016-12-24 00:41:50 +00:00
parent b1a3b4ad13
commit 4f838f6f88
2 changed files with 42 additions and 11 deletions

View file

@ -18,7 +18,6 @@
-export([
from_list/1,
from_list/2,
to_sstlist/1,
from_sortedlist/1,
from_sortedlist/2,
to_list/1,
@ -38,7 +37,6 @@
-define(SKIP_WIDTH, 16).
-define(LIST_HEIGHT, 2).
-define(SST_WIDTH, 16).
-define(INFINITY_KEY, {null, null, null, null, null}).
-define(BITARRAY_SIZE, 2048).
@ -96,9 +94,6 @@ from_sortedlist(SortedKVL, BloomProtect) ->
end,
{Bloom0, from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT)}.
to_sstlist(SortedKVL) ->
{list_only, from_list(SortedKVL, ?SST_WIDTH, ?LIST_HEIGHT)}.
lookup(Key, SkipList) ->
case element(1, SkipList) of
list_only ->

View file

@ -3,8 +3,7 @@
%% A FSM module intended to wrap a persisted, ordered view of Keys and Values
%%
%% The persisted view is built from a list (which may be created by merging
%% multiple lists).
%%
%% multiple lists). The list is built first, then the view is created in bulk.
%%
%% -------- Slots ---------
%%
@ -21,7 +20,7 @@
%% flatten back to list - 164 microseconds
%%
%% GBTree:
%% build and serialise tree 402 microseconds
%% build and serialise tree 1433 microseconds
%% de-serialise and check * 128 - 15263 microseconds
%% flatten back to list - 175 microseconds
%%
@ -30,6 +29,28 @@
%% the size of the slot, wherease the serialisation time is relatively constant
%% with growth. So bigger slots would be quicker to build, but the penalty for
%% that speed is too high at lookup time.
%%
%% -------- Blooms ---------
%%
%% There are two different tiny blooms for each table. One is split by the
%% first byte of the hash, and consists of two hashes (derived from the
%% remainder of the hash). This is the top bloom, and the size vaires by
%% level.
%% Level 0 has 8 bits per key - 0.05 fpr
%% Level 1 has 6 bits per key - 0.08 fpr
%% Other Levels have 4 bits per key - 0.15 fpr
%%
%% If this level is passed, then each slot has its own bloom based on the
%% same hash, but now split into three hashes and having a fixed 8 bit per
%% key size at all levels.
%% Slot Bloom has 8 bits per key - 0.03 fpr
%%
%% All blooms are base don the DJ Bernstein magic hash which proved to give
%% the predicted fpr in tests (unlike phash2 which has significantly higher
%% fpr). Due to the cost of producing the magic hash, it is read from the
%% value not reproduced each time. If the value is set to no_lookup no bloom
%% entry is added, and if all hashes are no_lookup in the slot then no bloom
%% is produced.
-module(leveled_sst).
@ -38,6 +59,7 @@
-define(SLOT_SIZE, 128).
-define(COMPRESSION_LEVEL, 1).
-define(LEVEL_BLOOM_SLOTS, [{0, 64}, {1, 48}, {default, 32}]).
-include_lib("eunit/include/eunit.hrl").
@ -57,6 +79,20 @@
%%% Internal Functions
%%%============================================================================
build_table_summary(SlotIndex, AllHashes, Level) ->
BloomSlots =
case lists:keyfind(Level, ?LEVEL_BLOOM_SLOTS) of
{Level, N} ->
N;
false ->
element(2, lists:keyfind(default, ?LEVEL_BLOOM_SLOTS))
end,
Bloom = lists:foldr(fun leveled_tinybloom:enter/2,
leveled_bloom:empty(BloomSlots),
AllHashes),
SkipSlot = leveled_skiplist:from_sortedlist(lists:reverse(SlotIndex)),
term_to_binary({SkipSlot, Bloom}, [{comprressed, ?COMPRESSION_LEVEL}]).
build_all_slots(KVList, BasePosition) ->
build_all_slots(KVList, BasePosition, [], 1, []).
@ -75,8 +111,8 @@ build_all_slots(KVList, StartPosition, AllHashes, SlotID, SlotIndex) ->
[{hash, H}|Acc]
end
end,
HashList = lists:foldr(ExtractHashFun, [], KVList),
{SlotBin, Bloom} = build_slot(KVList, HashList),
HashList = lists:foldr(ExtractHashFun, [], SlotList),
{SlotBin, Bloom} = build_slot(SlotList, HashList),
Length = byte_size(SlotBin),
SlotIndexV = #slot_index_value{slot_id = SlotID,
bloom = Bloom,
@ -89,7 +125,7 @@ build_all_slots(KVList, StartPosition, AllHashes, SlotID, SlotIndex) ->
[{LastKey, SlotIndexV}|SlotIndex]).
build_slot(KVList, HashList) when length(KVList) =< ?SLOT_SIZE ->
build_slot(KVList, HashList) ->
Tree = gb_trees:from_orddict(KVList),
Bloom = lists:foldr(fun leveled_tinybloom:tiny_enter/2,
leveled_tinybloom:tiny_empty(),