From f8c749497f62d9383e1399dfa40499e53ae0ca46 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Tue, 5 Mar 2013 19:40:57 -0800 Subject: [PATCH] updated comments in decoder --- src/jsx_decoder.erl | 95 ++++++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 41 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 3e08c2c..688a324 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -23,7 +23,7 @@ -module(jsx_decoder). -%% inline sequence accumulation, handle_event and format_number +%% inline sequence accumulation, handle_event, format_number and maybe_replace -compile({inline, [new_seq/0, new_seq/1, acc_seq/2, end_seq/1, end_seq/2]}). -compile({inline, [handle_event/3]}). -compile({inline, [format_number/1]}). @@ -38,6 +38,38 @@ decoder(Handler, State, Config) -> fun(JSON) -> start(JSON, {Handler, Handler:init(State)}, [], jsx_utils:parse_config(Config)) end. +%% resume allows continuation from interrupted decoding without having to explicitly export +%% all states +-spec resume( + Rest::binary(), + State::atom(), + Handler::{atom(), any()}, + Acc::any(), + Stack::list(atom()), + Config::jsx:config() + ) -> jsx:decoder(). + +resume(Rest, State, Handler, Acc, Stack, Config) -> + case State of + start -> start(Rest, Handler, Stack, Config); + value -> value(Rest, Handler, Stack, Config); + object -> object(Rest, Handler, Stack, Config); + array -> array(Rest, Handler, Stack, Config); + colon -> colon(Rest, Handler, Stack, Config); + key -> key(Rest, Handler, Stack, Config); + string -> string(Rest, Handler, Acc, Stack, Config); + integer -> integer(Rest, Handler, Acc, Stack, Config); + decimal -> decimal(Rest, Handler, Acc, Stack, Config); + exp -> exp(Rest, Handler, Acc, Stack, Config); + true -> true(Rest, Handler, Stack, Config); + false -> false(Rest, Handler, Stack, Config); + null -> null(Rest, Handler, Stack, Config); + comment -> comment(Rest, Handler, Acc, Stack, Config); + maybe_done -> maybe_done(Rest, Handler, Stack, Config); + done -> done(Rest, Handler, Stack, Config) + end. + + -include("jsx_config.hrl"). @@ -122,27 +154,7 @@ incomplete(State, Rest, Handler, Acc, Stack, Config=#config{incomplete_handler=F F(Rest, {decoder, State, Handler, Acc, Stack}, jsx_utils:config_to_list(Config)). -resume(Rest, State, Handler, Acc, Stack, Config) -> - case State of - start -> start(Rest, Handler, Stack, Config); - value -> value(Rest, Handler, Stack, Config); - object -> object(Rest, Handler, Stack, Config); - array -> array(Rest, Handler, Stack, Config); - colon -> colon(Rest, Handler, Stack, Config); - key -> key(Rest, Handler, Stack, Config); - string -> string(Rest, Handler, Acc, Stack, Config); - integer -> integer(Rest, Handler, Acc, Stack, Config); - decimal -> decimal(Rest, Handler, Acc, Stack, Config); - exp -> exp(Rest, Handler, Acc, Stack, Config); - true -> true(Rest, Handler, Stack, Config); - false -> false(Rest, Handler, Stack, Config); - null -> null(Rest, Handler, Stack, Config); - comment -> comment(Rest, Handler, Acc, Stack, Config); - maybe_done -> maybe_done(Rest, Handler, Stack, Config); - done -> done(Rest, Handler, Stack, Config) - end. - - +%% lists are benchmarked to be faster (tho higher in memory usage) than binaries new_seq() -> []. new_seq(C) -> [C]. @@ -278,8 +290,10 @@ key(Bin, Handler, Stack, Config) -> ?error(key, Bin, Handler, Stack, Config). -%% explicitly whitelist ascii set for better efficiency (seriously, it's worth -%% almost a 20% increase) +%% explicitly whitelist ascii set for faster parsing. really? really. someone should +%% submit a patch that unrolls simple guards +%% note that if you encounter an error from string and you can't find the clause that +%% caused it here, it might be in unescape below string(<<32, Rest/binary>>, Handler, Acc, Stack, Config) -> string(Rest, Handler, acc_seq(Acc, 32), Stack, Config); string(<<33, Rest/binary>>, Handler, Acc, Stack, Config) -> @@ -534,13 +548,14 @@ string(<>, Handler, Acc, Stack, Config) when X >= 16#100000 string(<<237, X, _, Rest/binary>>, Handler, Acc, Stack, Config=#config{replaced_bad_utf8=true}) when X >= 160 -> string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); -%% u+fffe and u+ffff for R14BXX -string(<<239, 191, X, Rest/binary>>, Handler, Acc, Stack, Config=#config{replaced_bad_utf8=true}) - when X == 190; X == 191 -> - string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); %% u+xfffe, u+xffff, control codes and other noncharacters string(<<_/utf8, Rest/binary>>, Handler, Acc, Stack, Config=#config{replaced_bad_utf8=true}) -> string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); +%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match the +%% preceeding clause +string(<<239, 191, X, Rest/binary>>, Handler, Acc, Stack, Config=#config{replaced_bad_utf8=true}) + when X == 190; X == 191 -> + string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); %% overlong encodings and missing continuations of a 2 byte sequence string(<>, Handler, Acc, Stack, Config=#config{replaced_bad_utf8=true}) when X >= 192, X =< 223 -> @@ -562,10 +577,6 @@ string(Bin, Handler, Acc, Stack, Config) -> false -> ?error(string, Bin, Handler, Acc, Stack, Config) end. - -%% string appends it's output to the term at the top of the stack. for -%% efficiency the strings are build in reverse order and reversed before -%% being added to the output stream %% when parsing strings, the naive detection of partial codepoints is %% insufficient. this incredibly anal function should detect all badly formed %% utf sequences @@ -579,28 +590,29 @@ is_partial_utf(<>) true; is_partial_utf(_) -> false. - %% strips continuation bytes after bad utf bytes, guards against both too short %% and overlong sequences. N is the maximum number of bytes to strip -%% if end of input is reached before stripping the max number of continuations -%% possible magic numbers are reinserted into the stream that get us back to -%% the same state without complicated machinery strip_continuations(Rest, Handler, Acc, Stack, Config, 0) -> string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); strip_continuations(<>, Handler, Acc, Stack, Config, N) when X >= 128, X =< 191 -> strip_continuations(Rest, Handler, Acc, Stack, Config, N - 1); -%% incomplete +%% if end of input is reached before stripping the max number of continuations +%% possible magic numbers are reinserted into the stream that get us back to +%% the same state without complicated machinery strip_continuations(<<>>, Handler, Acc, Stack, Config, N) -> case N of 1 -> incomplete(string, <<192>>, Handler, Acc, Stack, Config); 2 -> incomplete(string, <<224>>, Handler, Acc, Stack, Config); 3 -> incomplete(string, <<240>>, Handler, Acc, Stack, Config) end; -%% not a continuation byte, dispatch back to string +%% not a continuation byte, insert a replacement character for sequence thus +%% far and dispatch back to string strip_continuations(Rest, Handler, Acc, Stack, Config, _) -> string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config). +%% this all gets really gross and should probably eventually be folded into +%% but for now it fakes being part of string on incompletes and errors unescape(<>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) -> string(Rest, Handler, acc_seq(Acc, [?rsolidus, C]), Stack, Config); unescape(<<$b, Rest/binary>>, Handler, Acc, Stack, Config) -> @@ -696,8 +708,9 @@ maybe_replace(X, #config{escaped_strings=true}) when X < 32 -> maybe_replace(X, _Config) -> [X]. -%% like strings, numbers are collected in an intermediate accumulator before -%% being emitted to the callback handler +%% like in strings, there's some pseudo states in here that will never +%% show up in errors or incompletes. some show up in value, some show +%% up in integer, decimal or exp negative(<<$0, Rest/binary>>, Handler, Acc, Stack, Config) -> zero(Rest, Handler, acc_seq(Acc, $0), Stack, Config); negative(<>, Handler, Acc, Stack, Config) when ?is_nonzero(S) -> @@ -732,6 +745,7 @@ integer(Bin, Handler, Acc, Stack, Config) -> decimal(<>, Handler, Acc, Stack, Config) when S=:= ?zero; ?is_nonzero(S) -> decimal(Rest, Handler, acc_seq(Acc, S), Stack, Config); +%% guard against the insidious `1.e1` error decimal(<>, Handler, Acc, Stack, Config) when S =:= $e; S =:= $E -> case Acc of [?decimalpoint|_] -> ?error(decimal, <>, Handler, Acc, Stack, Config); @@ -799,7 +813,6 @@ finish_number(Bin, Handler, {NumType, Acc}, Stack, Config) -> ?error(value, <<$0, Bin/binary>>, Handler, OldAcc, Stack, Config) end. - format_number({zero, Acc}) -> {integer, list_to_integer(lists:reverse(Acc))}; format_number({integer, Acc}) -> {integer, list_to_integer(lists:reverse(Acc))}; format_number({decimal, Acc}) -> {float, list_to_float(lists:reverse(Acc))};