From a7ed14b7414845398771e26c97a5392b445f33e7 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Tue, 1 Jun 2010 01:03:28 -0700 Subject: [PATCH] utf8, both varieties of utf16 and both varieties of utf32 are now autodetected properly (either via bom or null order) and decoded properly --- src/jsx.erl | 595 +++++++------------------------------------- src/jsx_common.hrl | 83 ++++++ src/jsx_utf16.erl | 476 +++++++++++++++++++++++++++++++++++ src/jsx_utf16le.erl | 476 +++++++++++++++++++++++++++++++++++ src/jsx_utf32.erl | 476 +++++++++++++++++++++++++++++++++++ src/jsx_utf32le.erl | 476 +++++++++++++++++++++++++++++++++++ src/jsx_utf8.erl | 476 +++++++++++++++++++++++++++++++++++ 7 files changed, 2547 insertions(+), 511 deletions(-) create mode 100644 src/jsx_common.hrl create mode 100644 src/jsx_utf16.erl create mode 100644 src/jsx_utf16le.erl create mode 100644 src/jsx_utf32.erl create mode 100644 src/jsx_utf32le.erl create mode 100644 src/jsx_utf8.erl diff --git a/src/jsx.erl b/src/jsx.erl index 2a76d59..b6a7698 100644 --- a/src/jsx.erl +++ b/src/jsx.erl @@ -24,26 +24,42 @@ -module(jsx). -author("alisdairsullivan@yahoo.ca"). --export([decoder/0, decoder/1, decoder/2]). +-export([decoder/0, decoder/1, decoder/2, detect_encoding/4]). decoder() -> decoder([]). decoder(Opts) -> - F = fun(end_of_stream, State) -> lists:reverse(State) ;(Event, State) -> [Event] ++ State end, + F = fun(end_of_stream, State) -> lists:reverse(State) ;(Event, State) -> [Event] ++ State end, decoder({F, []}, Opts). decoder({F, _} = Callbacks, OptsList) when is_list(OptsList), is_function(F) -> - Opts = parse_opts(OptsList), - decoder(Callbacks, Opts); + start(Callbacks, OptsList); decoder({{Mod, Fun}, State}, OptsList) when is_list(OptsList), is_atom(Mod), is_atom(Fun) -> - Opts = parse_opts(OptsList), - decoder({fun(E, S) -> Mod:Fun(E, S) end, State}, Opts); -decoder(Callbacks, Opts) -> - fun(Stream) -> try start(Stream, [], Callbacks, Opts) catch error:function_clause -> {error, badjson} ;error:badjson -> {error, badjson} end end. + start({fun(E, S) -> Mod:Fun(E, S) end, State}, OptsList). +start(Callbacks, OptsList) -> + Opts = parse_opts(OptsList), + F = case proplists:get_value(encoding, OptsList, auto) of + utf8 -> fun jsx_utf8:start/4 + ; utf16 -> fun jsx_utf16:start/4 + ; utf32 -> fun jsx_utf32:start/4 + ; utf16le -> fun jsx_utf16le:start/4 + ; utf32le -> fun jsx_utf32le:start/4 + ; auto -> fun jsx:detect_encoding/4 + end, + start(Callbacks, Opts, F). +start(Callbacks, Opts, F) -> + fun(Stream) -> + try F(Stream, [], Callbacks, Opts) + catch + error:function_clause -> {error, badjson} + ; error:badjson -> {error, badjson} + end + end. + parse_opts(Opts) -> parse_opts(Opts, {false, codepoint}). @@ -57,511 +73,68 @@ parse_opts([{escaped_unicode, Value}|Rest], {Comments, _EscapedUnicode}) -> parse_opts(Rest, {Comments, Value}); parse_opts([_UnknownOpt|Rest], Opts) -> parse_opts(Rest, Opts). - - -%% option flags - --define(comments_enabled(X), {true, _} = X). --define(escaped_unicode_to_ascii(X), {_, ascii} = X). --define(escaped_unicode_to_codepoint(X), {_, codepoint} = X). - -%% whitespace --define(space, 16#20). --define(tab, 16#09). --define(cr, 16#0D). --define(newline, 16#0A). - -%% object delimiters --define(start_object, 16#7B). --define(end_object, 16#7D). - -%% array delimiters --define(start_array, 16#5B). --define(end_array, 16#5D). - -%% kv seperator --define(comma, 16#2C). --define(quote, 16#22). --define(colon, 16#3A). - -%% string escape sequences --define(escape, 16#5C). --define(rsolidus, 16#5C). --define(solidus, 16#2F). --define(formfeed, 16#0C). --define(backspace, 16#08). --define(unicode, 16#75). - -%% math --define(zero, 16#30). --define(decimalpoint, 16#2E). --define(negative, 16#2D). --define(positive, 16#2B). - -%% comments --define(star, 16#2a). - --define(is_hex(Symbol), - (Symbol >= $a andalso Symbol =< $z); (Symbol >= $A andalso Symbol =< $Z); - (Symbol >= $0 andalso Symbol =< $9) -). - --define(is_nonzero(Symbol), - Symbol >= $1 andalso Symbol =< $9 -). - --define(is_noncontrol(Symbol), - Symbol >= ?space -). - --define(is_whitespace(Symbol), - Symbol =:= ?space; Symbol =:= ?tab; Symbol =:= ?cr; Symbol =:= ?newline -). -%% this code is mostly autogenerated and mostly ugly. apologies. for more insight on -%% Callbacks or Opts, see the comments accompanying decoder/2 (in jsx.erl). Stack -%% is a stack of flags used to track depth and to keep track of whether we are -%% returning from a value or a key inside objects. all pops, peeks and pushes are -%% inlined. the code that handles naked values and comments is not optimized by the -%% compiler for efficient matching, but you shouldn't be using naked values or comments -%% anyways, they are horrible and contrary to the spec. +%% first check to see if there's a bom, if not, use the rfc4627 method for determining +%% encoding. this function makes some assumptions about the validity of the stream +%% which may delay failure later than if an encoding is explicitly provided. + +%% utf8 bom detection +detect_encoding(<<16#ef, 16#bb, 16#bf, Rest/binary>>, Stack, Callbacks, Opts) -> + jsx_utf8:start(Rest, Stack, Callbacks, Opts); + +%% utf32-little bom detection (this has to come before utf16-little) +detect_encoding(<<16#ff, 16#fe, 0, 0, Rest/binary>>, Stack, Callbacks, Opts) -> + jsx_utf32le:start(Rest, Stack, Callbacks, Opts); + +%% utf16-big bom detection +detect_encoding(<<16#fe, 16#ff, Rest/binary>>, Stack, Callbacks, Opts) -> + jsx_utf16:start(Rest, Stack, Callbacks, Opts); + +%% utf16-little bom detection +detect_encoding(<<16#ff, 16#fe, Rest/binary>>, Stack, Callbacks, Opts) -> + jsx_utf16le:start(Rest, Stack, Callbacks, Opts); + +%% utf32-big bom detection +detect_encoding(<<0, 0, 16#fe, 16#ff, Rest/binary>>, Stack, Callbacks, Opts) -> + jsx_utf32:start(Rest, Stack, Callbacks, Opts); + +%% utf8 null order detection +detect_encoding(<> = JSON, Stack, Callbacks, Opts) when X =/= 0, Y =/= 0 -> + jsx_utf8:start(JSON, Stack, Callbacks, Opts); -start(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> - start(Rest, Stack, Callbacks, Opts); -start(<>, Stack, Callbacks, Opts) -> - object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); -start(<>, Stack, Callbacks, Opts) -> - array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); -start(<>, Stack, Callbacks, Opts) -> - string(Rest, Stack, Callbacks, Opts, []); -start(<<$t, Rest/binary>>, Stack, Callbacks, Opts) -> - tr(Rest, Stack, Callbacks, Opts); -start(<<$f, Rest/binary>>, Stack, Callbacks, Opts) -> - fa(Rest, Stack, Callbacks, Opts); -start(<<$n, Rest/binary>>, Stack, Callbacks, Opts) -> - nu(Rest, Stack, Callbacks, Opts); -start(<>, Stack, Callbacks, Opts) -> - negative(Rest, Stack, Callbacks, Opts, "-"); -start(<>, Stack, Callbacks, Opts) -> - zero(Rest, Stack, Callbacks, Opts, "0"); -start(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> - integer(Rest, Stack, Callbacks, Opts, [S]); -start(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> - maybe_comment(Rest, fun(Resume) -> start(Resume, Stack, Callbacks, Opts) end); -start(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> start(Stream, Stack, Callbacks, Opts) end}. +%% utf32-little null order detection +detect_encoding(<> = JSON, Stack, Callbacks, Opts) when X =/= 0 -> + jsx_utf32le:start(JSON, Stack, Callbacks, Opts); + +%% utf16-big null order detection +detect_encoding(<<0, X, 0, Y, _Rest/binary>> = JSON, Stack, Callbacks, Opts) when X =/= 0, Y =/= 0 -> + jsx_utf16:start(JSON, Stack, Callbacks, Opts); + +%% utf16-little null order detection +detect_encoding(<> = JSON, Stack, Callbacks, Opts) when X =/= 0, Y =/= 0 -> + jsx_utf16le:start(JSON, Stack, Callbacks, Opts); - -maybe_done(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> - maybe_done(Rest, Stack, Callbacks, Opts); -maybe_done(<>, [object|Stack], Callbacks, Opts) -> - maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); -maybe_done(<>, [array|Stack], Callbacks, Opts) -> - maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); -maybe_done(<>, [object|Stack], Callbacks, Opts) -> - key(Rest, [key|Stack], Callbacks, Opts); -maybe_done(<>, [array|_] = Stack, Callbacks, Opts) -> - value(Rest, Stack, Callbacks, Opts); -maybe_done(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> - maybe_comment(Rest, fun(Resume) -> maybe_done(Resume, Stack, Callbacks, Opts) end); -maybe_done(<<>>, [], Callbacks, Opts) -> - {fold(end_of_stream, Callbacks), fun(Stream) -> maybe_done(Stream, [], Callbacks, Opts) end}; -maybe_done(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> maybe_done(Stream, Stack, Callbacks, Opts) end}. - - -object(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> - object(Rest, Stack, Callbacks, Opts); -object(<>, Stack, Callbacks, Opts) -> - string(Rest, Stack, Callbacks, Opts, []); -object(<>, [key|Stack], Callbacks, Opts) -> - maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); -object(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> - maybe_comment(Rest, fun(Resume) -> object(Resume, Stack, Callbacks, Opts) end); -object(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> object(Stream, Stack, Callbacks, Opts) end}. - - -array(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> - array(Rest, Stack, Callbacks, Opts); -array(<>, Stack, Callbacks, Opts) -> - string(Rest, Stack, Callbacks, Opts, []); -array(<<$t, Rest/binary>>, Stack, Callbacks, Opts) -> - tr(Rest, Stack, Callbacks, Opts); -array(<<$f, Rest/binary>>, Stack, Callbacks, Opts) -> - fa(Rest, Stack, Callbacks, Opts); -array(<<$n, Rest/binary>>, Stack, Callbacks, Opts) -> - nu(Rest, Stack, Callbacks, Opts); -array(<>, Stack, Callbacks, Opts) -> - negative(Rest, Stack, Callbacks, Opts, "-"); -array(<>, Stack, Callbacks, Opts) -> - zero(Rest, Stack, Callbacks, Opts, "0"); -array(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> - integer(Rest, Stack, Callbacks, Opts, [S]); -array(<>, Stack, Callbacks, Opts) -> - object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); -array(<>, Stack, Callbacks, Opts) -> - array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); -array(<>, [array|Stack], Callbacks, Opts) -> - maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); -array(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> - maybe_comment(Rest, fun(Resume) -> array(Resume, Stack, Callbacks, Opts) end); -array(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> array(Stream, Stack, Callbacks, Opts) end}. - - -value(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> - value(Rest, Stack, Callbacks, Opts); -value(<>, Stack, Callbacks, Opts) -> - string(Rest, Stack, Callbacks, Opts, []); -value(<<$t, Rest/binary>>, Stack, Callbacks, Opts) -> - tr(Rest, Stack, Callbacks, Opts); -value(<<$f, Rest/binary>>, Stack, Callbacks, Opts) -> - fa(Rest, Stack, Callbacks, Opts); -value(<<$n, Rest/binary>>, Stack, Callbacks, Opts) -> - nu(Rest, Stack, Callbacks, Opts); -value(<>, Stack, Callbacks, Opts) -> - negative(Rest, Stack, Callbacks, Opts, "-"); -value(<>, Stack, Callbacks, Opts) -> - zero(Rest, Stack, Callbacks, Opts, "0"); -value(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> - integer(Rest, Stack, Callbacks, Opts, [S]); -value(<>, Stack, Callbacks, Opts) -> - object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); -value(<>, Stack, Callbacks, Opts) -> - array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); -value(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> - maybe_comment(Rest, fun(Resume) -> value(Resume, Stack, Callbacks, Opts) end); -value(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> value(Stream, Stack, Callbacks, Opts) end}. - - -colon(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> - colon(Rest, Stack, Callbacks, Opts); -colon(<>, [key|Stack], Callbacks, Opts) -> - value(Rest, [object|Stack], Callbacks, Opts); -colon(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> - maybe_comment(Rest, fun(Resume) -> colon(Resume, Stack, Callbacks, Opts) end); -colon(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> colon(Stream, Stack, Callbacks, Opts) end}. - - -key(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> - key(Rest, Stack, Callbacks, Opts); -key(<>, Stack, Callbacks, Opts) -> - string(Rest, Stack, Callbacks, Opts, []); -key(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> - maybe_comment(Rest, fun(Resume) -> key(Resume, Stack, Callbacks, Opts) end); -key(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> key(Stream, Stack, Callbacks, Opts) end}. - - -%% string has an additional parameter, an accumulator (Acc) used to hold the intermediate -%% representation of the string being parsed. using a list of integers representing -%% unicode codepoints is faster than constructing binaries, many of which will be -%% converted back to lists by the user anyways. - -%% the clause starting with Bin is necessary for cases where a stream is broken at a -%% point where it contains only a partial utf-8 sequence. - -string(<>, [key|_] = Stack, Callbacks, Opts, Acc) -> - colon(Rest, Stack, fold({key, lists:reverse(Acc)}, Callbacks), Opts); -string(<>, Stack, Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold({string, lists:reverse(Acc)}, Callbacks), Opts); -string(<>, Stack, Callbacks, Opts, Acc) -> - escape(Rest, Stack, Callbacks, Opts, Acc); -string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> - string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -string(Bin, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end}. - - -%% only thing to note here is the additional accumulator passed to escaped_unicode used -%% to hold the codepoint sequence. unescessary, but nicer than using the string -%% accumulator. - -escape(<<$b, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - string(Rest, Stack, Callbacks, Opts, "\b" ++ Acc); -escape(<<$f, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - string(Rest, Stack, Callbacks, Opts, "\f" ++ Acc); -escape(<<$n, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - string(Rest, Stack, Callbacks, Opts, "\n" ++ Acc); -escape(<<$r, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - string(Rest, Stack, Callbacks, Opts, "\r" ++ Acc); -escape(<<$t, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - string(Rest, Stack, Callbacks, Opts, "\t" ++ Acc); -escape(<<$u, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - escaped_unicode(Rest, Stack, Callbacks, Opts, Acc, []); -escape(<>, Stack, Callbacks, Opts, Acc) - when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> - string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -escape(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> escape(Stream, Stack, Callbacks, Opts, Acc) end}. - - -%% this code is ugly and unfortunate, but so is json's handling of escaped unicode -%% codepoint sequences. if the ascii option is present, the sequence is converted -%% to a codepoint and inserted into the string if it represents an ascii value. if -%% the codepoint option is present the sequence is converted and inserted as long -%% as it represents a valid unicode codepoint. this means non-characters -%% representable in 16 bits are not converted (the utf16 surrogates and the two -%% special non-characters). any other option and no conversion is done. - -escaped_unicode(<>, - Stack, - Callbacks, - ?escaped_unicode_to_ascii(Opts), - String, - [C, B, A]) - when ?is_hex(D) -> - case erlang:list_to_integer([A, B, C, D], 16) of - X when X < 128 -> - string(Rest, Stack, Callbacks, Opts, [X] ++ String) - ; _ -> - string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) +%% utf32-big null order detection +detect_encoding(<<0, 0, 0, X, _Rest/binary>> = JSON, Stack, Callbacks, Opts) when X =/= 0 -> + jsx_utf32:start(JSON, Stack, Callbacks, Opts); + +%% trying to parse a json string of a single character encoded in utf8 will fail +%% unless special cased +detect_encoding(<> = JSON, Stack, Callbacks, Opts) when X =/= 0 -> + try jsx_utf8:start(JSON, Stack, Callbacks, Opts) + catch error:function_clause -> + {incomplete, + fun(Stream) -> + detect_encoding(<>, Stack, Callbacks, Opts) + end + } end; -escaped_unicode(<>, - Stack, - Callbacks, - ?escaped_unicode_to_codepoint(Opts), - String, - [C, B, A]) - when ?is_hex(D) -> - case erlang:list_to_integer([A, B, C, D], 16) of - X when X < 16#d800; X > 16#dfff, X < 16#fffe -> - string(Rest, Stack, Callbacks, Opts, [X] ++ String) - ; _ -> - string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) - end; -escaped_unicode(<>, Stack, Callbacks, Opts, String, [C, B, A]) when ?is_hex(D) -> - string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String); -escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) when ?is_hex(S) -> - escaped_unicode(Rest, Stack, Callbacks, Opts, String, [S] ++ Acc); -escaped_unicode(<<>>, Stack, Callbacks, Opts, String, Acc) -> - {incomplete, fun(Stream) -> escaped_unicode(Stream, Stack, Callbacks, Opts, String, Acc) end}. - - -%% like strings, numbers are collected in an intermediate accumulator before -%% being emitted to the callback handler. no processing of numbers is done in -%% process, it's left for the user, though there are convenience functions to -%% convert them into erlang floats/integers in jsx_utils.erl. - -%% TODO: actually write that jsx_utils.erl module mentioned above... - -negative(<<$0, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - zero(Rest, Stack, Callbacks, Opts, "0" ++ Acc); -negative(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> - integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -negative(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> negative(Stream, Stack, Callbacks, Opts, Acc) end}. - - -zero(<>, [object|Stack], Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); -zero(<>, [array|Stack], Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); -zero(<>, [object|Stack], Callbacks, Opts, Acc) -> - key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); -zero(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> - value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); -zero(<>, Stack, Callbacks, Opts, Acc) -> - fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); -zero(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> - maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); -zero(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> - maybe_comment(Rest, fun(Resume) -> zero(Resume, Stack, Callbacks, Opts, Acc) end); -zero(<<>>, [], Callbacks, Opts, Acc) -> - {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), - fun(Stream) -> zero(Stream, [], Callbacks, Opts, Acc) end}; -zero(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> zero(Stream, Stack, Callbacks, Opts, Acc) end}. - - -integer(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> - integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -integer(<>, [object|Stack], Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); -integer(<>, [array|Stack], Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); -integer(<>, [object|Stack], Callbacks, Opts, Acc) -> - key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); -integer(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> - value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); -integer(<>, Stack, Callbacks, Opts, Acc) -> - fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); -integer(<>, Stack, Callbacks, Opts, Acc) -> - integer(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); -integer(<<$e, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); -integer(<<$E, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); -integer(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> - maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); -integer(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> - maybe_comment(Rest, fun(Resume) -> integer(Resume, Stack, Callbacks, Opts, Acc) end); -integer(<<>>, [], Callbacks, Opts, Acc) -> - {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), - fun(Stream) -> integer(Stream, [], Callbacks, Opts, Acc) end}; -integer(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> integer(Stream, Stack, Callbacks, Opts, Acc) end}. - -fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> - fraction(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); -fraction(<>, [array|Stack], Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); -fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> - key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); -fraction(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> - value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); -fraction(<>, Stack, Callbacks, Opts, Acc) -> - fraction(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); -fraction(<<$e, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); -fraction(<<$E, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); -fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> - maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); -fraction(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> - maybe_comment(Rest, fun(Resume) -> fraction(Resume, Stack, Callbacks, Opts, Acc) end); -fraction(<<>>, [], Callbacks, Opts, Acc) -> - {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), - fun(Stream) -> fraction(Stream, [], Callbacks, Opts, Acc) end}; -fraction(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> fraction(Stream, Stack, Callbacks, Opts, Acc) end}. - - -e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> - exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?positive; S =:= ?negative -> - ex(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -e(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> e(Stream, Stack, Callbacks, Opts, Acc) end}. - - -ex(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> - exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -ex(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> ex(Stream, Stack, Callbacks, Opts, Acc) end}. - - -exp(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> - exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -exp(<>, [object|Stack], Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); -exp(<>, [array|Stack], Callbacks, Opts, Acc) -> - maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); -exp(<>, [object|Stack], Callbacks, Opts, Acc) -> - key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); -exp(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> - value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); -exp(<>, Stack, Callbacks, Opts, Acc) -> - exp(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); -exp(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> - maybe_comment(Rest, fun(Resume) -> exp(Resume, Stack, Callbacks, Opts, Acc) end); -exp(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> - maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); -exp(<<>>, [], Callbacks, Opts, Acc) -> - {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), - fun(Stream) -> exp(Stream, [], Callbacks, Opts, Acc) end}; -exp(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> exp(Stream, Stack, Callbacks, Opts, Acc) end}. - - -tr(<<$r, Rest/binary>>, Stack, Callbacks, Opts) -> - tru(Rest, Stack, Callbacks, Opts); -tr(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> tr(Stream, Stack, Callbacks, Opts) end}. - - -tru(<<$u, Rest/binary>>, Stack, Callbacks, Opts) -> - true(Rest, Stack, Callbacks, Opts); -tru(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> tru(Stream, Stack, Callbacks, Opts) end}. - - -true(<<$e, Rest/binary>>, Stack, Callbacks, Opts) -> - maybe_done(Rest, Stack, fold({literal, true}, Callbacks), Opts); -true(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> true(Stream, Stack, Callbacks, Opts) end}. - - -fa(<<$a, Rest/binary>>, Stack, Callbacks, Opts) -> - fal(Rest, Stack, Callbacks, Opts); -fa(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> fa(Stream, Stack, Callbacks, Opts) end}. - - -fal(<<$l, Rest/binary>>, Stack, Callbacks, Opts) -> - fals(Rest, Stack, Callbacks, Opts); -fal(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> fal(Stream, Stack, Callbacks, Opts) end}. - - -fals(<<$s, Rest/binary>>, Stack, Callbacks, Opts) -> - false(Rest, Stack, Callbacks, Opts); -fals(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> fals(Stream, Stack, Callbacks, Opts) end}. - - -false(<<$e, Rest/binary>>, Stack, Callbacks, Opts) -> - maybe_done(Rest, Stack, fold({literal, false}, Callbacks), Opts); -false(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> false(Stream, Stack, Callbacks, Opts) end}. - - -nu(<<$u, Rest/binary>>, Stack, Callbacks, Opts) -> - nul(Rest, Stack, Callbacks, Opts); -nu(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> nu(Stream, Stack, Callbacks, Opts) end}. - - -nul(<<$l, Rest/binary>>, Stack, Callbacks, Opts) -> - null(Rest, Stack, Callbacks, Opts); -nul(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> nul(Stream, Stack, Callbacks, Opts) end}. - - -null(<<$l, Rest/binary>>, Stack, Callbacks, Opts) -> - maybe_done(Rest, Stack, fold({literal, null}, Callbacks), Opts); -null(<<>>, Stack, Callbacks, Opts) -> - {incomplete, fun(Stream) -> null(Stream, Stack, Callbacks, Opts) end}. - - -%% comments are c style, /* blah blah */ and are STRONGLY discouraged. any unicode -%% character is valid in a comment, except, obviously the */ sequence which ends -%% the comment. they're implemented as a closure called when the comment ends that -%% returns execution to the point where the comment began. comments are not -%% recorded in any way, simply parsed. - -maybe_comment(<>, Resume) -> - comment(Rest, Resume); -maybe_comment(<<>>, Resume) -> - {incomplete, fun(Stream) -> maybe_comment(Stream, Resume) end}. - - -comment(<>, Resume) -> - maybe_comment_done(Rest, Resume); -comment(<<_/utf8, Rest/binary>>, Resume) -> - comment(Rest, Resume); -comment(<<>>, Resume) -> - {incomplete, fun(Stream) -> comment(Stream, Resume) end}. - - -maybe_comment_done(<>, Resume) -> - Resume(Rest); -maybe_comment_done(<<>>, Resume) -> - {incomplete, fun(Stream) -> maybe_comment_done(Stream, Resume) end}. - - -%% callbacks to our handler are roughly equivalent to a fold over the events, incremental -%% rather than all at once. - -fold(end_of_stream, {F, State}) -> - F(end_of_stream, State); -fold(Event, {F, State}) when is_function(F) -> - {F, F(Event, State)}. - - + +%% not enough input, request more +detect_encoding(Bin, Stack, Callbacks, Opts) -> + {incomplete, + fun(Stream) -> + detect_encoding(<>, Stack, Callbacks, Opts) + end + }. \ No newline at end of file diff --git a/src/jsx_common.hrl b/src/jsx_common.hrl new file mode 100644 index 0000000..510407b --- /dev/null +++ b/src/jsx_common.hrl @@ -0,0 +1,83 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + +%% option flags + +-define(comments_enabled(X), {true, _} = X). +-define(escaped_unicode_to_ascii(X), {_, ascii} = X). +-define(escaped_unicode_to_codepoint(X), {_, codepoint} = X). + +%% whitespace +-define(space, 16#20). +-define(tab, 16#09). +-define(cr, 16#0D). +-define(newline, 16#0A). + +%% object delimiters +-define(start_object, 16#7B). +-define(end_object, 16#7D). + +%% array delimiters +-define(start_array, 16#5B). +-define(end_array, 16#5D). + +%% kv seperator +-define(comma, 16#2C). +-define(quote, 16#22). +-define(colon, 16#3A). + +%% string escape sequences +-define(escape, 16#5C). +-define(rsolidus, 16#5C). +-define(solidus, 16#2F). +-define(formfeed, 16#0C). +-define(backspace, 16#08). +-define(unicode, 16#75). + +%% math +-define(zero, 16#30). +-define(decimalpoint, 16#2E). +-define(negative, 16#2D). +-define(positive, 16#2B). + +%% comments +-define(star, 16#2a). + +-define(is_hex(Symbol), + (Symbol >= $a andalso Symbol =< $z); (Symbol >= $A andalso Symbol =< $Z); + (Symbol >= $0 andalso Symbol =< $9) +). + +-define(is_nonzero(Symbol), + Symbol >= $1 andalso Symbol =< $9 +). + +-define(is_noncontrol(Symbol), + Symbol >= ?space +). + +-define(is_whitespace(Symbol), + Symbol =:= ?space; Symbol =:= ?tab; Symbol =:= ?cr; Symbol =:= ?newline +). + + + diff --git a/src/jsx_utf16.erl b/src/jsx_utf16.erl new file mode 100644 index 0000000..75bd0db --- /dev/null +++ b/src/jsx_utf16.erl @@ -0,0 +1,476 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + +-module(jsx_utf16). +-author("alisdairsullivan@yahoo.ca"). + +-include("jsx_common.hrl"). + +-export([start/4]). + +-define(encoding, utf16). + + +%% callbacks to our handler are roughly equivalent to a fold over the events, incremental +%% rather than all at once. + +fold(end_of_stream, {F, State}) -> + F(end_of_stream, State); +fold(Event, {F, State}) when is_function(F) -> + {F, F(Event, State)}. + + +%% this code is mostly autogenerated and mostly ugly. apologies. for more insight on +%% Callbacks or Opts, see the comments accompanying decoder/2 (in jsx.erl). Stack +%% is a stack of flags used to track depth and to keep track of whether we are +%% returning from a value or a key inside objects. all pops, peeks and pushes are +%% inlined. the code that handles naked values and comments is not optimized by the +%% compiler for efficient matching, but you shouldn't be using naked values or comments +%% anyways, they are horrible and contrary to the spec. + +start(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + start(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +start(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +start(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +start(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +start(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +start(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +start(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> start(Resume, Stack, Callbacks, Opts) end); +start(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> start(<>, Stack, Callbacks, Opts) end}. + + +maybe_done(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, Callbacks, Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +maybe_done(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + key(Rest, [key|Stack], Callbacks, Opts); +maybe_done(<>, [array|_] = Stack, Callbacks, Opts) -> + value(Rest, Stack, Callbacks, Opts); +maybe_done(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> maybe_done(Resume, Stack, Callbacks, Opts) end); +maybe_done(<<>>, [], Callbacks, Opts) -> + {fold(end_of_stream, Callbacks), fun(Stream) -> maybe_done(Stream, [], Callbacks, Opts) end}; +maybe_done(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> maybe_done(<>, Stack, Callbacks, Opts) end}. + + +object(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + object(Rest, Stack, Callbacks, Opts); +object(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +object(<>, [key|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +object(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> object(Resume, Stack, Callbacks, Opts) end); +object(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> object(<>, Stack, Callbacks, Opts) end}. + + +array(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + array(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +array(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +array(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +array(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +array(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +array(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +array(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +array(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +array(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +array(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> array(Resume, Stack, Callbacks, Opts) end); +array(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> array(<>, Stack, Callbacks, Opts) end}. + + +value(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + value(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +value(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +value(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +value(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +value(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +value(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +value(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +value(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +value(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> value(Resume, Stack, Callbacks, Opts) end); +value(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> value(<>, Stack, Callbacks, Opts) end}. + + +colon(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + colon(Rest, Stack, Callbacks, Opts); +colon(<>, [key|Stack], Callbacks, Opts) -> + value(Rest, [object|Stack], Callbacks, Opts); +colon(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> colon(Resume, Stack, Callbacks, Opts) end); +colon(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> colon(<>, Stack, Callbacks, Opts) end}. + + +key(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + key(Rest, Stack, Callbacks, Opts); +key(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +key(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> key(Resume, Stack, Callbacks, Opts) end); +key(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> key(<>, Stack, Callbacks, Opts) end}. + + +%% string has an additional parameter, an accumulator (Acc) used to hold the intermediate +%% representation of the string being parsed. using a list of integers representing +%% unicode codepoints is faster than constructing binaries, many of which will be +%% converted back to lists by the user anyways. + +%% the clause starting with Bin is necessary for cases where a stream is broken at a +%% point where it contains only a partial utf-8 sequence. + +string(<>, [key|_] = Stack, Callbacks, Opts, Acc) -> + colon(Rest, Stack, fold({key, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold({string, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + escape(Rest, Stack, Callbacks, Opts, Acc); +string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +string(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% only thing to note here is the additional accumulator passed to escaped_unicode used +%% to hold the codepoint sequence. unescessary, but nicer than using the string +%% accumulator. + +escape(<<$b/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\b" ++ Acc); +escape(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\f" ++ Acc); +escape(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\n" ++ Acc); +escape(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\r" ++ Acc); +escape(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\t" ++ Acc); +escape(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, Acc, []); +escape(<>, Stack, Callbacks, Opts, Acc) + when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +escape(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> escape(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% this code is ugly and unfortunate, but so is json's handling of escaped unicode +%% codepoint sequences. if the ascii option is present, the sequence is converted +%% to a codepoint and inserted into the string if it represents an ascii value. if +%% the codepoint option is present the sequence is converted and inserted as long +%% as it represents a valid unicode codepoint. this means non-characters +%% representable in 16 bits are not converted (the utf16 surrogates and the two +%% special non-characters). any other option and no conversion is done. + +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_ascii(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 128 -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_codepoint(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, Stack, Callbacks, Opts, String, [C, B, A]) when ?is_hex(D) -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String); +escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) when ?is_hex(S) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, String, [S] ++ Acc); +escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) -> + {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. + + +%% like strings, numbers are collected in an intermediate accumulator before +%% being emitted to the callback handler. no processing of numbers is done in +%% process, it's left for the user, though there are convenience functions to +%% convert them into erlang floats/integers in jsx_utils.erl. + +%% TODO: actually write that jsx_utils.erl module mentioned above... + +negative(<<$0/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + zero(Rest, Stack, Callbacks, Opts, "0" ++ Acc); +negative(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +negative(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> negative(<>, Stack, Callbacks, Opts, Acc) end}. + + +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +zero(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> zero(Resume, Stack, Callbacks, Opts, Acc) end); +zero(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> zero(Stream, [], Callbacks, Opts, Acc) end}; +zero(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> zero(<>, Stack, Callbacks, Opts, Acc) end}. + + +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) -> + integer(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +integer(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> integer(Resume, Stack, Callbacks, Opts, Acc) end); +integer(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> integer(Stream, [], Callbacks, Opts, Acc) end}; +integer(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> integer(<>, Stack, Callbacks, Opts, Acc) end}. + +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + fraction(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +fraction(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> fraction(Resume, Stack, Callbacks, Opts, Acc) end); +fraction(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> fraction(Stream, [], Callbacks, Opts, Acc) end}; +fraction(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> fraction(<>, Stack, Callbacks, Opts, Acc) end}. + + +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?positive; S =:= ?negative -> + ex(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> e(<>, Stack, Callbacks, Opts, Acc) end}. + + +ex(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +ex(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> ex(<>, Stack, Callbacks, Opts, Acc) end}. + + +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, Stack, Callbacks, Opts, Acc) -> + exp(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +exp(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> exp(Resume, Stack, Callbacks, Opts, Acc) end); +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> exp(Stream, [], Callbacks, Opts, Acc) end}; +exp(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> exp(<>, Stack, Callbacks, Opts, Acc) end}. + + +tr(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tru(Rest, Stack, Callbacks, Opts); +tr(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tr(<>, Stack, Callbacks, Opts) end}. + + +tru(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + true(Rest, Stack, Callbacks, Opts); +tru(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tru(<>, Stack, Callbacks, Opts) end}. + + +true(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, true}, Callbacks), Opts); +true(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> true(<>, Stack, Callbacks, Opts) end}. + + +fa(<<$a/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fal(Rest, Stack, Callbacks, Opts); +fa(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fa(<>, Stack, Callbacks, Opts) end}. + + +fal(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fals(Rest, Stack, Callbacks, Opts); +fal(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fal(<>, Stack, Callbacks, Opts) end}. + + +fals(<<$s/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + false(Rest, Stack, Callbacks, Opts); +fals(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fals(<>, Stack, Callbacks, Opts) end}. + + +false(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, false}, Callbacks), Opts); +false(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> false(<>, Stack, Callbacks, Opts) end}. + + +nu(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nul(Rest, Stack, Callbacks, Opts); +nu(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nu(<>, Stack, Callbacks, Opts) end}. + + +nul(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + null(Rest, Stack, Callbacks, Opts); +nul(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nul(<>, Stack, Callbacks, Opts) end}. + + +null(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, null}, Callbacks), Opts); +null(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> null(<>, Stack, Callbacks, Opts) end}. + + +%% comments are c style, /* blah blah */ and are STRONGLY discouraged. any unicode +%% character is valid in a comment, except, obviously the */ sequence which ends +%% the comment. they're implemented as a closure called when the comment ends that +%% returns execution to the point where the comment began. comments are not +%% recorded in any way, simply parsed. + +maybe_comment(<>, Resume) -> + comment(Rest, Resume); +maybe_comment(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment(<>, Resume) end}. + + +comment(<>, Resume) -> + maybe_comment_done(Rest, Resume); +comment(<<_/?encoding, Rest/binary>>, Resume) -> + comment(Rest, Resume); +comment(Bin, Resume) -> + {incomplete, fun(Stream) -> comment(<>, Resume) end}. + + +maybe_comment_done(<>, Resume) -> + Resume(Rest); +maybe_comment_done(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment_done(<>, Resume) end}. \ No newline at end of file diff --git a/src/jsx_utf16le.erl b/src/jsx_utf16le.erl new file mode 100644 index 0000000..1be5556 --- /dev/null +++ b/src/jsx_utf16le.erl @@ -0,0 +1,476 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + +-module(jsx_utf16le). +-author("alisdairsullivan@yahoo.ca"). + +-include("jsx_common.hrl"). + +-export([start/4]). + +-define(encoding, utf16-little). + + +%% callbacks to our handler are roughly equivalent to a fold over the events, incremental +%% rather than all at once. + +fold(end_of_stream, {F, State}) -> + F(end_of_stream, State); +fold(Event, {F, State}) when is_function(F) -> + {F, F(Event, State)}. + + +%% this code is mostly autogenerated and mostly ugly. apologies. for more insight on +%% Callbacks or Opts, see the comments accompanying decoder/2 (in jsx.erl). Stack +%% is a stack of flags used to track depth and to keep track of whether we are +%% returning from a value or a key inside objects. all pops, peeks and pushes are +%% inlined. the code that handles naked values and comments is not optimized by the +%% compiler for efficient matching, but you shouldn't be using naked values or comments +%% anyways, they are horrible and contrary to the spec. + +start(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + start(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +start(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +start(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +start(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +start(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +start(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +start(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> start(Resume, Stack, Callbacks, Opts) end); +start(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> start(<>, Stack, Callbacks, Opts) end}. + + +maybe_done(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, Callbacks, Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +maybe_done(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + key(Rest, [key|Stack], Callbacks, Opts); +maybe_done(<>, [array|_] = Stack, Callbacks, Opts) -> + value(Rest, Stack, Callbacks, Opts); +maybe_done(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> maybe_done(Resume, Stack, Callbacks, Opts) end); +maybe_done(<<>>, [], Callbacks, Opts) -> + {fold(end_of_stream, Callbacks), fun(Stream) -> maybe_done(Stream, [], Callbacks, Opts) end}; +maybe_done(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> maybe_done(<>, Stack, Callbacks, Opts) end}. + + +object(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + object(Rest, Stack, Callbacks, Opts); +object(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +object(<>, [key|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +object(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> object(Resume, Stack, Callbacks, Opts) end); +object(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> object(<>, Stack, Callbacks, Opts) end}. + + +array(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + array(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +array(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +array(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +array(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +array(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +array(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +array(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +array(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +array(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +array(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> array(Resume, Stack, Callbacks, Opts) end); +array(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> array(<>, Stack, Callbacks, Opts) end}. + + +value(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + value(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +value(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +value(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +value(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +value(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +value(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +value(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +value(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +value(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> value(Resume, Stack, Callbacks, Opts) end); +value(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> value(<>, Stack, Callbacks, Opts) end}. + + +colon(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + colon(Rest, Stack, Callbacks, Opts); +colon(<>, [key|Stack], Callbacks, Opts) -> + value(Rest, [object|Stack], Callbacks, Opts); +colon(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> colon(Resume, Stack, Callbacks, Opts) end); +colon(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> colon(<>, Stack, Callbacks, Opts) end}. + + +key(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + key(Rest, Stack, Callbacks, Opts); +key(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +key(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> key(Resume, Stack, Callbacks, Opts) end); +key(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> key(<>, Stack, Callbacks, Opts) end}. + + +%% string has an additional parameter, an accumulator (Acc) used to hold the intermediate +%% representation of the string being parsed. using a list of integers representing +%% unicode codepoints is faster than constructing binaries, many of which will be +%% converted back to lists by the user anyways. + +%% the clause starting with Bin is necessary for cases where a stream is broken at a +%% point where it contains only a partial utf-8 sequence. + +string(<>, [key|_] = Stack, Callbacks, Opts, Acc) -> + colon(Rest, Stack, fold({key, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold({string, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + escape(Rest, Stack, Callbacks, Opts, Acc); +string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +string(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% only thing to note here is the additional accumulator passed to escaped_unicode used +%% to hold the codepoint sequence. unescessary, but nicer than using the string +%% accumulator. + +escape(<<$b/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\b" ++ Acc); +escape(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\f" ++ Acc); +escape(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\n" ++ Acc); +escape(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\r" ++ Acc); +escape(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\t" ++ Acc); +escape(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, Acc, []); +escape(<>, Stack, Callbacks, Opts, Acc) + when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +escape(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> escape(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% this code is ugly and unfortunate, but so is json's handling of escaped unicode +%% codepoint sequences. if the ascii option is present, the sequence is converted +%% to a codepoint and inserted into the string if it represents an ascii value. if +%% the codepoint option is present the sequence is converted and inserted as long +%% as it represents a valid unicode codepoint. this means non-characters +%% representable in 16 bits are not converted (the utf16 surrogates and the two +%% special non-characters). any other option and no conversion is done. + +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_ascii(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 128 -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_codepoint(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, Stack, Callbacks, Opts, String, [C, B, A]) when ?is_hex(D) -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String); +escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) when ?is_hex(S) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, String, [S] ++ Acc); +escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) -> + {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. + + +%% like strings, numbers are collected in an intermediate accumulator before +%% being emitted to the callback handler. no processing of numbers is done in +%% process, it's left for the user, though there are convenience functions to +%% convert them into erlang floats/integers in jsx_utils.erl. + +%% TODO: actually write that jsx_utils.erl module mentioned above... + +negative(<<$0/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + zero(Rest, Stack, Callbacks, Opts, "0" ++ Acc); +negative(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +negative(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> negative(<>, Stack, Callbacks, Opts, Acc) end}. + + +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +zero(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> zero(Resume, Stack, Callbacks, Opts, Acc) end); +zero(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> zero(Stream, [], Callbacks, Opts, Acc) end}; +zero(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> zero(<>, Stack, Callbacks, Opts, Acc) end}. + + +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) -> + integer(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +integer(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> integer(Resume, Stack, Callbacks, Opts, Acc) end); +integer(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> integer(Stream, [], Callbacks, Opts, Acc) end}; +integer(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> integer(<>, Stack, Callbacks, Opts, Acc) end}. + +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + fraction(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +fraction(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> fraction(Resume, Stack, Callbacks, Opts, Acc) end); +fraction(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> fraction(Stream, [], Callbacks, Opts, Acc) end}; +fraction(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> fraction(<>, Stack, Callbacks, Opts, Acc) end}. + + +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?positive; S =:= ?negative -> + ex(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> e(<>, Stack, Callbacks, Opts, Acc) end}. + + +ex(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +ex(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> ex(<>, Stack, Callbacks, Opts, Acc) end}. + + +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, Stack, Callbacks, Opts, Acc) -> + exp(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +exp(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> exp(Resume, Stack, Callbacks, Opts, Acc) end); +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> exp(Stream, [], Callbacks, Opts, Acc) end}; +exp(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> exp(<>, Stack, Callbacks, Opts, Acc) end}. + + +tr(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tru(Rest, Stack, Callbacks, Opts); +tr(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tr(<>, Stack, Callbacks, Opts) end}. + + +tru(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + true(Rest, Stack, Callbacks, Opts); +tru(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tru(<>, Stack, Callbacks, Opts) end}. + + +true(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, true}, Callbacks), Opts); +true(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> true(<>, Stack, Callbacks, Opts) end}. + + +fa(<<$a/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fal(Rest, Stack, Callbacks, Opts); +fa(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fa(<>, Stack, Callbacks, Opts) end}. + + +fal(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fals(Rest, Stack, Callbacks, Opts); +fal(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fal(<>, Stack, Callbacks, Opts) end}. + + +fals(<<$s/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + false(Rest, Stack, Callbacks, Opts); +fals(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fals(<>, Stack, Callbacks, Opts) end}. + + +false(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, false}, Callbacks), Opts); +false(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> false(<>, Stack, Callbacks, Opts) end}. + + +nu(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nul(Rest, Stack, Callbacks, Opts); +nu(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nu(<>, Stack, Callbacks, Opts) end}. + + +nul(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + null(Rest, Stack, Callbacks, Opts); +nul(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nul(<>, Stack, Callbacks, Opts) end}. + + +null(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, null}, Callbacks), Opts); +null(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> null(<>, Stack, Callbacks, Opts) end}. + + +%% comments are c style, /* blah blah */ and are STRONGLY discouraged. any unicode +%% character is valid in a comment, except, obviously the */ sequence which ends +%% the comment. they're implemented as a closure called when the comment ends that +%% returns execution to the point where the comment began. comments are not +%% recorded in any way, simply parsed. + +maybe_comment(<>, Resume) -> + comment(Rest, Resume); +maybe_comment(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment(<>, Resume) end}. + + +comment(<>, Resume) -> + maybe_comment_done(Rest, Resume); +comment(<<_/?encoding, Rest/binary>>, Resume) -> + comment(Rest, Resume); +comment(Bin, Resume) -> + {incomplete, fun(Stream) -> comment(<>, Resume) end}. + + +maybe_comment_done(<>, Resume) -> + Resume(Rest); +maybe_comment_done(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment_done(<>, Resume) end}. \ No newline at end of file diff --git a/src/jsx_utf32.erl b/src/jsx_utf32.erl new file mode 100644 index 0000000..0a5a505 --- /dev/null +++ b/src/jsx_utf32.erl @@ -0,0 +1,476 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + +-module(jsx_utf32). +-author("alisdairsullivan@yahoo.ca"). + +-include("jsx_common.hrl"). + +-export([start/4]). + +-define(encoding, utf32). + + +%% callbacks to our handler are roughly equivalent to a fold over the events, incremental +%% rather than all at once. + +fold(end_of_stream, {F, State}) -> + F(end_of_stream, State); +fold(Event, {F, State}) when is_function(F) -> + {F, F(Event, State)}. + + +%% this code is mostly autogenerated and mostly ugly. apologies. for more insight on +%% Callbacks or Opts, see the comments accompanying decoder/2 (in jsx.erl). Stack +%% is a stack of flags used to track depth and to keep track of whether we are +%% returning from a value or a key inside objects. all pops, peeks and pushes are +%% inlined. the code that handles naked values and comments is not optimized by the +%% compiler for efficient matching, but you shouldn't be using naked values or comments +%% anyways, they are horrible and contrary to the spec. + +start(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + start(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +start(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +start(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +start(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +start(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +start(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +start(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> start(Resume, Stack, Callbacks, Opts) end); +start(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> start(<>, Stack, Callbacks, Opts) end}. + + +maybe_done(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, Callbacks, Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +maybe_done(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + key(Rest, [key|Stack], Callbacks, Opts); +maybe_done(<>, [array|_] = Stack, Callbacks, Opts) -> + value(Rest, Stack, Callbacks, Opts); +maybe_done(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> maybe_done(Resume, Stack, Callbacks, Opts) end); +maybe_done(<<>>, [], Callbacks, Opts) -> + {fold(end_of_stream, Callbacks), fun(Stream) -> maybe_done(Stream, [], Callbacks, Opts) end}; +maybe_done(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> maybe_done(<>, Stack, Callbacks, Opts) end}. + + +object(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + object(Rest, Stack, Callbacks, Opts); +object(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +object(<>, [key|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +object(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> object(Resume, Stack, Callbacks, Opts) end); +object(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> object(<>, Stack, Callbacks, Opts) end}. + + +array(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + array(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +array(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +array(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +array(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +array(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +array(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +array(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +array(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +array(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +array(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> array(Resume, Stack, Callbacks, Opts) end); +array(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> array(<>, Stack, Callbacks, Opts) end}. + + +value(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + value(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +value(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +value(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +value(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +value(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +value(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +value(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +value(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +value(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> value(Resume, Stack, Callbacks, Opts) end); +value(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> value(<>, Stack, Callbacks, Opts) end}. + + +colon(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + colon(Rest, Stack, Callbacks, Opts); +colon(<>, [key|Stack], Callbacks, Opts) -> + value(Rest, [object|Stack], Callbacks, Opts); +colon(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> colon(Resume, Stack, Callbacks, Opts) end); +colon(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> colon(<>, Stack, Callbacks, Opts) end}. + + +key(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + key(Rest, Stack, Callbacks, Opts); +key(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +key(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> key(Resume, Stack, Callbacks, Opts) end); +key(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> key(<>, Stack, Callbacks, Opts) end}. + + +%% string has an additional parameter, an accumulator (Acc) used to hold the intermediate +%% representation of the string being parsed. using a list of integers representing +%% unicode codepoints is faster than constructing binaries, many of which will be +%% converted back to lists by the user anyways. + +%% the clause starting with Bin is necessary for cases where a stream is broken at a +%% point where it contains only a partial utf-8 sequence. + +string(<>, [key|_] = Stack, Callbacks, Opts, Acc) -> + colon(Rest, Stack, fold({key, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold({string, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + escape(Rest, Stack, Callbacks, Opts, Acc); +string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +string(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% only thing to note here is the additional accumulator passed to escaped_unicode used +%% to hold the codepoint sequence. unescessary, but nicer than using the string +%% accumulator. + +escape(<<$b/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\b" ++ Acc); +escape(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\f" ++ Acc); +escape(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\n" ++ Acc); +escape(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\r" ++ Acc); +escape(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\t" ++ Acc); +escape(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, Acc, []); +escape(<>, Stack, Callbacks, Opts, Acc) + when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +escape(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> escape(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% this code is ugly and unfortunate, but so is json's handling of escaped unicode +%% codepoint sequences. if the ascii option is present, the sequence is converted +%% to a codepoint and inserted into the string if it represents an ascii value. if +%% the codepoint option is present the sequence is converted and inserted as long +%% as it represents a valid unicode codepoint. this means non-characters +%% representable in 16 bits are not converted (the utf16 surrogates and the two +%% special non-characters). any other option and no conversion is done. + +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_ascii(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 128 -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_codepoint(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, Stack, Callbacks, Opts, String, [C, B, A]) when ?is_hex(D) -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String); +escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) when ?is_hex(S) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, String, [S] ++ Acc); +escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) -> + {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. + + +%% like strings, numbers are collected in an intermediate accumulator before +%% being emitted to the callback handler. no processing of numbers is done in +%% process, it's left for the user, though there are convenience functions to +%% convert them into erlang floats/integers in jsx_utils.erl. + +%% TODO: actually write that jsx_utils.erl module mentioned above... + +negative(<<$0/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + zero(Rest, Stack, Callbacks, Opts, "0" ++ Acc); +negative(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +negative(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> negative(<>, Stack, Callbacks, Opts, Acc) end}. + + +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +zero(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> zero(Resume, Stack, Callbacks, Opts, Acc) end); +zero(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> zero(Stream, [], Callbacks, Opts, Acc) end}; +zero(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> zero(<>, Stack, Callbacks, Opts, Acc) end}. + + +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) -> + integer(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +integer(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> integer(Resume, Stack, Callbacks, Opts, Acc) end); +integer(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> integer(Stream, [], Callbacks, Opts, Acc) end}; +integer(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> integer(<>, Stack, Callbacks, Opts, Acc) end}. + +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + fraction(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +fraction(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> fraction(Resume, Stack, Callbacks, Opts, Acc) end); +fraction(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> fraction(Stream, [], Callbacks, Opts, Acc) end}; +fraction(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> fraction(<>, Stack, Callbacks, Opts, Acc) end}. + + +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?positive; S =:= ?negative -> + ex(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> e(<>, Stack, Callbacks, Opts, Acc) end}. + + +ex(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +ex(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> ex(<>, Stack, Callbacks, Opts, Acc) end}. + + +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, Stack, Callbacks, Opts, Acc) -> + exp(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +exp(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> exp(Resume, Stack, Callbacks, Opts, Acc) end); +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> exp(Stream, [], Callbacks, Opts, Acc) end}; +exp(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> exp(<>, Stack, Callbacks, Opts, Acc) end}. + + +tr(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tru(Rest, Stack, Callbacks, Opts); +tr(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tr(<>, Stack, Callbacks, Opts) end}. + + +tru(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + true(Rest, Stack, Callbacks, Opts); +tru(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tru(<>, Stack, Callbacks, Opts) end}. + + +true(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, true}, Callbacks), Opts); +true(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> true(<>, Stack, Callbacks, Opts) end}. + + +fa(<<$a/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fal(Rest, Stack, Callbacks, Opts); +fa(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fa(<>, Stack, Callbacks, Opts) end}. + + +fal(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fals(Rest, Stack, Callbacks, Opts); +fal(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fal(<>, Stack, Callbacks, Opts) end}. + + +fals(<<$s/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + false(Rest, Stack, Callbacks, Opts); +fals(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fals(<>, Stack, Callbacks, Opts) end}. + + +false(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, false}, Callbacks), Opts); +false(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> false(<>, Stack, Callbacks, Opts) end}. + + +nu(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nul(Rest, Stack, Callbacks, Opts); +nu(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nu(<>, Stack, Callbacks, Opts) end}. + + +nul(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + null(Rest, Stack, Callbacks, Opts); +nul(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nul(<>, Stack, Callbacks, Opts) end}. + + +null(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, null}, Callbacks), Opts); +null(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> null(<>, Stack, Callbacks, Opts) end}. + + +%% comments are c style, /* blah blah */ and are STRONGLY discouraged. any unicode +%% character is valid in a comment, except, obviously the */ sequence which ends +%% the comment. they're implemented as a closure called when the comment ends that +%% returns execution to the point where the comment began. comments are not +%% recorded in any way, simply parsed. + +maybe_comment(<>, Resume) -> + comment(Rest, Resume); +maybe_comment(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment(<>, Resume) end}. + + +comment(<>, Resume) -> + maybe_comment_done(Rest, Resume); +comment(<<_/?encoding, Rest/binary>>, Resume) -> + comment(Rest, Resume); +comment(Bin, Resume) -> + {incomplete, fun(Stream) -> comment(<>, Resume) end}. + + +maybe_comment_done(<>, Resume) -> + Resume(Rest); +maybe_comment_done(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment_done(<>, Resume) end}. \ No newline at end of file diff --git a/src/jsx_utf32le.erl b/src/jsx_utf32le.erl new file mode 100644 index 0000000..505b14f --- /dev/null +++ b/src/jsx_utf32le.erl @@ -0,0 +1,476 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + +-module(jsx_utf32le). +-author("alisdairsullivan@yahoo.ca"). + +-include("jsx_common.hrl"). + +-export([start/4]). + +-define(encoding, utf32-little). + + +%% callbacks to our handler are roughly equivalent to a fold over the events, incremental +%% rather than all at once. + +fold(end_of_stream, {F, State}) -> + F(end_of_stream, State); +fold(Event, {F, State}) when is_function(F) -> + {F, F(Event, State)}. + + +%% this code is mostly autogenerated and mostly ugly. apologies. for more insight on +%% Callbacks or Opts, see the comments accompanying decoder/2 (in jsx.erl). Stack +%% is a stack of flags used to track depth and to keep track of whether we are +%% returning from a value or a key inside objects. all pops, peeks and pushes are +%% inlined. the code that handles naked values and comments is not optimized by the +%% compiler for efficient matching, but you shouldn't be using naked values or comments +%% anyways, they are horrible and contrary to the spec. + +start(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + start(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +start(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +start(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +start(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +start(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +start(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +start(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> start(Resume, Stack, Callbacks, Opts) end); +start(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> start(<>, Stack, Callbacks, Opts) end}. + + +maybe_done(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, Callbacks, Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +maybe_done(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + key(Rest, [key|Stack], Callbacks, Opts); +maybe_done(<>, [array|_] = Stack, Callbacks, Opts) -> + value(Rest, Stack, Callbacks, Opts); +maybe_done(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> maybe_done(Resume, Stack, Callbacks, Opts) end); +maybe_done(<<>>, [], Callbacks, Opts) -> + {fold(end_of_stream, Callbacks), fun(Stream) -> maybe_done(Stream, [], Callbacks, Opts) end}; +maybe_done(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> maybe_done(<>, Stack, Callbacks, Opts) end}. + + +object(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + object(Rest, Stack, Callbacks, Opts); +object(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +object(<>, [key|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +object(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> object(Resume, Stack, Callbacks, Opts) end); +object(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> object(<>, Stack, Callbacks, Opts) end}. + + +array(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + array(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +array(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +array(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +array(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +array(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +array(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +array(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +array(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +array(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +array(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> array(Resume, Stack, Callbacks, Opts) end); +array(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> array(<>, Stack, Callbacks, Opts) end}. + + +value(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + value(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +value(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +value(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +value(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +value(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +value(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +value(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +value(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +value(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> value(Resume, Stack, Callbacks, Opts) end); +value(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> value(<>, Stack, Callbacks, Opts) end}. + + +colon(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + colon(Rest, Stack, Callbacks, Opts); +colon(<>, [key|Stack], Callbacks, Opts) -> + value(Rest, [object|Stack], Callbacks, Opts); +colon(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> colon(Resume, Stack, Callbacks, Opts) end); +colon(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> colon(<>, Stack, Callbacks, Opts) end}. + + +key(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + key(Rest, Stack, Callbacks, Opts); +key(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +key(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> key(Resume, Stack, Callbacks, Opts) end); +key(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> key(<>, Stack, Callbacks, Opts) end}. + + +%% string has an additional parameter, an accumulator (Acc) used to hold the intermediate +%% representation of the string being parsed. using a list of integers representing +%% unicode codepoints is faster than constructing binaries, many of which will be +%% converted back to lists by the user anyways. + +%% the clause starting with Bin is necessary for cases where a stream is broken at a +%% point where it contains only a partial utf-8 sequence. + +string(<>, [key|_] = Stack, Callbacks, Opts, Acc) -> + colon(Rest, Stack, fold({key, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold({string, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + escape(Rest, Stack, Callbacks, Opts, Acc); +string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +string(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% only thing to note here is the additional accumulator passed to escaped_unicode used +%% to hold the codepoint sequence. unescessary, but nicer than using the string +%% accumulator. + +escape(<<$b/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\b" ++ Acc); +escape(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\f" ++ Acc); +escape(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\n" ++ Acc); +escape(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\r" ++ Acc); +escape(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\t" ++ Acc); +escape(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, Acc, []); +escape(<>, Stack, Callbacks, Opts, Acc) + when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +escape(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> escape(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% this code is ugly and unfortunate, but so is json's handling of escaped unicode +%% codepoint sequences. if the ascii option is present, the sequence is converted +%% to a codepoint and inserted into the string if it represents an ascii value. if +%% the codepoint option is present the sequence is converted and inserted as long +%% as it represents a valid unicode codepoint. this means non-characters +%% representable in 16 bits are not converted (the utf16 surrogates and the two +%% special non-characters). any other option and no conversion is done. + +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_ascii(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 128 -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_codepoint(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, Stack, Callbacks, Opts, String, [C, B, A]) when ?is_hex(D) -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String); +escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) when ?is_hex(S) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, String, [S] ++ Acc); +escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) -> + {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. + + +%% like strings, numbers are collected in an intermediate accumulator before +%% being emitted to the callback handler. no processing of numbers is done in +%% process, it's left for the user, though there are convenience functions to +%% convert them into erlang floats/integers in jsx_utils.erl. + +%% TODO: actually write that jsx_utils.erl module mentioned above... + +negative(<<$0/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + zero(Rest, Stack, Callbacks, Opts, "0" ++ Acc); +negative(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +negative(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> negative(<>, Stack, Callbacks, Opts, Acc) end}. + + +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +zero(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> zero(Resume, Stack, Callbacks, Opts, Acc) end); +zero(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> zero(Stream, [], Callbacks, Opts, Acc) end}; +zero(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> zero(<>, Stack, Callbacks, Opts, Acc) end}. + + +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) -> + integer(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +integer(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> integer(Resume, Stack, Callbacks, Opts, Acc) end); +integer(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> integer(Stream, [], Callbacks, Opts, Acc) end}; +integer(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> integer(<>, Stack, Callbacks, Opts, Acc) end}. + +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + fraction(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +fraction(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> fraction(Resume, Stack, Callbacks, Opts, Acc) end); +fraction(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> fraction(Stream, [], Callbacks, Opts, Acc) end}; +fraction(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> fraction(<>, Stack, Callbacks, Opts, Acc) end}. + + +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?positive; S =:= ?negative -> + ex(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> e(<>, Stack, Callbacks, Opts, Acc) end}. + + +ex(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +ex(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> ex(<>, Stack, Callbacks, Opts, Acc) end}. + + +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, Stack, Callbacks, Opts, Acc) -> + exp(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +exp(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> exp(Resume, Stack, Callbacks, Opts, Acc) end); +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> exp(Stream, [], Callbacks, Opts, Acc) end}; +exp(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> exp(<>, Stack, Callbacks, Opts, Acc) end}. + + +tr(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tru(Rest, Stack, Callbacks, Opts); +tr(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tr(<>, Stack, Callbacks, Opts) end}. + + +tru(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + true(Rest, Stack, Callbacks, Opts); +tru(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tru(<>, Stack, Callbacks, Opts) end}. + + +true(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, true}, Callbacks), Opts); +true(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> true(<>, Stack, Callbacks, Opts) end}. + + +fa(<<$a/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fal(Rest, Stack, Callbacks, Opts); +fa(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fa(<>, Stack, Callbacks, Opts) end}. + + +fal(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fals(Rest, Stack, Callbacks, Opts); +fal(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fal(<>, Stack, Callbacks, Opts) end}. + + +fals(<<$s/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + false(Rest, Stack, Callbacks, Opts); +fals(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fals(<>, Stack, Callbacks, Opts) end}. + + +false(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, false}, Callbacks), Opts); +false(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> false(<>, Stack, Callbacks, Opts) end}. + + +nu(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nul(Rest, Stack, Callbacks, Opts); +nu(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nu(<>, Stack, Callbacks, Opts) end}. + + +nul(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + null(Rest, Stack, Callbacks, Opts); +nul(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nul(<>, Stack, Callbacks, Opts) end}. + + +null(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, null}, Callbacks), Opts); +null(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> null(<>, Stack, Callbacks, Opts) end}. + + +%% comments are c style, /* blah blah */ and are STRONGLY discouraged. any unicode +%% character is valid in a comment, except, obviously the */ sequence which ends +%% the comment. they're implemented as a closure called when the comment ends that +%% returns execution to the point where the comment began. comments are not +%% recorded in any way, simply parsed. + +maybe_comment(<>, Resume) -> + comment(Rest, Resume); +maybe_comment(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment(<>, Resume) end}. + + +comment(<>, Resume) -> + maybe_comment_done(Rest, Resume); +comment(<<_/?encoding, Rest/binary>>, Resume) -> + comment(Rest, Resume); +comment(Bin, Resume) -> + {incomplete, fun(Stream) -> comment(<>, Resume) end}. + + +maybe_comment_done(<>, Resume) -> + Resume(Rest); +maybe_comment_done(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment_done(<>, Resume) end}. \ No newline at end of file diff --git a/src/jsx_utf8.erl b/src/jsx_utf8.erl new file mode 100644 index 0000000..92a9c61 --- /dev/null +++ b/src/jsx_utf8.erl @@ -0,0 +1,476 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + +-module(jsx_utf8). +-author("alisdairsullivan@yahoo.ca"). + +-include("jsx_common.hrl"). + +-export([start/4]). + +-define(encoding, utf8). + + +%% callbacks to our handler are roughly equivalent to a fold over the events, incremental +%% rather than all at once. + +fold(end_of_stream, {F, State}) -> + F(end_of_stream, State); +fold(Event, {F, State}) when is_function(F) -> + {F, F(Event, State)}. + + +%% this code is mostly autogenerated and mostly ugly. apologies. for more insight on +%% Callbacks or Opts, see the comments accompanying decoder/2 (in jsx.erl). Stack +%% is a stack of flags used to track depth and to keep track of whether we are +%% returning from a value or a key inside objects. all pops, peeks and pushes are +%% inlined. the code that handles naked values and comments is not optimized by the +%% compiler for efficient matching, but you shouldn't be using naked values or comments +%% anyways, they are horrible and contrary to the spec. + +start(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + start(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +start(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +start(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +start(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +start(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +start(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +start(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +start(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +start(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> start(Resume, Stack, Callbacks, Opts) end); +start(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> start(<>, Stack, Callbacks, Opts) end}. + + +maybe_done(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, Callbacks, Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +maybe_done(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +maybe_done(<>, [object|Stack], Callbacks, Opts) -> + key(Rest, [key|Stack], Callbacks, Opts); +maybe_done(<>, [array|_] = Stack, Callbacks, Opts) -> + value(Rest, Stack, Callbacks, Opts); +maybe_done(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> maybe_done(Resume, Stack, Callbacks, Opts) end); +maybe_done(<<>>, [], Callbacks, Opts) -> + {fold(end_of_stream, Callbacks), fun(Stream) -> maybe_done(Stream, [], Callbacks, Opts) end}; +maybe_done(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> maybe_done(<>, Stack, Callbacks, Opts) end}. + + +object(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + object(Rest, Stack, Callbacks, Opts); +object(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +object(<>, [key|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_object, Callbacks), Opts); +object(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> object(Resume, Stack, Callbacks, Opts) end); +object(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> object(<>, Stack, Callbacks, Opts) end}. + + +array(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + array(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +array(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +array(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +array(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +array(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +array(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +array(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +array(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +array(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +array(<>, [array|Stack], Callbacks, Opts) -> + maybe_done(Rest, Stack, fold(end_array, Callbacks), Opts); +array(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> array(Resume, Stack, Callbacks, Opts) end); +array(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> array(<>, Stack, Callbacks, Opts) end}. + + +value(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + value(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +value(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tr(Rest, Stack, Callbacks, Opts); +value(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fa(Rest, Stack, Callbacks, Opts); +value(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nu(Rest, Stack, Callbacks, Opts); +value(<>, Stack, Callbacks, Opts) -> + negative(Rest, Stack, Callbacks, Opts, "-"); +value(<>, Stack, Callbacks, Opts) -> + zero(Rest, Stack, Callbacks, Opts, "0"); +value(<>, Stack, Callbacks, Opts) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S]); +value(<>, Stack, Callbacks, Opts) -> + object(Rest, [key|Stack], fold(start_object, Callbacks), Opts); +value(<>, Stack, Callbacks, Opts) -> + array(Rest, [array|Stack], fold(start_array, Callbacks), Opts); +value(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> value(Resume, Stack, Callbacks, Opts) end); +value(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> value(<>, Stack, Callbacks, Opts) end}. + + +colon(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + colon(Rest, Stack, Callbacks, Opts); +colon(<>, [key|Stack], Callbacks, Opts) -> + value(Rest, [object|Stack], Callbacks, Opts); +colon(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> colon(Resume, Stack, Callbacks, Opts) end); +colon(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> colon(<>, Stack, Callbacks, Opts) end}. + + +key(<>, Stack, Callbacks, Opts) when ?is_whitespace(S) -> + key(Rest, Stack, Callbacks, Opts); +key(<>, Stack, Callbacks, Opts) -> + string(Rest, Stack, Callbacks, Opts, []); +key(<>, Stack, Callbacks, ?comments_enabled(Opts)) -> + maybe_comment(Rest, fun(Resume) -> key(Resume, Stack, Callbacks, Opts) end); +key(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> key(<>, Stack, Callbacks, Opts) end}. + + +%% string has an additional parameter, an accumulator (Acc) used to hold the intermediate +%% representation of the string being parsed. using a list of integers representing +%% unicode codepoints is faster than constructing binaries, many of which will be +%% converted back to lists by the user anyways. + +%% the clause starting with Bin is necessary for cases where a stream is broken at a +%% point where it contains only a partial utf-8 sequence. + +string(<>, [key|_] = Stack, Callbacks, Opts, Acc) -> + colon(Rest, Stack, fold({key, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold({string, lists:reverse(Acc)}, Callbacks), Opts); +string(<>, Stack, Callbacks, Opts, Acc) -> + escape(Rest, Stack, Callbacks, Opts, Acc); +string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +string(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% only thing to note here is the additional accumulator passed to escaped_unicode used +%% to hold the codepoint sequence. unescessary, but nicer than using the string +%% accumulator. + +escape(<<$b/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\b" ++ Acc); +escape(<<$f/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\f" ++ Acc); +escape(<<$n/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\n" ++ Acc); +escape(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\r" ++ Acc); +escape(<<$t/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + string(Rest, Stack, Callbacks, Opts, "\t" ++ Acc); +escape(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, Acc, []); +escape(<>, Stack, Callbacks, Opts, Acc) + when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +escape(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> escape(<>, Stack, Callbacks, Opts, Acc) end}. + + +%% this code is ugly and unfortunate, but so is json's handling of escaped unicode +%% codepoint sequences. if the ascii option is present, the sequence is converted +%% to a codepoint and inserted into the string if it represents an ascii value. if +%% the codepoint option is present the sequence is converted and inserted as long +%% as it represents a valid unicode codepoint. this means non-characters +%% representable in 16 bits are not converted (the utf16 surrogates and the two +%% special non-characters). any other option and no conversion is done. + +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_ascii(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 128 -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, + Stack, + Callbacks, + ?escaped_unicode_to_codepoint(Opts), + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + end; +escaped_unicode(<>, Stack, Callbacks, Opts, String, [C, B, A]) when ?is_hex(D) -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String); +escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) when ?is_hex(S) -> + escaped_unicode(Rest, Stack, Callbacks, Opts, String, [S] ++ Acc); +escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) -> + {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. + + +%% like strings, numbers are collected in an intermediate accumulator before +%% being emitted to the callback handler. no processing of numbers is done in +%% process, it's left for the user, though there are convenience functions to +%% convert them into erlang floats/integers in jsx_utils.erl. + +%% TODO: actually write that jsx_utils.erl module mentioned above... + +negative(<<$0/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + zero(Rest, Stack, Callbacks, Opts, "0" ++ Acc); +negative(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +negative(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> negative(<>, Stack, Callbacks, Opts, Acc) end}. + + +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +zero(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +zero(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +zero(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> zero(Resume, Stack, Callbacks, Opts, Acc) end); +zero(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> zero(Stream, [], Callbacks, Opts, Acc) end}; +zero(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> zero(<>, Stack, Callbacks, Opts, Acc) end}. + + +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +integer(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?decimalpoint] ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) -> + integer(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +integer(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +integer(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +integer(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> integer(Resume, Stack, Callbacks, Opts, Acc) end); +integer(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> integer(Stream, [], Callbacks, Opts, Acc) end}; +integer(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> integer(<>, Stack, Callbacks, Opts, Acc) end}. + +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + fraction(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +fraction(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, Opts, Acc) -> + fraction(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +fraction(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); +fraction(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +fraction(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> fraction(Resume, Stack, Callbacks, Opts, Acc) end); +fraction(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> fraction(Stream, [], Callbacks, Opts, Acc) end}; +fraction(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> fraction(<>, Stack, Callbacks, Opts, Acc) end}. + + +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(<>, Stack, Callbacks, Opts, Acc) when S =:= ?positive; S =:= ?negative -> + ex(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +e(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> e(<>, Stack, Callbacks, Opts, Acc) end}. + + +ex(<>, Stack, Callbacks, Opts, Acc) when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +ex(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> ex(<>, Stack, Callbacks, Opts, Acc) end}. + + +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_nonzero(S) -> + exp(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_object, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [array|Stack], Callbacks, Opts, Acc) -> + maybe_done(Rest, Stack, fold(end_array, fold({number, lists:reverse(Acc)}, Callbacks)), Opts); +exp(<>, [object|Stack], Callbacks, Opts, Acc) -> + key(Rest, [key|Stack], fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, [array|_] = Stack, Callbacks, Opts, Acc) -> + value(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<>, Stack, Callbacks, Opts, Acc) -> + exp(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); +exp(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) -> + maybe_comment(Rest, fun(Resume) -> exp(Resume, Stack, Callbacks, Opts, Acc) end); +exp(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> + maybe_done(Rest, Stack, fold({number, lists:reverse(Acc)}, Callbacks), Opts); +exp(<<>>, [], Callbacks, Opts, Acc) -> + {fold(end_of_stream, fold({number, lists:reverse(Acc)}, Callbacks)), + fun(Stream) -> exp(Stream, [], Callbacks, Opts, Acc) end}; +exp(Bin, Stack, Callbacks, Opts, Acc) -> + {incomplete, fun(Stream) -> exp(<>, Stack, Callbacks, Opts, Acc) end}. + + +tr(<<$r/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + tru(Rest, Stack, Callbacks, Opts); +tr(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tr(<>, Stack, Callbacks, Opts) end}. + + +tru(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + true(Rest, Stack, Callbacks, Opts); +tru(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> tru(<>, Stack, Callbacks, Opts) end}. + + +true(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, true}, Callbacks), Opts); +true(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> true(<>, Stack, Callbacks, Opts) end}. + + +fa(<<$a/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fal(Rest, Stack, Callbacks, Opts); +fa(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fa(<>, Stack, Callbacks, Opts) end}. + + +fal(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + fals(Rest, Stack, Callbacks, Opts); +fal(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fal(<>, Stack, Callbacks, Opts) end}. + + +fals(<<$s/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + false(Rest, Stack, Callbacks, Opts); +fals(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> fals(<>, Stack, Callbacks, Opts) end}. + + +false(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, false}, Callbacks), Opts); +false(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> false(<>, Stack, Callbacks, Opts) end}. + + +nu(<<$u/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + nul(Rest, Stack, Callbacks, Opts); +nu(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nu(<>, Stack, Callbacks, Opts) end}. + + +nul(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + null(Rest, Stack, Callbacks, Opts); +nul(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> nul(<>, Stack, Callbacks, Opts) end}. + + +null(<<$l/?encoding, Rest/binary>>, Stack, Callbacks, Opts) -> + maybe_done(Rest, Stack, fold({literal, null}, Callbacks), Opts); +null(Bin, Stack, Callbacks, Opts) -> + {incomplete, fun(Stream) -> null(<>, Stack, Callbacks, Opts) end}. + + +%% comments are c style, /* blah blah */ and are STRONGLY discouraged. any unicode +%% character is valid in a comment, except, obviously the */ sequence which ends +%% the comment. they're implemented as a closure called when the comment ends that +%% returns execution to the point where the comment began. comments are not +%% recorded in any way, simply parsed. + +maybe_comment(<>, Resume) -> + comment(Rest, Resume); +maybe_comment(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment(<>, Resume) end}. + + +comment(<>, Resume) -> + maybe_comment_done(Rest, Resume); +comment(<<_/?encoding, Rest/binary>>, Resume) -> + comment(Rest, Resume); +comment(Bin, Resume) -> + {incomplete, fun(Stream) -> comment(<>, Resume) end}. + + +maybe_comment_done(<>, Resume) -> + Resume(Rest); +maybe_comment_done(Bin, Resume) -> + {incomplete, fun(Stream) -> maybe_comment_done(<>, Resume) end}. \ No newline at end of file