From 7e88a14525594fadb4258d9a3853ecdeae323dc5 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 31 Aug 2011 18:52:01 -0700 Subject: [PATCH] massive simplification of api, operation and internals. removes all demo modules temporarily --- include/jsx_common.hrl | 144 --- include/jsx_decoder.hrl | 1049 -------------------- src/jsx_format.hrl => include/jsx_opts.hrl | 29 +- include/jsx_scanner.hrl | 683 +++++++++++++ include/jsx_tokenizer.hrl | 228 +++++ src/jsx_utf16.erl => include/jsx_types.hrl | 47 +- src/jsx.app.src | 11 +- src/jsx.erl | 237 +---- src/jsx_encoder.erl | 401 -------- src/jsx_format.erl | 275 ----- src/jsx_scanner.erl | 188 ++++ src/jsx_terms.erl | 500 ---------- src/jsx_tokenizer.erl | 168 ++++ src/jsx_utf16le.erl | 35 - src/jsx_utf32.erl | 35 - src/jsx_utf32le.erl | 35 - src/jsx_utf8.erl | 33 - src/jsx_utils.erl | 112 +-- src/jsx_verify.erl | 234 ----- 19 files changed, 1358 insertions(+), 3086 deletions(-) delete mode 100644 include/jsx_common.hrl delete mode 100644 include/jsx_decoder.hrl rename src/jsx_format.hrl => include/jsx_opts.hrl (74%) create mode 100644 include/jsx_scanner.hrl create mode 100644 include/jsx_tokenizer.hrl rename src/jsx_utf16.erl => include/jsx_types.hrl (53%) delete mode 100644 src/jsx_encoder.erl delete mode 100644 src/jsx_format.erl create mode 100644 src/jsx_scanner.erl delete mode 100644 src/jsx_terms.erl create mode 100644 src/jsx_tokenizer.erl delete mode 100644 src/jsx_utf16le.erl delete mode 100644 src/jsx_utf32.erl delete mode 100644 src/jsx_utf32le.erl delete mode 100644 src/jsx_utf8.erl delete mode 100644 src/jsx_verify.erl diff --git a/include/jsx_common.hrl b/include/jsx_common.hrl deleted file mode 100644 index 57e79a7..0000000 --- a/include/jsx_common.hrl +++ /dev/null @@ -1,144 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - - -%% opts record for decoder/encoder --record(opts, { - loose_unicode = false, - encoding = auto, - escape_forward_slash = false, - iterate = false -}). - - - --define(is_utf_encoding(X), - X == utf8 - ; X == utf16 - ; X == utf32 - ; X == {utf16, little} - ; X == {utf32, little} -). - - - --type jsx_opts() :: [jsx_opt()]. --type jsx_opt() :: multi_term - | loose_unicode - | escape_forward_slashes - | {encoding, auto - | utf8 - | utf16 - | {utf16, little} - | utf32 - | {utf32, little} - }. - - --type jsx_event() :: start_object - | end_object - | start_array - | end_array - | end_json - | {key, list()} - | {string, list()} - | {integer, integer()} - | {float, float()} - | {literal, true} - | {literal, false} - | {literal, null}. - - --type jsx_encodeable() :: jsx_event() | [jsx_encodeable()]. - - --type jsx_iterator() :: jsx_decoder() | jsx_encoder(). - - -%% this probably doesn't work properly --type jsx_decoder() :: fun((binary()) -> jsx_iterator_result()). - --type jsx_encoder() :: fun((jsx_encodeable()) -> jsx_iterator_result()). - --type jsx_iterator_result() :: - {jsx, jsx_event(), fun(() -> jsx_iterator_result())} - | {jsx, [jsx_event()], fun(() -> jsx_iterator_result())} - | {jsx, incomplete, jsx_iterator()} - | {error, {badjson, any()}}. - - - --type supported_utf() :: utf8 - | utf16 - | {utf16, little} - | utf32 - | {utf32, little}. - - - -%% json specification --type jsx_array() :: [jsx_term()] | []. --type jsx_object() :: [{jsx_key(), jsx_term()}] | [{}]. - --type jsx_key() :: binary(). - --type jsx_term() :: jsx_array() - | jsx_object() - | jsx_string() - | jsx_number() - | true | false | null. - --type jsx_string() :: binary(). - --type jsx_number() :: float() | integer(). - - --type encoder_opts() :: [encoder_opt()]. --type encoder_opt() :: {strict, true | false} - | {encoding, supported_utf()} - | {space, integer()} - | space - | {indent, integer()} - | indent. - - --type decoder_opts() :: [decoder_opt()]. --type decoder_opt() :: {strict, true | false} - | {repeatable_keys, true | false} - | repeatable_keys - | {encoding, supported_utf()}. - - --type verify_opts() :: [verify_opt()]. --type verify_opt() :: {encoding, auto | supported_utf()} - | {repeated_keys, true | false} - | {naked_values, true | false}. - - --type format_opts() :: [format_opt()]. --type format_opt() :: {encoding, auto | supported_utf()} - | {space, integer()} - | space - | {indent, integer()} - | indent - | {output_encoding, supported_utf()}. \ No newline at end of file diff --git a/include/jsx_decoder.hrl b/include/jsx_decoder.hrl deleted file mode 100644 index f489e0d..0000000 --- a/include/jsx_decoder.hrl +++ /dev/null @@ -1,1049 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - -%% this is the implementation of the utf backends for the jsx decoder. it's -%% included by the various jsx_utfxx.erl frontends and all modifications to -%% this file should take that into account - - --export([decoder/1]). - - -%% exported solely to facilitate stupid trick i shouldn't be using --export([start/4, - maybe_done/4, - done/4, - object/4, - array/4, - value/4, - colon/4, - key/4, - string/5, - escape/5, - escaped_unicode/6, - low_surrogate/6, - low_surrogate_u/6, - low_surrogate/7, - negative/5, - zero/5, - integer/5, - initial_decimal/5, - decimal/5, - e/5, - ex/5, - exp/5, - tr/4, - tru/4, - true/4, - fa/4, - fal/4, - fals/4, - false/4, - nu/4, - nul/4, - null/4 -]). - - - --spec decoder(Opts::#opts{}) -> jsx_decoder(). - -decoder(Opts) -> - fun(JSON) -> start(JSON, [], [], Opts) end. - - -%% whitespace --define(space, 16#20). --define(tab, 16#09). --define(cr, 16#0D). --define(newline, 16#0A). - -%% object delimiters --define(start_object, 16#7B). --define(end_object, 16#7D). - -%% array delimiters --define(start_array, 16#5B). --define(end_array, 16#5D). - -%% kv seperator --define(comma, 16#2C). --define(quote, 16#22). --define(colon, 16#3A). - -%% string escape sequences --define(escape, 16#5C). --define(rsolidus, 16#5C). --define(solidus, 16#2F). --define(formfeed, 16#0C). --define(backspace, 16#08). --define(unicode, 16#75). - -%% math --define(zero, 16#30). --define(decimalpoint, 16#2E). --define(negative, 16#2D). --define(positive, 16#2B). - - -%% some useful guards --define(is_hex(Symbol), - (Symbol >= $a andalso Symbol =< $z); (Symbol >= $A andalso Symbol =< $Z); - (Symbol >= $0 andalso Symbol =< $9) -). - --define(is_nonzero(Symbol), - Symbol >= $1 andalso Symbol =< $9 -). - --define(is_noncontrol(Symbol), - (Symbol >= ?space) -). - --define(is_whitespace(Symbol), - Symbol =:= ?space; Symbol =:= ?tab; Symbol =:= ?cr; Symbol =:= ?newline -). - - -%% utf8 is the default encoding --define(utf8, true). - --ifdef(utf16). --undef(utf8). --define(encoding, utf16). --define(utfx, utf16). --define(partial_codepoint(Bin), byte_size(Bin) < 2). --endif. - --ifdef(utf16le). --undef(utf8). --define(encoding, utf16le). --define(utfx, utf16-little). --define(partial_codepoint(Bin), byte_size(Bin) < 2). --endif. - --ifdef(utf32). --undef(utf8). --define(encoding, utf32). --define(utfx, utf32). --define(partial_codepoint(Bin), byte_size(Bin) < 4). --endif. - --ifdef(utf32le). --undef(utf8). --define(encoding, utf32le). --define(utfx, utf32-little). --define(partial_codepoint(Bin), byte_size(Bin) < 4). --endif. - --ifdef(utf8). --define(encoding, utf8). --define(utfx, utf8). --define(partial_codepoint(Bin), byte_size(Bin) < 1). --endif. - -%% when parsing strings, the naive detection of partial codepoints is -%% insufficient. this incredibly anal function should detect all badly formed -%% utf sequences --ifdef(utf8). -partial_utf(<<>>) -> true; -partial_utf(<>) when X >= 16#c2, X =< 16#df -> true; -partial_utf(<>) when X >= 16#e0, X =< 16#ef -> - case Rest of - <<>> -> true - ; <> when Y >= 16#80, Y =< 16#bf -> true - ; _ -> false - end; -partial_utf(<>) when X >= 16#f0, X =< 16#f4 -> - case Rest of - <<>> -> true - ; <> when Y >= 16#80, Y =< 16#bf -> true - ; <> when Y >= 16#80, Y =< 16#bf, Z >= 16#80, Z =< 16#bf -> true - ; _ -> false - end; -partial_utf(_) -> false. --endif. - --ifdef(utf16). -partial_utf(<<>>) -> true; -partial_utf(<<_X>>) -> true; -partial_utf(<>) when X >= 16#d8, X =< 16#df -> true; -partial_utf(<>) when X >= 16#d8, X =< 16#df, Z >= 16#dc, Z =< 16#df -> - true; -partial_utf(_) -> false. --endif. - --ifdef(utf16le). -partial_utf(<<>>) -> true; -%% this case is not strictly true, there are single bytes that should be -%% rejected, but they're rare enough they can be ignored -partial_utf(<<_X>>) -> true; -partial_utf(<<_Y, X>>) when X >= 16#d8, X =< 16#df -> true; -partial_utf(<<_Y, X, _Z>>) when X >= 16#d8, X =< 16#df -> true; -partial_utf(_) -> false. --endif. - --ifdef(utf32). -partial_utf(<<>>) -> true; -partial_utf(<<_>>) -> true; -partial_utf(<<_, _>>) -> true; -partial_utf(<<_, _, _>>) -> true; -partial_utf(_) -> false. --endif. - --ifdef(utf32le). -partial_utf(<<>>) -> true; -partial_utf(<<_>>) -> true; -partial_utf(<<_, _>>) -> true; -partial_utf(<<_, _, _>>) -> true; -partial_utf(_) -> false. --endif. - - -incomplete(State, Bin, T, Args) -> - case ?partial_codepoint(Bin) of - true -> - {jsx, incomplete, fun(Stream) - when is_binary(Stream) -> - erlang:apply(?MODULE, - State, - [<>, T] ++ Args - ) - ; (Else) -> {error, {badjson, Else}} - end} - ; false -> {error, {badjson, Bin}} - end. - - --ifndef(emit). - -%% takes a list of `events` to present to client code and formats them -%% appropriately -iterate_wrapper([], Next) -> Next(); -iterate_wrapper([Event|Events], Next) -> - {jsx, Event, fun() -> - iterate_wrapper(Events, Next) - end}. - - --define(emit(Event, State, Rest, T, Stack, Opts), - State(Rest, Event ++ T, Stack, Opts) -). - - -done(<>, T, [], Opts) when ?is_whitespace(S) -> - done(Rest, T, [], Opts); -done(<<>>, T, [], Opts=#opts{iterate = true}) -> - iterate_wrapper(lists:reverse([end_json] ++ T), fun() -> - incomplete(done, <<>>, [], [[], Opts]) - end); -done(<<>>, T, [], Opts) -> - {jsx, lists:reverse([end_json] ++ T), fun(end_stream) -> - done(<<>>, T, [], Opts) - ; (Stream) -> - done(Stream, T, [], Opts) - end}; -done(Bin, T, [], Opts) -> - incomplete(done, Bin, T, [[], Opts]). - - --else. - --define(next(State, Rest, T, Stack, Opts), - State(Rest, T, Stack, Opts) -). - -done(<>, T, [], Opts) when ?is_whitespace(S) -> - done(Rest, T, Opts); -done(<<>>, T, [], Opts) -> - ?emit([end_json], done, Rest, T, [], Opts); -done(Bin, T, [], Opts) -> - incomplete(done, Bin, T, [[], Opts]). - - --endif. - -start(<>, T, Stack, Opts) -> - ?emit([start_object], object, Rest, T, [key|Stack], Opts); -start(<>, T, Stack, Opts) -> - ?emit([start_array], array, Rest, T, [array|Stack], Opts); -start(<>, T, Stack, Opts) -> - string(Rest, T, Stack, Opts, []); -start(<<$t/?utfx, Rest/binary>>, T, Stack, Opts) -> - tr(Rest, T, Stack, Opts); -start(<<$f/?utfx, Rest/binary>>, T, Stack, Opts) -> - fa(Rest, T, Stack, Opts); -start(<<$n/?utfx, Rest/binary>>, T, Stack, Opts) -> - nu(Rest, T, Stack, Opts); -start(<>, T, Stack, Opts) -> - negative(Rest, T, Stack, Opts, "-"); -start(<>, T, Stack, Opts) -> - zero(Rest, T, Stack, Opts, "0"); -start(<>, T, Stack, Opts) when ?is_nonzero(S) -> - integer(Rest, T, Stack, Opts, [S]); -start(<>, T, Stack, Opts) when ?is_whitespace(S) -> - start(Rest, T, Stack, Opts); -start(Bin, T, Stack, Opts) -> - incomplete(start, Bin, T, [Stack, Opts]). - - -maybe_done(<>, T, [object|Stack], Opts) -> - ?emit([end_object], maybe_done, Rest, T, Stack, Opts); -maybe_done(<>, T, [array|Stack], Opts) -> - ?emit([end_array], maybe_done, Rest, T, Stack, Opts); -maybe_done(<>, T, [object|Stack], Opts) -> - key(Rest, T, [key|Stack], Opts); -maybe_done(<>, T, [array|_] = Stack, Opts) -> - value(Rest, T, Stack, Opts); -maybe_done(<>, T, Stack, Opts) when ?is_whitespace(S) -> - maybe_done(Rest, T, Stack, Opts); -maybe_done(Rest, T, [], Opts) -> - done(Rest, T, [], Opts); -maybe_done(Bin, T, Stack, Opts) -> - incomplete(maybe_done, Bin, T, [Stack, Opts]). - - -object(<>, T, Stack, Opts) -> - string(Rest, T, Stack, Opts, []); -object(<>, T, [key|Stack], Opts) -> - ?emit([end_object], maybe_done, Rest, T, Stack, Opts); -object(<>, T, Stack, Opts) when ?is_whitespace(S) -> - object(Rest, T, Stack, Opts); -object(Bin, T, Stack, Opts) -> - incomplete(object, Bin, T, [Stack, Opts]). - - -array(<>, T, Stack, Opts) -> - string(Rest, T, Stack, Opts, []); -array(<<$t/?utfx, Rest/binary>>, T, Stack, Opts) -> - tr(Rest, T, Stack, Opts); -array(<<$f/?utfx, Rest/binary>>, T, Stack, Opts) -> - fa(Rest, T, Stack, Opts); -array(<<$n/?utfx, Rest/binary>>, T, Stack, Opts) -> - nu(Rest, T, Stack, Opts); -array(<>, T, Stack, Opts) -> - negative(Rest, T, Stack, Opts, "-"); -array(<>, T, Stack, Opts) -> - zero(Rest, T, Stack, Opts, "0"); -array(<>, T, Stack, Opts) when ?is_nonzero(S) -> - integer(Rest, T, Stack, Opts, [S]); -array(<>, T, Stack, Opts) -> - ?emit([start_object], object, Rest, T, [key|Stack], Opts); -array(<>, T, Stack, Opts) -> - ?emit([start_array], array, Rest, T, [array|Stack], Opts); -array(<>, T, [array|Stack], Opts) -> - maybe_done(Rest, [end_array] ++ T, Stack, Opts); -array(<>, T, Stack, Opts) when ?is_whitespace(S) -> - array(Rest, T, Stack, Opts); -array(Bin, T, Stack, Opts) -> - incomplete(array, Bin, T, [Stack, Opts]). - - -value(<>, T, Stack, Opts) -> - string(Rest, T, Stack, Opts, []); -value(<<$t/?utfx, Rest/binary>>, T, Stack, Opts) -> - tr(Rest, T, Stack, Opts); -value(<<$f/?utfx, Rest/binary>>, T, Stack, Opts) -> - fa(Rest, T, Stack, Opts); -value(<<$n/?utfx, Rest/binary>>, T, Stack, Opts) -> - nu(Rest, T, Stack, Opts); -value(<>, T, Stack, Opts) -> - negative(Rest, T, Stack, Opts, "-"); -value(<>, T, Stack, Opts) -> - zero(Rest, T, Stack, Opts, "0"); -value(<>, T, Stack, Opts) when ?is_nonzero(S) -> - integer(Rest, T, Stack, Opts, [S]); -value(<>, T, Stack, Opts) -> - ?emit([start_object], object, Rest, T, [key|Stack], Opts); -value(<>, T, Stack, Opts) -> - ?emit([start_array], array, Rest, T, [array|Stack], Opts); -value(<>, T, Stack, Opts) when ?is_whitespace(S) -> - value(Rest, T, Stack, Opts); -value(Bin, T, Stack, Opts) -> - incomplete(value, Bin, T, [Stack, Opts]). - - -colon(<>, T, [key|Stack], Opts) -> - value(Rest, T, [object|Stack], Opts); -colon(<>, T, Stack, Opts) when ?is_whitespace(S) -> - colon(Rest, T, Stack, Opts); -colon(Bin, T, Stack, Opts) -> - incomplete(colon, Bin, T, [Stack, Opts]). - - -key(<>, T, Stack, Opts) -> - string(Rest, T, Stack, Opts, []); -key(<>, T, Stack, Opts) when ?is_whitespace(S) -> - key(Rest, T, Stack, Opts); -key(Bin, T, Stack, Opts) -> - incomplete(key, Bin, T, [Stack, Opts]). - - -%% string has an additional parameter, an accumulator (Acc) used to hold the -%% intermediate representation of the string being parsed. using a list of -%% integers representing unicode codepoints is faster than constructing -%% binaries, there's a branch kicking around which proves it -%% string uses partial_utf/1 to cease parsing when invalid encodings are -%% encountered rather than just checking remaining binary size like other -%% states to eliminate certain incomplete states -string(<>, T, [key|_] = Stack, Opts, Acc) -> - ?emit([{key, lists:reverse(Acc)}], colon, Rest, T, Stack, Opts); -string(<>, T, Stack, Opts, Acc) -> - ?emit([{string, lists:reverse(Acc)}], maybe_done, Rest, T, Stack, Opts); -string(<>, T, Stack, Opts, Acc) -> - escape(Rest, T, Stack, Opts, Acc); -%% things get dumb here. erlang doesn't properly restrict unicode non-characters -%% so you can't trust the codepoints it returns always -%% the range 32..16#fdcf is safe, so allow that -string(<>, T, Stack, Opts, Acc) - when ?is_noncontrol(S), S < 16#fdd0 -> - string(Rest, T, Stack, Opts, [S] ++ Acc); -%% the range 16#fdf0..16#fffd is also safe -string(<>, T, Stack, Opts, Acc) - when S > 16#fdef, S < 16#fffe -> - string(Rest, T, Stack, Opts, [S] ++ Acc); -%% yes, i think it's insane too -string(<>, T, Stack, Opts, Acc) - when S > 16#ffff andalso - S =/= 16#1fffe andalso S =/= 16#1ffff andalso - S =/= 16#2fffe andalso S =/= 16#2ffff andalso - S =/= 16#3fffe andalso S =/= 16#3ffff andalso - S =/= 16#4fffe andalso S =/= 16#4ffff andalso - S =/= 16#5fffe andalso S =/= 16#5ffff andalso - S =/= 16#6fffe andalso S =/= 16#6ffff andalso - S =/= 16#7fffe andalso S =/= 16#7ffff andalso - S =/= 16#8fffe andalso S =/= 16#8ffff andalso - S =/= 16#9fffe andalso S =/= 16#9ffff andalso - S =/= 16#afffe andalso S =/= 16#affff andalso - S =/= 16#bfffe andalso S =/= 16#bffff andalso - S =/= 16#cfffe andalso S =/= 16#cffff andalso - S =/= 16#dfffe andalso S =/= 16#dffff andalso - S =/= 16#efffe andalso S =/= 16#effff andalso - S =/= 16#ffffe andalso S =/= 16#fffff andalso - S =/= 16#10fffe andalso S =/= 16#10ffff -> - string(Rest, T, Stack, Opts, [S] ++ Acc); -string(Bin, T, Stack, Opts, Acc) -> - case partial_utf(Bin) of - true -> - {jsx, incomplete, fun(Stream) - when is_binary(Stream) -> - string(<>, T, Stack, Opts, Acc) - ; (Else) -> - {error, {badjson, Else}} - end} - ; false -> - case Opts#opts.loose_unicode of - true -> noncharacter(Bin, T, Stack, Opts, Acc) - ; false -> {error, {badjson, Bin}} - end - end. - - -%% we don't need to guard against partial utf here, because it's already taken -%% care of in string. theoretically, the last clause of noncharacter/4 is -%% unreachable --ifdef(utf8). -%% non-characters erlang doesn't recognize as non-characters, idiotically -noncharacter(<>, T, Stack, Opts, Acc) - when ?is_noncontrol(S) -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% u+fffe and u+ffff -noncharacter(<<239, 191, X, Rest/binary>>, T, Stack, Opts, Acc) - when X == 190; X == 191 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% surrogates -noncharacter(<<237, X, _, Rest/binary>>, T, Stack, Opts, Acc) when X >= 160 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -noncharacter(Bin, _T, _Stack, _Opts, _Acc) -> - {error, {badjson, Bin}}. --endif. - --ifdef(utf16). -%% non-characters blah blah -noncharacter(<>, T, Stack, Opts, Acc) - when ?is_noncontrol(S) -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% u+ffff and u+fffe -noncharacter(<<255, X, Rest/binary>>, T, Stack, Opts, Acc) - when X == 254; X == 255 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% surrogates -noncharacter(<>, T, Stack, Opts, Acc) - when X >= 216, X =< 223 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -noncharacter(Bin, _T, _Stack, _Opts, _Acc) -> - {error, {badjson, Bin}}. --endif. - --ifdef(utf16le). -%% non-characters blah blah -noncharacter(<>, T, Stack, Opts, Acc) - when ?is_noncontrol(S) -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% u+ffff and u+fffe -noncharacter(<>, T, Stack, Opts, Acc) - when X == 254; X == 255 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% surrogates -noncharacter(<<_, X, Rest/binary>>, T, Stack, Opts, Acc) - when X >= 216, X =< 223 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -noncharacter(Bin, _T, _Stack, _Opts, _Acc) -> - {error, {badjson, Bin}}. --endif. - --ifdef(utf32). -%% non-characters blah blah -noncharacter(<>, T, Stack, Opts, Acc) - when ?is_noncontrol(S) -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% u+ffff and u+fffe -noncharacter(<<0, 0, 255, X, Rest/binary>>, T, Stack, Opts, Acc) - when X == 254; X == 255 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% surrogates -noncharacter(<<0, 0, X, _, Rest/binary>>, T, Stack, Opts, Acc) - when X >= 216, X =< 223 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -noncharacter(Bin, _T, _Stack, _Opts, _Acc) -> - {error, {badjson, Bin}}. --endif. - --ifdef(utf32le). -%% non-characters blah blah -noncharacter(<>, T, Stack, Opts, Acc) - when ?is_noncontrol(S) -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% u+ffff and u+fffe -noncharacter(<>, T, Stack, Opts, Acc) - when X == 254; X == 255 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -%% surrogates -noncharacter(<<_, X, 0, 0, Rest/binary>>, T, Stack, Opts, Acc) - when X >= 216, X =< 223 -> - string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); -noncharacter(Bin, _T, _Stack, _Opts, _Acc) -> - {error, {badjson, Bin}}. --endif. - - -%% only thing to note here is the additional accumulator passed to -%% escaped_unicode used to hold the codepoint sequence. unescessary, but nicer -%% than using the string accumulator -escape(<<$b/?utfx, Rest/binary>>, T, Stack, Opts, Acc) -> - string(Rest, T, Stack, Opts, "\b" ++ Acc); -escape(<<$f/?utfx, Rest/binary>>, T, Stack, Opts, Acc) -> - string(Rest, T, Stack, Opts, "\f" ++ Acc); -escape(<<$n/?utfx, Rest/binary>>, T, Stack, Opts, Acc) -> - string(Rest, T, Stack, Opts, "\n" ++ Acc); -escape(<<$r/?utfx, Rest/binary>>, T, Stack, Opts, Acc) -> - string(Rest, T, Stack, Opts, "\r" ++ Acc); -escape(<<$t/?utfx, Rest/binary>>, T, Stack, Opts, Acc) -> - string(Rest, T, Stack, Opts, "\t" ++ Acc); -escape(<<$u/?utfx, Rest/binary>>, T, Stack, Opts, Acc) -> - escaped_unicode(Rest, T, Stack, Opts, Acc, []); -escape(<>, T, Stack, Opts, Acc) - when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> - string(Rest, T, Stack, Opts, [S] ++ Acc); -escape(Bin, T, Stack, Opts, Acc) -> - incomplete(escape, Bin, T, [Stack, Opts, Acc]). - - -%% this code is ugly and unfortunate, but so is json's handling of escaped -%% unicode codepoint sequences. -escaped_unicode(<>, T, Stack, Opts, String, [C, B, A]) - when ?is_hex(D) -> - case erlang:list_to_integer([A, B, C, D], 16) of - %% high surrogate, we need a low surrogate next - X when X >= 16#d800, X =< 16#dbff -> - low_surrogate(Rest, T, Stack, Opts, String, X) - %% non-characters, you're not allowed to exchange these - ; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef -> - case Opts#opts.loose_unicode of - true -> - string(Rest, T, Stack, Opts, [16#fffd] ++ String) - ; false -> - {error, {badjson, <>}} - end - %% allowing interchange of null bytes allows attackers to forge - %% malicious streams - ; X when X == 16#0000 -> - case Opts#opts.loose_unicode of - true -> - string(Rest, T, Stack, Opts, [16#fffd] ++ String) - ; false -> - {error, {badjson, <>}} - end - %% anything else - ; X -> - string(Rest, T, Stack, Opts, [X] ++ String) - end; -escaped_unicode(<>, T, Stack, Opts, String, Acc) - when ?is_hex(S) -> - escaped_unicode(Rest, T, Stack, Opts, String, [S] ++ Acc); -escaped_unicode(Bin, T, Stack, Opts, String, Acc) -> - incomplete(escaped_unicode, Bin, T, [Stack, Opts, String, Acc]). - - -low_surrogate(<>, T, Stack, Opts, String, High) -> - low_surrogate_u(Rest, T, Stack, Opts, String, High); -%% not an escaped codepoint, our high codepoint is illegal. dispatch back to -%% string to handle -low_surrogate(<> = Bin, T, Stack, Opts, String, _) -> - case Opts#opts.loose_unicode of - true -> - string(Bin, T, Stack, Opts, [16#fffd] ++ String) - ; false -> - {error, {badjson, <>}} - end; -low_surrogate(Bin, T, Stack, Opts, String, High) -> - incomplete(low_surrogate, Bin, T, [Stack, Opts, String, High]). - - -low_surrogate_u(<<$u/?utfx, Rest/binary>>, T, Stack, Opts, String, H) -> - low_surrogate(Rest, T, Stack, Opts, String, [], H); -%% not a low surrogate, dispatch back to string to handle, including the -%% rsolidus we parsed previously -low_surrogate_u(<> = Bin, T, Stack, Opts, String, _) -> - case Opts#opts.loose_unicode of - true -> - string(<>, - T, - Stack, - Opts, - [16#fffd] ++ String - ) - ; false -> - {error, {badjson, <>}} - end; -low_surrogate_u(Bin, T, Stack, Opts, String, H) -> - incomplete(low_surrogate_u, Bin, T, [Stack, Opts, String, H]). - - -low_surrogate(<>, T, Stack, Opts, String, [C, B, A], H) - when ?is_hex(D) -> - case erlang:list_to_integer([A, B, C, D], 16) of - X when X >= 16#dc00, X =< 16#dfff -> - V = surrogate_to_codepoint(H, X), - case V rem 16#10000 of Y when Y == 16#fffe; Y == 16#ffff -> - case Opts#opts.loose_unicode of - true -> - string(Rest, T, Stack, Opts, [16#fffd] ++ String) - ; false -> - {error, {badjson, <>}} - end - ; _ -> - string(Rest, T, Stack, Opts, [V] ++ String) - end - %% not a low surrogate, bad bad bad - ; _ -> - case Opts#opts.loose_unicode of - true -> - string(Rest, T, Stack, Opts, [16#fffd, 16#fffd] ++ String) - ; false -> - {error, {badjson, <>}} - end - end; -low_surrogate(<>, T, Stack, Opts, String, Acc, H) - when ?is_hex(S) -> - low_surrogate(Rest, T, Stack, Opts, String, [S] ++ Acc, H); -low_surrogate(Bin, T, Stack, Opts, String, Acc, H) -> - incomplete(low_surrogate, Bin, T, [Stack, Opts, String, Acc, H]). - - -%% stole this from the unicode spec -surrogate_to_codepoint(High, Low) -> - (High - 16#d800) * 16#400 + (Low - 16#dc00) + 16#10000. - - -%% like strings, numbers are collected in an intermediate accumulator before -%% being emitted to the callback handler -negative(<<$0/?utfx, Rest/binary>>, T, Stack, Opts, Acc) -> - zero(Rest, T, Stack, Opts, "0" ++ Acc); -negative(<>, T, Stack, Opts, Acc) when ?is_nonzero(S) -> - integer(Rest, T, Stack, Opts, [S] ++ Acc); -negative(Bin, T, Stack, Opts, Acc) -> - incomplete(negative, Bin, T, [Stack, Opts, Acc]). - - -zero(<>, T, [object|Stack], Opts, Acc) -> - ?emit([end_object, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -zero(<>, T, [array|Stack], Opts, Acc) -> - ?emit([end_array, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -zero(<>, T, [object|Stack], Opts, Acc) -> - ?emit([format_number(Acc)], key, Rest, T, [key|Stack], Opts); -zero(<>, T, [array|_] = Stack, Opts, Acc) -> - ?emit([format_number(Acc)], value, Rest, T, Stack, Opts); -zero(<>, T, Stack, Opts, Acc) -> - initial_decimal(Rest, T, Stack, Opts, {Acc, []}); -zero(<>, T, Stack, Opts, Acc) when ?is_whitespace(S) -> - ?emit([format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -zero(<<>>, T, [], Opts, Acc) -> - {jsx, incomplete, fun(end_stream) -> - ?emit([format_number(Acc)], done, <<>>, T, [], Opts) - ; (Stream) when is_binary(Stream) -> - zero(Stream, T, [], Opts, Acc) - ; (Else) -> {error, {badjson, Else}} - end}; -zero(Bin, T, Stack, Opts, Acc) -> - incomplete(zero, Bin, T, [Stack, Opts, Acc]). - - -integer(<>, T, Stack, Opts, Acc) when ?is_nonzero(S) -> - integer(Rest, T, Stack, Opts, [S] ++ Acc); -integer(<>, T, [object|Stack], Opts, Acc) -> - ?emit([end_object, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -integer(<>, T, [array|Stack], Opts, Acc) -> - ?emit([end_array, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -integer(<>, T, [object|Stack], Opts, Acc) -> - ?emit([format_number(Acc)], key, Rest, T, [key|Stack], Opts); -integer(<>, T, [array|_] = Stack, Opts, Acc) -> - ?emit([format_number(Acc)], value, Rest, T, Stack, Opts); -integer(<>, T, Stack, Opts, Acc) -> - initial_decimal(Rest, T, Stack, Opts, {Acc, []}); -integer(<>, T, Stack, Opts, Acc) -> - integer(Rest, T, Stack, Opts, [?zero] ++ Acc); -integer(<>, T, Stack, Opts, Acc) when S =:= $e; S =:= $E -> - e(Rest, T, Stack, Opts, {Acc, [], []}); -integer(<>, T, Stack, Opts, Acc) when ?is_whitespace(S) -> - ?emit([format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -integer(<<>>, T, [], Opts, Acc) -> - {jsx, incomplete, fun(end_stream) -> - ?emit([format_number(Acc)], done, <<>>, T, [], Opts) - ; (Stream) when is_binary(Stream) -> - integer(Stream, T, [], Opts, Acc) - ; (Else) -> {error, {badjson, Else}} - end}; -integer(Bin, T, Stack, Opts, Acc) -> - incomplete(integer, Bin, T, [Stack, Opts, Acc]). - - -initial_decimal(<>, T, Stack, Opts, {Int, Frac}) - when S =:= ?zero; ?is_nonzero(S) -> - decimal(Rest, T, Stack, Opts, {Int, [S] ++ Frac}); -initial_decimal(Bin, T, Stack, Opts, Acc) -> - incomplete(initial_decimal, Bin, T, [Stack, Opts, Acc]). - - -decimal(<>, T, Stack, Opts, {Int, Frac}) - when S=:= ?zero; ?is_nonzero(S) -> - decimal(Rest, T, Stack, Opts, {Int, [S] ++ Frac}); -decimal(<>, T, [object|Stack], Opts, Acc) -> - ?emit([end_object, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -decimal(<>, T, [array|Stack], Opts, Acc) -> - ?emit([end_array, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -decimal(<>, T, [object|Stack], Opts, Acc) -> - ?emit([format_number(Acc)], key, Rest, T, [key|Stack], Opts); -decimal(<>, T, [array|_] = Stack, Opts, Acc) -> - ?emit([format_number(Acc)], value, Rest, T, Stack, Opts); -decimal(<>, T, Stack, Opts, {Int, Frac}) - when S =:= $e; S =:= $E -> - e(Rest, T, Stack, Opts, {Int, Frac, []}); -decimal(<>, T, Stack, Opts, Acc) when ?is_whitespace(S) -> - ?emit([format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -decimal(<<>>, T, [], Opts, Acc) -> - {jsx, incomplete, fun(end_stream) -> - ?emit([format_number(Acc)], done, <<>>, T, [], Opts) - ; (Stream) when is_binary(Stream) -> - decimal(Stream, T, [], Opts, Acc) - ; (Else) -> {error, {badjson, Else}} - end}; -decimal(Bin, T, Stack, Opts, Acc) -> - incomplete(decimal, Bin, T, [Stack, Opts, Acc]). - - -e(<>, T, Stack, Opts, {Int, Frac, Exp}) - when S =:= ?zero; ?is_nonzero(S) -> - exp(Rest, T, Stack, Opts, {Int, Frac, [S] ++ Exp}); -e(<>, T, Stack, Opts, {Int, Frac, Exp}) - when S =:= ?positive; S =:= ?negative -> - ex(Rest, T, Stack, Opts, {Int, Frac, [S] ++ Exp}); -e(Bin, T, Stack, Opts, Acc) -> - incomplete(e, Bin, T, [Stack, Opts, Acc]). - - -ex(<>, T, Stack, Opts, {Int, Frac, Exp}) - when S =:= ?zero; ?is_nonzero(S) -> - exp(Rest, T, Stack, Opts, {Int, Frac, [S] ++ Exp}); -ex(Bin, T, Stack, Opts, Acc) -> - incomplete(ex, Bin, T, [Stack, Opts, Acc]). - - -exp(<>, T, Stack, Opts, {Int, Frac, Exp}) - when S =:= ?zero; ?is_nonzero(S) -> - exp(Rest, T, Stack, Opts, {Int, Frac, [S] ++ Exp}); -exp(<>, T, [object|Stack], Opts, Acc) -> - ?emit([end_object, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -exp(<>, T, [array|Stack], Opts, Acc) -> - ?emit([end_array, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -exp(<>, T, [object|Stack], Opts, Acc) -> - ?emit([format_number(Acc)], key, Rest, T, [key|Stack], Opts); -exp(<>, T, [array|_] = Stack, Opts, Acc) -> - ?emit([format_number(Acc)], value, Rest, T, Stack, Opts); -exp(<>, T, Stack, Opts, Acc) when ?is_whitespace(S) -> - ?emit([format_number(Acc)], maybe_done, Rest, T, Stack, Opts); -exp(<<>>, T, [], Opts, Acc) -> - {jsx, incomplete, fun(end_stream) -> - ?emit([format_number(Acc)], done, <<>>, T, [], Opts) - ; (Stream) when is_binary(Stream) -> - exp(Stream, T, [], Opts, Acc) - ; (Else) -> {error, {badjson, Else}} - end}; -exp(Bin, T, Stack, Opts, Acc) -> - incomplete(exp, Bin, T, [Stack, Opts, Acc]). - - -format_number(Int) when is_list(Int) -> - {integer, list_to_integer(lists:reverse(Int))}; -format_number({Int, Frac}) -> - {float, list_to_float(lists:reverse(Frac ++ "." ++ Int))}; -format_number({Int, [], Exp}) -> - {float, list_to_float(lists:reverse(Exp ++ "e0." ++ Int))}; -format_number({Int, Frac, Exp}) -> - {float, list_to_float(lists:reverse(Exp ++ "e" ++ Frac ++ "." ++ Int))}. - - -tr(<<$r/?utfx, Rest/binary>>, T, Stack, Opts) -> - tru(Rest, T, Stack, Opts); -tr(Bin, T, Stack, Opts) -> - incomplete(tr, Bin, T, [Stack, Opts]). - - -tru(<<$u/?utfx, Rest/binary>>, T, Stack, Opts) -> - true(Rest, T, Stack, Opts); -tru(Bin, T, Stack, Opts) -> - incomplete(tru, Bin, T, [Stack, Opts]). - - -true(<<$e/?utfx, Rest/binary>>, T, Stack, Opts) -> - ?emit([{literal, true}], maybe_done, Rest, T, Stack, Opts); -true(Bin, T, Stack, Opts) -> - incomplete(true, Bin, T, [Stack, Opts]). - - -fa(<<$a/?utfx, Rest/binary>>, T, Stack, Opts) -> - fal(Rest, T, Stack, Opts); -fa(Bin, T, Stack, Opts) -> - incomplete(fa, Bin, T, [Stack, Opts]). - - -fal(<<$l/?utfx, Rest/binary>>, T, Stack, Opts) -> - fals(Rest, T, Stack, Opts); -fal(Bin, T, Stack, Opts) -> - incomplete(fal, Bin, T, [Stack, Opts]). - - -fals(<<$s/?utfx, Rest/binary>>, T, Stack, Opts) -> - false(Rest, T, Stack, Opts); -fals(Bin, T, Stack, Opts) -> - incomplete(fals, Bin, T, [Stack, Opts]). - - -false(<<$e/?utfx, Rest/binary>>, T, Stack, Opts) -> - ?emit([{literal, false}], maybe_done, Rest, T, Stack, Opts); -false(Bin, T, Stack, Opts) -> - incomplete(false, Bin, T, [Stack, Opts]). - - -nu(<<$u/?utfx, Rest/binary>>, T, Stack, Opts) -> - nul(Rest, T, Stack, Opts); -nu(Bin, T, Stack, Opts) -> - incomplete(nu, Bin, T, [Stack, Opts]). - - -nul(<<$l/?utfx, Rest/binary>>, T, Stack, Opts) -> - null(Rest, T, Stack, Opts); -nul(Bin, T, Stack, Opts) -> - incomplete(nul, Bin, T, [Stack, Opts]). - - -null(<<$l/?utfx, Rest/binary>>, T, Stack, Opts) -> - ?emit([{literal, null}], maybe_done, Rest, T, Stack, Opts); -null(Bin, T, Stack, Opts) -> - incomplete(null, Bin, T, [Stack, Opts]). - - - --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - - -noncharacters_test_() -> - [ - {"noncharacters - badjson", - ?_assertEqual(check_bad(noncharacters()), []) - }, - {"noncharacters - replaced", - ?_assertEqual(check_replaced(noncharacters()), []) - } - ]. - -extended_noncharacters_test_() -> - [ - {"extended noncharacters - badjson", - ?_assertEqual(check_bad(extended_noncharacters()), []) - }, - {"extended noncharacters - replaced", - ?_assertEqual(check_replaced(extended_noncharacters()), []) - } - ]. - -surrogates_test_() -> - [ - {"surrogates - badjson", - ?_assertEqual(check_bad(surrogates()), []) - }, - {"surrogates - replaced", - ?_assertEqual(check_replaced(surrogates()), []) - } - ]. - -control_test_() -> - [ - {"control characters - badjson", - ?_assertEqual(check_bad(control_characters()), []) - } - ]. - -reserved_test_() -> - [ - {"reserved noncharacters - badjson", - ?_assertEqual(check_bad(reserved_space()), []) - }, - {"reserved noncharacters - replaced", - ?_assertEqual(check_replaced(reserved_space()), []) - } - ]. - -zero_test_() -> - [ - {"nullbyte - badjson", - ?_assertEqual(check_bad(zero()), []) - } - ]. - -good_characters_test_() -> - [ - {"acceptable codepoints", - ?_assertEqual(check_good(good()), []) - }, - {"acceptable extended", - ?_assertEqual(check_good(good_extended()), []) - } - ]. - - -check_bad(List) -> - lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end, - check(List, [], []) - ). - -check_replaced(List) -> - lists:dropwhile(fun({_, [{string, [16#fffd]}|_]}) -> - true - ; (_) -> - false - end, - check(List, [loose_unicode], []) - ). - -check_good(List) -> - lists:dropwhile(fun({_, [{string, _}]}) -> true ; (_) -> false end, - check(List, [], []) - ). - -check([], _Opts, Acc) -> Acc; -check([H|T], Opts, Acc) -> - R = decode(to_fake_utf(H, ?encoding), Opts), - check(T, Opts, [{H, R}] ++ Acc). - - -decode(JSON, Opts) -> - case (jsx:decoder(Opts))(JSON) of - {jsx, Events, _} -> loop(Events, []) - ; {error, {badjson, _}} -> {error, badjson} - end. - - -loop([end_json], Acc) -> lists:reverse(Acc); -loop([Event|Events], Acc) -> loop(Events, [Event] ++ Acc); -loop(_, _) -> {error, badjson}. - - - -noncharacters() -> lists:seq(16#fffe, 16#ffff). - -extended_noncharacters() -> - [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] - ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] - ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] - ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] - ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] - ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] - ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] - ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. - -surrogates() -> lists:seq(16#d800, 16#dfff). - -control_characters() -> lists:seq(1, 31). - -reserved_space() -> lists:seq(16#fdd0, 16#fdef). - -zero() -> [0]. - -good() -> [32, 33] - ++ lists:seq(16#23, 16#5b) - ++ lists:seq(16#5d, 16#d7ff) - ++ lists:seq(16#e000, 16#fdcf) - ++ lists:seq(16#fdf0, 16#fffd). - -good_extended() -> lists:seq(16#100000, 16#10fffd). - -%% erlang refuses to encode certain codepoints, so fake them all -to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>; -to_fake_utf(N, utf8) when N < 16#0800 -> - <<0:5, Y:5, X:6>> = <>, - <<34/utf8, 2#110:3, Y:5, 2#10:2, X:6, 34/utf8>>; -to_fake_utf(N, utf8) when N < 16#10000 -> - <> = <>, - <<34/utf8, 2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>; -to_fake_utf(N, utf8) -> - <<0:3, W:3, Z:6, Y:6, X:6>> = <>, - <<34/utf8, 2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>; - -to_fake_utf(N, utf16) when N < 16#10000 -> <<34/utf16, N:16, 34/utf16>>; -to_fake_utf(N, utf16) -> <<34/utf16, N/utf16, 34/utf16>>; - -to_fake_utf(N, utf16le) when N < 16#10000 -> - <> = <>, - <<34, 0, B:8, A:8, 34, 0>>; -to_fake_utf(N, utf16le) -> <<34/utf16-little, N/utf16-little, 34/utf16-little>>; - -to_fake_utf(N, utf32) -> <<34/utf32, N:32, 34/utf32>>; - -to_fake_utf(N, utf32le) -> - <> = <>, - <<34/utf32-little, D:8, C:8, B:8, A:8, 34/utf32-little>>. - - --endif. \ No newline at end of file diff --git a/src/jsx_format.hrl b/include/jsx_opts.hrl similarity index 74% rename from src/jsx_format.hrl rename to include/jsx_opts.hrl index 689ff23..9f75bd9 100644 --- a/src/jsx_format.hrl +++ b/include/jsx_opts.hrl @@ -20,21 +20,20 @@ %% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN %% THE SOFTWARE. - - --record(format_opts, { - space = 0, - indent = 0, - output_encoding = utf8 +-record(opts, { + loose_unicode = false, + escape_forward_slash = false }). --define(newline, $\n). --define(space, 16#20). %% ascii code for space --define(quote, $\"). --define(comma, $,). --define(colon, $:). --define(start_object, ${). --define(end_object, $}). --define(start_array, $[). --define(end_array, $]). \ No newline at end of file +parse_opts(Opts) -> + parse_opts(Opts, #opts{}). + +parse_opts([], Opts) -> + Opts; +parse_opts([loose_unicode|Rest], Opts) -> + parse_opts(Rest, Opts#opts{loose_unicode=true}); +parse_opts([escape_forward_slash|Rest], Opts) -> + parse_opts(Rest, Opts#opts{escape_forward_slash=true}); +parse_opts(_, _) -> + {error, badarg}. \ No newline at end of file diff --git a/include/jsx_scanner.hrl b/include/jsx_scanner.hrl new file mode 100644 index 0000000..fed4b97 --- /dev/null +++ b/include/jsx_scanner.hrl @@ -0,0 +1,683 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + + +%% whitespace +-define(space, 16#20). +-define(tab, 16#09). +-define(cr, 16#0D). +-define(newline, 16#0A). + +%% object delimiters +-define(start_object, 16#7B). +-define(end_object, 16#7D). + +%% array delimiters +-define(start_array, 16#5B). +-define(end_array, 16#5D). + +%% kv seperator +-define(comma, 16#2C). +-define(quote, 16#22). +-define(colon, 16#3A). + +%% string escape sequences +-define(escape, 16#5C). +-define(rsolidus, 16#5C). +-define(solidus, 16#2F). +-define(formfeed, 16#0C). +-define(backspace, 16#08). +-define(unicode, 16#75). + +%% math +-define(zero, 16#30). +-define(decimalpoint, 16#2E). +-define(negative, 16#2D). +-define(positive, 16#2B). + + +%% some useful guards +-define(is_hex(Symbol), + (Symbol >= $a andalso Symbol =< $z); (Symbol >= $A andalso Symbol =< $Z); + (Symbol >= $0 andalso Symbol =< $9) +). + +-define(is_nonzero(Symbol), + Symbol >= $1 andalso Symbol =< $9 +). + +-define(is_noncontrol(Symbol), + (Symbol >= ?space) +). + +-define(is_whitespace(Symbol), + Symbol =:= ?space; Symbol =:= ?tab; Symbol =:= ?cr; Symbol =:= ?newline +). + + +%% error, incomplete and event macros +-ifndef(error). +-define(error(Args), + erlang:error(badarg, Args) +). +-endif. + + +-ifndef(incomplete). +-define(incomplete(State, Rest, T, Stack, Opts), + {ok, lists:reverse(T), fun(Stream) when is_binary(Stream) -> + State(<>, [], Stack, Opts) + end + } +). +-define(incomplete(State, Rest, T, Stack, Opts, Acc), + {ok, T, fun(Stream) when is_binary(Stream) -> + State(<>, [], Stack, Opts, Acc) + end + } +). +-endif. + + +-ifndef(event). +-define(event(Event, State, Rest, T, Stack, Opts), + State(Rest, Event ++ T, Stack, Opts) +). +-endif. + + +start(<>, T, Stack, Opts) -> + ?event([start_object], object, Rest, T, [key|Stack], Opts); +start(<>, T, Stack, Opts) -> + ?event([start_array], array, Rest, T, [array|Stack], Opts); +start(<>, T, Stack, Opts) -> + string(Rest, T, Stack, Opts, []); +start(<<$t, Rest/binary>>, T, Stack, Opts) -> + tr(Rest, T, Stack, Opts); +start(<<$f, Rest/binary>>, T, Stack, Opts) -> + fa(Rest, T, Stack, Opts); +start(<<$n, Rest/binary>>, T, Stack, Opts) -> + nu(Rest, T, Stack, Opts); +start(<>, T, Stack, Opts) -> + negative(Rest, T, Stack, Opts, "-"); +start(<>, T, Stack, Opts) -> + zero(Rest, T, Stack, Opts, "0"); +start(<>, T, Stack, Opts) when ?is_nonzero(S) -> + integer(Rest, T, Stack, Opts, [S]); +start(<>, T, Stack, Opts) when ?is_whitespace(S) -> + start(Rest, T, Stack, Opts); +start(<<>>, T, Stack, Opts) -> + ?incomplete(start, <<>>, T, Stack, Opts); +start(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +object(<>, T, Stack, Opts) -> + string(Rest, T, Stack, Opts, []); +object(<>, T, [key|Stack], Opts) -> + ?event([end_object], maybe_done, Rest, T, Stack, Opts); +object(<>, T, Stack, Opts) when ?is_whitespace(S) -> + object(Rest, T, Stack, Opts); +object(<<>>, T, Stack, Opts) -> + ?incomplete(object, <<>>, T, Stack, Opts); +object(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +array(<>, T, Stack, Opts) -> + string(Rest, T, Stack, Opts, []); +array(<<$t, Rest/binary>>, T, Stack, Opts) -> + tr(Rest, T, Stack, Opts); +array(<<$f, Rest/binary>>, T, Stack, Opts) -> + fa(Rest, T, Stack, Opts); +array(<<$n, Rest/binary>>, T, Stack, Opts) -> + nu(Rest, T, Stack, Opts); +array(<>, T, Stack, Opts) -> + negative(Rest, T, Stack, Opts, "-"); +array(<>, T, Stack, Opts) -> + zero(Rest, T, Stack, Opts, "0"); +array(<>, T, Stack, Opts) when ?is_nonzero(S) -> + integer(Rest, T, Stack, Opts, [S]); +array(<>, T, Stack, Opts) -> + ?event([start_object], object, Rest, T, [key|Stack], Opts); +array(<>, T, Stack, Opts) -> + ?event([start_array], array, Rest, T, [array|Stack], Opts); +array(<>, T, [array|Stack], Opts) -> + maybe_done(Rest, [end_array] ++ T, Stack, Opts); +array(<>, T, Stack, Opts) when ?is_whitespace(S) -> + array(Rest, T, Stack, Opts); +array(<<>>, T, Stack, Opts) -> + ?incomplete(array, <<>>, T, Stack, Opts); +array(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +value(<>, T, Stack, Opts) -> + string(Rest, T, Stack, Opts, []); +value(<<$t, Rest/binary>>, T, Stack, Opts) -> + tr(Rest, T, Stack, Opts); +value(<<$f, Rest/binary>>, T, Stack, Opts) -> + fa(Rest, T, Stack, Opts); +value(<<$n, Rest/binary>>, T, Stack, Opts) -> + nu(Rest, T, Stack, Opts); +value(<>, T, Stack, Opts) -> + negative(Rest, T, Stack, Opts, "-"); +value(<>, T, Stack, Opts) -> + zero(Rest, T, Stack, Opts, "0"); +value(<>, T, Stack, Opts) when ?is_nonzero(S) -> + integer(Rest, T, Stack, Opts, [S]); +value(<>, T, Stack, Opts) -> + ?event([start_object], object, Rest, T, [key|Stack], Opts); +value(<>, T, Stack, Opts) -> + ?event([start_array], array, Rest, T, [array|Stack], Opts); +value(<>, T, Stack, Opts) when ?is_whitespace(S) -> + value(Rest, T, Stack, Opts); +value(<<>>, T, Stack, Opts) -> + ?incomplete(value, <<>>, T, Stack, Opts); +value(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +colon(<>, T, [key|Stack], Opts) -> + value(Rest, T, [object|Stack], Opts); +colon(<>, T, Stack, Opts) when ?is_whitespace(S) -> + colon(Rest, T, Stack, Opts); +colon(<<>>, T, Stack, Opts) -> + ?incomplete(colon, <<>>, T, Stack, Opts); +colon(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +key(<>, T, Stack, Opts) -> + string(Rest, T, Stack, Opts, []); +key(<>, T, Stack, Opts) when ?is_whitespace(S) -> + key(Rest, T, Stack, Opts); +key(<<>>, T, Stack, Opts) -> + ?incomplete(key, <<>>, T, Stack, Opts); +key(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +%% string has an additional parameter, an accumulator (Acc) used to hold the +%% intermediate representation of the string being parsed. using a list of +%% integers representing unicode codepoints is faster than constructing +%% binaries, there's a branch kicking around which proves it +%% string uses partial_utf/1 to cease parsing when invalid encodings are +%% encountered rather than just checking remaining binary size like other +%% states to eliminate certain incomplete states +%% when parsing strings, the naive detection of partial codepoints is +%% insufficient. this incredibly anal function should detect all badly formed +%% utf sequences +partial_utf(<<>>) -> true; +partial_utf(<>) when X >= 16#c2, X =< 16#df -> true; +partial_utf(<>) when X >= 16#e0, X =< 16#ef -> + case Rest of + <<>> -> true + ; <> when Y >= 16#80, Y =< 16#bf -> true + ; _ -> false + end; +partial_utf(<>) when X >= 16#f0, X =< 16#f4 -> + case Rest of + <<>> -> true + ; <> when Y >= 16#80, Y =< 16#bf -> true + ; <> when Y >= 16#80, Y =< 16#bf, Z >= 16#80, Z =< 16#bf -> true + ; _ -> false + end; +partial_utf(_) -> false. + +string(<>, T, [key|_] = Stack, Opts, Acc) -> + ?event([{key, lists:reverse(Acc)}], colon, Rest, T, Stack, Opts); +string(<>, T, Stack, Opts, Acc) -> + ?event([{string, lists:reverse(Acc)}], maybe_done, Rest, T, Stack, Opts); +string(<>, T, Stack, Opts, Acc) -> + escape(Rest, T, Stack, Opts, Acc); +%% things get dumb here. erlang doesn't properly restrict unicode non-characters +%% so you can't trust the codepoints it returns always +%% the range 32..16#fdcf is safe, so allow that +string(<>, T, Stack, Opts, Acc) + when ?is_noncontrol(S), S < 16#fdd0 -> + string(Rest, T, Stack, Opts, [S] ++ Acc); +%% the range 16#fdf0..16#fffd is also safe +string(<>, T, Stack, Opts, Acc) + when S > 16#fdef, S < 16#fffe -> + string(Rest, T, Stack, Opts, [S] ++ Acc); +%% yes, i think it's insane too +string(<>, T, Stack, Opts, Acc) + when S > 16#ffff andalso + S =/= 16#1fffe andalso S =/= 16#1ffff andalso + S =/= 16#2fffe andalso S =/= 16#2ffff andalso + S =/= 16#3fffe andalso S =/= 16#3ffff andalso + S =/= 16#4fffe andalso S =/= 16#4ffff andalso + S =/= 16#5fffe andalso S =/= 16#5ffff andalso + S =/= 16#6fffe andalso S =/= 16#6ffff andalso + S =/= 16#7fffe andalso S =/= 16#7ffff andalso + S =/= 16#8fffe andalso S =/= 16#8ffff andalso + S =/= 16#9fffe andalso S =/= 16#9ffff andalso + S =/= 16#afffe andalso S =/= 16#affff andalso + S =/= 16#bfffe andalso S =/= 16#bffff andalso + S =/= 16#cfffe andalso S =/= 16#cffff andalso + S =/= 16#dfffe andalso S =/= 16#dffff andalso + S =/= 16#efffe andalso S =/= 16#effff andalso + S =/= 16#ffffe andalso S =/= 16#fffff andalso + S =/= 16#10fffe andalso S =/= 16#10ffff -> + string(Rest, T, Stack, Opts, [S] ++ Acc); +string(Bin, T, Stack, Opts, Acc) -> + case partial_utf(Bin) of + true -> ?incomplete(string, Bin, T, Stack, Opts, Acc) + ; false -> + case Opts#opts.loose_unicode of + true -> noncharacter(Bin, T, Stack, Opts, Acc) + ; false -> ?error([Bin, T, Stack, Opts, Acc]) + end + end. + +%% we don't need to guard against partial utf here, because it's already taken +%% care of in string. theoretically, the last clause of noncharacter/4 is +%% unreachable +%% non-characters erlang doesn't recognize as non-characters, idiotically +noncharacter(<>, T, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); +%% u+fffe and u+ffff +noncharacter(<<239, 191, X, Rest/binary>>, T, Stack, Opts, Acc) + when X == 190; X == 191 -> + string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); +%% surrogates +noncharacter(<<237, X, _, Rest/binary>>, T, Stack, Opts, Acc) when X >= 160 -> + string(Rest, T, Stack, Opts, [16#fffd] ++ Acc); +noncharacter(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +escape(<<$b, Rest/binary>>, T, Stack, Opts, Acc) -> + string(Rest, T, Stack, Opts, "\b" ++ Acc); +escape(<<$f, Rest/binary>>, T, Stack, Opts, Acc) -> + string(Rest, T, Stack, Opts, "\f" ++ Acc); +escape(<<$n, Rest/binary>>, T, Stack, Opts, Acc) -> + string(Rest, T, Stack, Opts, "\n" ++ Acc); +escape(<<$r, Rest/binary>>, T, Stack, Opts, Acc) -> + string(Rest, T, Stack, Opts, "\r" ++ Acc); +escape(<<$t, Rest/binary>>, T, Stack, Opts, Acc) -> + string(Rest, T, Stack, Opts, "\t" ++ Acc); +escape(<<$u, Rest/binary>>, T, Stack, Opts, Acc) -> + escaped_unicode(Rest, T, Stack, Opts, {[], Acc}); +escape(<>, T, Stack, Opts, Acc) + when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> + string(Rest, T, Stack, Opts, [S] ++ Acc); +escape(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(escape, <<>>, T, Stack, Opts, Acc); +escape(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +%% this code is ugly and unfortunate, but so is json's handling of escaped +%% unicode codepoint sequences. +escaped_unicode(<>, T, Stack, Opts, {[C, B, A], String}) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + %% high surrogate, we need a low surrogate next + X when X >= 16#d800, X =< 16#dbff -> + low_surrogate(Rest, T, Stack, Opts, {X, String}) + %% non-characters, you're not allowed to exchange these + ; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef -> + case Opts#opts.loose_unicode of + true -> + string(Rest, T, Stack, Opts, [16#fffd] ++ String) + ; false -> + ?error([<>, T, Stack, Opts, {[C, B, A], String}]) + end + %% allowing interchange of null bytes allows attackers to forge + %% malicious streams + ; X when X == 16#0000 -> + case Opts#opts.loose_unicode of + true -> + string(Rest, T, Stack, Opts, [16#fffd] ++ String) + ; false -> + ?error([<>, T, Stack, Opts, {[C, B, A], String}]) + end + %% anything else + ; X -> + string(Rest, T, Stack, Opts, [X] ++ String) + end; +escaped_unicode(<>, T, Stack, Opts, {Acc, String}) + when ?is_hex(S) -> + escaped_unicode(Rest, T, Stack, Opts, {[S] ++ Acc, String}); +escaped_unicode(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(escaped_unicode, <<>>, T, Stack, Opts, Acc); +escaped_unicode(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +low_surrogate(<>, T, Stack, Opts, Acc) -> + low_surrogate_u(Rest, T, Stack, Opts, Acc); +%% not an escaped codepoint, our high codepoint is illegal. dispatch back to +%% string to handle +low_surrogate(<> = Bin, T, Stack, Opts, {High, String}) -> + case Opts#opts.loose_unicode of + true -> + string(Bin, T, Stack, Opts, [16#fffd] ++ String) + ; false -> + ?error([<>, T, Stack, Opts, {High, String}]) + end; +low_surrogate(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(low_surrogate, <<>>, T, Stack, Opts, Acc); +low_surrogate(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +low_surrogate_u(<<$u, Rest/binary>>, T, Stack, Opts, {High, String}) -> + low_surrogate_v(Rest, T, Stack, Opts, {[], High, String}); +%% not a low surrogate, dispatch back to string to handle, including the +%% rsolidus we parsed previously +low_surrogate_u(<> = Bin, T, Stack, Opts, {High, String}) -> + case Opts#opts.loose_unicode of + true -> + string(<>, + T, + Stack, + Opts, + [16#fffd] ++ String + ) + ; false -> + ?error([<>, T, Stack, Opts, {High, String}]) + end; +low_surrogate_u(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(low_surrogate_u, <<>>, T, Stack, Opts, Acc); +low_surrogate_u(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +low_surrogate_v(<>, T, Stack, Opts, {[C, B, A], High, String}) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X >= 16#dc00, X =< 16#dfff -> + V = surrogate_to_codepoint(High, X), + case V rem 16#10000 of Y when Y == 16#fffe; Y == 16#ffff -> + case Opts#opts.loose_unicode of + true -> + string(Rest, T, Stack, Opts, [16#fffd] ++ String) + ; false -> + ?error([<>, T, Stack, Opts, {[C, B, A], High, String}]) + end + ; _ -> + string(Rest, T, Stack, Opts, [V] ++ String) + end + %% not a low surrogate, bad bad bad + ; _ -> + case Opts#opts.loose_unicode of + true -> + string(Rest, T, Stack, Opts, [16#fffd, 16#fffd] ++ String) + ; false -> + ?error([<>, T, Stack, Opts, {[C, B, A], High, String}]) + end + end; +low_surrogate_v(<>, T, Stack, Opts, {Low, High, String}) + when ?is_hex(S) -> + low_surrogate_v(Rest, T, Stack, Opts, {[S] ++ Low, High, String}); +low_surrogate_v(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(low_surrogate_v, <<>>, T, Stack, Opts, Acc); +low_surrogate_v(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +%% stole this from the unicode spec +surrogate_to_codepoint(High, Low) -> + (High - 16#d800) * 16#400 + (Low - 16#dc00) + 16#10000. + + +%% like strings, numbers are collected in an intermediate accumulator before +%% being emitted to the callback handler +negative(<<$0, Rest/binary>>, T, Stack, Opts, Acc) -> + zero(Rest, T, Stack, Opts, "0" ++ Acc); +negative(<>, T, Stack, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, T, Stack, Opts, [S] ++ Acc); +negative(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(negative, <<>>, T, Stack, Opts, Acc); +negative(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +zero(<>, T, [object|Stack], Opts, Acc) -> + ?event([end_object, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +zero(<>, T, [array|Stack], Opts, Acc) -> + ?event([end_array, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +zero(<>, T, [object|Stack], Opts, Acc) -> + ?event([format_number(Acc)], key, Rest, T, [key|Stack], Opts); +zero(<>, T, [array|_] = Stack, Opts, Acc) -> + ?event([format_number(Acc)], value, Rest, T, Stack, Opts); +zero(<>, T, Stack, Opts, Acc) -> + initial_decimal(Rest, T, Stack, Opts, {Acc, []}); +zero(<>, T, Stack, Opts, Acc) when ?is_whitespace(S) -> + ?event([format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +zero(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(zero, <<>>, T, Stack, Opts, Acc); +zero(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +integer(<>, T, Stack, Opts, Acc) when ?is_nonzero(S) -> + integer(Rest, T, Stack, Opts, [S] ++ Acc); +integer(<>, T, [object|Stack], Opts, Acc) -> + ?event([end_object, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +integer(<>, T, [array|Stack], Opts, Acc) -> + ?event([end_array, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +integer(<>, T, [object|Stack], Opts, Acc) -> + ?event([format_number(Acc)], key, Rest, T, [key|Stack], Opts); +integer(<>, T, [array|_] = Stack, Opts, Acc) -> + ?event([format_number(Acc)], value, Rest, T, Stack, Opts); +integer(<>, T, Stack, Opts, Acc) -> + initial_decimal(Rest, T, Stack, Opts, {Acc, []}); +integer(<>, T, Stack, Opts, Acc) -> + integer(Rest, T, Stack, Opts, [?zero] ++ Acc); +integer(<>, T, Stack, Opts, Acc) when S =:= $e; S =:= $E -> + e(Rest, T, Stack, Opts, {Acc, [], []}); +integer(<>, T, Stack, Opts, Acc) when ?is_whitespace(S) -> + ?event([format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +integer(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(integer, <<>>, T, Stack, Opts, Acc); +integer(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +initial_decimal(<>, T, Stack, Opts, {Int, Frac}) + when S =:= ?zero; ?is_nonzero(S) -> + decimal(Rest, T, Stack, Opts, {Int, [S] ++ Frac}); +initial_decimal(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(initial_decimal, <<>>, T, Stack, Opts, Acc); +initial_decimal(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +decimal(<>, T, Stack, Opts, {Int, Frac}) + when S=:= ?zero; ?is_nonzero(S) -> + decimal(Rest, T, Stack, Opts, {Int, [S] ++ Frac}); +decimal(<>, T, [object|Stack], Opts, Acc) -> + ?event([end_object, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +decimal(<>, T, [array|Stack], Opts, Acc) -> + ?event([end_array, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +decimal(<>, T, [object|Stack], Opts, Acc) -> + ?event([format_number(Acc)], key, Rest, T, [key|Stack], Opts); +decimal(<>, T, [array|_] = Stack, Opts, Acc) -> + ?event([format_number(Acc)], value, Rest, T, Stack, Opts); +decimal(<>, T, Stack, Opts, {Int, Frac}) + when S =:= $e; S =:= $E -> + e(Rest, T, Stack, Opts, {Int, Frac, []}); +decimal(<>, T, Stack, Opts, Acc) when ?is_whitespace(S) -> + ?event([format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +decimal(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(decimal, <<>>, T, Stack, Opts, Acc); +decimal(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +e(<>, T, Stack, Opts, {Int, Frac, Exp}) + when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, T, Stack, Opts, {Int, Frac, [S] ++ Exp}); +e(<>, T, Stack, Opts, {Int, Frac, Exp}) + when S =:= ?positive; S =:= ?negative -> + ex(Rest, T, Stack, Opts, {Int, Frac, [S] ++ Exp}); +e(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(e, <<>>, T, Stack, Opts, Acc); +e(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +ex(<>, T, Stack, Opts, {Int, Frac, Exp}) + when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, T, Stack, Opts, {Int, Frac, [S] ++ Exp}); +ex(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(ex, <<>>, T, Stack, Opts, Acc); +ex(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +exp(<>, T, Stack, Opts, {Int, Frac, Exp}) + when S =:= ?zero; ?is_nonzero(S) -> + exp(Rest, T, Stack, Opts, {Int, Frac, [S] ++ Exp}); +exp(<>, T, [object|Stack], Opts, Acc) -> + ?event([end_object, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +exp(<>, T, [array|Stack], Opts, Acc) -> + ?event([end_array, format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +exp(<>, T, [object|Stack], Opts, Acc) -> + ?event([format_number(Acc)], key, Rest, T, [key|Stack], Opts); +exp(<>, T, [array|_] = Stack, Opts, Acc) -> + ?event([format_number(Acc)], value, Rest, T, Stack, Opts); +exp(<>, T, Stack, Opts, Acc) when ?is_whitespace(S) -> + ?event([format_number(Acc)], maybe_done, Rest, T, Stack, Opts); +exp(<<>>, T, Stack, Opts, Acc) -> + ?incomplete(exp, <<>>, T, Stack, Opts, Acc); +exp(Bin, T, Stack, Opts, Acc) -> + ?error([Bin, T, Stack, Opts, Acc]). + + +format_number(Int) when is_list(Int) -> + {integer, list_to_integer(lists:reverse(Int))}; +format_number({Int, Frac}) -> + {float, list_to_float(lists:reverse(Frac ++ "." ++ Int))}; +format_number({Int, [], Exp}) -> + {float, list_to_float(lists:reverse(Exp ++ "e0." ++ Int))}; +format_number({Int, Frac, Exp}) -> + {float, list_to_float(lists:reverse(Exp ++ "e" ++ Frac ++ "." ++ Int))}. + + +tr(<<$r, Rest/binary>>, T, Stack, Opts) -> + tru(Rest, T, Stack, Opts); +tr(<<>>, T, Stack, Opts) -> + ?incomplete(tr, <<>>, T, Stack, Opts); +tr(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +tru(<<$u, Rest/binary>>, T, Stack, Opts) -> + true(Rest, T, Stack, Opts); +tru(<<>>, T, Stack, Opts) -> + ?incomplete(tru, <<>>, T, Stack, Opts); +tru(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +true(<<$e, Rest/binary>>, T, Stack, Opts) -> + ?event([{literal, true}], maybe_done, Rest, T, Stack, Opts); +true(<<>>, T, Stack, Opts) -> + ?incomplete(true, <<>>, T, Stack, Opts); +true(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +fa(<<$a, Rest/binary>>, T, Stack, Opts) -> + fal(Rest, T, Stack, Opts); +fa(<<>>, T, Stack, Opts) -> + ?incomplete(fa, <<>>, T, Stack, Opts); +fa(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +fal(<<$l, Rest/binary>>, T, Stack, Opts) -> + fals(Rest, T, Stack, Opts); +fal(<<>>, T, Stack, Opts) -> + ?incomplete(fal, <<>>, T, Stack, Opts); +fal(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +fals(<<$s, Rest/binary>>, T, Stack, Opts) -> + false(Rest, T, Stack, Opts); +fals(<<>>, T, Stack, Opts) -> + ?incomplete(fals, <<>>, T, Stack, Opts); +fals(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +false(<<$e, Rest/binary>>, T, Stack, Opts) -> + ?event([{literal, false}], maybe_done, Rest, T, Stack, Opts); +false(<<>>, T, Stack, Opts) -> + ?incomplete(false, <<>>, T, Stack, Opts); +false(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +nu(<<$u, Rest/binary>>, T, Stack, Opts) -> + nul(Rest, T, Stack, Opts); +nu(<<>>, T, Stack, Opts) -> + ?incomplete(nu, <<>>, T, Stack, Opts); +nu(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +nul(<<$l, Rest/binary>>, T, Stack, Opts) -> + null(Rest, T, Stack, Opts); +nul(<<>>, T, Stack, Opts) -> + ?incomplete(nul, <<>>, T, Stack, Opts); +nul(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +null(<<$l, Rest/binary>>, T, Stack, Opts) -> + ?event([{literal, null}], maybe_done, Rest, T, Stack, Opts); +null(<<>>, T, Stack, Opts) -> + ?incomplete(null, <<>>, T, Stack, Opts); +null(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +maybe_done(<>, T, [object|Stack], Opts) -> + ?event([end_object], maybe_done, Rest, T, Stack, Opts); +maybe_done(<>, T, [array|Stack], Opts) -> + ?event([end_array], maybe_done, Rest, T, Stack, Opts); +maybe_done(<>, T, [object|Stack], Opts) -> + key(Rest, T, [key|Stack], Opts); +maybe_done(<>, T, [array|_] = Stack, Opts) -> + value(Rest, T, Stack, Opts); +maybe_done(<>, T, Stack, Opts) when ?is_whitespace(S) -> + maybe_done(Rest, T, Stack, Opts); +maybe_done(Rest, T, [], Opts) -> + ?event([end_json], done, Rest, T, [], Opts); +maybe_done(<<>>, T, Stack, Opts) -> + ?incomplete(maybe_done, <<>>, T, Stack, Opts); +maybe_done(Bin, T, Stack, Opts) -> + ?error([Bin, T, Stack, Opts]). + + +done(<>, T, [], Opts) when ?is_whitespace(S) -> + done(Rest, T, [], Opts); +done(<<>>, T, [], Opts) -> ?incomplete(done, <<>>, T, [], Opts); +done(Bin, T, [], Opts) -> ?error([Bin, T, [], Opts]). \ No newline at end of file diff --git a/include/jsx_tokenizer.hrl b/include/jsx_tokenizer.hrl new file mode 100644 index 0000000..051397b --- /dev/null +++ b/include/jsx_tokenizer.hrl @@ -0,0 +1,228 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + + +-ifndef(error). +-define(error(Args), + erlang:error(badarg, Args) +). +-endif. + + +-ifndef(incomplete). +-define(incomplete(State, T, Stack, Opts), + {ok, lists:reverse(T), fun(Stream) when is_list(Stream) -> + State(Stream, [], Stack, Opts) + end + } +). +-endif. + + +-ifndef(event). +-define(event(Event, State, Rest, T, Stack, Opts), + State(Rest, Event ++ T, Stack, Opts) +). +-endif. + + + + +start({string, String}, [], [], Opts) when is_binary(String); is_list(String) -> + {ok, + [{string, unicode:characters_to_list(json_escape(String, Opts))}, end_json], + fun(X) when is_list(X) -> ?error([X, [], [], Opts]) end + }; +start({float, Float}, [], [], Opts) when is_float(Float) -> + {ok, + [{float, Float}, end_json], + fun(X) when is_list(X) -> ?error([X, [], [], Opts]) end + }; +start({integer, Int}, [], [], Opts) when is_integer(Int) -> + {ok, + [{integer, Int}, end_json], + fun(X) when is_list(X) -> ?error([X, [], [], Opts]) end + }; +start({literal, Atom}, [], [], Opts) when Atom == true; Atom == false; Atom == null -> + {ok, + [{literal, Atom}, end_json], + fun(X) when is_list(X) -> ?error([X, [], [], Opts]) end + }; +%% third parameter is a stack to match end_foos to start_foos +start(Forms, [], [], Opts) -> list_or_object(Forms, [], [], Opts). + + +list_or_object([start_object|Forms], T, Stack, Opts) -> + ?event([start_object], key, Forms, T, [object] ++ Stack, Opts); +list_or_object([start_array|Forms], T, Stack, Opts) -> + ?event([start_array], value, Forms, T, [array] ++ Stack, Opts); +list_or_object([], T, Stack, Opts) -> ?incomplete(list_or_object, T, Stack, Opts); +list_or_object(Forms, T, Stack, Opts) -> ?error([Forms, T, Stack, Opts]). + + +key([{key, Key}|Forms], T, Stack, Opts) when is_binary(Key); is_list(Key) -> + ?event([{key, unicode:characters_to_list(json_escape(Key, Opts))}], + value, Forms, T, Stack, Opts + ); +key([end_object|Forms], T, [object|Stack], Opts) -> + ?event([end_object], maybe_done, Forms, T, Stack, Opts); +key([], T, Stack, Opts) -> ?incomplete(key, T, Stack, Opts); +key(Forms, T, Stack, Opts) -> ?error([Forms, T, Stack, Opts]). + + +value([{string, S}|Forms], T, Stack, Opts) when is_binary(S); is_list(S) -> + ?event([{string, unicode:characters_to_list(json_escape(S, Opts))}], + maybe_done, Forms, T, Stack, Opts + ); +value([{float, F}|Forms], T, Stack, Opts) when is_float(F) -> + ?event([{float, F}], maybe_done, Forms, T, Stack, Opts); +value([{integer, I}|Forms], T, Stack, Opts) when is_integer(I) -> + ?event([{integer, I}], maybe_done, Forms, T, Stack, Opts); +value([{literal, L}|Forms], T, Stack, Opts) + when L == true; L == false; L == null -> + ?event([{literal, L}], maybe_done, Forms, T, Stack, Opts); +value([start_object|Forms], T, Stack, Opts) -> + ?event([start_object], key, Forms, T, [object] ++ Stack, Opts); +value([start_array|Forms], T, Stack, Opts) -> + ?event([start_array], maybe_done, Forms, T, [array] ++ Stack, Opts); +value([end_array|Forms], T, [array|Stack], Opts) -> + ?event([end_array], maybe_done, Forms, T, Stack, Opts); +value([], T, Stack, Opts) -> ?incomplete(value, T, Stack, Opts); +value(Forms, T, Stack, Opts) -> ?error([Forms, T, Stack, Opts]). + + +maybe_done([end_json], T, [], Opts) -> + ?event([end_json], done, [], T, [], Opts); +maybe_done([end_object|Forms], T, [object|Stack], Opts) -> + ?event([end_object], maybe_done, Forms, T, Stack, Opts); +maybe_done([end_array|Forms], T, [array|Stack], Opts) -> + ?event([end_array], maybe_done, Forms, T, Stack, Opts); +maybe_done(Forms, T, [object|_] = Stack, Opts) -> key(Forms, T, Stack, Opts); +maybe_done(Forms, T, [array|_] = Stack, Opts) -> value(Forms, T, Stack, Opts); +maybe_done([], T, Stack, Opts) -> ?incomplete(maybe_done, T, Stack, Opts); +maybe_done(Forms, T, Stack, Opts) -> ?error([Forms, T, Stack, Opts]). + + +done([], T, [], Opts) -> + {ok, lists:reverse(T), fun(X) when is_list(X) -> + done(X, T, [], Opts) + end + }; +done(Forms, T, Stack, Opts) -> ?error([Forms, T, Stack, Opts]). + + +%% json string escaping, for utf8 binaries. escape the json control sequences to +%% their json equivalent, escape other control characters to \uXXXX sequences, +%% everything else should be a legal json string component +json_escape(String, Opts) when is_binary(String) -> + json_escape(String, Opts, <<>>); +json_escape(String, Opts) when is_list(String) -> + json_escape(String, Opts, []). + +%% double quote +json_escape(<<$\", Rest/binary>>, Opts, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([$\"|Rest], Opts, Acc) -> + json_escape(Rest, Opts, [$\", $\\] ++ Acc); +%% backslash \ reverse solidus +json_escape(<<$\\, Rest/binary>>, Opts, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([$\\|Rest], Opts, Acc) -> + json_escape(Rest, Opts, [$\\, $\\] ++ Acc); +%% backspace +json_escape(<<$\b, Rest/binary>>, Opts, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([$\b|Rest], Opts, Acc) -> + json_escape(Rest, Opts, [$b, $\\] ++ Acc); +%% form feed +json_escape(<<$\f, Rest/binary>>, Opts, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([$\f|Rest], Opts, Acc) -> + json_escape(Rest, Opts, [$f, $\\] ++ Acc); +%% newline +json_escape(<<$\n, Rest/binary>>, Opts, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([$\n|Rest], Opts, Acc) -> + json_escape(Rest, Opts, [$n, $\\] ++ Acc); +%% cr +json_escape(<<$\r, Rest/binary>>, Opts, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([$\r|Rest], Opts, Acc) -> + json_escape(Rest, Opts, [$r, $\\] ++ Acc); +%% tab +json_escape(<<$\t, Rest/binary>>, Opts, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([$\t|Rest], Opts, Acc) -> + json_escape(Rest, Opts, [$t, $\\] ++ Acc); +%% other control characters +json_escape(<>, Opts, Acc) when C >= 0, C < $\s -> + json_escape(Rest, + Opts, + <> + ); +json_escape([C|Rest], Opts, Acc) when C >= 0, C < $\s -> + json_escape(Rest, Opts, lists:reverse(json_escape_sequence(C)) ++ Acc); +%% escape forward slashes -- optionally -- to faciliate microsoft's retarded +%% date format +json_escape(<<$/, Rest/binary>>, Opts=#opts{escape_forward_slash=true}, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([$/|Rest], Opts=#opts{escape_forward_slash=true}, Acc) -> + json_escape(Rest, Opts, [$/, $\\] ++ Acc); +%% escape u+2028 and u+2029 to avoid problems with jsonp +json_escape(<>, Opts, Acc) + when C == 16#2028; C == 16#2029 -> + json_escape(Rest, + Opts, + <> + ); +json_escape([C|Rest], Opts, Acc) when C =:= 16#2028; C =:= 16#2029 -> + json_escape(Rest, Opts, lists:reverse(json_escape_sequence(C)) ++ Acc); +%% any other legal codepoint +json_escape(<>, Opts, Acc) -> + json_escape(Rest, Opts, <>); +json_escape([C|Rest], Opts, Acc) -> + json_escape(Rest, Opts, [C] ++ Acc); +json_escape(<<>>, _Opts, Acc) -> + Acc; +json_escape([], _Opts, Acc) -> + lists:reverse(Acc); +json_escape(Rest, Opts, Acc) -> + erlang:error(badarg, [Rest, Opts, Acc]). + + +%% convert a codepoint to it's \uXXXX equiv. +json_escape_sequence(X) -> + <> = <>, + [$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))]. + + +to_hex(15) -> $f; +to_hex(14) -> $e; +to_hex(13) -> $d; +to_hex(12) -> $c; +to_hex(11) -> $b; +to_hex(10) -> $a; +to_hex(X) -> X + $0. \ No newline at end of file diff --git a/src/jsx_utf16.erl b/include/jsx_types.hrl similarity index 53% rename from src/jsx_utf16.erl rename to include/jsx_types.hrl index 131fc52..7f24eba 100644 --- a/src/jsx_utf16.erl +++ b/include/jsx_types.hrl @@ -21,16 +21,47 @@ %% THE SOFTWARE. +-type jsx_opts() :: [jsx_opt()]. +-type jsx_opt() :: multi_term + | loose_unicode + | escape_forward_slashes + | {encoding, auto + | utf8 + | utf16 + | {utf16, little} + | utf32 + | {utf32, little} + }. + --module(jsx_utf16). +-type jsx_event() :: start_object + | end_object + | start_array + | end_array + | end_json + | {key, list()} + | {string, list()} + | {integer, integer()} + | {float, float()} + | {literal, true} + | {literal, false} + | {literal, null}. --define(utf16, true). --include("../include/jsx_common.hrl"). --include("../include/jsx_decoder.hrl"). +-type jsx_encodeable() :: jsx_event() | [jsx_encodeable()]. -%% i've noticed you've noticed that there's no source here. very astute. see -%% jsx_decoder_template.hrl in the include directory. any mofications to this -%% module should be made there, but keep in mind other modules also include -%% that header +-type jsx_iterator() :: jsx_scanner() | jsx_tokenizer(). + + +-type jsx_scanner() :: fun((binary()) -> jsx_iterator_result()). + + +-type jsx_tokenizer() :: fun((jsx_encodeable()) -> jsx_iterator_result()). + + +-type jsx_iterator_result() :: + {jsx, jsx_event(), fun(() -> jsx_iterator_result())} + | {jsx, [jsx_event()], fun(() -> jsx_iterator_result())} + | {jsx, incomplete, jsx_iterator()} + | {error, {badjson, any()}}. \ No newline at end of file diff --git a/src/jsx.app.src b/src/jsx.app.src index 41a170b..3419ac1 100644 --- a/src/jsx.app.src +++ b/src/jsx.app.src @@ -4,15 +4,8 @@ {vsn, "0.10.0"}, {modules, [ jsx, - jsx_encoder, - jsx_utf8, - jsx_utf16, - jsx_utf16le, - jsx_utf32, - jsx_utf32le, - jsx_terms, - jsx_format, - jsx_verify, + jsx_tokenizer, + jsx_scanner, jsx_utils ]}, {registered, []}, diff --git a/src/jsx.erl b/src/jsx.erl index 0d63ec7..eed0705 100644 --- a/src/jsx.erl +++ b/src/jsx.erl @@ -25,149 +25,23 @@ %% the core parser api --export([parser/0, parser/1]). --export([decoder/0, decoder/1]). --export([encoder/0, encoder/1]). --export([term_to_json/1, term_to_json/2]). --export([json_to_term/1, json_to_term/2]). --export([is_json/1, is_json/2]). --export([format/1, format/2]). +-export([scanner/0, scanner/1]). + +-include("../include/jsx_types.hrl"). --include("../include/jsx_common.hrl"). +-spec scanner() -> jsx_scanner(). +scanner() -> scanner([]). - --spec parser() -> jsx_decoder(). - -parser() -> decoder([]). - - --spec parser(OptsList::jsx_opts()) -> jsx_decoder(). - -parser(OptsList) -> decoder(OptsList). - - --spec decoder() -> jsx_decoder(). - -decoder() -> decoder([]). - - --spec decoder(OptsList::jsx_opts()) -> jsx_decoder(). - - -decoder(OptsList) -> - case parse_opts(OptsList) of - {error, badarg} -> {error, badarg} - ; Opts -> - case Opts#opts.encoding of - utf8 -> jsx_utf8:decoder(Opts) - ; utf16 -> jsx_utf16:decoder(Opts) - ; utf32 -> jsx_utf32:decoder(Opts) - ; {utf16, little} -> jsx_utf16le:decoder(Opts) - ; {utf32, little} -> jsx_utf32le:decoder(Opts) - ; auto -> jsx_utils:detect_encoding(Opts) - ; _ -> {error, badarg} - end +-spec scanner(OptsList::jsx_opts()) -> jsx_scanner(). +scanner(OptsList) -> + fun(Stream) when is_binary(Stream) -> + (jsx_scanner:scanner(OptsList))(Stream) + ; (Stream) when is_list(Stream); is_tuple(Stream) -> + (jsx_tokenizer:tokenizer(OptsList))(Stream) end. --spec encoder() -> jsx_encoder(). - -encoder() -> encoder([]). - - --spec encoder(OptsList::jsx_opts()) -> jsx_encoder(). - -encoder(OptsList) -> - case parse_opts(OptsList) of - {error, badarg} -> {error, badarg} - ; Opts -> jsx_encoder:encoder(Opts) - end. - - --spec json_to_term(JSON::binary()) -> jsx_term(). - -json_to_term(JSON) -> - try json_to_term(JSON, []) - %% rethrow exception so internals aren't confusingly exposed to users - catch error:badarg -> erlang:error(badarg, [JSON]) - end. - - --spec json_to_term(JSON::binary(), Opts::decoder_opts()) -> jsx_term(). - -json_to_term(JSON, Opts) -> - jsx_terms:json_to_term(JSON, Opts). - - --spec term_to_json(JSON::jsx_term()) -> binary(). - -term_to_json(JSON) -> - try term_to_json(JSON, []) - %% rethrow exception so internals aren't confusingly exposed to users - catch error:badarg -> erlang:error(badarg, [JSON]) - end. - - --spec term_to_json(JSON::jsx_term(), Opts::encoder_opts()) -> binary(). - -term_to_json(JSON, Opts) -> - try jsx_terms:term_to_json(JSON, Opts) - %% rethrow exception so internals aren't confusingly exposed to users - catch error:badarg -> erlang:error(badarg, [JSON, Opts]) - end. - - --spec is_json(JSON::binary()) -> true | false - ; (Terms::list(jsx_encodeable())) -> true | false. - -is_json(JSON) -> - is_json(JSON, []). - - --spec is_json(JSON::binary(), Opts::verify_opts()) -> true | false - ; (Terms::list(jsx_encodeable()), Opts::verify_opts()) -> true | false. - -is_json(JSON, Opts) -> - jsx_verify:is_json(JSON, Opts). - - --spec format(JSON::binary()) -> binary() | iolist() - ; (Terms::list(jsx_encodeable())) -> binary() | iolist(). - -format(JSON) -> - format(JSON, []). - - --spec format(JSON::binary(), Opts::format_opts()) -> binary() | iolist() - ; (Terms::list(jsx_encodeable()), Opts::format_opts()) -> - binary() | iolist(). - -format(JSON, Opts) -> - jsx_format:format(JSON, Opts). - - - -parse_opts(Opts) -> - parse_opts(Opts, #opts{}). - -parse_opts([], Opts) -> - Opts; -parse_opts([loose_unicode|Rest], Opts) -> - parse_opts(Rest, Opts#opts{loose_unicode=true}); -parse_opts([iterate|Rest], Opts) -> - parse_opts(Rest, Opts#opts{iterate=true}); -parse_opts([escape_forward_slash|Rest], Opts) -> - parse_opts(Rest, Opts#opts{escape_forward_slash=true}); -parse_opts([{encoding, Encoding}|Rest], Opts) - when Encoding =:= utf8; Encoding =:= utf16; Encoding =:= utf32; - Encoding =:= {utf16,little}; Encoding =:= {utf32,little}; - Encoding =:= auto -> - parse_opts(Rest, Opts#opts{encoding=Encoding}); -parse_opts(_, _) -> - {error, badarg}. - - -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). @@ -179,9 +53,9 @@ jsx_decoder_test_() -> encoder_decoder_equiv_test_() -> [ {"encoder/decoder equivalency", - ?_assert(begin {jsx, X, _} = (jsx:decoder())( + ?_assert(begin {ok, X, _} = (jsx:scanner())( <<"[\"a\", 17, 3.14, true, {\"k\":false}, []]">> - ), X end =:= begin {jsx, Y, _} = (jsx:encoder())( + ), X end =:= begin {ok, Y, _} = (jsx:scanner())( [start_array, {string, <<"a">>}, {integer, 17}, @@ -193,7 +67,8 @@ encoder_decoder_equiv_test_() -> end_object, start_array, end_array, - end_array] + end_array, + end_json] ), Y end ) } @@ -201,44 +76,14 @@ encoder_decoder_equiv_test_() -> jsx_decoder_gen([]) -> []; -jsx_decoder_gen(Tests) -> - jsx_decoder_gen(Tests, [utf8, - utf16, - {utf16, little}, - utf32, - {utf32, little} - ]). - -jsx_decoder_gen([_Test|Rest], []) -> - jsx_decoder_gen(Rest); -jsx_decoder_gen([Test|_] = Tests, [Encoding|Encodings]) -> - Name = lists:flatten(proplists:get_value(name, Test) ++ " :: " ++ - io_lib:format("~p", [Encoding]) - ), - JSON = unicode:characters_to_binary(proplists:get_value(json, Test), - unicode, - Encoding - ), +jsx_decoder_gen([Test|Rest]) -> + Name = proplists:get_value(name, Test), + JSON = proplists:get_value(json, Test), JSX = proplists:get_value(jsx, Test), Flags = proplists:get_value(jsx_flags, Test, []), - {generator, - fun() -> - [{Name ++ " iterative", - ?_assertEqual(iterative_decode(JSON, Flags), JSX)} - | {generator, - fun() -> [{Name ++ " incremental", ?_assertEqual( - incremental_decode(JSON, Flags), JSX) - } | {generator, - fun() -> - [{Name, ?_assertEqual( - decode(JSON, Flags), JSX) - } | jsx_decoder_gen(Tests, Encodings)] - end} - ] - end} - ] - end - }. + {generator, fun() -> + [{Name, ?_assertEqual(decode(JSON, Flags), JSX)} | jsx_decoder_gen(Rest)] + end}. load_tests(Path) -> @@ -254,7 +99,7 @@ load_tests([Test|Rest], Dir, Acc) -> ParsedTest = parse_tests(TestSpec, Dir), load_tests(Rest, Dir, [ParsedTest] ++ Acc) ; {error, _Reason} -> - erlang:error(Test) + erlang:error(badarg, [Test|Rest], Dir, Acc) end. @@ -264,7 +109,7 @@ parse_tests(TestSpec, Dir) -> parse_tests([{json, Path}|Rest], Dir, Acc) when is_list(Path) -> case file:read_file(Dir ++ "/" ++ Path) of {ok, Bin} -> parse_tests(Rest, Dir, [{json, Bin}] ++ Acc) - ; _ -> erlang:error(badarg) + ; _ -> erlang:error(badarg, [[{json, Path}|Rest], Dir, Acc]) end; parse_tests([KV|Rest], Dir, Acc) -> parse_tests(Rest, Dir, [KV] ++ Acc); @@ -273,34 +118,22 @@ parse_tests([], _Dir, Acc) -> decode(JSON, Flags) -> - P = jsx:decoder(Flags), - case P(JSON) of - {error, {badjson, _}} -> {error, badjson} - ; {jsx, incomplete, More} -> - case More(end_stream) of - {error, {badjson, _}} -> {error, badjson} - ; {jsx, T, _} -> T - end - ; {jsx, T, _} -> T + try + P = jsx:scanner(Flags), + {ok, X, More} = P(JSON), + {ok, Y, _More} = More(<<" ">>), + V = X ++ Y, + case lists:reverse(V) of + [end_json|_] -> V + ; _ -> {error, badjson} + end + catch + error:badarg -> {error, badjson} end. - -iterative_decode(JSON, Flags) -> - P = jsx:decoder([iterate] ++ Flags), - iterative_decode_loop(P(JSON), []). - -iterative_decode_loop({jsx, end_json, _Next}, Acc) -> - lists:reverse([end_json] ++ Acc); -iterative_decode_loop({jsx, incomplete, More}, Acc) -> - iterative_decode_loop(More(end_stream), Acc); -iterative_decode_loop({jsx, E, Next}, Acc) -> - iterative_decode_loop(Next(), [E] ++ Acc); -iterative_decode_loop({error, {badjson, _Error}}, _Acc) -> - {error, badjson}. - incremental_decode(<>, Flags) -> - P = jsx:decoder([iterate] ++ Flags), + P = jsx:scanner([iterate] ++ Flags), incremental_decode_loop(P(C), Rest, []). incremental_decode_loop({jsx, incomplete, Next}, <<>>, Acc) -> diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl deleted file mode 100644 index 91cf336..0000000 --- a/src/jsx_encoder.erl +++ /dev/null @@ -1,401 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2011 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - --module(jsx_encoder). - - --include("../include/jsx_common.hrl"). - - --export([start/3, - list_or_object/4, - key/4, - value/4, - maybe_done/4, - bad_json/2 -]). - --export([encoder/1]). - - --spec encoder(Opts::#opts{}) -> jsx_encoder(). - -encoder(Opts) -> - case Opts#opts.iterate of - true -> - fun(Forms) -> start(Forms, iterate, Opts) end - ; false -> - fun(Forms) -> start(Forms, [], Opts) end - end. - - - -%% emit takes a list of `events` to present to client code and formats them -%% appropriately -emit([], {State, Rest, T, Args}) -> - erlang:apply(?MODULE, State, [Rest, T] ++ Args); -emit([incomplete], {State, Rest, T, Args}) -> - {jsx, incomplete, fun(Stream) - when is_binary(Stream) -> - erlang:apply(?MODULE, - State, - [Rest ++ Stream, T] ++ Args - ) - ; (Else) -> {error, {badjson, Else}} - end}; -emit([Event|Events], {_State, _Rest, iterate, _Args} = Next) -> - {jsx, Event, fun() -> emit(Events, Next) end}; -emit([end_json|Events], {_State, _Rest, T, _Args} = Next) -> - {jsx, lists:reverse([end_json] ++ T), fun() -> emit(Events, Next) end}; -emit([Event|Events], {State, Rest, T, Args}) -> - emit(Events, {State, Rest, [Event] ++ T, Args}). - - -bad_json(Stream, _) -> {error, {badjson, Stream}}. - - -start({string, String}, T, Opts) when is_binary(String); is_list(String) -> - emit([{string, unicode:characters_to_list(json_escape(String, Opts))}, - end_json, - incomplete - ], - {bad_json, [], T, []} - ); -start({float, Float}, T, _Opts) when is_float(Float) -> - emit([{float, Float}, end_json, incomplete], {bad_json, [], T, []}); -start({integer, Int}, T, _Opts) when is_integer(Int) -> - emit([{integer, Int}, end_json, incomplete], {bad_json, [], T, []}); -start({literal, Atom}, T, _Opts) when Atom == true; Atom == false; Atom == null -> - emit([{literal, Atom}, end_json, incomplete], {bad_json, [], T, []}); -%% third parameter is a stack to match end_foos to start_foos -start(Forms, T, Opts) -> list_or_object(Forms, T, [], Opts). - - -list_or_object([start_object|Forms], T, Stack, Opts) -> - emit([start_object], {key, Forms, T, [[object] ++ Stack, Opts]}); -list_or_object([start_array|Forms], T, Stack, Opts) -> - emit([start_array], {value, Forms, T, [[array] ++ Stack, Opts]}); -list_or_object([], T, Stack, Opts) -> - emit([incomplete], {list_or_object, [], T, [Stack, Opts]}); -list_or_object(Forms, _, _, _) -> {error, {badjson, Forms}}. - - -key([{key, Key}|Forms], T, Stack, Opts) when is_binary(Key); is_list(Key) -> - emit([{key, unicode:characters_to_list(json_escape(Key, Opts))}], - {value, Forms, T, [Stack, Opts]} - ); -key([end_object|Forms], T, [object|Stack], Opts) -> - emit([end_object], {maybe_done, Forms, T, [Stack, Opts]}); -key([], T, Stack, Opts) -> - emit([incomplete], {key, [], T, [Stack, Opts]}); -key(Forms, _, _, _) -> {error, {badjson, Forms}}. - - -value([{string, S}|Forms], T, Stack, Opts) when is_binary(S); is_list(S) -> - emit([{string, unicode:characters_to_list(json_escape(S, Opts))}], - {maybe_done, Forms, T, [Stack, Opts]} - ); -value([{float, F}|Forms], T, Stack, Opts) when is_float(F) -> - emit([{float, F}], {maybe_done, Forms, T, [Stack, Opts]}); -value([{integer, I}|Forms], T, Stack, Opts) when is_integer(I) -> - emit([{integer, I}], {maybe_done, Forms, T, [Stack, Opts]}); -value([{literal, L}|Forms], T, Stack, Opts) - when L == true; L == false; L == null -> - emit([{literal, L}], {maybe_done, Forms, T, [Stack, Opts]}); -value([start_object|Forms], T, Stack, Opts) -> - emit([start_object], {key, Forms, T, [[object] ++ Stack, Opts]}); -value([start_array|Forms], T, Stack, Opts) -> - emit([start_array], {value, Forms, T, [[array] ++ Stack, Opts]}); -value([end_array|Forms], T, [array|Stack], Opts) -> - emit([end_array], {maybe_done, Forms, T, [Stack, Opts]}); -value([], T, Stack, Opts) -> - emit([incomplete], {value, [], T, [Stack, Opts]}); -value(Forms, _, _, _) -> {error, {badjson, Forms}}. - - -maybe_done([], T, [], _Opts) -> - emit([end_json, incomplete], {bad_json, [], T, []}); -maybe_done([end_json], T, [], _Opts) -> - emit([end_json, incomplete], {bad_json, [], T, []}); -maybe_done([end_object|Forms], T, [object|Stack], Opts) -> - emit([end_object], {maybe_done, Forms, T, [Stack, Opts]}); -maybe_done([end_array|Forms], T, [array|Stack], Opts) -> - emit([end_array], {maybe_done, Forms, T, [Stack, Opts]}); -maybe_done(Forms, T, [object|_] = Stack, Opts) -> key(Forms, T, Stack, Opts); -maybe_done(Forms, T, [array|_] = Stack, Opts) -> value(Forms, T, Stack, Opts); -maybe_done([], T, Stack, Opts) -> - emit([incomplete], {maybe_done, [], T, [Stack, Opts]}); -maybe_done(Forms, _, _, _) -> {error, {badjson, Forms}}. - - - -%% json string escaping, for utf8 binaries. escape the json control sequences to -%% their json equivalent, escape other control characters to \uXXXX sequences, -%% everything else should be a legal json string component -json_escape(String, Opts) when is_binary(String) -> - json_escape(String, Opts, <<>>); -json_escape(String, Opts) when is_list(String) -> - json_escape(String, Opts, []). - -%% double quote -json_escape(<<$\", Rest/binary>>, Opts, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([$\"|Rest], Opts, Acc) -> - json_escape(Rest, Opts, [$\", $\\] ++ Acc); -%% backslash \ reverse solidus -json_escape(<<$\\, Rest/binary>>, Opts, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([$\\|Rest], Opts, Acc) -> - json_escape(Rest, Opts, [$\\, $\\] ++ Acc); -%% backspace -json_escape(<<$\b, Rest/binary>>, Opts, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([$\b|Rest], Opts, Acc) -> - json_escape(Rest, Opts, [$b, $\\] ++ Acc); -%% form feed -json_escape(<<$\f, Rest/binary>>, Opts, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([$\f|Rest], Opts, Acc) -> - json_escape(Rest, Opts, [$f, $\\] ++ Acc); -%% newline -json_escape(<<$\n, Rest/binary>>, Opts, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([$\n|Rest], Opts, Acc) -> - json_escape(Rest, Opts, [$n, $\\] ++ Acc); -%% cr -json_escape(<<$\r, Rest/binary>>, Opts, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([$\r|Rest], Opts, Acc) -> - json_escape(Rest, Opts, [$r, $\\] ++ Acc); -%% tab -json_escape(<<$\t, Rest/binary>>, Opts, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([$\t|Rest], Opts, Acc) -> - json_escape(Rest, Opts, [$t, $\\] ++ Acc); -%% other control characters -json_escape(<>, Opts, Acc) when C >= 0, C < $\s -> - json_escape(Rest, - Opts, - <> - ); -json_escape([C|Rest], Opts, Acc) when C >= 0, C < $\s -> - json_escape(Rest, Opts, lists:reverse(json_escape_sequence(C)) ++ Acc); -%% escape forward slashes -- optionally -- to faciliate microsoft's retarded -%% date format -json_escape(<<$/, Rest/binary>>, Opts=#opts{escape_forward_slash=true}, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([$/|Rest], Opts=#opts{escape_forward_slash=true}, Acc) -> - json_escape(Rest, Opts, [$/, $\\] ++ Acc); -%% escape u+2028 and u+2029 to avoid problems with jsonp -json_escape(<>, Opts, Acc) - when C == 16#2028; C == 16#2029 -> - json_escape(Rest, - Opts, - <> - ); -json_escape([C|Rest], Opts, Acc) when C =:= 16#2028; C =:= 16#2029 -> - json_escape(Rest, Opts, lists:reverse(json_escape_sequence(C)) ++ Acc); -%% any other legal codepoint -json_escape(<>, Opts, Acc) -> - json_escape(Rest, Opts, <>); -json_escape([C|Rest], Opts, Acc) -> - json_escape(Rest, Opts, [C] ++ Acc); -json_escape(<<>>, _Opts, Acc) -> - Acc; -json_escape([], _Opts, Acc) -> - lists:reverse(Acc); -json_escape(_, _, _) -> - erlang:error(badarg). - - -%% convert a codepoint to it's \uXXXX equiv. -json_escape_sequence(X) -> - <> = <>, - [$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))]. - - -to_hex(15) -> $f; -to_hex(14) -> $e; -to_hex(13) -> $d; -to_hex(12) -> $c; -to_hex(11) -> $b; -to_hex(10) -> $a; -to_hex(X) -> X + $0. - - - --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - - -encode(Terms) -> - encode_simple(Terms) andalso encode_iterative(Terms). - - -encode_simple(Terms) -> - case (jsx:encoder([]))(Terms) of - {jsx, Terms, _} -> - true - %% matches [foo, end_json], aka naked terms - ; {jsx, [Terms, end_json], _} -> - true - ; {error, _} -> - false - end. - - -encode_iterative(Terms) -> - case loop((jsx:encoder([iterate]))(Terms), []) of - {ok, Terms} -> - true - %% matches naked terms - ; {ok, [Terms, end_json]} -> - true - ; {error, _} -> - false - end. - -loop({jsx, end_json, Next}, Acc) -> - {jsx, incomplete, F} = Next(), - {error, _} = F([]), - {ok, lists:reverse([end_json] ++ Acc)}; -loop({jsx, Event, Next}, Acc) -> - loop(Next(), [Event] ++ Acc). - - -encode_test_() -> - [ - {"empty object", ?_assert(encode([start_object, end_object, end_json]))}, - {"empty array", ?_assert(encode([start_array, end_array, end_json]))}, - {"nested empty objects", ?_assert(encode([start_object, - {key, "empty object"}, - start_object, - {key, "empty object"}, - start_object, - end_object, - end_object, - end_object, - end_json - ]))}, - {"nested empty arrays", ?_assert(encode([start_array, - start_array, - start_array, - end_array, - end_array, - end_array, - end_json - ]))}, - {"simple object", ?_assert(encode([start_object, - {key, "a"}, - {string, "hello"}, - {key, "b"}, - {integer, 1}, - {key, "c"}, - {float, 1.0}, - {key, "d"}, - {literal, true}, - end_object, - end_json - ]))}, - {"simple array", ?_assert(encode([start_array, - {string, "hello"}, - {integer, 1}, - {float, 1.0}, - {literal, true}, - end_array, - end_json - ]))}, - {"unbalanced array", ?_assertNot(encode([start_array, - end_array, - end_array, - end_json - ]))}, - {"naked string", ?_assert(encode({string, "hello"}))}, - {"naked literal", ?_assert(encode({literal, true}))}, - {"naked integer", ?_assert(encode({integer, 1}))}, - {"naked float", ?_assert(encode({float, 1.0}))} - ]. - - -binary_escape_test_() -> - [ - {"json string escaping", - ?_assert(json_escape( - <<"\"\\\b\f\n\r\t">>, #opts{} - ) =:= <<"\\\"\\\\\\b\\f\\n\\r\\t">> - ) - }, - {"json string hex escape", - ?_assert(json_escape( - <<1, 2, 3, 11, 26, 30, 31>>, #opts{} - ) =:= <<"\\u0001\\u0002\\u0003\\u000b\\u001a\\u001e\\u001f">> - ) - }, - {"jsonp protection", - ?_assert(json_escape( - <<226, 128, 168, 226, 128, 169>>, #opts{} - ) =:= <<"\\u2028\\u2029">> - ) - }, - {"microsoft i hate your date format", - ?_assert(json_escape(<<"/Date(1303502009425)/">>, - #opts{escape_forward_slash=true} - ) =:= <<"\\/Date(1303502009425)\\/">> - ) - } - ]. - - -string_escape_test_() -> - [ - {"json string escaping", - ?_assert(json_escape( - "\"\\\b\f\n\r\t", #opts{} - ) =:= "\\\"\\\\\\b\\f\\n\\r\\t" - ) - }, - {"json string hex escape", - ?_assert(json_escape( - [1, 2, 3, 11, 26, 30, 31], #opts{} - ) =:= "\\u0001\\u0002\\u0003\\u000b\\u001a\\u001e\\u001f" - ) - }, - {"jsonp protection", - ?_assert(json_escape( - [16#2028, 16#2029], #opts{} - ) =:= "\\u2028\\u2029" - ) - }, - {"microsoft i hate your date format", - ?_assert(json_escape("/Date(1303502009425)/", - #opts{escape_forward_slash=true} - ) =:= "\\/Date(1303502009425)\\/" - ) - } - ]. - --endif. \ No newline at end of file diff --git a/src/jsx_format.erl b/src/jsx_format.erl deleted file mode 100644 index a750aa7..0000000 --- a/src/jsx_format.erl +++ /dev/null @@ -1,275 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - - --module(jsx_format). - - --export([format/2]). - - --include("../include/jsx_common.hrl"). --include("jsx_format.hrl"). - - - --spec format(JSON::binary(), Opts::format_opts()) -> - binary() | iolist() - ; (Terms::list(jsx_encodeable()), Opts::format_opts()) -> - binary() | iolist() - ; (F::jsx_iterator(), Opts::format_opts()) -> - binary() | iolist(). - -format(JSON, OptsList) when is_binary(JSON) -> - P = jsx:decoder([iterate] ++ extract_parser_opts(OptsList)), - format(fun() -> P(JSON) end, OptsList); -format(Terms, OptsList) when is_list(Terms); is_tuple(Terms) -> - P = jsx:encoder([iterate]), - format(fun() -> P(Terms) end, OptsList); -format(F, OptsList) when is_function(F) -> - Opts = parse_opts(OptsList, #format_opts{}), - {Continue, String} = format_something(F(), Opts, 0), - case Continue() of - {jsx, end_json, _} -> encode(String, Opts) - ; _ -> {error, badarg} - end. - - -parse_opts([{indent, Val}|Rest], Opts) -> - parse_opts(Rest, Opts#format_opts{indent = Val}); -parse_opts([indent|Rest], Opts) -> - parse_opts(Rest, Opts#format_opts{indent = 1}); -parse_opts([{space, Val}|Rest], Opts) -> - parse_opts(Rest, Opts#format_opts{space = Val}); -parse_opts([space|Rest], Opts) -> - parse_opts(Rest, Opts#format_opts{space = 1}); -parse_opts([{output_encoding, Val}|Rest], Opts) -> - parse_opts(Rest, Opts#format_opts{output_encoding = Val}); -parse_opts([_|Rest], Opts) -> - parse_opts(Rest, Opts); -parse_opts([], Opts) -> - Opts. - - -extract_parser_opts(Opts) -> - extract_parser_opts(Opts, []). - -extract_parser_opts([], Acc) -> Acc; -extract_parser_opts([{K,V}|Rest], Acc) -> - case lists:member(K, [encoding]) of - true -> [{K,V}] ++ Acc - ; false -> extract_parser_opts(Rest, Acc) - end; -extract_parser_opts([K|Rest], Acc) -> - case lists:member(K, [encoding]) of - true -> [K] ++ Acc - ; false -> extract_parser_opts(Rest, Acc) - end. - - -format_something({jsx, start_object, Next}, Opts, Level) -> - case Next() of - {jsx, end_object, Continue} -> - {Continue, [?start_object, ?end_object]} - ; Event -> - {Continue, Object} = format_object(Event, [], Opts, Level + 1), - {Continue, [?start_object, - Object, - indent(Opts, Level), - ?end_object - ]} - end; -format_something({jsx, start_array, Next}, Opts, Level) -> - case Next() of - {jsx, end_array, Continue} -> - {Continue, [?start_array, ?end_array]} - ; Event -> - {Continue, Object} = format_array(Event, [], Opts, Level + 1), - {Continue, [?start_array, Object, indent(Opts, Level), ?end_array]} - end; -format_something({jsx, {Type, Value}, Next}, _Opts, _Level) -> - {Next, [encode(Type, Value)]}. - - -format_object({jsx, end_object, Next}, Acc, _Opts, _Level) -> - {Next, Acc}; -format_object({jsx, {key, Key}, Next}, Acc, Opts, Level) -> - {Continue, Value} = format_something(Next(), Opts, Level), - case Continue() of - {jsx, end_object, NextNext} -> - {NextNext, [Acc, - indent(Opts, Level), - encode(string, Key), - ?colon, - space(Opts), - Value - ]} - ; Else -> - format_object(Else, - [Acc, - indent(Opts, Level), - encode(string, Key), - ?colon, - space(Opts), - Value, - ?comma, - space(Opts) - ], - Opts, - Level - ) - end. - - -format_array({jsx, end_array, Next}, Acc, _Opts, _Level) -> - {Next, Acc}; -format_array(Event, Acc, Opts, Level) -> - {Continue, Value} = format_something(Event, Opts, Level), - case Continue() of - {jsx, end_array, NextNext} -> - {NextNext, [Acc, indent(Opts, Level), Value]} - ; Else -> - format_array(Else, - [Acc, - indent(Opts, Level), - Value, - ?comma, - space(Opts) - ], - Opts, - Level - ) - end. - - -encode(Acc, Opts) when is_list(Acc) -> - case Opts#format_opts.output_encoding of - iolist -> Acc - ; UTF when ?is_utf_encoding(UTF) -> - unicode:characters_to_binary(Acc, utf8, UTF) - ; _ -> erlang:error(badarg) - end; -encode(string, String) -> - [?quote, String, ?quote]; -encode(literal, Literal) -> - erlang:atom_to_list(Literal); -encode(integer, Integer) -> - erlang:integer_to_list(Integer); -encode(float, Float) -> - jsx_utils:nice_decimal(Float). - - -indent(Opts, Level) -> - case Opts#format_opts.indent of - 0 -> [] - ; X when X > 0 -> - Indent = [ ?space || _ <- lists:seq(1, X) ], - indent(Indent, Level, [?newline]) - end. - -indent(_Indent, 0, Acc) -> - Acc; -indent(Indent, N, Acc) -> - indent(Indent, N - 1, [Acc, Indent]). - - -space(Opts) -> - case Opts#format_opts.space of - 0 -> [] - ; X when X > 0 -> [ ?space || _ <- lists:seq(1, X) ] - end. - - -%% eunit tests - --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - -minify_test_() -> - [ - {"minify object", - ?_assert(format(<<" { \"key\" :\n\t \"value\"\r\r\r\n } ">>, - [] - ) =:= <<"{\"key\":\"value\"}">> - ) - }, - {"minify array", - ?_assert(format(<<" [\n\ttrue,\n\tfalse , \n \tnull\n] ">>, - [] - ) =:= <<"[true,false,null]">> - ) - } - ]. - -opts_test_() -> - [ - {"unspecified indent/space", - ?_assert(format(<<" [\n\ttrue,\n\tfalse,\n\tnull\n] ">>, - [space, indent] - ) =:= <<"[\n true, \n false, \n null\n]">> - ) - }, - {"specific indent/space", - ?_assert(format( - <<"\n{\n\"key\" : [],\n\"another key\" : true\n}\n">>, - [{space, 2}, {indent, 3}] - ) =:= <<"{\n \"key\": [], \n \"another key\": true\n}">> - ) - }, - {"nested structures", - ?_assert(format( - <<"[{\"key\":\"value\", - \"another key\": \"another value\" - }, - [[true, false, null]] - ]">>, - [{space, 2}, {indent, 2}] - ) =:= <<"[\n {\n \"key\": \"value\", \n \"another key\": \"another value\"\n }, \n [\n [\n true, \n false, \n null\n ]\n ]\n]">> - ) - }, - {"just spaces", - ?_assert(format(<<"[1,2,3]">>, - [{space, 2}] - ) =:= <<"[1, 2, 3]">> - ) - }, - {"just indent", - ?_assert(format(<<"[1.0, 2.0, 3.0]">>, - [{indent, 2}] - ) =:= <<"[\n 1.0,\n 2.0,\n 3.0\n]">> - ) - } - ]. - -terms_test_() -> - [ - {"terms", - ?_assert(format([start_object, - {key, <<"key">>}, - {string, <<"value">>}, - end_object - ], []) =:= <<"{\"key\":\"value\"}">> - )} - ]. - --endif. \ No newline at end of file diff --git a/src/jsx_scanner.erl b/src/jsx_scanner.erl new file mode 100644 index 0000000..733f6f6 --- /dev/null +++ b/src/jsx_scanner.erl @@ -0,0 +1,188 @@ +%% The MIT License + +%% Copyright (c) 2010 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + + +-module(jsx_scanner). + +-export([scanner/1]). + +-include("../include/jsx_types.hrl"). + +-spec scanner(Opts::jsx_opts()) -> jsx_scanner(). +scanner(Opts) -> + fun(JSON) -> start(JSON, [], [], parse_opts(Opts)) end. + +-include("../include/jsx_opts.hrl"). + +-include("../include/jsx_scanner.hrl"). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). + + +noncharacters_test_() -> + [ + {"noncharacters - badjson", + ?_assertEqual(check_bad(noncharacters()), []) + }, + {"noncharacters - replaced", + ?_assertEqual(check_replaced(noncharacters()), []) + } + ]. + +extended_noncharacters_test_() -> + [ + {"extended noncharacters - badjson", + ?_assertEqual(check_bad(extended_noncharacters()), []) + }, + {"extended noncharacters - replaced", + ?_assertEqual(check_replaced(extended_noncharacters()), []) + } + ]. + +surrogates_test_() -> + [ + {"surrogates - badjson", + ?_assertEqual(check_bad(surrogates()), []) + }, + {"surrogates - replaced", + ?_assertEqual(check_replaced(surrogates()), []) + } + ]. + +control_test_() -> + [ + {"control characters - badjson", + ?_assertEqual(check_bad(control_characters()), []) + } + ]. + +reserved_test_() -> + [ + {"reserved noncharacters - badjson", + ?_assertEqual(check_bad(reserved_space()), []) + }, + {"reserved noncharacters - replaced", + ?_assertEqual(check_replaced(reserved_space()), []) + } + ]. + +zero_test_() -> + [ + {"nullbyte - badjson", + ?_assertEqual(check_bad(zero()), []) + } + ]. + +good_characters_test_() -> + [ + {"acceptable codepoints", + ?_assertEqual(check_good(good()), []) + }, + {"acceptable extended", + ?_assertEqual(check_good(good_extended()), []) + } + ]. + + +check_bad(List) -> + lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end, + check(List, [], []) + ). + +check_replaced(List) -> + lists:dropwhile(fun({_, [{string, [16#fffd]}|_]}) -> + true + ; (_) -> + false + end, + check(List, [loose_unicode], []) + ). + +check_good(List) -> + lists:dropwhile(fun({_, [{string, _}]}) -> true ; (_) -> false end, + check(List, [], []) + ). + +check([], _Opts, Acc) -> Acc; +check([H|T], Opts, Acc) -> + R = decode(to_fake_utf(H, utf8), Opts), + check(T, Opts, [{H, R}] ++ Acc). + + +decode(JSON, Opts) -> + try + {ok, Events, _} = (scanner(Opts))(JSON), + loop(Events, []) + catch + error:badarg -> {error, badjson} + end. + + +loop([end_json], Acc) -> lists:reverse(Acc); +loop([Event|Events], Acc) -> loop(Events, [Event] ++ Acc); +loop(_, _) -> {error, badjson}. + + + +noncharacters() -> lists:seq(16#fffe, 16#ffff). + +extended_noncharacters() -> + [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] + ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] + ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] + ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] + ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] + ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] + ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] + ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. + +surrogates() -> lists:seq(16#d800, 16#dfff). + +control_characters() -> lists:seq(1, 31). + +reserved_space() -> lists:seq(16#fdd0, 16#fdef). + +zero() -> [0]. + +good() -> [32, 33] + ++ lists:seq(16#23, 16#5b) + ++ lists:seq(16#5d, 16#d7ff) + ++ lists:seq(16#e000, 16#fdcf) + ++ lists:seq(16#fdf0, 16#fffd). + +good_extended() -> lists:seq(16#100000, 16#10fffd). + +%% erlang refuses to encode certain codepoints, so fake them all +to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>; +to_fake_utf(N, utf8) when N < 16#0800 -> + <<0:5, Y:5, X:6>> = <>, + <<34/utf8, 2#110:3, Y:5, 2#10:2, X:6, 34/utf8>>; +to_fake_utf(N, utf8) when N < 16#10000 -> + <> = <>, + <<34/utf8, 2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>; +to_fake_utf(N, utf8) -> + <<0:3, W:3, Z:6, Y:6, X:6>> = <>, + <<34/utf8, 2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>. + + +-endif. \ No newline at end of file diff --git a/src/jsx_terms.erl b/src/jsx_terms.erl deleted file mode 100644 index 9b6da8a..0000000 --- a/src/jsx_terms.erl +++ /dev/null @@ -1,500 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - --module(jsx_terms). - --export([json_to_term/2, term_to_json/2]). - --include("../include/jsx_common.hrl"). - - --record(decoder_opts, { - strict = false, - encoding = auto, - repeatable_keys = true -}). - - --spec json_to_term(JSON::binary(), OptsList::decoder_opts()) -> - jsx_term() | {jsx, incomplete, fun()}. - -json_to_term(JSON, OptsList) -> - Opts = parse_opts(OptsList, #decoder_opts{}), - P = jsx:decoder([{encoding, Opts#decoder_opts.encoding}]), - case Opts#decoder_opts.strict of - true -> collect_strict(P(JSON), [[]], Opts) - ; false -> collect(P(JSON), [[]], Opts) - end. - - -%% the jsx formatter (pretty printer) can do most of the heavy lifting in -%% converting erlang terms to json strings - --record(encoder_opts, { - strict = false, - encoding = auto, - formatter_opts = [] -}). - - --spec term_to_json(JSON::jsx_term(), OptsList::encoder_opts()) -> - binary() | {jsx, incomplete, fun()}. - -term_to_json(List, OptsList) -> - Opts = parse_opts(OptsList, #encoder_opts{}), - case Opts#encoder_opts.strict of - true when is_list(List) -> continue - ; true -> erlang:error(badarg) - ; false -> continue - end, - case term_to_events(List) of - L when is_tuple(L) -> - jsx:format(L, Opts#encoder_opts.formatter_opts) - ; L when is_list(L) -> - jsx:format(lists:reverse(L), Opts#encoder_opts.formatter_opts) - end. - - -parse_opts([{strict, Val}|Rest], Opts = #decoder_opts{}) - when Val =:= true; Val =:= false -> - parse_opts(Rest, Opts#decoder_opts{strict = Val}); -parse_opts([strict|Rest], Opts = #decoder_opts{}) -> - parse_opts(Rest, Opts#decoder_opts{strict = true}); -parse_opts([{strict, Val}|Rest], Opts = #encoder_opts{}) - when Val =:= true; Val =:= false -> - parse_opts(Rest, Opts#encoder_opts{strict = Val}); -parse_opts([strict|Rest], Opts = #encoder_opts{}) -> - parse_opts(Rest, Opts#encoder_opts{strict = true}); -parse_opts([{encoding, Val}|Rest], Opts = #decoder_opts{}) - when Val =:= auto; Val =:= utf8; - Val =:= utf16; Val =:= {utf16,little}; - Val =:= utf32; Val =:= {utf32,little} -> - parse_opts(Rest, Opts#decoder_opts{encoding = Val}); -parse_opts([encoding|Rest], Opts = #decoder_opts{}) -> - parse_opts(Rest, Opts#decoder_opts{encoding = auto}); -parse_opts([{encoding, Val}|Rest], Opts = #encoder_opts{}) - when Val =:= auto; Val =:= utf8; - Val =:= utf16; Val =:= {utf16,little}; - Val =:= utf32; Val =:= {utf32,little} -> - parse_opts(Rest, Opts#encoder_opts{encoding = Val}); -parse_opts([encoding|Rest], Opts = #encoder_opts{}) -> - parse_opts(Rest, Opts#encoder_opts{encoding = auto}); -parse_opts([{indent, Val}|Rest], Opts = #encoder_opts{formatter_opts = F}) - when is_integer(Val) -> - parse_opts(Rest, Opts#encoder_opts{formatter_opts = [{indent, Val}] ++ F}); -parse_opts([indent|Rest], Opts = #encoder_opts{formatter_opts = F}) -> - parse_opts(Rest, Opts#encoder_opts{formatter_opts = [{indent, 1}] ++ F}); -parse_opts([{space, Val}|Rest], Opts = #encoder_opts{formatter_opts = F}) - when is_integer(Val) -> - parse_opts(Rest, Opts#encoder_opts{formatter_opts = [{space, Val}] ++ F}); -parse_opts([space|Rest], Opts = #encoder_opts{formatter_opts = F}) -> - parse_opts(Rest, Opts#encoder_opts{formatter_opts = [{space, 1}] ++ F}); -parse_opts([{output_encoding, Val}|Rest], Opts = #encoder_opts{formatter_opts = F}) - when Val =:= utf8; - Val =:= utf16; Val =:= {utf16,little}; - Val =:= utf32; Val =:= {utf32,little} -> - parse_opts(Rest, Opts#encoder_opts{formatter_opts = [{output_encoding, Val}] ++ F}); -parse_opts([{repeatable_keys, Val}|Rest], Opts = #decoder_opts{}) - when Val =:= true; Val =:= false -> - parse_opts(Rest, Opts#decoder_opts{repeatable_keys = Val}); -parse_opts([repeatable_keys|Rest], Opts = #decoder_opts{}) -> - parse_opts(Rest, Opts#decoder_opts{repeatable_keys = true}); -parse_opts([], Opts) -> - Opts. - - -%% ensure the first jsx event we get is start_object or start_array when running -%% in strict mode -collect_strict({jsx, [Start|Next], Next}, Acc, Opts) - when Start =:= start_object; Start =:= start_array -> - collect(Next, [[]|Acc], Opts); -collect_strict(_, _, _) -> erlang:error(badarg). - - -%% collect decoder events and convert to eep0018 format -collect({jsx, [Start|Next], _}, Acc, Opts) - when Start =:= start_object; Start =:= start_array -> - collect(Next, [[]|Acc], Opts); -collect({jsx, [Event, end_json], _}, _, Opts) -> - event(Event, Opts); -collect([Start|Next], Acc, Opts) - when Start =:= start_object; Start =:= start_array -> - collect(Next, [[]|Acc], Opts); -%% special case for empty object -collect([end_object|Next], [[], Parent|Rest], Opts) when is_list(Parent) -> - collect(Next, [[[{}]] ++ Parent] ++ Rest, Opts); -%% reverse the array/object accumulator before prepending it to it's parent -collect([end_object|Next], [Current, Parent|Rest], Opts) - when is_list(Parent) -> - collect(Next, [[lists:reverse(Current)] ++ Parent] ++ Rest, Opts); -collect([end_array|Next], [Current, Parent|Rest], Opts) - when is_list(Parent) -> - collect(Next, [[lists:reverse(Current)] ++ Parent] ++ Rest, Opts); -%% special case for empty object -collect([end_object|Next], [[], Key, Parent|Rest], Opts) -> - collect(Next, [[{Key, [{}]}] ++ Parent] ++ Rest, Opts); -collect([End|Next], [Current, Key, Parent|Rest], Opts) - when End =:= end_object; End =:= end_array -> - collect(Next, [[{Key, lists:reverse(Current)}] ++ Parent] ++ Rest, Opts); -collect([end_json], [[Acc]], _Opts) -> - Acc; -%% key can only be emitted inside of a json object, so just insert it directly -%% into the head of the accumulator and deal with it when we receive it's -%% paired value -collect([{key, _} = PreKey|Next], Acc, Opts) -> - Key = event(PreKey, Opts), - collect(Next, [Key] ++ Acc, Opts); -%% if our returned event is {jsx, incomplete, ...} try to force end and return -%% the Event if one is returned, else just return {incomplete, More/1} -collect({jsx, incomplete, More}, _Acc, Opts) -> - case More(end_stream) of - {jsx, [Event, end_json], _Next} -> event(Event, Opts) - ; _ -> {incomplete, More} - end; -%% check acc to see if we're inside an object or an array. because inside an -%% object context the events that fall this far are always preceded by a key -%% (which are binaries or atoms), if Current is a list, we're inside an array, -%% else, an object -collect([Event|Next], [Current|Rest], Opts) when is_list(Current) -> - collect(Next, [[event(Event, Opts)] ++ Current] ++ Rest, Opts); -%% delete any prior uses of current key -collect([Event|Next], [Key, Current|Rest], Opts=#decoder_opts{repeatable_keys=false}) -> - case proplists:is_defined(Key, Current) of - true -> - Acc = proplists:delete(Key, Current), - collect(Next, - [[{Key, event(Event, Opts)}] ++ Acc] ++ Rest, - Opts - ) - ; _ -> - collect(Next, - [[{Key, event(Event, Opts)}] ++ Current] ++ Rest, - Opts - ) - end; -collect([Event|Next], [Key, Current|Rest], Opts) -> - collect(Next, [[{Key, event(Event, Opts)}] ++ Current] ++ Rest, Opts); -%% any other event is an error -collect(_, _, _) -> erlang:error(badarg). - - -%% helper functions for converting jsx events to term format -event({string, String}, _Opts) -> unicode:characters_to_binary(String); -event({key, Key}, _Opts) -> unicode:characters_to_binary(Key); -event({integer, Integer}, _Opts) -> Integer; -event({float, Float}, _Opts) -> Float; -event({literal, Literal}, _Opts) -> Literal. - - -%% convert term format representation to jsx events. note special casing for the -%% empty object -term_to_events([{}]) -> - [end_object, start_object]; -term_to_events([First|_] = List) when is_tuple(First) -> - proplist_to_events(List, [start_object]); -term_to_events(List) when is_list(List) -> - list_to_events(List, [start_array]); -term_to_events(Term) -> - [Res] = term_to_event(Term), - Res. - - -proplist_to_events([{Key, Term}|Rest], Acc) -> - Event = term_to_event(Term), - EncodedKey = key_to_event(Key), - proplist_to_events(Rest, Event ++ EncodedKey ++ Acc); -proplist_to_events([], Acc) -> - [end_object] ++ Acc; -proplist_to_events(_, _) -> - erlang:error(badarg). - - -list_to_events([Term|Rest], Acc) -> - list_to_events(Rest, term_to_event(Term) ++ Acc); -list_to_events([], Acc) -> - [end_array] ++ Acc. - - -term_to_event(List) when is_list(List) -> - term_to_events(List); -term_to_event(Float) when is_float(Float) -> - [{float, Float}]; -term_to_event(Integer) when is_integer(Integer) -> - [{integer, Integer}]; -term_to_event(String) when is_binary(String) -> - [{string, json_escape(String)}]; -term_to_event(true) -> [{literal, true}]; -term_to_event(false) -> [{literal, false}]; -term_to_event(null) -> [{literal, null}]; -term_to_event(_) -> erlang:error(badarg). - - -key_to_event(Key) when is_binary(Key) -> - [{key, json_escape(Key)}]. - - -%% json string escaping, for utf8 binaries. escape the json control sequences to -%% their json equivalent, escape other control characters to \uXXXX sequences, -%% everything else should be a legal json string component -json_escape(String) -> - json_escape(String, <<>>). - -%% double quote -json_escape(<<$\", Rest/binary>>, Acc) -> - json_escape(Rest, <>); -%% backslash \ reverse solidus -json_escape(<<$\\, Rest/binary>>, Acc) -> - json_escape(Rest, <>); -%% backspace -json_escape(<<$\b, Rest/binary>>, Acc) -> - json_escape(Rest, <>); -%% form feed -json_escape(<<$\f, Rest/binary>>, Acc) -> - json_escape(Rest, <>); -%% newline -json_escape(<<$\n, Rest/binary>>, Acc) -> - json_escape(Rest, <>); -%% cr -json_escape(<<$\r, Rest/binary>>, Acc) -> - json_escape(Rest, <>); -%% tab -json_escape(<<$\t, Rest/binary>>, Acc) -> - json_escape(Rest, <>); -%% other control characters -json_escape(<>, Acc) when C >= 0, C < $\s -> - json_escape(Rest, <>); -%% escape u+2028 and u+2029 to avoid problems with jsonp -json_escape(<>, Acc) when C == 16#2028; C == 16#2029 -> - json_escape(Rest, <>); -%% any other legal codepoint -json_escape(<>, Acc) -> - json_escape(Rest, <>); -json_escape(<<>>, Acc) -> - Acc; -json_escape(_, _) -> - erlang:error(badarg). - - -%% convert a codepoint to it's \uXXXX equiv. -json_escape_sequence(X) -> - <> = <>, - <<$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))>>. - - -to_hex(15) -> $f; -to_hex(14) -> $e; -to_hex(13) -> $d; -to_hex(12) -> $c; -to_hex(11) -> $b; -to_hex(10) -> $a; -to_hex(X) -> X + $0. - - - -%% eunit tests --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - -decode_test_() -> - [ - {"empty object", ?_assert(json_to_term(<<"{}">>, []) =:= [{}])}, - {"empty array", ?_assert(json_to_term(<<"[]">>, []) =:= [])}, - {"simple object", - ?_assert(json_to_term( - <<"{\"a\": true, \"b\": true, \"c\": true}">>, [] - ) =:= [{<<"a">>, true}, {<<"b">>, true}, {<<"c">>, true}] - ) - }, - {"simple array", - ?_assert(json_to_term(<<"[true,true,true]">>, - [] - ) =:= [true, true, true] - ) - }, - {"nested structures", - ?_assert(json_to_term( - <<"{\"x\":[{\"x\":[{}, {}],\"y\":{}}, []],\"y\":{}}">>, [] - ) =:= [{<<"x">>, - [[{<<"x">>, [[{}], [{}]]}, {<<"y">>, [{}]}],[]]}, - {<<"y">>, [{}]} - ] - ) - }, - {"numbers", - ?_assert(json_to_term( - <<"[-100000000.0, -1, 0.0, 0, 1, 100000000, 10000000.0]">>, - [] - ) =:= [-100000000.0, -1, 0.0, 0, 1, 100000000, 10000000.0] - ) - }, - {"strings", - ?_assert(json_to_term(<<"[\"a string\"]">>, - [] - ) =:= [<<"a string">>]) - }, - {"literals", - ?_assert(json_to_term(<<"[true,false,null]">>, - [] - ) =:= [true,false,null] - ) - }, - {"naked true", - ?_assert(json_to_term(<<"true">>, []) =:= true) - }, - {"naked short number", - ?_assert(json_to_term(<<"1">>, []) =:= 1) - }, - {"naked float", ?_assert(json_to_term(<<"1.0">>, []) =:= 1.0)}, - {"naked string", - ?_assert(json_to_term(<<"\"hello world\"">>, - [] - ) =:= <<"hello world">> - ) - }, - {"strict mode", ?_assertError(badarg, json_to_term(<<"1.0">>, - [{strict, true}] - ) - )} - ]. - -encode_test_() -> - [ - {"empty object", ?_assert(term_to_json([{}], []) =:= <<"{}">>)}, - {"empty array", ?_assert(term_to_json([], []) =:= <<"[]">>)}, - {"simple object", - ?_assert(term_to_json([{<<"a">>, true}, {<<"b">>, true}], - [] - ) =:= <<"{\"a\":true,\"b\":true}">> - ) - }, - {"simple array", - ?_assert(term_to_json([true, true, true], - [] - ) =:= <<"[true,true,true]">> - ) - }, - {"nested structures", - ?_assert(term_to_json( - [{<<"x">>, - [[{<<"x">>, [[{}], [{}]]}, {<<"y">>, [{}]}],[]]}, - {<<"y">>, [{}]}], - [] - ) =:= <<"{\"x\":[{\"x\":[{},{}],\"y\":{}},[]],\"y\":{}}">> - ) - }, - {"numbers", - ?_assert(term_to_json( - [-10000000000.0, -1, 0.0, 0, 1, 10000000, 1000000000.0], - [] - ) =:= <<"[-1.0e10,-1,0.0,0,1,10000000,1.0e9]">> - ) - }, - {"strings", - ?_assert(term_to_json([<<"a string">>], - [] - ) =:= <<"[\"a string\"]">> - ) - }, - {"literals", - ?_assert(term_to_json([true,false,null], - [] - ) =:= <<"[true,false,null]">> - ) - }, - {"naked true", - ?_assert(term_to_json(true, []) =:= <<"true">>) - }, - {"naked number", - ?_assert(term_to_json(1, []) =:= <<"1">>) - }, - {"float", ?_assert(term_to_json(1.0, []) =:= <<"1.0">>)}, - {"naked string", - ?_assert(term_to_json(<<"hello world">>, []) - =:= <<"\"hello world\"">> - ) - }, - {"strict mode", ?_assertError(badarg, term_to_json(true, - [{strict, true}] - ) - )} - ]. - -repeated_keys_test_() -> - [ - {"simple repeated key", - ?_assert(json_to_term(<<"{\"a\":false,\"a\":true}">>, [{repeatable_keys, false}]) - =:= [{<<"a">>, true}] - ) - }, - {"simple repeated key - allowed", - ?_assert(json_to_term(<<"{\"a\":false,\"a\":true}">>, []) - =:= [{<<"a">>, false}, {<<"a">>, true}] - ) - }, - {"nested repeated key", - ?_assert(json_to_term( - <<"[{\"a\":false,\"a\":true},{\"a\":false,\"a\":true}]">>, - [{repeatable_keys, false}]) - =:= [[{<<"a">>, true}], [{<<"a">>, true}]] - ) - }, - {"nested repeated key - allowed", - ?_assert(json_to_term(<<"[{\"a\":false,\"a\":true},{\"a\":false,\"a\":true}]">>, []) - =:= [[{<<"a">>, false}, {<<"a">>, true}], [{<<"a">>, false}, {<<"a">>, true}]] - ) - }, - {"multiple keys", - ?_assert(json_to_term(<<"{\"a\":4,\"a\":3,\"a\":2,\"a\":1}">>, [{repeatable_keys, false}]) - =:= [{<<"a">>, 1}] - ) - }, - {"multiple keys", - ?_assert(json_to_term(<<"{\"a\":4,\"a\":3,\"a\":2,\"a\":1}">>, []) - =:= [{<<"a">>, 4}, {<<"a">>, 3}, {<<"a">>, 2}, {<<"a">>, 1}] - ) - } - ]. - -escape_test_() -> - [ - {"json string escaping", - ?_assert(json_escape( - <<"\"\\\b\f\n\r\t">> - ) =:= <<"\\\"\\\\\\b\\f\\n\\r\\t">> - ) - }, - {"json string hex escape", - ?_assert(json_escape( - <<1, 2, 3, 11, 26, 30, 31>> - ) =:= <<"\\u0001\\u0002\\u0003\\u000b\\u001a\\u001e\\u001f">> - ) - }, - {"jsonp protection", - ?_assert(json_escape( - <<226, 128, 168, 226, 128, 169>> - ) =:= <<"\\u2028\\u2029">> - ) - } - ]. - --endif. \ No newline at end of file diff --git a/src/jsx_tokenizer.erl b/src/jsx_tokenizer.erl new file mode 100644 index 0000000..7d25a3b --- /dev/null +++ b/src/jsx_tokenizer.erl @@ -0,0 +1,168 @@ +%% The MIT License + +%% Copyright (c) 2011 Alisdair Sullivan + +%% Permission is hereby granted, free of charge, to any person obtaining a copy +%% of this software and associated documentation files (the "Software"), to deal +%% in the Software without restriction, including without limitation the rights +%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +%% copies of the Software, and to permit persons to whom the Software is +%% furnished to do so, subject to the following conditions: + +%% The above copyright notice and this permission notice shall be included in +%% all copies or substantial portions of the Software. + +%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +%% THE SOFTWARE. + + +-module(jsx_tokenizer). + + +-include("../include/jsx_types.hrl"). + + +-export([tokenizer/1]). + + +-spec tokenizer(OptsList::jsx_opts()) -> jsx_tokenizer(). +tokenizer(OptsList) -> + fun(Forms) -> start(Forms, [], [], parse_opts(OptsList)) end. + +-include("../include/jsx_opts.hrl"). + +-include("../include/jsx_tokenizer.hrl"). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). + +encode(Terms) -> + try case (jsx:scanner([]))(Terms) of + {ok, Terms, _} -> + true + %% matches [foo, end_json], aka naked terms + ; {ok, [Terms, end_json], _} -> + true + end + catch + error:badarg -> false + end. + + +encode_test_() -> + [ + {"empty object", ?_assert(encode([start_object, end_object, end_json]))}, + {"empty array", ?_assert(encode([start_array, end_array, end_json]))}, + {"nested empty objects", ?_assert(encode([start_object, + {key, "empty object"}, + start_object, + {key, "empty object"}, + start_object, + end_object, + end_object, + end_object, + end_json + ]))}, + {"nested empty arrays", ?_assert(encode([start_array, + start_array, + start_array, + end_array, + end_array, + end_array, + end_json + ]))}, + {"simple object", ?_assert(encode([start_object, + {key, "a"}, + {string, "hello"}, + {key, "b"}, + {integer, 1}, + {key, "c"}, + {float, 1.0}, + {key, "d"}, + {literal, true}, + end_object, + end_json + ]))}, + {"simple array", ?_assert(encode([start_array, + {string, "hello"}, + {integer, 1}, + {float, 1.0}, + {literal, true}, + end_array, + end_json + ]))}, + {"unbalanced array", ?_assertNot(encode([start_array, + end_array, + end_array, + end_json + ]))}, + {"naked string", ?_assert(encode({string, "hello"}))}, + {"naked literal", ?_assert(encode({literal, true}))}, + {"naked integer", ?_assert(encode({integer, 1}))}, + {"naked float", ?_assert(encode({float, 1.0}))} + ]. + + +binary_escape_test_() -> + [ + {"json string escaping", + ?_assert(json_escape( + <<"\"\\\b\f\n\r\t">>, #opts{} + ) =:= <<"\\\"\\\\\\b\\f\\n\\r\\t">> + ) + }, + {"json string hex escape", + ?_assert(json_escape( + <<1, 2, 3, 11, 26, 30, 31>>, #opts{} + ) =:= <<"\\u0001\\u0002\\u0003\\u000b\\u001a\\u001e\\u001f">> + ) + }, + {"jsonp protection", + ?_assert(json_escape( + <<226, 128, 168, 226, 128, 169>>, #opts{} + ) =:= <<"\\u2028\\u2029">> + ) + }, + {"microsoft i hate your date format", + ?_assert(json_escape(<<"/Date(1303502009425)/">>, + #opts{escape_forward_slash=true} + ) =:= <<"\\/Date(1303502009425)\\/">> + ) + } + ]. + + +string_escape_test_() -> + [ + {"json string escaping", + ?_assert(json_escape( + "\"\\\b\f\n\r\t", #opts{} + ) =:= "\\\"\\\\\\b\\f\\n\\r\\t" + ) + }, + {"json string hex escape", + ?_assert(json_escape( + [1, 2, 3, 11, 26, 30, 31], #opts{} + ) =:= "\\u0001\\u0002\\u0003\\u000b\\u001a\\u001e\\u001f" + ) + }, + {"jsonp protection", + ?_assert(json_escape( + [16#2028, 16#2029], #opts{} + ) =:= "\\u2028\\u2029" + ) + }, + {"microsoft i hate your date format", + ?_assert(json_escape("/Date(1303502009425)/", + #opts{escape_forward_slash=true} + ) =:= "\\/Date(1303502009425)\\/" + ) + } + ]. + +-endif. \ No newline at end of file diff --git a/src/jsx_utf16le.erl b/src/jsx_utf16le.erl deleted file mode 100644 index 83cd2b9..0000000 --- a/src/jsx_utf16le.erl +++ /dev/null @@ -1,35 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - - --module(jsx_utf16le). - --define(utf16le, true). - --include("../include/jsx_common.hrl"). --include("../include/jsx_decoder.hrl"). - -%% i've noticed you've noticed that there's no source here. very astute. see -%% jsx_decoder_template.hrl in the include directory. any mofications to this -%% module should be made there, but keep in mind other modules also include -%% that header \ No newline at end of file diff --git a/src/jsx_utf32.erl b/src/jsx_utf32.erl deleted file mode 100644 index 4ebb80f..0000000 --- a/src/jsx_utf32.erl +++ /dev/null @@ -1,35 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - - --module(jsx_utf32). - --define(utf32, true). - --include("../include/jsx_common.hrl"). --include("../include/jsx_decoder.hrl"). - -%% i've noticed you've noticed that there's no source here. very astute. see -%% jsx_decoder_template.hrl in the include directory. any mofications to this -%% module should be made there, but keep in mind other modules also include -%% that header \ No newline at end of file diff --git a/src/jsx_utf32le.erl b/src/jsx_utf32le.erl deleted file mode 100644 index b6406a7..0000000 --- a/src/jsx_utf32le.erl +++ /dev/null @@ -1,35 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - - --module(jsx_utf32le). - --define(utf32le, true). - --include("../include/jsx_common.hrl"). --include("../include/jsx_decoder.hrl"). - -%% i've noticed you've noticed that there's no source here. very astute. see -%% jsx_decoder_template.hrl in the include directory. any mofications to this -%% module should be made there, but keep in mind other modules also include -%% that header \ No newline at end of file diff --git a/src/jsx_utf8.erl b/src/jsx_utf8.erl deleted file mode 100644 index 2d327b3..0000000 --- a/src/jsx_utf8.erl +++ /dev/null @@ -1,33 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - - --module(jsx_utf8). - --include("../include/jsx_common.hrl"). --include("../include/jsx_decoder.hrl"). - -%% i've noticed you've noticed that there's no source here. very astute. see -%% jsx_decoder_template.hrl in the include directory. any mofications to this -%% module should be made there, but keep in mind other modules also include -%% that header \ No newline at end of file diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 28705e1..11cd92e 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -23,21 +23,7 @@ -module(jsx_utils). --export([nice_decimal/1, detect_encoding/1, detect_encoding/2, collect/1]). - - --spec collect(F::function()) -> {jsx, list(), function()}. - -collect(F) when is_function(F) -> - collect(F(), []). - -collect({error, _}, _) -> {error, badarg}; -collect({jsx, incomplete, More}, Acc) -> - {jsx, incomplete, fun(Stream) -> collect(More(Stream), Acc) end}; -collect({jsx, end_json, Next}, Acc) -> - {jsx, lists:reverse([end_json] ++ Acc), Next}; -collect({jsx, Event, Next}, Acc) -> collect(Next(), [Event] ++ Acc). - +-export([nice_decimal/1]). %% conversion of floats to 'nice' decimal output. erlang's float implementation @@ -179,102 +165,6 @@ to_ascii(15) -> "f"; to_ascii(X) -> [X + 48]. %% ascii "1" is [49], "2" is [50], etc... -%% encoding detection -%% first check to see if there's a bom, if not, use the rfc4627 method for -%% determining encoding. this function makes some assumptions about the -%% validity of the stream which may delay failure later than if an encoding is -%% explicitly provided - -detect_encoding(OptsList) -> - fun(Stream) -> detect_encoding(Stream, OptsList) end. - -%% utf8 bom detection -detect_encoding(<<16#ef, 16#bb, 16#bf, Rest/binary>>, Opts) -> - (jsx_utf8:decoder(Opts))(Rest); -%% utf32-little bom detection (this has to come before utf16-little or it'll -%% match that) -detect_encoding(<<16#ff, 16#fe, 0, 0, Rest/binary>>, Opts) -> - (jsx_utf32le:decoder(Opts))(Rest); -%% utf16-big bom detection -detect_encoding(<<16#fe, 16#ff, Rest/binary>>, Opts) -> - (jsx_utf16:decoder(Opts))(Rest); -%% utf16-little bom detection -detect_encoding(<<16#ff, 16#fe, Rest/binary>>, Opts) -> - (jsx_utf16le:decoder(Opts))(Rest); -%% utf32-big bom detection -detect_encoding(<<0, 0, 16#fe, 16#ff, Rest/binary>>, Opts) -> - (jsx_utf32:decoder(Opts))(Rest); - -%% utf32-little null order detection -detect_encoding(<> = JSON, Opts) when X =/= 0 -> - (jsx_utf32le:decoder(Opts))(JSON); -%% utf32-big null order detection -detect_encoding(<<0, 0, 0, X, _Rest/binary>> = JSON, Opts) when X =/= 0 -> - (jsx_utf32:decoder(Opts))(JSON); -%% utf16-little null order detection -detect_encoding(<> = JSON, Opts) when X =/= 0 -> - (jsx_utf16le:decoder(Opts))(JSON); -%% utf16-big null order detection -detect_encoding(<<0, X, _, _, _Rest/binary>> = JSON, Opts) when X =/= 0 -> - (jsx_utf16:decoder(Opts))(JSON); -%% utf8 null order detection -detect_encoding(<> = JSON, Opts) when X =/= 0, Y =/= 0 -> - (jsx_utf8:decoder(Opts))(JSON); - -%% a problem, to autodetect naked single digits' encoding, there is not enough -%% data to conclusively determine the encoding correctly. below is an attempt -%% to solve the problem -detect_encoding(<>, Opts) when X =/= 0 -> - {jsx, incomplete, - fun(end_stream) -> - try - {jsx, incomplete, Next} = (jsx_utf8:decoder(Opts))(<>), - Next(end_stream) - catch - error:function_clause -> {error, {badjson, <>}} - ; error:{badmatch, _} -> {error, {badjson, <>}} - end - ; (Stream) -> detect_encoding(<>, Opts) - end - }; -detect_encoding(<<0, X>>, Opts) when X =/= 0 -> - {jsx, incomplete, - fun(end_stream) -> - try - {jsx, incomplete, Next} - = (jsx_utf16:decoder(Opts))(<<0, X>>), - Next(end_stream) - catch - error:function_clause -> {error, {badjson, <<0, X>>}} - ; error:{badmatch, _} -> {error, {badjson, <>}} - end - ; (Stream) -> detect_encoding(<<0, X, Stream/binary>>, Opts) - end - }; -detect_encoding(<>, Opts) when X =/= 0 -> - {jsx, incomplete, - fun(end_stream) -> - try - {jsx, incomplete, Next} - = (jsx_utf16le:decoder(Opts))(<>), - Next(end_stream) - catch - error:function_clause -> {error, {badjson, <>}} - ; error:{badmatch, _} -> {error, {badjson, <>}} - end - ; (Stream) -> detect_encoding(<>, Opts) - end - }; - -%% not enough input, request more -detect_encoding(Bin, Opts) -> - {jsx, incomplete, - fun(end_stream) -> {error, {badjson, Bin}} - ; (Stream) -> detect_encoding(<>, Opts) - end - }. - - %% eunit tests -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). diff --git a/src/jsx_verify.erl b/src/jsx_verify.erl deleted file mode 100644 index 1e9a3c5..0000000 --- a/src/jsx_verify.erl +++ /dev/null @@ -1,234 +0,0 @@ -%% The MIT License - -%% Copyright (c) 2010 Alisdair Sullivan - -%% Permission is hereby granted, free of charge, to any person obtaining a copy -%% of this software and associated documentation files (the "Software"), to deal -%% in the Software without restriction, including without limitation the rights -%% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -%% copies of the Software, and to permit persons to whom the Software is -%% furnished to do so, subject to the following conditions: - -%% The above copyright notice and this permission notice shall be included in -%% all copies or substantial portions of the Software. - -%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -%% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -%% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -%% THE SOFTWARE. - - --module(jsx_verify). - --export([is_json/2]). - --include("../include/jsx_common.hrl"). - - --record(verify_opts, { - repeated_keys = true, - naked_values = true -}). - - --spec is_json(JSON::binary(), Opts::verify_opts()) -> true | false - ; (Terms::list(jsx_encodeable()), Opts::verify_opts()) -> true | false. - -is_json(JSON, OptsList) when is_binary(JSON) -> - F = jsx:decoder(extract_parser_opts(OptsList)), - verify(F(JSON), parse_opts(OptsList)); -is_json(JSON, OptsList) when is_list(JSON) -> - F = jsx:encoder(extract_parser_opts(OptsList)), - verify(F(JSON), parse_opts(OptsList)). - - -extract_parser_opts(Opts) -> - extract_parser_opts(Opts, []). - -extract_parser_opts([], Acc) -> Acc; -extract_parser_opts([{K,V}|Rest], Acc) -> - case lists:member(K, [encoding]) of - true -> [{K,V}] ++ Acc - ; false -> extract_parser_opts(Rest, Acc) - end; -extract_parser_opts([K|Rest], Acc) -> - case lists:member(K, [encoding]) of - true -> [K] ++ Acc - ; false -> extract_parser_opts(Rest, Acc) - end. - - -parse_opts(Opts) -> parse_opts(Opts, #verify_opts{}). - -parse_opts([{repeated_keys, Val}|Rest], Opts) - when Val =:= true; Val =:= false -> - parse_opts(Rest, Opts#verify_opts{repeated_keys = Val}); -parse_opts([repeated_keys|Rest], Opts) -> - parse_opts(Rest, Opts#verify_opts{repeated_keys = true}); -parse_opts([{naked_values, Val}|Rest], Opts) - when Val =:= true; Val =:= false -> - parse_opts(Rest, Opts#verify_opts{naked_values = Val}); -parse_opts([naked_values|Rest], Opts) -> - parse_opts(Rest, Opts#verify_opts{naked_values = true}); -parse_opts([{encoding, _}|Rest], Opts) -> - parse_opts(Rest, Opts); -parse_opts([encoding|Rest], Opts) -> - parse_opts(Rest, Opts); -parse_opts([], Opts) -> - Opts. - - -verify({error, {badjson, _}}, _Opts) -> false; -verify({jsx, incomplete, More}, Opts) -> verify(More(end_stream), Opts); -verify({jsx, [First|Rest], _}, Opts=#verify_opts{naked_values=false}) -> - case First of - start_object -> verify(Rest, Opts, []) - ; start_array -> verify(Rest, Opts, []) - ; _ -> false - end; -verify({jsx, Terms, _}, Opts) -> verify(Terms, Opts, []). - -verify([end_json], _Opts, _Keys) -> true; - -%% allocate new key accumulator at start_object, discard it at end_object -verify([start_object|Rest], Opts=#verify_opts{repeated_keys=false}, Keys) -> - verify(Rest, Opts, [[]] ++ Keys); -verify([end_object|Rest], Opts=#verify_opts{repeated_keys=false}, [_|Keys]) -> - verify(Rest, Opts, Keys); - -%% check to see if key has already been encountered, if not add it to the key -%% accumulator and continue, else return false -verify([{key, Key}|Rest], Opts=#verify_opts{repeated_keys=false}, [Current|Keys]) -> - case lists:member(Key, Current) of - true -> false - ; false -> verify(Rest, Opts, [[Key] ++ Current] ++ Keys) - end; - -verify([_|Rest], Opts, Keys) -> verify(Rest, Opts, Keys); - -verify(_, _, _) -> false. - - - -%% eunit tests --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - -true_test_() -> - [ - {"empty object", ?_assert(is_json(<<"{}">>, []) =:= true)}, - {"empty array", ?_assert(is_json(<<"[]">>, []) =:= true)}, - {"whitespace", - ?_assert(is_json(<<" \n \t \r [true] \t \n\r ">>, - [] - ) =:= true - ) - }, - {"nested terms", - ?_assert(is_json( - <<"[{ \"x\": [ {}, {}, {} ], \"y\": [{}] }, {}, [[[]]]]">>, - [] - ) =:= true - ) - }, - {"numbers", - ?_assert(is_json( - <<"[ -1.0, -1, -0, 0, 1e-1, 1, 1.0, 1e1 ]">>, - [] - ) =:= true - ) - }, - {"strings", - ?_assert(is_json( - <<"[ \"a\", \"string\", \"in\", \"multiple\", \"acts\" ]">>, - [] - ) =:= true - ) - }, - {"literals", - ?_assert(is_json(<<"[ true, false, null ]">>, []) =:= true) - }, - {"nested objects", - ?_assert(is_json(<<"{\"key\": { \"key\": true}}">>, []) =:= true) - }, - {"naked true", ?_assert(is_json(<<"true">>, []) =:= true)}, - {"naked number", ?_assert(is_json(<<"1">>, []) =:= true)}, - {"naked string", - ?_assert(is_json(<<"\"i am not really json\"">>, []) =:= true) - } - ]. - -false_test_() -> - [ - {"unbalanced list", ?_assert(is_json(<<"[[[]]">>, []) =:= false)}, - {"trailing comma", - ?_assert(is_json(<<"[ true, false, null, ]">>, []) =:= false) - } - ]. - -repeated_keys_test_() -> - [ - {"repeated key forbidden", - ?_assert(is_json( - <<"{\"key\": true, \"key\": true}">>, - [{repeated_keys, false}] - ) =:= false - ) - }, - {"repeated key allowed", - ?_assert(is_json( - <<"{\"key\": true, \"key\": true}">>, - [{repeated_keys, true}] - ) =:= true - ) - }, - {"repeated key nested", - ?_assert(is_json( - <<"{\"a\": {\"a\": {\"a\": true, \"a\":false}}}">>, - [{repeated_keys, false}] - ) =:= false - ) - } - ]. - -naked_value_test_() -> - [ - {"naked true", - ?_assert(is_json(<<"true">>, []) =:= true) - }, - {"naked number", - ?_assert(is_json(<<"1">>, []) =:= true) - }, - {"naked string", - ?_assert(is_json(<<"\"i am not json\"">>, []) =:= true) - }, - {"naked true", - ?_assert(is_json(<<"true">>, [{naked_values, false}]) =:= false) - }, - {"naked number", - ?_assert(is_json(<<"1">>, [{naked_values, false}]) =:= false) - }, - {"naked string", - ?_assert(is_json( - <<"\"i am not json\"">>, - [{naked_values, false}] - ) =:= false - ) - } - ]. - -terms_test_() -> - [ - {"terms", - ?_assert(is_json([start_object, - {key, <<"key">>}, - {string, <<"value">>}, - end_object - ], []) =:= true - )} - ]. - --endif. \ No newline at end of file