diff --git a/src/jsx.erl b/src/jsx.erl index b6a7698..7bb1ae1 100644 --- a/src/jsx.erl +++ b/src/jsx.erl @@ -36,7 +36,7 @@ decoder(Opts) -> decoder({F, _} = Callbacks, OptsList) when is_list(OptsList), is_function(F) -> start(Callbacks, OptsList); -decoder({{Mod, Fun}, State}, OptsList) when is_list(OptsList), is_atom(Mod), is_atom(Fun) -> +decoder({Mod, Fun, State}, OptsList) when is_list(OptsList), is_atom(Mod), is_atom(Fun) -> start({fun(E, S) -> Mod:Fun(E, S) end, State}, OptsList). start(Callbacks, OptsList) -> diff --git a/src/jsx_utf16.erl b/src/jsx_utf16.erl index f78bb54..b9822e0 100644 --- a/src/jsx_utf16.erl +++ b/src/jsx_utf16.erl @@ -193,8 +193,21 @@ string(<>, Stack, Callbacks, Opts, Acc) -> escape(Rest, Stack, Callbacks, Opts, Acc); string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -string(Bin, Stack, Callbacks, Opts, Acc) when byte_size(Bin) < 2 -> - {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end}. +string(Bin, Stack, Callbacks, Opts, Acc) -> + case partial_utf16(Bin) of + true -> + {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end} + ; false -> + erlang:error(function_clause) + end. + +partial_utf16(<<>>) -> true; +%% this case is not strictly true, there are single bytes that should be rejected, but +%% they're rare enough they can be ignored +partial_utf16(<<_X>>) -> true; +partial_utf16(<>) when X >= 16#d8, X =< 16#df -> true; +partial_utf16(<>) when X >= 16#d8, X =< 16#df, Z >= 16#dc, Z =< 16#df -> true; +partial_utf16(_) -> false. %% only thing to note here is the additional accumulator passed to escaped_unicode used diff --git a/src/jsx_utf16le.erl b/src/jsx_utf16le.erl index e9d6a04..7a78ff6 100644 --- a/src/jsx_utf16le.erl +++ b/src/jsx_utf16le.erl @@ -193,8 +193,21 @@ string(<>, Stack, Callbacks, Opts, Acc) -> escape(Rest, Stack, Callbacks, Opts, Acc); string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -string(Bin, Stack, Callbacks, Opts, Acc) when byte_size(Bin) < 2 -> - {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end}. +string(Bin, Stack, Callbacks, Opts, Acc) -> + case partial_utf16(Bin) of + true -> + {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end} + ; false -> + erlang:error(function_clause) + end. + +partial_utf16(<<>>) -> true; +%% this case is not strictly true, there are single bytes that should be rejected, but +%% they're rare enough they can be ignored +partial_utf16(<<_X>>) -> true; +partial_utf16(<<_Y, X>>) when X >= 16#d8, X =< 16#df -> true; +partial_utf16(<<_Y, X, _Z>>) when X >= 16#d8, X =< 16#df -> true; +partial_utf16(_) -> false. %% only thing to note here is the additional accumulator passed to escaped_unicode used diff --git a/src/jsx_utf8.erl b/src/jsx_utf8.erl index a5620eb..140159b 100644 --- a/src/jsx_utf8.erl +++ b/src/jsx_utf8.erl @@ -183,7 +183,8 @@ key(<<>>, Stack, Callbacks, Opts) -> %% converted back to lists by the user anyways. %% the clause starting with Bin is necessary for cases where a stream is broken at a -%% point where it contains only a partial utf-8 sequence. +%% point where it contains only a partial utf-8 sequence. we emulate a function_clause +%% error if the partial sequence is not valid utf-8 to maintain consistency of errors string(<>, [key|_] = Stack, Callbacks, Opts, Acc) -> colon(Rest, Stack, fold({key, lists:reverse(Acc)}, Callbacks), Opts); @@ -192,9 +193,32 @@ string(<>, Stack, Callbacks, Opts, Acc) -> string(<>, Stack, Callbacks, Opts, Acc) -> escape(Rest, Stack, Callbacks, Opts, Acc); string(<>, Stack, Callbacks, Opts, Acc) when ?is_noncontrol(S) -> - string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); -string(<<>>, Stack, Callbacks, Opts, Acc) -> - {incomplete, fun(Stream) -> string(Stream, Stack, Callbacks, Opts, Acc) end}. + string(Rest, Stack, Callbacks, Opts, [S] ++ Acc); +string(Bin, Stack, Callbacks, Opts, Acc) -> + case partial_utf8(Bin) of + true -> + {incomplete, fun(Stream) -> string(<>, Stack, Callbacks, Opts, Acc) end} + ; false -> + erlang:error(function_clause) + end. + +%% in the case of broken (as in split over two halves of a stream) utf-8 input, +%% ensure that the half present is *possibly* valid + +partial_utf8(<<>>) -> true; +partial_utf8(<>) when X >= 16#c2, X =< 16#df -> true; +partial_utf8(<>) when X >= 16#e0, X =< 16#ef -> + case Rest of + <<>> -> true + ; <> when Y >= 16#80, Y =< 16#bf -> true + end; +partial_utf8(<>) when X >= 16#f0, X =< 16#f4 -> + case Rest of + <<>> -> true + ; <> when Y >= 16#80, Y =< 16#bf -> true + ; <> when Y >= 16#80, Y =< 16#bf, Z >= 16#80, Z =< 16#bf -> true + end; +partial_utf8(_) -> false. %% only thing to note here is the additional accumulator passed to escaped_unicode used @@ -346,9 +370,9 @@ decimal(<>, [array|_] = Stack, Callbacks, Opts, A decimal(<>, Stack, Callbacks, Opts, Acc) -> decimal(Rest, Stack, Callbacks, Opts, [?zero] ++ Acc); decimal(<<$e/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - e(Rest, Stack, Callbacks, Opts, "e0." ++ Acc); + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); decimal(<<$E/?encoding, Rest/binary>>, Stack, Callbacks, Opts, Acc) -> - e(Rest, Stack, Callbacks, Opts, "e0." ++ Acc); + e(Rest, Stack, Callbacks, Opts, "e" ++ Acc); decimal(<>, Stack, Callbacks, Opts, Acc) when ?is_whitespace(S) -> maybe_done(Rest, Stack, fold({float, lists:reverse(Acc)}, Callbacks), Opts); decimal(<>, Stack, Callbacks, ?comments_enabled(Opts), Acc) ->