diff --git a/priv/test_cases/escaped_noncharacter.json b/priv/test_cases/escaped_noncharacter.json deleted file mode 100644 index e5c1b65..0000000 --- a/priv/test_cases/escaped_noncharacter.json +++ /dev/null @@ -1 +0,0 @@ -"\uffff" \ No newline at end of file diff --git a/priv/test_cases/escaped_noncharacter.test b/priv/test_cases/escaped_noncharacter.test deleted file mode 100644 index 4e20bc3..0000000 --- a/priv/test_cases/escaped_noncharacter.test +++ /dev/null @@ -1,3 +0,0 @@ -{name, "escaped noncharacter"}. -{jsx, {error, badjson}}. -{json, "escaped_noncharacter.json"}. \ No newline at end of file diff --git a/priv/test_cases/escaped_noncharacter_ext.json b/priv/test_cases/escaped_noncharacter_ext.json deleted file mode 100644 index f10ec2b..0000000 --- a/priv/test_cases/escaped_noncharacter_ext.json +++ /dev/null @@ -1 +0,0 @@ -"\ud83f\udfff" \ No newline at end of file diff --git a/priv/test_cases/escaped_noncharacter_ext.test b/priv/test_cases/escaped_noncharacter_ext.test deleted file mode 100644 index 7049148..0000000 --- a/priv/test_cases/escaped_noncharacter_ext.test +++ /dev/null @@ -1,3 +0,0 @@ -{name, "escaped noncharacter (extended)"}. -{jsx, {error, badjson}}. -{json, "escaped_noncharacter_ext.json"}. \ No newline at end of file diff --git a/priv/test_cases/escaped_noncharacter_ext_replaced.json b/priv/test_cases/escaped_noncharacter_ext_replaced.json deleted file mode 100644 index f10ec2b..0000000 --- a/priv/test_cases/escaped_noncharacter_ext_replaced.json +++ /dev/null @@ -1 +0,0 @@ -"\ud83f\udfff" \ No newline at end of file diff --git a/priv/test_cases/escaped_noncharacter_ext_replaced.test b/priv/test_cases/escaped_noncharacter_ext_replaced.test deleted file mode 100644 index 0a740b6..0000000 --- a/priv/test_cases/escaped_noncharacter_ext_replaced.test +++ /dev/null @@ -1,4 +0,0 @@ -{name, "escaped noncharacter (extended)"}. -{jsx, [{string, <<16#fffd/utf8>>}, end_json]}. -{json, "escaped_noncharacter_ext.json"}. -{jsx_flags, [loose_unicode]}. \ No newline at end of file diff --git a/priv/test_cases/escaped_noncharacter_replaced.json b/priv/test_cases/escaped_noncharacter_replaced.json deleted file mode 100644 index e5c1b65..0000000 --- a/priv/test_cases/escaped_noncharacter_replaced.json +++ /dev/null @@ -1 +0,0 @@ -"\uffff" \ No newline at end of file diff --git a/priv/test_cases/escaped_noncharacter_replaced.test b/priv/test_cases/escaped_noncharacter_replaced.test deleted file mode 100644 index 9c5faac..0000000 --- a/priv/test_cases/escaped_noncharacter_replaced.test +++ /dev/null @@ -1,4 +0,0 @@ -{name, "escaped noncharacter replacement"}. -{jsx, [{string,<<16#fffd/utf8>>},end_json]}. -{json, "escaped_noncharacter_replaced.json"}. -{jsx_flags, [loose_unicode]}. \ No newline at end of file diff --git a/priv/test_cases/escaped_reserved_a.json b/priv/test_cases/escaped_reserved_a.json deleted file mode 100644 index dab850b..0000000 --- a/priv/test_cases/escaped_reserved_a.json +++ /dev/null @@ -1 +0,0 @@ -"\ufdd0" \ No newline at end of file diff --git a/priv/test_cases/escaped_reserved_a.test b/priv/test_cases/escaped_reserved_a.test deleted file mode 100644 index 8a5cba2..0000000 --- a/priv/test_cases/escaped_reserved_a.test +++ /dev/null @@ -1,3 +0,0 @@ -{name, "escaped reserved a"}. -{jsx, {error, badjson}}. -{json, "escaped_reserved_a.json"}. \ No newline at end of file diff --git a/priv/test_cases/escaped_reserved_b.json b/priv/test_cases/escaped_reserved_b.json deleted file mode 100644 index be11b6e..0000000 --- a/priv/test_cases/escaped_reserved_b.json +++ /dev/null @@ -1 +0,0 @@ -"\ufdef" \ No newline at end of file diff --git a/priv/test_cases/escaped_reserved_b.test b/priv/test_cases/escaped_reserved_b.test deleted file mode 100644 index 414f024..0000000 --- a/priv/test_cases/escaped_reserved_b.test +++ /dev/null @@ -1,3 +0,0 @@ -{name, "escaped reserved b"}. -{jsx, {error, badjson}}. -{json, "escaped_reserved_b.json"}. \ No newline at end of file diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index b7ebe80..7d4faa8 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -130,6 +130,7 @@ decoder(Handler, State, Opts) -> -define(new_seq(C), [C]). -define(acc_seq(Seq, C), [C] ++ Seq). +-define(acc_seq(Seq, C, D), [C, D] ++ Seq). -define(end_seq(Seq), unicode:characters_to_binary(lists:reverse(Seq))). @@ -512,7 +513,7 @@ escape(<>, Handler, [Acc|Stack], Opts) -> escape(<>, Handler, [Acc|Stack], Opts = #opts{single_quotes=true}) -> string(Rest, Handler, [?acc_seq(Acc, ?singlequote)|Stack], Opts); escape(<<$u, Rest/binary>>, Handler, Stack, Opts) -> - escaped_unicode(Rest, Handler, [?new_seq()|Stack], Opts); + escaped_unicode(Rest, Handler, Stack, Opts); escape(<<>>, Handler, Stack, Opts) -> ?incomplete(escape, <<>>, Handler, Stack, Opts); escape(Bin, Handler, Stack, Opts) -> @@ -521,96 +522,74 @@ escape(Bin, Handler, Stack, Opts) -> %% this code is ugly and unfortunate, but so is json's handling of escaped %% unicode codepoint sequences. -escaped_unicode(<>, Handler, [[C,B,A], Acc|Stack], Opts) - when ?is_hex(D) -> +escaped_unicode(<>, Handler, [Acc|Stack], Opts) + when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D) -> case erlang:list_to_integer([A, B, C, D], 16) of - %% high surrogate, we need a low surrogate next + %% high surrogate, dispatch to low surrogate X when X >= 16#d800, X =< 16#dbff -> low_surrogate(Rest, Handler, [X, Acc|Stack], Opts) - %% non-characters, you're not allowed to exchange these - ; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef -> + %% low surrogate, illegal in this position + ; X when X >= 16#dc00, X =< 16#dfff -> case Opts#opts.loose_unicode of - true -> - string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts) - ; false -> - ?error([<>, Handler, [[C,B,A], Acc|Stack], Opts]) + true -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts) + ; false -> ?error([<>, Handler, [Acc|Stack], Opts]) end %% anything else - ; X -> - string(Rest, Handler, [?acc_seq(Acc, X)|Stack], Opts) + ; X -> string(Rest, Handler, [?acc_seq(Acc, X)|Stack], Opts) end; -escaped_unicode(<>, Handler, [Acc|Stack], Opts) - when ?is_hex(S) -> - escaped_unicode(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts); -escaped_unicode(<<>>, Handler, Stack, Opts) -> - ?incomplete(escaped_unicode, <<>>, Handler, Stack, Opts); escaped_unicode(Bin, Handler, Stack, Opts) -> - ?error([Bin, Handler, Stack, Opts]). - - -low_surrogate(<>, Handler, Stack, Opts) -> - low_surrogate_u(Rest, Handler, Stack, Opts); -%% not an escaped codepoint, our high codepoint is illegal. dispatch back to -%% string to handle -low_surrogate(<> = Bin, Handler, [High, String|Stack], Opts) -> - case Opts#opts.loose_unicode of - true -> - string(Bin, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts) - ; false -> - ?error([<>, Handler, [High, String|Stack], Opts]) - end; -low_surrogate(<<>>, Handler, Stack, Opts) -> - ?incomplete(low_surrogate, <<>>, Handler, Stack, Opts); -low_surrogate(Bin, Handler, Stack, Opts) -> - ?error([Bin, Handler, Stack, Opts]). - - -low_surrogate_u(<<$u, Rest/binary>>, Handler, Stack, Opts) -> - low_surrogate_v(Rest, Handler, [?new_seq()|Stack], Opts); -low_surrogate_u(<<>>, Handler, Stack, Opts) -> - ?incomplete(low_surrogate_u, <<>>, Handler, Stack, Opts); -%% not a low surrogate, dispatch back to string to handle, including the -%% rsolidus we parsed previously -low_surrogate_u(Bin, Handler, [High, String|Stack], Opts) -> - case Opts#opts.loose_unicode of - true -> - string(<>, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts) - ; false -> - ?error([Bin, Handler, [High, String|Stack], Opts]) + case is_partial_escape(Bin) of + true -> ?incomplete(escaped_unicode, Bin, Handler, Stack, Opts) + ; false -> ?error([Bin, Handler, Stack, Opts]) end. -low_surrogate_v(<>, Handler, [[C,B,A], High, String|Stack], Opts) - when ?is_hex(D) -> +is_partial_escape(<>) when ?is_hex(A), ?is_hex(B), ?is_hex(C) -> true; +is_partial_escape(<>) when ?is_hex(A), ?is_hex(B) -> true; +is_partial_escape(<>) when ?is_hex(A) -> true; +is_partial_escape(<<>>) -> true; +is_partial_escape(_) -> false. + + +low_surrogate(<>, Handler, [High, Acc|Stack], Opts) + when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D) -> case erlang:list_to_integer([A, B, C, D], 16) of - X when X >= 16#dc00, X =< 16#dfff -> - V = surrogate_to_codepoint(High, X), - case V rem 16#10000 of Y when Y == 16#fffe; Y == 16#ffff -> + X when X >= 16#dc00, X =< 16#dfff -> + Y = surrogate_to_codepoint(High, X), + case (Y =< 16#d800 orelse Y >= 16#e000) of + true -> string(Rest, Handler, [?acc_seq(Acc, Y)|Stack], Opts) + ; false -> case Opts#opts.loose_unicode of true -> - string(Rest, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts) - ; false -> - ?error([<>, Handler, [[C,B,A], High, String|Stack], Opts]) + string(Rest, Handler, [?acc_seq(Acc, 16#fffd, 16#fffd)|Stack], Opts) + ; false -> + ?error([<>, Handler, [High, Acc|Stack], Opts]) end - ; _ -> - string(Rest, Handler, [?acc_seq(String, V)|Stack], Opts) end - %% not a low surrogate, bad bad bad ; _ -> case Opts#opts.loose_unicode of - true -> - string(Rest, Handler, [?acc_seq(?acc_seq(String, 16#fffd), 16#fffd)|Stack], Opts) - ; false -> - ?error([<>, Handler, [[C,B,A], High, String|Stack], Opts]) + true -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd, 16#fffd)|Stack], Opts) + ; false -> ?error([<>, Handler, [High, Acc|Stack], Opts]) end end; -low_surrogate_v(<>, Handler, [Acc|Stack], Opts) - when ?is_hex(S) -> - low_surrogate_v(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts); -low_surrogate_v(<<>>, Handler, Stack, Opts) -> - ?incomplete(low_surrogate_v, <<>>, Handler, Stack, Opts); -low_surrogate_v(Bin, Handler, Stack, Opts) -> - ?error([Bin, Handler, Stack, Opts]). +low_surrogate(Bin, Handler, [High, Acc|Stack], Opts) -> + case is_partial_low(Bin) of + true -> ?incomplete(low_surrogate, Bin, Handler, [High, Acc|Stack], Opts) + ; false -> + case Opts#opts.loose_unicode of + true -> string(Bin, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts) + ; false -> ?error([Bin, Handler, [High, Acc|Stack], Opts]) + end + end. + + +is_partial_low(<>) when ?is_hex(A), ?is_hex(B), ?is_hex(C) -> true; +is_partial_low(<>) when ?is_hex(A), ?is_hex(B) -> true; +is_partial_low(<>) when ?is_hex(A) -> true; +is_partial_low(<>) -> true; +is_partial_low(<>) -> true; +is_partial_low(<<>>) -> true; +is_partial_low(_) -> false. %% stole this from the unicode spec