refactor of encoded codepoints with looser string restrictions

This commit is contained in:
alisdair sullivan 2012-03-27 14:44:02 -07:00
parent 672fe04c37
commit 42d6ef2c21
13 changed files with 50 additions and 97 deletions

View file

@ -1 +0,0 @@
"\uffff"

View file

@ -1,3 +0,0 @@
{name, "escaped noncharacter"}.
{jsx, {error, badjson}}.
{json, "escaped_noncharacter.json"}.

View file

@ -1 +0,0 @@
"\ud83f\udfff"

View file

@ -1,3 +0,0 @@
{name, "escaped noncharacter (extended)"}.
{jsx, {error, badjson}}.
{json, "escaped_noncharacter_ext.json"}.

View file

@ -1 +0,0 @@
"\ud83f\udfff"

View file

@ -1,4 +0,0 @@
{name, "escaped noncharacter (extended)"}.
{jsx, [{string, <<16#fffd/utf8>>}, end_json]}.
{json, "escaped_noncharacter_ext.json"}.
{jsx_flags, [loose_unicode]}.

View file

@ -1 +0,0 @@
"\uffff"

View file

@ -1,4 +0,0 @@
{name, "escaped noncharacter replacement"}.
{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
{json, "escaped_noncharacter_replaced.json"}.
{jsx_flags, [loose_unicode]}.

View file

@ -1 +0,0 @@
"\ufdd0"

View file

@ -1,3 +0,0 @@
{name, "escaped reserved a"}.
{jsx, {error, badjson}}.
{json, "escaped_reserved_a.json"}.

View file

@ -1 +0,0 @@
"\ufdef"

View file

@ -1,3 +0,0 @@
{name, "escaped reserved b"}.
{jsx, {error, badjson}}.
{json, "escaped_reserved_b.json"}.

View file

@ -130,6 +130,7 @@ decoder(Handler, State, Opts) ->
-define(new_seq(C), [C]).
-define(acc_seq(Seq, C), [C] ++ Seq).
-define(acc_seq(Seq, C, D), [C, D] ++ Seq).
-define(end_seq(Seq), unicode:characters_to_binary(lists:reverse(Seq))).
@ -512,7 +513,7 @@ escape(<<?doublequote, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
escape(<<?singlequote, Rest/binary>>, Handler, [Acc|Stack], Opts = #opts{single_quotes=true}) ->
string(Rest, Handler, [?acc_seq(Acc, ?singlequote)|Stack], Opts);
escape(<<$u, Rest/binary>>, Handler, Stack, Opts) ->
escaped_unicode(Rest, Handler, [?new_seq()|Stack], Opts);
escaped_unicode(Rest, Handler, Stack, Opts);
escape(<<>>, Handler, Stack, Opts) ->
?incomplete(escape, <<>>, Handler, Stack, Opts);
escape(Bin, Handler, Stack, Opts) ->
@ -521,96 +522,74 @@ escape(Bin, Handler, Stack, Opts) ->
%% this code is ugly and unfortunate, but so is json's handling of escaped
%% unicode codepoint sequences.
escaped_unicode(<<D, Rest/binary>>, Handler, [[C,B,A], Acc|Stack], Opts)
when ?is_hex(D) ->
escaped_unicode(<<A, B, C, D, Rest/binary>>, Handler, [Acc|Stack], Opts)
when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D) ->
case erlang:list_to_integer([A, B, C, D], 16) of
%% high surrogate, we need a low surrogate next
%% high surrogate, dispatch to low surrogate
X when X >= 16#d800, X =< 16#dbff ->
low_surrogate(Rest, Handler, [X, Acc|Stack], Opts)
%% non-characters, you're not allowed to exchange these
; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef ->
%% low surrogate, illegal in this position
; X when X >= 16#dc00, X =< 16#dfff ->
case Opts#opts.loose_unicode of
true ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts)
; false ->
?error([<<D, Rest/binary>>, Handler, [[C,B,A], Acc|Stack], Opts])
true -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts)
; false -> ?error([<<A, B, C, D, Rest/binary>>, Handler, [Acc|Stack], Opts])
end
%% anything else
; X ->
string(Rest, Handler, [?acc_seq(Acc, X)|Stack], Opts)
; X -> string(Rest, Handler, [?acc_seq(Acc, X)|Stack], Opts)
end;
escaped_unicode(<<S, Rest/binary>>, Handler, [Acc|Stack], Opts)
when ?is_hex(S) ->
escaped_unicode(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts);
escaped_unicode(<<>>, Handler, Stack, Opts) ->
?incomplete(escaped_unicode, <<>>, Handler, Stack, Opts);
escaped_unicode(Bin, Handler, Stack, Opts) ->
?error([Bin, Handler, Stack, Opts]).
low_surrogate(<<?rsolidus, Rest/binary>>, Handler, Stack, Opts) ->
low_surrogate_u(Rest, Handler, Stack, Opts);
%% not an escaped codepoint, our high codepoint is illegal. dispatch back to
%% string to handle
low_surrogate(<<S, Rest/binary>> = Bin, Handler, [High, String|Stack], Opts) ->
case Opts#opts.loose_unicode of
true ->
string(Bin, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts)
; false ->
?error([<<S, Rest/binary>>, Handler, [High, String|Stack], Opts])
end;
low_surrogate(<<>>, Handler, Stack, Opts) ->
?incomplete(low_surrogate, <<>>, Handler, Stack, Opts);
low_surrogate(Bin, Handler, Stack, Opts) ->
?error([Bin, Handler, Stack, Opts]).
low_surrogate_u(<<$u, Rest/binary>>, Handler, Stack, Opts) ->
low_surrogate_v(Rest, Handler, [?new_seq()|Stack], Opts);
low_surrogate_u(<<>>, Handler, Stack, Opts) ->
?incomplete(low_surrogate_u, <<>>, Handler, Stack, Opts);
%% not a low surrogate, dispatch back to string to handle, including the
%% rsolidus we parsed previously
low_surrogate_u(Bin, Handler, [High, String|Stack], Opts) ->
case Opts#opts.loose_unicode of
true ->
string(<<?rsolidus, Bin/binary>>, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts)
; false ->
?error([Bin, Handler, [High, String|Stack], Opts])
case is_partial_escape(Bin) of
true -> ?incomplete(escaped_unicode, Bin, Handler, Stack, Opts)
; false -> ?error([Bin, Handler, Stack, Opts])
end.
low_surrogate_v(<<D, Rest/binary>>, Handler, [[C,B,A], High, String|Stack], Opts)
when ?is_hex(D) ->
is_partial_escape(<<A, B, C>>) when ?is_hex(A), ?is_hex(B), ?is_hex(C) -> true;
is_partial_escape(<<A, B>>) when ?is_hex(A), ?is_hex(B) -> true;
is_partial_escape(<<A>>) when ?is_hex(A) -> true;
is_partial_escape(<<>>) -> true;
is_partial_escape(_) -> false.
low_surrogate(<<?rsolidus, $u, A, B, C, D, Rest/binary>>, Handler, [High, Acc|Stack], Opts)
when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D) ->
case erlang:list_to_integer([A, B, C, D], 16) of
X when X >= 16#dc00, X =< 16#dfff ->
V = surrogate_to_codepoint(High, X),
case V rem 16#10000 of Y when Y == 16#fffe; Y == 16#ffff ->
Y = surrogate_to_codepoint(High, X),
case (Y =< 16#d800 orelse Y >= 16#e000) of
true -> string(Rest, Handler, [?acc_seq(Acc, Y)|Stack], Opts)
; false ->
case Opts#opts.loose_unicode of
true ->
string(Rest, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts)
string(Rest, Handler, [?acc_seq(Acc, 16#fffd, 16#fffd)|Stack], Opts)
; false ->
?error([<<D, Rest/binary>>, Handler, [[C,B,A], High, String|Stack], Opts])
?error([<<?rsolidus, $u, A, B, C, D, Rest/binary>>, Handler, [High, Acc|Stack], Opts])
end
; _ ->
string(Rest, Handler, [?acc_seq(String, V)|Stack], Opts)
end
%% not a low surrogate, bad bad bad
; _ ->
case Opts#opts.loose_unicode of
true ->
string(Rest, Handler, [?acc_seq(?acc_seq(String, 16#fffd), 16#fffd)|Stack], Opts)
; false ->
?error([<<D, Rest/binary>>, Handler, [[C,B,A], High, String|Stack], Opts])
true -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd, 16#fffd)|Stack], Opts)
; false -> ?error([<<?rsolidus, $u, A, B, C, D, Rest/binary>>, Handler, [High, Acc|Stack], Opts])
end
end;
low_surrogate_v(<<S, Rest/binary>>, Handler, [Acc|Stack], Opts)
when ?is_hex(S) ->
low_surrogate_v(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts);
low_surrogate_v(<<>>, Handler, Stack, Opts) ->
?incomplete(low_surrogate_v, <<>>, Handler, Stack, Opts);
low_surrogate_v(Bin, Handler, Stack, Opts) ->
?error([Bin, Handler, Stack, Opts]).
low_surrogate(Bin, Handler, [High, Acc|Stack], Opts) ->
case is_partial_low(Bin) of
true -> ?incomplete(low_surrogate, Bin, Handler, [High, Acc|Stack], Opts)
; false ->
case Opts#opts.loose_unicode of
true -> string(Bin, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts)
; false -> ?error([Bin, Handler, [High, Acc|Stack], Opts])
end
end.
is_partial_low(<<?rsolidus, $u, A, B, C>>) when ?is_hex(A), ?is_hex(B), ?is_hex(C) -> true;
is_partial_low(<<?rsolidus, $u, A, B>>) when ?is_hex(A), ?is_hex(B) -> true;
is_partial_low(<<?rsolidus, $u, A>>) when ?is_hex(A) -> true;
is_partial_low(<<?rsolidus, $u>>) -> true;
is_partial_low(<<?rsolidus>>) -> true;
is_partial_low(<<>>) -> true;
is_partial_low(_) -> false.
%% stole this from the unicode spec