refactor of encoded codepoints with looser string restrictions
This commit is contained in:
parent
672fe04c37
commit
42d6ef2c21
13 changed files with 50 additions and 97 deletions
|
@ -130,6 +130,7 @@ decoder(Handler, State, Opts) ->
|
|||
-define(new_seq(C), [C]).
|
||||
|
||||
-define(acc_seq(Seq, C), [C] ++ Seq).
|
||||
-define(acc_seq(Seq, C, D), [C, D] ++ Seq).
|
||||
|
||||
-define(end_seq(Seq), unicode:characters_to_binary(lists:reverse(Seq))).
|
||||
|
||||
|
@ -512,7 +513,7 @@ escape(<<?doublequote, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
|
|||
escape(<<?singlequote, Rest/binary>>, Handler, [Acc|Stack], Opts = #opts{single_quotes=true}) ->
|
||||
string(Rest, Handler, [?acc_seq(Acc, ?singlequote)|Stack], Opts);
|
||||
escape(<<$u, Rest/binary>>, Handler, Stack, Opts) ->
|
||||
escaped_unicode(Rest, Handler, [?new_seq()|Stack], Opts);
|
||||
escaped_unicode(Rest, Handler, Stack, Opts);
|
||||
escape(<<>>, Handler, Stack, Opts) ->
|
||||
?incomplete(escape, <<>>, Handler, Stack, Opts);
|
||||
escape(Bin, Handler, Stack, Opts) ->
|
||||
|
@ -521,96 +522,74 @@ escape(Bin, Handler, Stack, Opts) ->
|
|||
|
||||
%% this code is ugly and unfortunate, but so is json's handling of escaped
|
||||
%% unicode codepoint sequences.
|
||||
escaped_unicode(<<D, Rest/binary>>, Handler, [[C,B,A], Acc|Stack], Opts)
|
||||
when ?is_hex(D) ->
|
||||
escaped_unicode(<<A, B, C, D, Rest/binary>>, Handler, [Acc|Stack], Opts)
|
||||
when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D) ->
|
||||
case erlang:list_to_integer([A, B, C, D], 16) of
|
||||
%% high surrogate, we need a low surrogate next
|
||||
%% high surrogate, dispatch to low surrogate
|
||||
X when X >= 16#d800, X =< 16#dbff ->
|
||||
low_surrogate(Rest, Handler, [X, Acc|Stack], Opts)
|
||||
%% non-characters, you're not allowed to exchange these
|
||||
; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef ->
|
||||
%% low surrogate, illegal in this position
|
||||
; X when X >= 16#dc00, X =< 16#dfff ->
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts)
|
||||
; false ->
|
||||
?error([<<D, Rest/binary>>, Handler, [[C,B,A], Acc|Stack], Opts])
|
||||
true -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts)
|
||||
; false -> ?error([<<A, B, C, D, Rest/binary>>, Handler, [Acc|Stack], Opts])
|
||||
end
|
||||
%% anything else
|
||||
; X ->
|
||||
string(Rest, Handler, [?acc_seq(Acc, X)|Stack], Opts)
|
||||
; X -> string(Rest, Handler, [?acc_seq(Acc, X)|Stack], Opts)
|
||||
end;
|
||||
escaped_unicode(<<S, Rest/binary>>, Handler, [Acc|Stack], Opts)
|
||||
when ?is_hex(S) ->
|
||||
escaped_unicode(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts);
|
||||
escaped_unicode(<<>>, Handler, Stack, Opts) ->
|
||||
?incomplete(escaped_unicode, <<>>, Handler, Stack, Opts);
|
||||
escaped_unicode(Bin, Handler, Stack, Opts) ->
|
||||
?error([Bin, Handler, Stack, Opts]).
|
||||
|
||||
|
||||
low_surrogate(<<?rsolidus, Rest/binary>>, Handler, Stack, Opts) ->
|
||||
low_surrogate_u(Rest, Handler, Stack, Opts);
|
||||
%% not an escaped codepoint, our high codepoint is illegal. dispatch back to
|
||||
%% string to handle
|
||||
low_surrogate(<<S, Rest/binary>> = Bin, Handler, [High, String|Stack], Opts) ->
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Bin, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts)
|
||||
; false ->
|
||||
?error([<<S, Rest/binary>>, Handler, [High, String|Stack], Opts])
|
||||
end;
|
||||
low_surrogate(<<>>, Handler, Stack, Opts) ->
|
||||
?incomplete(low_surrogate, <<>>, Handler, Stack, Opts);
|
||||
low_surrogate(Bin, Handler, Stack, Opts) ->
|
||||
?error([Bin, Handler, Stack, Opts]).
|
||||
|
||||
|
||||
low_surrogate_u(<<$u, Rest/binary>>, Handler, Stack, Opts) ->
|
||||
low_surrogate_v(Rest, Handler, [?new_seq()|Stack], Opts);
|
||||
low_surrogate_u(<<>>, Handler, Stack, Opts) ->
|
||||
?incomplete(low_surrogate_u, <<>>, Handler, Stack, Opts);
|
||||
%% not a low surrogate, dispatch back to string to handle, including the
|
||||
%% rsolidus we parsed previously
|
||||
low_surrogate_u(Bin, Handler, [High, String|Stack], Opts) ->
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(<<?rsolidus, Bin/binary>>, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts)
|
||||
; false ->
|
||||
?error([Bin, Handler, [High, String|Stack], Opts])
|
||||
case is_partial_escape(Bin) of
|
||||
true -> ?incomplete(escaped_unicode, Bin, Handler, Stack, Opts)
|
||||
; false -> ?error([Bin, Handler, Stack, Opts])
|
||||
end.
|
||||
|
||||
|
||||
low_surrogate_v(<<D, Rest/binary>>, Handler, [[C,B,A], High, String|Stack], Opts)
|
||||
when ?is_hex(D) ->
|
||||
is_partial_escape(<<A, B, C>>) when ?is_hex(A), ?is_hex(B), ?is_hex(C) -> true;
|
||||
is_partial_escape(<<A, B>>) when ?is_hex(A), ?is_hex(B) -> true;
|
||||
is_partial_escape(<<A>>) when ?is_hex(A) -> true;
|
||||
is_partial_escape(<<>>) -> true;
|
||||
is_partial_escape(_) -> false.
|
||||
|
||||
|
||||
low_surrogate(<<?rsolidus, $u, A, B, C, D, Rest/binary>>, Handler, [High, Acc|Stack], Opts)
|
||||
when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D) ->
|
||||
case erlang:list_to_integer([A, B, C, D], 16) of
|
||||
X when X >= 16#dc00, X =< 16#dfff ->
|
||||
V = surrogate_to_codepoint(High, X),
|
||||
case V rem 16#10000 of Y when Y == 16#fffe; Y == 16#ffff ->
|
||||
X when X >= 16#dc00, X =< 16#dfff ->
|
||||
Y = surrogate_to_codepoint(High, X),
|
||||
case (Y =< 16#d800 orelse Y >= 16#e000) of
|
||||
true -> string(Rest, Handler, [?acc_seq(Acc, Y)|Stack], Opts)
|
||||
; false ->
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Rest, Handler, [?acc_seq(String, 16#fffd)|Stack], Opts)
|
||||
; false ->
|
||||
?error([<<D, Rest/binary>>, Handler, [[C,B,A], High, String|Stack], Opts])
|
||||
string(Rest, Handler, [?acc_seq(Acc, 16#fffd, 16#fffd)|Stack], Opts)
|
||||
; false ->
|
||||
?error([<<?rsolidus, $u, A, B, C, D, Rest/binary>>, Handler, [High, Acc|Stack], Opts])
|
||||
end
|
||||
; _ ->
|
||||
string(Rest, Handler, [?acc_seq(String, V)|Stack], Opts)
|
||||
end
|
||||
%% not a low surrogate, bad bad bad
|
||||
; _ ->
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Rest, Handler, [?acc_seq(?acc_seq(String, 16#fffd), 16#fffd)|Stack], Opts)
|
||||
; false ->
|
||||
?error([<<D, Rest/binary>>, Handler, [[C,B,A], High, String|Stack], Opts])
|
||||
true -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd, 16#fffd)|Stack], Opts)
|
||||
; false -> ?error([<<?rsolidus, $u, A, B, C, D, Rest/binary>>, Handler, [High, Acc|Stack], Opts])
|
||||
end
|
||||
end;
|
||||
low_surrogate_v(<<S, Rest/binary>>, Handler, [Acc|Stack], Opts)
|
||||
when ?is_hex(S) ->
|
||||
low_surrogate_v(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts);
|
||||
low_surrogate_v(<<>>, Handler, Stack, Opts) ->
|
||||
?incomplete(low_surrogate_v, <<>>, Handler, Stack, Opts);
|
||||
low_surrogate_v(Bin, Handler, Stack, Opts) ->
|
||||
?error([Bin, Handler, Stack, Opts]).
|
||||
low_surrogate(Bin, Handler, [High, Acc|Stack], Opts) ->
|
||||
case is_partial_low(Bin) of
|
||||
true -> ?incomplete(low_surrogate, Bin, Handler, [High, Acc|Stack], Opts)
|
||||
; false ->
|
||||
case Opts#opts.loose_unicode of
|
||||
true -> string(Bin, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts)
|
||||
; false -> ?error([Bin, Handler, [High, Acc|Stack], Opts])
|
||||
end
|
||||
end.
|
||||
|
||||
|
||||
is_partial_low(<<?rsolidus, $u, A, B, C>>) when ?is_hex(A), ?is_hex(B), ?is_hex(C) -> true;
|
||||
is_partial_low(<<?rsolidus, $u, A, B>>) when ?is_hex(A), ?is_hex(B) -> true;
|
||||
is_partial_low(<<?rsolidus, $u, A>>) when ?is_hex(A) -> true;
|
||||
is_partial_low(<<?rsolidus, $u>>) -> true;
|
||||
is_partial_low(<<?rsolidus>>) -> true;
|
||||
is_partial_low(<<>>) -> true;
|
||||
is_partial_low(_) -> false.
|
||||
|
||||
|
||||
%% stole this from the unicode spec
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue