refactor handling of string escapes adding test cases to catch edge cases with replaced utf8
This commit is contained in:
parent
f47e89e54f
commit
1646955038
1 changed files with 65 additions and 58 deletions
|
@ -74,8 +74,8 @@ decoder(Handler, State, Config) ->
|
||||||
|
|
||||||
%% some useful guards
|
%% some useful guards
|
||||||
-define(is_hex(Symbol),
|
-define(is_hex(Symbol),
|
||||||
(Symbol >= $a andalso Symbol =< $z);
|
(Symbol >= $a andalso Symbol =< $f) orelse
|
||||||
(Symbol >= $A andalso Symbol =< $Z);
|
(Symbol >= $A andalso Symbol =< $F) orelse
|
||||||
(Symbol >= $0 andalso Symbol =< $9)
|
(Symbol >= $0 andalso Symbol =< $9)
|
||||||
).
|
).
|
||||||
|
|
||||||
|
@ -394,7 +394,7 @@ string(<<90, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(<<91, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
string(<<91, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, 91), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, 91), Stack, Config);
|
||||||
string(<<?rsolidus/utf8, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
string(<<?rsolidus/utf8, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
escape(Rest, Handler, Acc, Stack, Config);
|
unescape(Rest, Handler, Acc, Stack, Config);
|
||||||
string(<<93, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
string(<<93, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, 93), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, 93), Stack, Config);
|
||||||
string(<<94, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
string(<<94, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
|
@ -581,83 +581,74 @@ strip_continuations(Rest, Handler, Acc, Stack, Config, _) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, 16#fffd), Stack, Config).
|
string(Rest, Handler, ?acc_seq(Acc, 16#fffd), Stack, Config).
|
||||||
|
|
||||||
|
|
||||||
escape(<<$b, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
unescape(<<$b, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\b, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\b, Config)), Stack, Config);
|
||||||
escape(<<$f, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
unescape(<<$f, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\f, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\f, Config)), Stack, Config);
|
||||||
escape(<<$n, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
unescape(<<$n, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\n, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\n, Config)), Stack, Config);
|
||||||
escape(<<$r, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
unescape(<<$r, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\r, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\r, Config)), Stack, Config);
|
||||||
escape(<<$t, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
unescape(<<$t, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\t, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\t, Config)), Stack, Config);
|
||||||
escape(<<?rsolidus, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
unescape(<<?rsolidus, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\\, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\\, Config)), Stack, Config);
|
||||||
escape(<<?solidus, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
unescape(<<?solidus, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($/, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($/, Config)), Stack, Config);
|
||||||
escape(<<?doublequote, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
unescape(<<?doublequote, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\", Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace($\", Config)), Stack, Config);
|
||||||
escape(<<?singlequote, Rest/binary>>, Handler, Acc, Stack, Config = #config{single_quoted_strings=true}) ->
|
unescape(<<?singlequote, Rest/binary>>, Handler, Acc, Stack, Config=#config{single_quoted_strings=true}) ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace(?singlequote, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace(?singlequote, Config)), Stack, Config);
|
||||||
escape(<<$u, A, B, C, D, ?rsolidus, $u, W, X, Y, Z, Rest/binary>>, Handler, Acc, Stack, Config)
|
unescape(<<$u, $d, A, B, C, ?rsolidus, $u, $d, X, Y, Z, Rest/binary>>, Handler, Acc, Stack, Config)
|
||||||
when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D), ?is_hex(W), ?is_hex(X), ?is_hex(Y), ?is_hex(Z) ->
|
when (A == $8 orelse A == $9 orelse A == $a orelse A == $b),
|
||||||
case {erlang:list_to_integer([A, B, C, D], 16), erlang:list_to_integer([W, X, Y, Z], 16)} of
|
(X == $c orelse X == $d orelse X == $e orelse X == $f),
|
||||||
{High, Low} when High >= 16#d800, High =< 16#dbff, Low >= 16#dc00, Low =< 16#dfff ->
|
?is_hex(B), ?is_hex(C), ?is_hex(Y), ?is_hex(Z)
|
||||||
case (High - 16#d800) * 16#400 + (Low - 16#dc00) + 16#10000 of
|
->
|
||||||
Codepoint when Codepoint =< 16#d800; Codepoint >= 16#e000 ->
|
High = erlang:list_to_integer([$d, A, B, C], 16),
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace(Codepoint, Config)), Stack, Config);
|
Low = erlang:list_to_integer([$d, X, Y, Z], 16),
|
||||||
_ when Config#config.replaced_bad_utf8 == true ->
|
Codepoint = (High - 16#d800) * 16#400 + (Low - 16#dc00) + 16#10000,
|
||||||
string(Rest, Handler, ?acc_seq(Acc, 16#fffd, 16#fffd), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, Codepoint), Stack, Config);
|
||||||
_ ->
|
unescape(<<$u, $d, A, B, C, ?rsolidus, $u, W, X, Y, Z, Rest/binary>>, Handler, Acc, Stack, Config)
|
||||||
?error([<<$u, A, B, C, D, ?rsolidus, $u, W, X, Y, Z, Rest/binary>>, Handler, Stack, Config])
|
when (A == $8 orelse A == $9 orelse A == $a orelse A == $b),
|
||||||
end;
|
?is_hex(B), ?is_hex(C), ?is_hex(W), ?is_hex(X), ?is_hex(Y), ?is_hex(Z)
|
||||||
_ ->
|
->
|
||||||
?error([<<$u, A, B, C, D, ?rsolidus, $u, W, X, Y, Z, Rest/binary>>, Handler, Stack, Config])
|
string(Rest, Handler, ?acc_seq(Acc, 16#fffd, 16#fffd), Stack, Config);
|
||||||
|
unescape(<<$u, $d, A, B, C, ?rsolidus, Rest/binary>> = Bin, Handler, Acc, Stack, Config)
|
||||||
|
when (A == $8 orelse A == $9 orelse A == $a orelse A == $b) andalso
|
||||||
|
?is_hex(B), ?is_hex(C)
|
||||||
|
->
|
||||||
|
case is_partial_escape(Rest) of
|
||||||
|
true -> ?incomplete(string, <<?rsolidus, Bin/binary>>, Handler, Acc, Stack, Config);
|
||||||
|
false when Config#config.replaced_bad_utf8 ->
|
||||||
|
string(<<?rsolidus, Rest/binary>>, Handler, ?acc_seq(Acc, 16#fffd), Stack, Config);
|
||||||
|
false -> ?error([Bin, Handler, Acc, Stack, Config])
|
||||||
end;
|
end;
|
||||||
escape(<<$u, A, B, C, D, Rest/binary>> = Bin, Handler, Acc, Stack, Config)
|
unescape(<<$u, $d, A, B, C>> = Bin, Handler, Acc, Stack, Config)
|
||||||
|
when (A == $8 orelse A == $9 orelse A == $a orelse A == $b) andalso
|
||||||
|
?is_hex(B), ?is_hex(C)
|
||||||
|
->
|
||||||
|
?incomplete(string, <<?rsolidus, Bin/binary>>, Handler, Acc, Stack, Config);
|
||||||
|
unescape(<<$u, A, B, C, D, Rest/binary>> = Bin, Handler, Acc, Stack, Config)
|
||||||
when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D) ->
|
when ?is_hex(A), ?is_hex(B), ?is_hex(C), ?is_hex(D) ->
|
||||||
case erlang:list_to_integer([A, B, C, D], 16) of
|
case erlang:list_to_integer([A, B, C, D], 16) of
|
||||||
Codepoint when Codepoint >= 16#d800, Codepoint =< 16#dfff ->
|
Codepoint when Codepoint < 16#d800; Codepoint > 16#dfff ->
|
||||||
case is_partial_escape(Bin) of
|
|
||||||
true ->
|
|
||||||
?incomplete(string, <<?rsolidus, Bin/binary>>, Handler, Acc, Stack, Config);
|
|
||||||
false ->
|
|
||||||
?error([Bin, Handler, Acc, Stack, Config])
|
|
||||||
end;
|
|
||||||
Codepoint when Codepoint =< 16#d7ff; Codepoint >= 16#e000 ->
|
|
||||||
string(Rest, Handler, ?acc_seq(Acc, maybe_replace(Codepoint, Config)), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, maybe_replace(Codepoint, Config)), Stack, Config);
|
||||||
_ when Config#config.replaced_bad_utf8 == true ->
|
_ when Config#config.replaced_bad_utf8 ->
|
||||||
string(Rest, Handler, ?acc_seq(Acc, 16#fffd), Stack, Config);
|
string(Rest, Handler, ?acc_seq(Acc, 16#fffd), Stack, Config);
|
||||||
_ ->
|
_ -> ?error([Bin, Handler, Acc, Stack, Config])
|
||||||
?error([<<$u, A, B, C, D, Rest/binary>>, Handler, Acc, Stack, Config])
|
|
||||||
end;
|
end;
|
||||||
escape(Bin, Handler, Acc, Stack, Config=#config{ignored_bad_escapes=true}) ->
|
unescape(Bin, Handler, Acc, Stack, Config=#config{ignored_bad_escapes=true}) ->
|
||||||
string(Bin, Handler, ?acc_seq(Acc, ?rsolidus), Stack, Config);
|
string(Bin, Handler, ?acc_seq(Acc, ?rsolidus), Stack, Config);
|
||||||
escape(Bin, Handler, Acc, Stack, Config) ->
|
unescape(Bin, Handler, Acc, Stack, Config) ->
|
||||||
case is_partial_escape(Bin) of
|
case is_partial_escape(Bin) of
|
||||||
true -> ?incomplete(string, <<?rsolidus/utf8, Bin/binary>>, Handler, Acc, Stack, Config);
|
true -> ?incomplete(string, <<?rsolidus/utf8, Bin/binary>>, Handler, Acc, Stack, Config);
|
||||||
false -> ?error([Bin, Handler, Acc, Stack, Config])
|
false -> ?error([Bin, Handler, Acc, Stack, Config])
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
|
||||||
is_partial_escape(<<$u, A, B, C, D, ?rsolidus, $u, W, X, Y>>) ->
|
is_partial_escape(<<$u, A, B, C>>) when ?is_hex(A), ?is_hex(B), ?is_hex(C) -> true;
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A, B, C, D, W, X, Y]);
|
is_partial_escape(<<$u, A, B>>) when ?is_hex(A), ?is_hex(B) -> true;
|
||||||
is_partial_escape(<<$u, A, B, C, D, ?rsolidus, $u, W, X>>) ->
|
is_partial_escape(<<$u, A>>) when ?is_hex(A) -> true;
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A, B, C, D, W, X]);
|
|
||||||
is_partial_escape(<<$u, A, B, C, D, ?rsolidus, $u, W>>) ->
|
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A, B, C, D, W]);
|
|
||||||
is_partial_escape(<<$u, A, B, C, D, ?rsolidus, $u>>) ->
|
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A, B, C, D]);
|
|
||||||
is_partial_escape(<<$u, A, B, C, D, ?rsolidus>>) ->
|
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A, B, C, D]);
|
|
||||||
is_partial_escape(<<$u, A, B, C, D>>) ->
|
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A, B, C, D]);
|
|
||||||
is_partial_escape(<<$u, A, B, C>>) ->
|
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A, B, C]);
|
|
||||||
is_partial_escape(<<$u, A, B>>) ->
|
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A, B]);
|
|
||||||
is_partial_escape(<<$u, A>>) ->
|
|
||||||
lists:all(fun(N) when ?is_hex(N) -> true; (_) -> false end, [A]);
|
|
||||||
is_partial_escape(<<$u>>) -> true;
|
is_partial_escape(<<$u>>) -> true;
|
||||||
is_partial_escape(<<>>) -> true;
|
is_partial_escape(<<>>) -> true;
|
||||||
is_partial_escape(_) -> false.
|
is_partial_escape(_) -> false.
|
||||||
|
@ -1598,6 +1589,22 @@ unescape_test_() ->
|
||||||
{"unescape surrogate pair", ?_assertEqual(
|
{"unescape surrogate pair", ?_assertEqual(
|
||||||
<<16#10000/utf8>>,
|
<<16#10000/utf8>>,
|
||||||
unescape(<<"\\ud800\\udc00"/utf8>>, [])
|
unescape(<<"\\ud800\\udc00"/utf8>>, [])
|
||||||
|
)},
|
||||||
|
{"unescape bad high surrogate", ?_assertEqual(
|
||||||
|
<<16#fffd/utf8>>,
|
||||||
|
unescape(<<"\\udc00"/utf8>>, [replaced_bad_utf8])
|
||||||
|
)},
|
||||||
|
{"unescape naked high surrogate", ?_assertEqual(
|
||||||
|
<<16#fffd/utf8, "hello world">>,
|
||||||
|
unescape(<<"\\ud800hello world"/utf8>>, [replaced_bad_utf8])
|
||||||
|
)},
|
||||||
|
{"unescape naked low surrogate", ?_assertEqual(
|
||||||
|
<<16#fffd/utf8, "hello world">>,
|
||||||
|
unescape(<<"\\udc00hello world"/utf8>>, [replaced_bad_utf8])
|
||||||
|
)},
|
||||||
|
{"unescape bad surrogate pair", ?_assertEqual(
|
||||||
|
<<16#fffd/utf8, 16#fffd/utf8>>,
|
||||||
|
unescape(<<"\\ud800\\u0000">>, [replaced_bad_utf8])
|
||||||
)}
|
)}
|
||||||
].
|
].
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue