adds loose_unicode option which replaces badly formed unicode (escaped non-characters and restricted codepoints, as well as partial surrogates) with u+fffd instead of throwing an error
This commit is contained in:
parent
51d27bb3b5
commit
653205501c
15 changed files with 121 additions and 10 deletions
|
@ -34,6 +34,7 @@
|
|||
|
||||
-type jsx_opts() :: [jsx_opt()].
|
||||
-type jsx_opt() :: {multi_term, true | false}
|
||||
| loose_unicode
|
||||
| {encoding, auto
|
||||
| utf8
|
||||
| utf16
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
%% opts record for decoder
|
||||
-record(opts, {
|
||||
multi_term = false,
|
||||
loose_unicode = false,
|
||||
encoding = auto
|
||||
}).
|
||||
|
||||
|
@ -81,7 +82,7 @@
|
|||
).
|
||||
|
||||
-define(is_noncontrol(Symbol),
|
||||
Symbol >= ?space
|
||||
(Symbol >= ?space)
|
||||
).
|
||||
|
||||
-define(is_whitespace(Symbol),
|
||||
|
@ -138,6 +139,8 @@ parse_opts([{multi_term, Value}|Rest], Opts) ->
|
|||
parse_opts(Rest, Opts#opts{multi_term=Value});
|
||||
parse_opts([multi_term|Rest], Opts) ->
|
||||
parse_opts(Rest, Opts#opts{multi_term=true});
|
||||
parse_opts([loose_unicode|Rest], Opts) ->
|
||||
parse_opts(Rest, Opts#opts{loose_unicode=true});
|
||||
parse_opts([{encoding, _}|Rest], Opts) ->
|
||||
parse_opts(Rest, Opts);
|
||||
parse_opts(_, _) ->
|
||||
|
@ -350,7 +353,6 @@ key(Bin, Stack, Opts) ->
|
|||
%% states
|
||||
string(Bin, Stack, Opts) -> string(Bin, Stack, Opts, <<>>).
|
||||
|
||||
|
||||
string(<<?quote/?utfx, Rest/binary>>, [key|_] = Stack, Opts, Acc) ->
|
||||
{jsx, {key, Acc}, fun() -> colon(Rest, Stack, Opts) end};
|
||||
string(<<?quote/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
|
||||
|
@ -358,8 +360,37 @@ string(<<?quote/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
|
|||
maybe_done(Rest, Stack, Opts)
|
||||
end};
|
||||
string(<<?rsolidus/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
|
||||
escape(Rest, Stack, Opts, Acc);
|
||||
string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc) when ?is_noncontrol(S) ->
|
||||
escape(Rest, Stack, Opts, Acc);
|
||||
%% things get dumb here. erlang doesn't properly restrict unicode non-characters
|
||||
%% so you can't trust the codepoints it returns always
|
||||
%% the range 32..16#fdcf is safe, so allow that
|
||||
string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
|
||||
when ?is_noncontrol(S), S < 16#fdd0 ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
|
||||
%% the range 16#fdf0..16#fffd is also safe
|
||||
string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
|
||||
when S > 16#fdef, S < 16#fffe ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
|
||||
%% i think doing it like this is faster than just putting this clause first.
|
||||
%% yes, i think it's insane too
|
||||
string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
|
||||
when S > 16#ffff andalso
|
||||
S =/= 16#1fffe andalso S =/= 16#1ffff andalso
|
||||
S =/= 16#2fffe andalso S =/= 16#2ffff andalso
|
||||
S =/= 16#3fffe andalso S =/= 16#3ffff andalso
|
||||
S =/= 16#4fffe andalso S =/= 16#4ffff andalso
|
||||
S =/= 16#5fffe andalso S =/= 16#5ffff andalso
|
||||
S =/= 16#6fffe andalso S =/= 16#6ffff andalso
|
||||
S =/= 16#7fffe andalso S =/= 16#7ffff andalso
|
||||
S =/= 16#8fffe andalso S =/= 16#8ffff andalso
|
||||
S =/= 16#9fffe andalso S =/= 16#9ffff andalso
|
||||
S =/= 16#afffe andalso S =/= 16#affff andalso
|
||||
S =/= 16#bfffe andalso S =/= 16#bffff andalso
|
||||
S =/= 16#cfffe andalso S =/= 16#cffff andalso
|
||||
S =/= 16#dfffe andalso S =/= 16#dffff andalso
|
||||
S =/= 16#efffe andalso S =/= 16#effff andalso
|
||||
S =/= 16#ffffe andalso S =/= 16#fffff andalso
|
||||
S =/= 16#101fffe andalso S =/= 16#10ffff ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
|
||||
string(Bin, Stack, Opts, Acc) ->
|
||||
case partial_utf(Bin) of
|
||||
|
@ -380,12 +411,14 @@ partial_utf(<<X, Rest/binary>>) when X >= 16#e0, X =< 16#ef ->
|
|||
case Rest of
|
||||
<<>> -> true
|
||||
; <<Y>> when Y >= 16#80, Y =< 16#bf -> true
|
||||
; _ -> false
|
||||
end;
|
||||
partial_utf(<<X, Rest/binary>>) when X >= 16#f0, X =< 16#f4 ->
|
||||
case Rest of
|
||||
<<>> -> true
|
||||
; <<Y>> when Y >= 16#80, Y =< 16#bf -> true
|
||||
; <<Y, Z>> when Y >= 16#80, Y =< 16#bf, Z >= 16#80, Z =< 16#bf -> true
|
||||
; _ -> false
|
||||
end;
|
||||
partial_utf(_) -> false.
|
||||
-endif.
|
||||
|
@ -467,11 +500,21 @@ escaped_unicode(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A])
|
|||
low_surrogate(Rest, Stack, Opts, String, X)
|
||||
%% non-characters, you're not allowed to exchange these
|
||||
; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef ->
|
||||
{error, {badjson, <<D/?utfx, Rest/binary>>}}
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Rest, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
|
||||
; false ->
|
||||
{error, {badjson, <<D/?utfx, Rest/binary>>}}
|
||||
end
|
||||
%% allowing interchange of null bytes allows attackers to forge
|
||||
%% malicious streams
|
||||
; X when X == 16#0000 ->
|
||||
{error, {badjson, <<D/?utfx, Rest/binary>>}}
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Rest, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
|
||||
; false ->
|
||||
{error, {badjson, <<D/?utfx, Rest/binary>>}}
|
||||
end
|
||||
%% anything else
|
||||
; X ->
|
||||
string(Rest, Stack, Opts, <<String/binary, X/utf8>>)
|
||||
|
@ -498,6 +541,14 @@ escaped_unicode(Bin, Stack, Opts, String, Acc) ->
|
|||
|
||||
low_surrogate(<<?rsolidus/?utfx, Rest/binary>>, Stack, Opts, String, High) ->
|
||||
low_surrogate_u(Rest, Stack, Opts, String, High);
|
||||
%% not an escaped codepoint, our high codepoint is illegal
|
||||
low_surrogate(<<S/?utfx, Rest/binary>> = Bin, Stack, Opts, String, _) ->
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Bin, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
|
||||
; false ->
|
||||
{error, {badjson, <<S/?utfx, Rest/binary>>}}
|
||||
end;
|
||||
low_surrogate(Bin, Stack, Opts, String, High) ->
|
||||
case ?partial_codepoint(Bin) of
|
||||
true ->
|
||||
|
@ -518,6 +569,19 @@ low_surrogate(Bin, Stack, Opts, String, High) ->
|
|||
|
||||
low_surrogate_u(<<$u/?utfx, Rest/binary>>, Stack, Opts, String, High) ->
|
||||
low_surrogate(Rest, Stack, Opts, String, [], High);
|
||||
%% not a low surrogate, dispatch back to string to handle, including the
|
||||
%% rsolidus we parsed previously
|
||||
low_surrogate_u(<<S/?utfx, Rest/binary>> = Bin, Stack, Opts, String, _) ->
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(<<?rsolidus/?utfx, Bin/binary>>,
|
||||
Stack,
|
||||
Opts,
|
||||
<<String/binary, 16#fffd/utf8>>
|
||||
)
|
||||
; false ->
|
||||
{error, {badjson, <<S/?utfx, Rest/binary>>}}
|
||||
end;
|
||||
low_surrogate_u(Bin, Stack, Opts, String, High) ->
|
||||
case ?partial_codepoint(Bin) of
|
||||
true ->
|
||||
|
@ -543,14 +607,32 @@ low_surrogate(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A], High)
|
|||
V = surrogate_to_codepoint(High, X),
|
||||
case V rem 16#10000 of
|
||||
Y when Y == 16#fffe; Y == 16#ffff ->
|
||||
{error, {badjson, <<D/?utfx, Rest/binary>>}}
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Rest,
|
||||
Stack,
|
||||
Opts,
|
||||
<<String/binary, 16#fffd/utf8, 16#fffd/utf8>>
|
||||
)
|
||||
; false ->
|
||||
{error, {badjson, <<D/?utfx, Rest/binary>>}}
|
||||
end
|
||||
; Y ->
|
||||
io:format("~p ~p~n", [V, Y]),
|
||||
string(Rest, Stack, Opts, <<String/binary, V/utf8>>)
|
||||
end
|
||||
%% not a low surrogate, bad bad bad
|
||||
; _ ->
|
||||
{error, {badjson, <<D/?utfx, Rest/binary>>}}
|
||||
case Opts#opts.loose_unicode of
|
||||
true ->
|
||||
string(Rest,
|
||||
Stack,
|
||||
Opts,
|
||||
<<String/binary, 16#fffd/utf8, 16#fffd/utf8>>
|
||||
)
|
||||
; false ->
|
||||
{error, {badjson, <<D/?utfx, Rest/binary>>}}
|
||||
end
|
||||
end;
|
||||
low_surrogate(<<S/?utfx, Rest/binary>>, Stack, Opts, String, Acc, High)
|
||||
when ?is_hex(S) ->
|
||||
|
|
|
@ -204,10 +204,10 @@ detect_encoding(<<X, 0, 0, 0, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
|
|||
detect_encoding(<<0, 0, 0, X, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
|
||||
(jsx_utf32:decoder(Opts))(JSON);
|
||||
%% utf16-little null order detection
|
||||
detect_encoding(<<X, 0, _, 0, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
|
||||
detect_encoding(<<X, 0, _, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
|
||||
(jsx_utf16le:decoder(Opts))(JSON);
|
||||
%% utf16-big null order detection
|
||||
detect_encoding(<<0, X, 0, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
|
||||
detect_encoding(<<0, X, _, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
|
||||
(jsx_utf16:decoder(Opts))(JSON);
|
||||
%% utf8 null order detection
|
||||
detect_encoding(<<X, Y, _Rest/binary>> = JSON, Opts) when X =/= 0, Y =/= 0 ->
|
||||
|
|
1
test/cases/escaped_noncharacter_ext_replaced.json
Normal file
1
test/cases/escaped_noncharacter_ext_replaced.json
Normal file
|
@ -0,0 +1 @@
|
|||
"\ud83f\udfff"
|
4
test/cases/escaped_noncharacter_ext_replaced.test
Normal file
4
test/cases/escaped_noncharacter_ext_replaced.test
Normal file
|
@ -0,0 +1,4 @@
|
|||
{name, "escaped noncharacter (extended)"}.
|
||||
{jsx, [{string, <<16#fffd/utf8, 16#fffd/utf8>>}, end_json]}.
|
||||
{json, "escaped_noncharacter_ext.json"}.
|
||||
{jsx_flags, [loose_unicode]}.
|
1
test/cases/escaped_noncharacter_replaced.json
Normal file
1
test/cases/escaped_noncharacter_replaced.json
Normal file
|
@ -0,0 +1 @@
|
|||
"\uffff"
|
4
test/cases/escaped_noncharacter_replaced.test
Normal file
4
test/cases/escaped_noncharacter_replaced.test
Normal file
|
@ -0,0 +1,4 @@
|
|||
{name, "escaped noncharacter replacement"}.
|
||||
{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
|
||||
{json, "escaped_noncharacter_replaced.json"}.
|
||||
{jsx_flags, [loose_unicode]}.
|
1
test/cases/escaped_nullbyte_replaced.json
Normal file
1
test/cases/escaped_nullbyte_replaced.json
Normal file
|
@ -0,0 +1 @@
|
|||
"\u0000"
|
4
test/cases/escaped_nullbyte_replaced.test
Normal file
4
test/cases/escaped_nullbyte_replaced.test
Normal file
|
@ -0,0 +1,4 @@
|
|||
{name, "escaped nullbyte replaced"}.
|
||||
{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
|
||||
{json, "escaped_nullbyte_replaced.json"}.
|
||||
{jsx_flags, [loose_unicode]}.
|
1
test/cases/noncharacter.json
Normal file
1
test/cases/noncharacter.json
Normal file
|
@ -0,0 +1 @@
|
|||
""
|
3
test/cases/noncharacter.test
Normal file
3
test/cases/noncharacter.test
Normal file
|
@ -0,0 +1,3 @@
|
|||
{name, "noncharacter"}.
|
||||
{jsx, {error, badjson}}.
|
||||
{json, "noncharacter.json"}.
|
1
test/cases/unpaired_surrogate.json
Normal file
1
test/cases/unpaired_surrogate.json
Normal file
|
@ -0,0 +1 @@
|
|||
["\ud801blah"]
|
3
test/cases/unpaired_surrogate.test
Normal file
3
test/cases/unpaired_surrogate.test
Normal file
|
@ -0,0 +1,3 @@
|
|||
{name, "unpaired_surrogate"}.
|
||||
{jsx, {error, badjson}}.
|
||||
{json, "unpaired_surrogate.json"}.
|
1
test/cases/unpaired_surrogate_replaced.json
Normal file
1
test/cases/unpaired_surrogate_replaced.json
Normal file
|
@ -0,0 +1 @@
|
|||
["\ud801blah"]
|
4
test/cases/unpaired_surrogate_replaced.test
Normal file
4
test/cases/unpaired_surrogate_replaced.test
Normal file
|
@ -0,0 +1,4 @@
|
|||
{name, "unpaired surrogate replaced"}.
|
||||
{jsx, [start_array,{string,<<16#fffd/utf8, "blah">>},end_array,end_json]}.
|
||||
{json, "unpaired_surrogate_replaced.json"}.
|
||||
{jsx_flags, [loose_unicode]}.
|
Loading…
Add table
Add a link
Reference in a new issue