From 653205501c8889fffaaaf4a20103d4266b0d6da6 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 27 Jul 2011 01:59:03 -0700 Subject: [PATCH 1/2] adds loose_unicode option which replaces badly formed unicode (escaped non-characters and restricted codepoints, as well as partial surrogates) with u+fffd instead of throwing an error --- src/jsx_common.hrl | 1 + src/jsx_decoder.hrl | 98 +++++++++++++++++-- src/jsx_utils.erl | 4 +- .../escaped_noncharacter_ext_replaced.json | 1 + .../escaped_noncharacter_ext_replaced.test | 4 + test/cases/escaped_noncharacter_replaced.json | 1 + test/cases/escaped_noncharacter_replaced.test | 4 + test/cases/escaped_nullbyte_replaced.json | 1 + test/cases/escaped_nullbyte_replaced.test | 4 + test/cases/noncharacter.json | 1 + test/cases/noncharacter.test | 3 + test/cases/unpaired_surrogate.json | 1 + test/cases/unpaired_surrogate.test | 3 + test/cases/unpaired_surrogate_replaced.json | 1 + test/cases/unpaired_surrogate_replaced.test | 4 + 15 files changed, 121 insertions(+), 10 deletions(-) create mode 100644 test/cases/escaped_noncharacter_ext_replaced.json create mode 100644 test/cases/escaped_noncharacter_ext_replaced.test create mode 100644 test/cases/escaped_noncharacter_replaced.json create mode 100644 test/cases/escaped_noncharacter_replaced.test create mode 100644 test/cases/escaped_nullbyte_replaced.json create mode 100644 test/cases/escaped_nullbyte_replaced.test create mode 100644 test/cases/noncharacter.json create mode 100644 test/cases/noncharacter.test create mode 100644 test/cases/unpaired_surrogate.json create mode 100644 test/cases/unpaired_surrogate.test create mode 100644 test/cases/unpaired_surrogate_replaced.json create mode 100644 test/cases/unpaired_surrogate_replaced.test diff --git a/src/jsx_common.hrl b/src/jsx_common.hrl index 6431434..9c950c9 100644 --- a/src/jsx_common.hrl +++ b/src/jsx_common.hrl @@ -34,6 +34,7 @@ -type jsx_opts() :: [jsx_opt()]. -type jsx_opt() :: {multi_term, true | false} + | loose_unicode | {encoding, auto | utf8 | utf16 diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl index 6aa5b6b..d3a2fc4 100644 --- a/src/jsx_decoder.hrl +++ b/src/jsx_decoder.hrl @@ -31,6 +31,7 @@ %% opts record for decoder -record(opts, { multi_term = false, + loose_unicode = false, encoding = auto }). @@ -81,7 +82,7 @@ ). -define(is_noncontrol(Symbol), - Symbol >= ?space + (Symbol >= ?space) ). -define(is_whitespace(Symbol), @@ -138,6 +139,8 @@ parse_opts([{multi_term, Value}|Rest], Opts) -> parse_opts(Rest, Opts#opts{multi_term=Value}); parse_opts([multi_term|Rest], Opts) -> parse_opts(Rest, Opts#opts{multi_term=true}); +parse_opts([loose_unicode|Rest], Opts) -> + parse_opts(Rest, Opts#opts{loose_unicode=true}); parse_opts([{encoding, _}|Rest], Opts) -> parse_opts(Rest, Opts); parse_opts(_, _) -> @@ -350,7 +353,6 @@ key(Bin, Stack, Opts) -> %% states string(Bin, Stack, Opts) -> string(Bin, Stack, Opts, <<>>). - string(<>, [key|_] = Stack, Opts, Acc) -> {jsx, {key, Acc}, fun() -> colon(Rest, Stack, Opts) end}; string(<>, Stack, Opts, Acc) -> @@ -358,8 +360,37 @@ string(<>, Stack, Opts, Acc) -> maybe_done(Rest, Stack, Opts) end}; string(<>, Stack, Opts, Acc) -> - escape(Rest, Stack, Opts, Acc); -string(<>, Stack, Opts, Acc) when ?is_noncontrol(S) -> + escape(Rest, Stack, Opts, Acc); +%% things get dumb here. erlang doesn't properly restrict unicode non-characters +%% so you can't trust the codepoints it returns always +%% the range 32..16#fdcf is safe, so allow that +string(<>, Stack, Opts, Acc) + when ?is_noncontrol(S), S < 16#fdd0 -> + string(Rest, Stack, Opts, <>); +%% the range 16#fdf0..16#fffd is also safe +string(<>, Stack, Opts, Acc) + when S > 16#fdef, S < 16#fffe -> + string(Rest, Stack, Opts, <>); +%% i think doing it like this is faster than just putting this clause first. +%% yes, i think it's insane too +string(<>, Stack, Opts, Acc) + when S > 16#ffff andalso + S =/= 16#1fffe andalso S =/= 16#1ffff andalso + S =/= 16#2fffe andalso S =/= 16#2ffff andalso + S =/= 16#3fffe andalso S =/= 16#3ffff andalso + S =/= 16#4fffe andalso S =/= 16#4ffff andalso + S =/= 16#5fffe andalso S =/= 16#5ffff andalso + S =/= 16#6fffe andalso S =/= 16#6ffff andalso + S =/= 16#7fffe andalso S =/= 16#7ffff andalso + S =/= 16#8fffe andalso S =/= 16#8ffff andalso + S =/= 16#9fffe andalso S =/= 16#9ffff andalso + S =/= 16#afffe andalso S =/= 16#affff andalso + S =/= 16#bfffe andalso S =/= 16#bffff andalso + S =/= 16#cfffe andalso S =/= 16#cffff andalso + S =/= 16#dfffe andalso S =/= 16#dffff andalso + S =/= 16#efffe andalso S =/= 16#effff andalso + S =/= 16#ffffe andalso S =/= 16#fffff andalso + S =/= 16#101fffe andalso S =/= 16#10ffff -> string(Rest, Stack, Opts, <>); string(Bin, Stack, Opts, Acc) -> case partial_utf(Bin) of @@ -380,12 +411,14 @@ partial_utf(<>) when X >= 16#e0, X =< 16#ef -> case Rest of <<>> -> true ; <> when Y >= 16#80, Y =< 16#bf -> true + ; _ -> false end; partial_utf(<>) when X >= 16#f0, X =< 16#f4 -> case Rest of <<>> -> true ; <> when Y >= 16#80, Y =< 16#bf -> true ; <> when Y >= 16#80, Y =< 16#bf, Z >= 16#80, Z =< 16#bf -> true + ; _ -> false end; partial_utf(_) -> false. -endif. @@ -467,11 +500,21 @@ escaped_unicode(<>, Stack, Opts, String, [C, B, A]) low_surrogate(Rest, Stack, Opts, String, X) %% non-characters, you're not allowed to exchange these ; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef -> - {error, {badjson, <>}} + case Opts#opts.loose_unicode of + true -> + string(Rest, Stack, Opts, <>) + ; false -> + {error, {badjson, <>}} + end %% allowing interchange of null bytes allows attackers to forge %% malicious streams ; X when X == 16#0000 -> - {error, {badjson, <>}} + case Opts#opts.loose_unicode of + true -> + string(Rest, Stack, Opts, <>) + ; false -> + {error, {badjson, <>}} + end %% anything else ; X -> string(Rest, Stack, Opts, <>) @@ -498,6 +541,14 @@ escaped_unicode(Bin, Stack, Opts, String, Acc) -> low_surrogate(<>, Stack, Opts, String, High) -> low_surrogate_u(Rest, Stack, Opts, String, High); +%% not an escaped codepoint, our high codepoint is illegal +low_surrogate(<> = Bin, Stack, Opts, String, _) -> + case Opts#opts.loose_unicode of + true -> + string(Bin, Stack, Opts, <>) + ; false -> + {error, {badjson, <>}} + end; low_surrogate(Bin, Stack, Opts, String, High) -> case ?partial_codepoint(Bin) of true -> @@ -518,6 +569,19 @@ low_surrogate(Bin, Stack, Opts, String, High) -> low_surrogate_u(<<$u/?utfx, Rest/binary>>, Stack, Opts, String, High) -> low_surrogate(Rest, Stack, Opts, String, [], High); +%% not a low surrogate, dispatch back to string to handle, including the +%% rsolidus we parsed previously +low_surrogate_u(<> = Bin, Stack, Opts, String, _) -> + case Opts#opts.loose_unicode of + true -> + string(<>, + Stack, + Opts, + <> + ) + ; false -> + {error, {badjson, <>}} + end; low_surrogate_u(Bin, Stack, Opts, String, High) -> case ?partial_codepoint(Bin) of true -> @@ -543,14 +607,32 @@ low_surrogate(<>, Stack, Opts, String, [C, B, A], High) V = surrogate_to_codepoint(High, X), case V rem 16#10000 of Y when Y == 16#fffe; Y == 16#ffff -> - {error, {badjson, <>}} + case Opts#opts.loose_unicode of + true -> + string(Rest, + Stack, + Opts, + <> + ) + ; false -> + {error, {badjson, <>}} + end ; Y -> io:format("~p ~p~n", [V, Y]), string(Rest, Stack, Opts, <>) end %% not a low surrogate, bad bad bad ; _ -> - {error, {badjson, <>}} + case Opts#opts.loose_unicode of + true -> + string(Rest, + Stack, + Opts, + <> + ) + ; false -> + {error, {badjson, <>}} + end end; low_surrogate(<>, Stack, Opts, String, Acc, High) when ?is_hex(S) -> diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 3a9ec1f..bb06334 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -204,10 +204,10 @@ detect_encoding(<> = JSON, Opts) when X =/= 0 -> detect_encoding(<<0, 0, 0, X, _Rest/binary>> = JSON, Opts) when X =/= 0 -> (jsx_utf32:decoder(Opts))(JSON); %% utf16-little null order detection -detect_encoding(<> = JSON, Opts) when X =/= 0 -> +detect_encoding(<> = JSON, Opts) when X =/= 0 -> (jsx_utf16le:decoder(Opts))(JSON); %% utf16-big null order detection -detect_encoding(<<0, X, 0, _, _Rest/binary>> = JSON, Opts) when X =/= 0 -> +detect_encoding(<<0, X, _, _, _Rest/binary>> = JSON, Opts) when X =/= 0 -> (jsx_utf16:decoder(Opts))(JSON); %% utf8 null order detection detect_encoding(<> = JSON, Opts) when X =/= 0, Y =/= 0 -> diff --git a/test/cases/escaped_noncharacter_ext_replaced.json b/test/cases/escaped_noncharacter_ext_replaced.json new file mode 100644 index 0000000..f10ec2b --- /dev/null +++ b/test/cases/escaped_noncharacter_ext_replaced.json @@ -0,0 +1 @@ +"\ud83f\udfff" \ No newline at end of file diff --git a/test/cases/escaped_noncharacter_ext_replaced.test b/test/cases/escaped_noncharacter_ext_replaced.test new file mode 100644 index 0000000..c2741f7 --- /dev/null +++ b/test/cases/escaped_noncharacter_ext_replaced.test @@ -0,0 +1,4 @@ +{name, "escaped noncharacter (extended)"}. +{jsx, [{string, <<16#fffd/utf8, 16#fffd/utf8>>}, end_json]}. +{json, "escaped_noncharacter_ext.json"}. +{jsx_flags, [loose_unicode]}. \ No newline at end of file diff --git a/test/cases/escaped_noncharacter_replaced.json b/test/cases/escaped_noncharacter_replaced.json new file mode 100644 index 0000000..e5c1b65 --- /dev/null +++ b/test/cases/escaped_noncharacter_replaced.json @@ -0,0 +1 @@ +"\uffff" \ No newline at end of file diff --git a/test/cases/escaped_noncharacter_replaced.test b/test/cases/escaped_noncharacter_replaced.test new file mode 100644 index 0000000..9c5faac --- /dev/null +++ b/test/cases/escaped_noncharacter_replaced.test @@ -0,0 +1,4 @@ +{name, "escaped noncharacter replacement"}. +{jsx, [{string,<<16#fffd/utf8>>},end_json]}. +{json, "escaped_noncharacter_replaced.json"}. +{jsx_flags, [loose_unicode]}. \ No newline at end of file diff --git a/test/cases/escaped_nullbyte_replaced.json b/test/cases/escaped_nullbyte_replaced.json new file mode 100644 index 0000000..ed6780d --- /dev/null +++ b/test/cases/escaped_nullbyte_replaced.json @@ -0,0 +1 @@ +"\u0000" \ No newline at end of file diff --git a/test/cases/escaped_nullbyte_replaced.test b/test/cases/escaped_nullbyte_replaced.test new file mode 100644 index 0000000..785acc8 --- /dev/null +++ b/test/cases/escaped_nullbyte_replaced.test @@ -0,0 +1,4 @@ +{name, "escaped nullbyte replaced"}. +{jsx, [{string,<<16#fffd/utf8>>},end_json]}. +{json, "escaped_nullbyte_replaced.json"}. +{jsx_flags, [loose_unicode]}. \ No newline at end of file diff --git a/test/cases/noncharacter.json b/test/cases/noncharacter.json new file mode 100644 index 0000000..09db417 --- /dev/null +++ b/test/cases/noncharacter.json @@ -0,0 +1 @@ +"﷐" \ No newline at end of file diff --git a/test/cases/noncharacter.test b/test/cases/noncharacter.test new file mode 100644 index 0000000..6b3732c --- /dev/null +++ b/test/cases/noncharacter.test @@ -0,0 +1,3 @@ +{name, "noncharacter"}. +{jsx, {error, badjson}}. +{json, "noncharacter.json"}. \ No newline at end of file diff --git a/test/cases/unpaired_surrogate.json b/test/cases/unpaired_surrogate.json new file mode 100644 index 0000000..32497a8 --- /dev/null +++ b/test/cases/unpaired_surrogate.json @@ -0,0 +1 @@ +["\ud801blah"] \ No newline at end of file diff --git a/test/cases/unpaired_surrogate.test b/test/cases/unpaired_surrogate.test new file mode 100644 index 0000000..e2da5c1 --- /dev/null +++ b/test/cases/unpaired_surrogate.test @@ -0,0 +1,3 @@ +{name, "unpaired_surrogate"}. +{jsx, {error, badjson}}. +{json, "unpaired_surrogate.json"}. diff --git a/test/cases/unpaired_surrogate_replaced.json b/test/cases/unpaired_surrogate_replaced.json new file mode 100644 index 0000000..32497a8 --- /dev/null +++ b/test/cases/unpaired_surrogate_replaced.json @@ -0,0 +1 @@ +["\ud801blah"] \ No newline at end of file diff --git a/test/cases/unpaired_surrogate_replaced.test b/test/cases/unpaired_surrogate_replaced.test new file mode 100644 index 0000000..7269bc2 --- /dev/null +++ b/test/cases/unpaired_surrogate_replaced.test @@ -0,0 +1,4 @@ +{name, "unpaired surrogate replaced"}. +{jsx, [start_array,{string,<<16#fffd/utf8, "blah">>},end_array,end_json]}. +{json, "unpaired_surrogate_replaced.json"}. +{jsx_flags, [loose_unicode]}. From 80e9381b42c525ce1588629fad05639218362aba Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 27 Jul 2011 06:52:16 -0700 Subject: [PATCH 2/2] proper guarding and handling of noncharacters in json strings. more tests required --- src/jsx_decoder.hrl | 102 ++++++++++++++++++++++++-- test/cases/noncharacter_replaced.json | 1 + test/cases/noncharacter_replaced.test | 4 + test/cases/nullbyte_replaced.json | 1 + test/cases/nullbyte_replaced.test | 4 + 5 files changed, 107 insertions(+), 5 deletions(-) create mode 100644 test/cases/noncharacter_replaced.json create mode 100644 test/cases/noncharacter_replaced.test create mode 100644 test/cases/nullbyte_replaced.json create mode 100644 test/cases/nullbyte_replaced.test diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl index d3a2fc4..2cb88cc 100644 --- a/src/jsx_decoder.hrl +++ b/src/jsx_decoder.hrl @@ -400,7 +400,11 @@ string(Bin, Stack, Opts, Acc) -> ; (Stream) -> string(<>, Stack, Opts, Acc) end} - ; false -> {error, {badjson, Bin}} + ; false -> + case Opts#opts.loose_unicode of + true -> noncharacter(Bin, Stack, Opts, Acc) + ; false -> {error, {badjson, Bin}} + end end. @@ -449,13 +453,101 @@ partial_utf(_) -> false. -endif. -ifdef(utf32). -partial_utf(<<_:32>>) -> false; -partial_utf(_) -> true. +partial_utf(<<>>) -> true; +partial_utf(<<_>>) -> true; +partial_utf(<<_, _>>) -> true; +partial_utf(<<_, _, _>>) -> true; +partial_utf(_) -> false. -endif. -ifdef(utf32le). -partial_utf(<<_:32>>) -> false; -partial_utf(_) -> true. +partial_utf(<<>>) -> true; +partial_utf(<<_>>) -> true; +partial_utf(<<_, _>>) -> true; +partial_utf(<<_, _, _>>) -> true; +partial_utf(_) -> false. +-endif. + + +-ifdef(utf8). +%% non-characters erlang doesn't recognize as non-characters, idiotically +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+fffe and u+ffff +noncharacter(<<239, 191, X, Rest/binary>>, Stack, Opts, Acc) + when X == 190; X == 191 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<<237, X, _, Rest/binary>>, Stack, Opts, Acc) when X >= 160 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. +-endif. + +-ifdef(utf16). +%% non-characters blah blah +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+ffff and u+fffe +noncharacter(<<255, X, Rest/binary>>, Stack, Opts, Acc) + when X == 253; X == 254 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<>, Stack, Opts, Acc) + when X >= 216, X =< 223 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. +-endif. + +-ifdef(utf16le). +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+ffff and u+fffe +noncharacter(<>, Stack, Opts, Acc) + when X == 253; X == 254 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<<_, X, Rest/binary>>, Stack, Opts, Acc) + when X >= 216, X =< 223 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. +-endif. + +-ifdef(utf32). +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+ffff and u+fffe +noncharacter(<<0, 0, 255, X, Rest/binary>>, Stack, Opts, Acc) + when X == 254; X == 255 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<<0, 0, X, _, Rest/binary>>, Stack, Opts, Acc) + when X >= 216, X =< 223 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. +-endif. + +-ifdef(utf32le). +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+ffff and u+fffe +noncharacter(<>, Stack, Opts, Acc) + when X == 254; X == 255 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<<_, X, 0, 0, Rest/binary>>, Stack, Opts, Acc) + when X >= 216, X =< 223 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. -endif. diff --git a/test/cases/noncharacter_replaced.json b/test/cases/noncharacter_replaced.json new file mode 100644 index 0000000..09db417 --- /dev/null +++ b/test/cases/noncharacter_replaced.json @@ -0,0 +1 @@ +"﷐" \ No newline at end of file diff --git a/test/cases/noncharacter_replaced.test b/test/cases/noncharacter_replaced.test new file mode 100644 index 0000000..0944886 --- /dev/null +++ b/test/cases/noncharacter_replaced.test @@ -0,0 +1,4 @@ +{name, "noncharacter replaced"}. +{jsx, [{string,<<16#fffd/utf8>>},end_json]}. +{json, "noncharacter_replaced.json"}. +{jsx_flags, [loose_unicode]}. \ No newline at end of file diff --git a/test/cases/nullbyte_replaced.json b/test/cases/nullbyte_replaced.json new file mode 100644 index 0000000..ed6780d --- /dev/null +++ b/test/cases/nullbyte_replaced.json @@ -0,0 +1 @@ +"\u0000" \ No newline at end of file diff --git a/test/cases/nullbyte_replaced.test b/test/cases/nullbyte_replaced.test new file mode 100644 index 0000000..9a909eb --- /dev/null +++ b/test/cases/nullbyte_replaced.test @@ -0,0 +1,4 @@ +{name, "nullbyte replaced"}. +{jsx, [{string,<<16#fffd/utf8>>},end_json]}. +{json, "nullbyte_replaced.json"}. +{jsx_flags, [loose_unicode]}. \ No newline at end of file