From 80e9381b42c525ce1588629fad05639218362aba Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 27 Jul 2011 06:52:16 -0700 Subject: [PATCH] proper guarding and handling of noncharacters in json strings. more tests required --- src/jsx_decoder.hrl | 102 ++++++++++++++++++++++++-- test/cases/noncharacter_replaced.json | 1 + test/cases/noncharacter_replaced.test | 4 + test/cases/nullbyte_replaced.json | 1 + test/cases/nullbyte_replaced.test | 4 + 5 files changed, 107 insertions(+), 5 deletions(-) create mode 100644 test/cases/noncharacter_replaced.json create mode 100644 test/cases/noncharacter_replaced.test create mode 100644 test/cases/nullbyte_replaced.json create mode 100644 test/cases/nullbyte_replaced.test diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl index d3a2fc4..2cb88cc 100644 --- a/src/jsx_decoder.hrl +++ b/src/jsx_decoder.hrl @@ -400,7 +400,11 @@ string(Bin, Stack, Opts, Acc) -> ; (Stream) -> string(<>, Stack, Opts, Acc) end} - ; false -> {error, {badjson, Bin}} + ; false -> + case Opts#opts.loose_unicode of + true -> noncharacter(Bin, Stack, Opts, Acc) + ; false -> {error, {badjson, Bin}} + end end. @@ -449,13 +453,101 @@ partial_utf(_) -> false. -endif. -ifdef(utf32). -partial_utf(<<_:32>>) -> false; -partial_utf(_) -> true. +partial_utf(<<>>) -> true; +partial_utf(<<_>>) -> true; +partial_utf(<<_, _>>) -> true; +partial_utf(<<_, _, _>>) -> true; +partial_utf(_) -> false. -endif. -ifdef(utf32le). -partial_utf(<<_:32>>) -> false; -partial_utf(_) -> true. +partial_utf(<<>>) -> true; +partial_utf(<<_>>) -> true; +partial_utf(<<_, _>>) -> true; +partial_utf(<<_, _, _>>) -> true; +partial_utf(_) -> false. +-endif. + + +-ifdef(utf8). +%% non-characters erlang doesn't recognize as non-characters, idiotically +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+fffe and u+ffff +noncharacter(<<239, 191, X, Rest/binary>>, Stack, Opts, Acc) + when X == 190; X == 191 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<<237, X, _, Rest/binary>>, Stack, Opts, Acc) when X >= 160 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. +-endif. + +-ifdef(utf16). +%% non-characters blah blah +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+ffff and u+fffe +noncharacter(<<255, X, Rest/binary>>, Stack, Opts, Acc) + when X == 253; X == 254 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<>, Stack, Opts, Acc) + when X >= 216, X =< 223 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. +-endif. + +-ifdef(utf16le). +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+ffff and u+fffe +noncharacter(<>, Stack, Opts, Acc) + when X == 253; X == 254 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<<_, X, Rest/binary>>, Stack, Opts, Acc) + when X >= 216, X =< 223 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. +-endif. + +-ifdef(utf32). +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+ffff and u+fffe +noncharacter(<<0, 0, 255, X, Rest/binary>>, Stack, Opts, Acc) + when X == 254; X == 255 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<<0, 0, X, _, Rest/binary>>, Stack, Opts, Acc) + when X >= 216, X =< 223 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. +-endif. + +-ifdef(utf32le). +noncharacter(<>, Stack, Opts, Acc) + when ?is_noncontrol(S) -> + string(Rest, Stack, Opts, <>); +%% u+ffff and u+fffe +noncharacter(<>, Stack, Opts, Acc) + when X == 254; X == 255 -> + string(Rest, Stack, Opts, <>); +%% surrogates +noncharacter(<<_, X, 0, 0, Rest/binary>>, Stack, Opts, Acc) + when X >= 216, X =< 223 -> + string(Rest, Stack, Opts, <>); +noncharacter(Bin, _Stack, _Opts, _Acc) -> + {error, {badjson, Bin}}. -endif. diff --git a/test/cases/noncharacter_replaced.json b/test/cases/noncharacter_replaced.json new file mode 100644 index 0000000..09db417 --- /dev/null +++ b/test/cases/noncharacter_replaced.json @@ -0,0 +1 @@ +"﷐" \ No newline at end of file diff --git a/test/cases/noncharacter_replaced.test b/test/cases/noncharacter_replaced.test new file mode 100644 index 0000000..0944886 --- /dev/null +++ b/test/cases/noncharacter_replaced.test @@ -0,0 +1,4 @@ +{name, "noncharacter replaced"}. +{jsx, [{string,<<16#fffd/utf8>>},end_json]}. +{json, "noncharacter_replaced.json"}. +{jsx_flags, [loose_unicode]}. \ No newline at end of file diff --git a/test/cases/nullbyte_replaced.json b/test/cases/nullbyte_replaced.json new file mode 100644 index 0000000..ed6780d --- /dev/null +++ b/test/cases/nullbyte_replaced.json @@ -0,0 +1 @@ +"\u0000" \ No newline at end of file diff --git a/test/cases/nullbyte_replaced.test b/test/cases/nullbyte_replaced.test new file mode 100644 index 0000000..9a909eb --- /dev/null +++ b/test/cases/nullbyte_replaced.test @@ -0,0 +1,4 @@ +{name, "nullbyte replaced"}. +{jsx, [{string,<<16#fffd/utf8>>},end_json]}. +{json, "nullbyte_replaced.json"}. +{jsx_flags, [loose_unicode]}. \ No newline at end of file