From 9528216d154da25a0804353b94041cf8950a4e5b Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Mon, 1 Dec 2014 09:20:17 +0000 Subject: [PATCH] allow unicode noncharacters and reserved space characters in json strings fixes #67 --- .travis.yml | 4 +- README.md | 10 ++-- src/jsx_decoder.erl | 93 ++++++------------------------------- src/jsx_encoder.erl | 4 +- src/jsx_parser.erl | 110 +++----------------------------------------- 5 files changed, 30 insertions(+), 191 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5bd4215..3662941 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,7 @@ language: erlang script: rebar compile && rebar skip_deps=true eunit otp_release: + - 17.3 - 17.1 - 17.0 - R16B03-1 @@ -11,6 +12,3 @@ otp_release: - R15B02 - R15B01 - R15B - - R14B04 - - R14B03 - - R14B02 diff --git a/README.md b/README.md index 91cb3e9..bc7baad 100644 --- a/README.md +++ b/README.md @@ -223,9 +223,9 @@ see below | `datetime()` encountered otherwise are replaced with the replacement codepoint (`u+fffd`) all erlang strings are represented by **valid** `utf8` encoded binaries. the - encoder will check strings for conformance. noncharacters (like `u+ffff`) - are allowed in erlang utf8 encoded binaries, but will be replaced in strings - passed to the encoder (although, again, see [options](#option)) + encoder will check strings for conformance. badly formed `utf8` sequences may + be replaced with the replacement codepoint (`u+fffd`) according to the unicode + spec this implementation performs no normalization on strings beyond that detailed here. be careful when comparing strings as equivalent strings @@ -244,7 +244,8 @@ see below | `datetime()` * objects json objects are represented by erlang proplists. json maps may also be - encoded to json but the decoder will not produce maps + encoded to json and optionally decoded to maps (via the `return_maps` + option) the empty object has the special representation `[{}]` to differentiate it from the empty list. ambiguities like `[true, false]` prevent the use of @@ -349,6 +350,7 @@ option() = dirty_strings | stream | strict | {strict, [strict_option()]} + | uescape | unescaped_jsonp strict_option() = comments diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index db84f56..a006a0e 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -505,73 +505,29 @@ string(<<127, Rest/binary>>, Handler, Acc, Stack, Config) -> string(Rest, Handler, acc_seq(Acc, 127), Stack, Config); string(<>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) -> string(Rest, Handler, acc_seq(Acc, C), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#20, X < 16#2028 -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); string(<>, Handler, Acc, Stack, Config) when X == 16#2028; X == 16#2029 -> string(Rest, Handler, acc_seq(Acc, maybe_replace(X, Config)), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X > 16#2029, X < 16#d800 -> +string(<>, Handler, Acc, Stack, Config) when X >= 16#80 -> string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X > 16#dfff, X < 16#fdd0 -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X > 16#fdef, X < 16#fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#10000, X < 16#1fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#20000, X < 16#2fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#30000, X < 16#3fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#40000, X < 16#4fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#50000, X < 16#5fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#60000, X < 16#6fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#70000, X < 16#7fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#80000, X < 16#8fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#90000, X < 16#9fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#a0000, X < 16#afffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#b0000, X < 16#bfffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#c0000, X < 16#cfffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#d0000, X < 16#dfffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#e0000, X < 16#efffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#f0000, X < 16#ffffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#100000, X < 16#10fffe -> - string(Rest, Handler, acc_seq(Acc, X), Stack, Config); -%% partial utf8 codepoints. check that input could possibly be valid before attempting -%% to correct +%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match preceeding +string(<<239, 191, 190, Rest/binary>>, Handler, Acc, Stack, Config) -> + string(Rest, Handler, acc_seq(Acc, 16#fffe), Stack, Config); +string(<<239, 191, 191, Rest/binary>>, Handler, Acc, Stack, Config) -> + string(Rest, Handler, acc_seq(Acc, 16#ffff), Stack, Config); +%% partial utf8 codepoints string(<<>>, Handler, Acc, Stack, Config) -> incomplete(string, <<>>, Handler, Acc, Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#c2, X =< 16#f4 -> +string(<>, Handler, Acc, Stack, Config) when X >= 2#11000000 -> incomplete(string, <>, Handler, Acc, Stack, Config); -string(<>, Handler, Acc, Stack, Config) when X >= 16#e0, X =< 16#f4, Y >= 16#80, Y =< 16#bf -> +string(<>, Handler, Acc, Stack, Config) when X >= 2#11100000, Y >= 2#10000000 -> incomplete(string, <>, Handler, Acc, Stack, Config); string(<>, Handler, Acc, Stack, Config) - when X >= 16#f0, X =< 16#f4, - Y >= 16#80, Y =< 16#bf, - Z >= 16#80, Z =< 16#bf -> + when X >= 2#11100000, Y >= 2#10000000, Z >= 2#10000000 -> incomplete(string, <>, Handler, Acc, Stack, Config); %% surrogates string(<<237, X, _, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) when X >= 160 -> string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); -%% u+xfffe, u+xffff, control codes and other noncharacters -string(<<_/utf8, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) -> - string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); -%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match the -%% preceeding clause -string(<<239, 191, X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) - when X == 190; X == 191 -> - string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); %% overlong encodings and missing continuations of a 2 byte sequence string(<>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) when X >= 192, X =< 223 -> @@ -1268,27 +1224,16 @@ codepoints() -> lists:seq(35, 46) ++ lists:seq(48, 91) ++ lists:seq(93, 127) ++ - [16#2027, 16#202a, 16#d7ff, 16#e000, 16#fdcf, 16#fdf0, 16#fffd] ++ - [16#10000, 16#1fffd, 16#20000, 16#30000, 16#40000, 16#50000] ++ + [16#2027, 16#202a, 16#d7ff, 16#e000] ++ + lists:seq(16#fdd0, 16#ffff) ++ + [16#10000, 16#20000, 16#30000, 16#40000, 16#50000] ++ [16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++ [16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000]. -reserved_space() -> lists:seq(16#fdd0, 16#fdef). +controls() -> lists:seq(0, 31). surrogates() -> lists:seq(16#d800, 16#dfff). -noncharacters() -> lists:seq(16#fffe, 16#ffff). - -extended_noncharacters() -> - [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] ++ - [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] ++ - [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] ++ - [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] ++ - [16#9fffe, 16#9ffff, 16#afffe, 16#affff] ++ - [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] ++ - [16#dfffe, 16#dffff, 16#efffe, 16#effff] ++ - [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. - %% erlang refuses to decode certain codepoints, so fake them all to_fake_utf8(N) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>; @@ -1305,7 +1250,7 @@ to_fake_utf8(N) -> clean_string_test_() -> Clean = codepoints(), - Dirty = reserved_space() ++ surrogates() ++ noncharacters() ++ extended_noncharacters(), + Dirty = surrogates() ++ controls(), % clean codepoints [{"clean u+" ++ integer_to_list(Codepoint, 16), ?_assertEqual( [{string, <>}, end_json], @@ -1363,11 +1308,6 @@ dirty_string_test_() -> <<"[\"", 237, 160, 128, "\"]">>, [dirty_strings] }, - {"dirty 16#10ffff", - [start_array, {string, <<244, 143, 191, 191>>}, end_array, end_json], - <<"[\"", 244, 143, 191, 191, "\"]">>, - [dirty_strings] - }, {"dirty /", [start_array, {string, <<$/>>}, end_array, end_json], <<"[\"", $/, "\"]">>, @@ -1393,8 +1333,6 @@ dirty_string_test_() -> bad_utf8_test_() -> Cases = [ - {"noncharacter u+fffe", <<16#fffd/utf8>>, <<239, 191, 190>>}, - {"noncharacter u+ffff", <<16#fffd/utf8>>, <<239, 191, 191>>}, {"orphan continuation byte u+0080", <<16#fffd/utf8>>, <<16#0080>>}, {"orphan continuation byte u+00bf", <<16#fffd/utf8>>, <<16#00bf>>}, {"2 continuation bytes", @@ -1610,7 +1548,6 @@ embedded_single_quoted_string_test_() -> decode(<<34, "quoth the raven, 'nevermore'", 34>>, [{strict, [single_quotes]}]) )} ]. - ignored_bad_escapes_test_() -> diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index c631efd..d39d49a 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -103,8 +103,8 @@ custom_error_handler_test_() -> parser(self(), [{error_handler, Error}]) )}, {"string error", ?_assertEqual( - {value, [{string, <<239, 191, 191>>}]}, - parser(<<239, 191, 191>>, [{error_handler, Error}, strict]) + {value, [{string, <<237, 160, 128>>}]}, + parser(<<237, 160, 128>>, [{error_handler, Error}, strict]) )} ]. diff --git a/src/jsx_parser.erl b/src/jsx_parser.erl index c2969d3..d33d22a 100644 --- a/src/jsx_parser.erl +++ b/src/jsx_parser.erl @@ -355,53 +355,11 @@ clean(<>, Acc, Config=#config{uescape=true}) -> maybe_replace(X, Rest, Acc, Config); clean(<>, Acc, Config) when X == 16#2028; X == 16#2029 -> maybe_replace(X, Rest, Acc, Config); -clean(<>, Acc, Config) when X < 16#d800 -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X > 16#dfff, X < 16#fdd0 -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X > 16#fdef, X < 16#fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#10000, X < 16#1fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#20000, X < 16#2fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#30000, X < 16#3fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#40000, X < 16#4fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#50000, X < 16#5fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#60000, X < 16#6fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#70000, X < 16#7fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#80000, X < 16#8fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#90000, X < 16#9fffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#a0000, X < 16#afffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#b0000, X < 16#bfffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#c0000, X < 16#cfffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#d0000, X < 16#dfffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#e0000, X < 16#efffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#f0000, X < 16#ffffe -> - clean(Rest, [X] ++ Acc, Config); -clean(<>, Acc, Config) when X >= 16#100000, X < 16#10fffe -> +clean(<>, Acc, Config) -> clean(Rest, [X] ++ Acc, Config); %% surrogates clean(<<237, X, _, Rest/binary>>, Acc, Config) when X >= 160 -> maybe_replace(surrogate, Rest, Acc, Config); -%% noncharacters -clean(<<_/utf8, Rest/binary>>, Acc, Config) -> - maybe_replace(noncharacter, Rest, Acc, Config); -%% u+fffe and u+ffff for R14BXX -clean(<<239, 191, X, Rest/binary>>, Acc, Config) when X == 190; X == 191 -> - maybe_replace(noncharacter, Rest, Acc, Config); %% overlong encodings and missing continuations of a 2 byte sequence clean(<>, Acc, Config) when X >= 192, X =< 223 -> maybe_replace(badutf, strip_continuations(Rest, 1), Acc, Config); @@ -500,7 +458,7 @@ error_test_() -> {"value error", ?_assertError(badarg, parse([self()], []))}, {"maybe_done error", ?_assertError(badarg, parse([start_array, end_array, start_array, end_json], []))}, {"done error", ?_assertError(badarg, parse([{string, <<"">>}, {literal, true}, end_json], []))}, - {"string error", ?_assertError(badarg, parse([{string, <<239, 191, 191>>}, end_json], [strict]))} + {"string error", ?_assertError(badarg, parse([{string, <<237, 160, 128>>}, end_json], [strict]))} ]. @@ -520,8 +478,8 @@ custom_error_handler_test_() -> parse([{string, <<"">>}, {literal, true}, end_json], [{error_handler, Error}]) )}, {"string error", ?_assertEqual( - {value, [{string, <<239, 191, 191>>}, end_json]}, - parse([{string, <<239, 191, 191>>}, end_json], [{error_handler, Error}, strict]) + {value, [{string, <<237, 160, 128>>}, end_json]}, + parse([{string, <<237, 160, 128>>}, end_json], [{error_handler, Error}, strict]) )} ]. @@ -585,36 +543,20 @@ codepoints() -> ++ lists:seq(48, 91) ++ lists:seq(93, 16#2027) ++ lists:seq(16#202a, 16#d7ff) - ++ lists:seq(16#e000, 16#fdcf) - ++ lists:seq(16#fdf0, 16#fffd) + ++ lists:seq(16#e000, 16#ffff) ). extended_codepoints() -> unicode:characters_to_binary( - lists:seq(16#10000, 16#1fffd) ++ [ + lists:seq(16#10000, 16#1ffff) ++ [ 16#20000, 16#30000, 16#40000, 16#50000, 16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000 ] ). -reserved_space() -> [ to_fake_utf8(N) || N <- lists:seq(16#fdd0, 16#fdef) ]. - surrogates() -> [ to_fake_utf8(N) || N <- lists:seq(16#d800, 16#dfff) ]. -noncharacters() -> [ to_fake_utf8(N) || N <- lists:seq(16#fffe, 16#ffff) ]. - -extended_noncharacters() -> - [ to_fake_utf8(N) || N <- [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] - ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] - ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] - ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] - ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] - ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] - ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] - ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff] - ]. - clean_string_helper(String) -> try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean catch error:badarg -> {error, badarg} @@ -638,37 +580,13 @@ clean_string_test_() -> extended_codepoints(), clean_string(extended_codepoints(), #config{escaped_strings=true}) )}, - {"error reserved space", ?_assertEqual( - lists:duplicate(length(reserved_space()), {error, badarg}), - lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, reserved_space()) - )}, {"error surrogates", ?_assertEqual( lists:duplicate(length(surrogates()), {error, badarg}), lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, surrogates()) )}, - {"error noncharacters", ?_assertEqual( - lists:duplicate(length(noncharacters()), {error, badarg}), - lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, noncharacters()) - )}, - {"error extended noncharacters", ?_assertEqual( - lists:duplicate(length(extended_noncharacters()), {error, badarg}), - lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, extended_noncharacters()) - )}, - {"clean reserved space", ?_assertEqual( - lists:duplicate(length(reserved_space()), <<16#fffd/utf8>>), - lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, reserved_space()) - )}, {"clean surrogates", ?_assertEqual( lists:duplicate(length(surrogates()), <<16#fffd/utf8>>), lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, surrogates()) - )}, - {"clean noncharacters", ?_assertEqual( - lists:duplicate(length(noncharacters()), <<16#fffd/utf8>>), - lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, noncharacters()) - )}, - {"clean extended noncharacters", ?_assertEqual( - lists:duplicate(length(extended_noncharacters()), <<16#fffd/utf8>>), - lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, extended_noncharacters()) )} ]. @@ -844,22 +762,6 @@ escape_test_() -> bad_utf8_test_() -> [ - {"noncharacter u+fffe", ?_assertError( - badarg, - clean_string(to_fake_utf8(16#fffe), #config{strict_utf8=true}) - )}, - {"noncharacter u+fffe replaced", ?_assertEqual( - <<16#fffd/utf8>>, - clean_string(to_fake_utf8(16#fffe), #config{}) - )}, - {"noncharacter u+ffff", ?_assertError( - badarg, - clean_string(to_fake_utf8(16#ffff), #config{strict_utf8=true}) - )}, - {"noncharacter u+ffff replaced", ?_assertEqual( - <<16#fffd/utf8>>, - clean_string(to_fake_utf8(16#ffff), #config{}) - )}, {"orphan continuation byte u+0080", ?_assertError( badarg, clean_string(<<16#0080>>, #config{strict_utf8=true})