allow unicode noncharacters and reserved space characters in json

strings

fixes #67
This commit is contained in:
alisdair sullivan 2014-12-01 09:20:17 +00:00
parent 5f5e85914d
commit 9528216d15
5 changed files with 30 additions and 191 deletions

View file

@ -1,6 +1,7 @@
language: erlang
script: rebar compile && rebar skip_deps=true eunit
otp_release:
- 17.3
- 17.1
- 17.0
- R16B03-1
@ -11,6 +12,3 @@ otp_release:
- R15B02
- R15B01
- R15B
- R14B04
- R14B03
- R14B02

View file

@ -223,9 +223,9 @@ see below | `datetime()`
encountered otherwise are replaced with the replacement codepoint (`u+fffd`)
all erlang strings are represented by **valid** `utf8` encoded binaries. the
encoder will check strings for conformance. noncharacters (like `u+ffff`)
are allowed in erlang utf8 encoded binaries, but will be replaced in strings
passed to the encoder (although, again, see [options](#option))
encoder will check strings for conformance. badly formed `utf8` sequences may
be replaced with the replacement codepoint (`u+fffd`) according to the unicode
spec
this implementation performs no normalization on strings beyond that
detailed here. be careful when comparing strings as equivalent strings
@ -244,7 +244,8 @@ see below | `datetime()`
* objects
json objects are represented by erlang proplists. json maps may also be
encoded to json but the decoder will not produce maps
encoded to json and optionally decoded to maps (via the `return_maps`
option)
the empty object has the special representation `[{}]` to differentiate it
from the empty list. ambiguities like `[true, false]` prevent the use of
@ -349,6 +350,7 @@ option() = dirty_strings
| stream
| strict
| {strict, [strict_option()]}
| uescape
| unescaped_jsonp
strict_option() = comments

View file

@ -505,73 +505,29 @@ string(<<127, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, acc_seq(Acc, 127), Stack, Config);
string(<<C, Rest/binary>>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) ->
string(Rest, Handler, acc_seq(Acc, C), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#20, X < 16#2028 ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X == 16#2028; X == 16#2029 ->
string(Rest, Handler, acc_seq(Acc, maybe_replace(X, Config)), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#2029, X < 16#d800 ->
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#80 ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#dfff, X < 16#fdd0 ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#fdef, X < 16#fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#10000, X < 16#1fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#20000, X < 16#2fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#30000, X < 16#3fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#40000, X < 16#4fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#50000, X < 16#5fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#60000, X < 16#6fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#70000, X < 16#7fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#80000, X < 16#8fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#90000, X < 16#9fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#a0000, X < 16#afffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#b0000, X < 16#bfffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#c0000, X < 16#cfffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#d0000, X < 16#dfffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#e0000, X < 16#efffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#f0000, X < 16#ffffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#100000, X < 16#10fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
%% partial utf8 codepoints. check that input could possibly be valid before attempting
%% to correct
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match preceeding
string(<<239, 191, 190, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, acc_seq(Acc, 16#fffe), Stack, Config);
string(<<239, 191, 191, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, acc_seq(Acc, 16#ffff), Stack, Config);
%% partial utf8 codepoints
string(<<>>, Handler, Acc, Stack, Config) ->
incomplete(string, <<>>, Handler, Acc, Stack, Config);
string(<<X>>, Handler, Acc, Stack, Config) when X >= 16#c2, X =< 16#f4 ->
string(<<X>>, Handler, Acc, Stack, Config) when X >= 2#11000000 ->
incomplete(string, <<X>>, Handler, Acc, Stack, Config);
string(<<X, Y>>, Handler, Acc, Stack, Config) when X >= 16#e0, X =< 16#f4, Y >= 16#80, Y =< 16#bf ->
string(<<X, Y>>, Handler, Acc, Stack, Config) when X >= 2#11100000, Y >= 2#10000000 ->
incomplete(string, <<X, Y>>, Handler, Acc, Stack, Config);
string(<<X, Y, Z>>, Handler, Acc, Stack, Config)
when X >= 16#f0, X =< 16#f4,
Y >= 16#80, Y =< 16#bf,
Z >= 16#80, Z =< 16#bf ->
when X >= 2#11100000, Y >= 2#10000000, Z >= 2#10000000 ->
incomplete(string, <<X, Y, Z>>, Handler, Acc, Stack, Config);
%% surrogates
string(<<237, X, _, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
when X >= 160 ->
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
%% u+xfffe, u+xffff, control codes and other noncharacters
string(<<_/utf8, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) ->
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match the
%% preceeding clause
string(<<239, 191, X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
when X == 190; X == 191 ->
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
%% overlong encodings and missing continuations of a 2 byte sequence
string(<<X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
when X >= 192, X =< 223 ->
@ -1268,27 +1224,16 @@ codepoints() ->
lists:seq(35, 46) ++
lists:seq(48, 91) ++
lists:seq(93, 127) ++
[16#2027, 16#202a, 16#d7ff, 16#e000, 16#fdcf, 16#fdf0, 16#fffd] ++
[16#10000, 16#1fffd, 16#20000, 16#30000, 16#40000, 16#50000] ++
[16#2027, 16#202a, 16#d7ff, 16#e000] ++
lists:seq(16#fdd0, 16#ffff) ++
[16#10000, 16#20000, 16#30000, 16#40000, 16#50000] ++
[16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++
[16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000].
reserved_space() -> lists:seq(16#fdd0, 16#fdef).
controls() -> lists:seq(0, 31).
surrogates() -> lists:seq(16#d800, 16#dfff).
noncharacters() -> lists:seq(16#fffe, 16#ffff).
extended_noncharacters() ->
[16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] ++
[16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] ++
[16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] ++
[16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] ++
[16#9fffe, 16#9ffff, 16#afffe, 16#affff] ++
[16#bfffe, 16#bffff, 16#cfffe, 16#cffff] ++
[16#dfffe, 16#dffff, 16#efffe, 16#effff] ++
[16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
%% erlang refuses to decode certain codepoints, so fake them all
to_fake_utf8(N) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;
@ -1305,7 +1250,7 @@ to_fake_utf8(N) ->
clean_string_test_() ->
Clean = codepoints(),
Dirty = reserved_space() ++ surrogates() ++ noncharacters() ++ extended_noncharacters(),
Dirty = surrogates() ++ controls(),
% clean codepoints
[{"clean u+" ++ integer_to_list(Codepoint, 16), ?_assertEqual(
[{string, <<Codepoint/utf8>>}, end_json],
@ -1363,11 +1308,6 @@ dirty_string_test_() ->
<<"[\"", 237, 160, 128, "\"]">>,
[dirty_strings]
},
{"dirty 16#10ffff",
[start_array, {string, <<244, 143, 191, 191>>}, end_array, end_json],
<<"[\"", 244, 143, 191, 191, "\"]">>,
[dirty_strings]
},
{"dirty /",
[start_array, {string, <<$/>>}, end_array, end_json],
<<"[\"", $/, "\"]">>,
@ -1393,8 +1333,6 @@ dirty_string_test_() ->
bad_utf8_test_() ->
Cases = [
{"noncharacter u+fffe", <<16#fffd/utf8>>, <<239, 191, 190>>},
{"noncharacter u+ffff", <<16#fffd/utf8>>, <<239, 191, 191>>},
{"orphan continuation byte u+0080", <<16#fffd/utf8>>, <<16#0080>>},
{"orphan continuation byte u+00bf", <<16#fffd/utf8>>, <<16#00bf>>},
{"2 continuation bytes",
@ -1610,7 +1548,6 @@ embedded_single_quoted_string_test_() ->
decode(<<34, "quoth the raven, 'nevermore'", 34>>, [{strict, [single_quotes]}])
)}
].
ignored_bad_escapes_test_() ->

View file

@ -103,8 +103,8 @@ custom_error_handler_test_() ->
parser(self(), [{error_handler, Error}])
)},
{"string error", ?_assertEqual(
{value, [{string, <<239, 191, 191>>}]},
parser(<<239, 191, 191>>, [{error_handler, Error}, strict])
{value, [{string, <<237, 160, 128>>}]},
parser(<<237, 160, 128>>, [{error_handler, Error}, strict])
)}
].

View file

@ -355,53 +355,11 @@ clean(<<X/utf8, Rest/binary>>, Acc, Config=#config{uescape=true}) ->
maybe_replace(X, Rest, Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X == 16#2028; X == 16#2029 ->
maybe_replace(X, Rest, Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X < 16#d800 ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X > 16#dfff, X < 16#fdd0 ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X > 16#fdef, X < 16#fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#10000, X < 16#1fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#20000, X < 16#2fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#30000, X < 16#3fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#40000, X < 16#4fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#50000, X < 16#5fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#60000, X < 16#6fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#70000, X < 16#7fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#80000, X < 16#8fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#90000, X < 16#9fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#a0000, X < 16#afffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#b0000, X < 16#bfffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#c0000, X < 16#cfffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#d0000, X < 16#dfffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#e0000, X < 16#efffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#f0000, X < 16#ffffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#100000, X < 16#10fffe ->
clean(<<X/utf8, Rest/binary>>, Acc, Config) ->
clean(Rest, [X] ++ Acc, Config);
%% surrogates
clean(<<237, X, _, Rest/binary>>, Acc, Config) when X >= 160 ->
maybe_replace(surrogate, Rest, Acc, Config);
%% noncharacters
clean(<<_/utf8, Rest/binary>>, Acc, Config) ->
maybe_replace(noncharacter, Rest, Acc, Config);
%% u+fffe and u+ffff for R14BXX
clean(<<239, 191, X, Rest/binary>>, Acc, Config) when X == 190; X == 191 ->
maybe_replace(noncharacter, Rest, Acc, Config);
%% overlong encodings and missing continuations of a 2 byte sequence
clean(<<X, Rest/binary>>, Acc, Config) when X >= 192, X =< 223 ->
maybe_replace(badutf, strip_continuations(Rest, 1), Acc, Config);
@ -500,7 +458,7 @@ error_test_() ->
{"value error", ?_assertError(badarg, parse([self()], []))},
{"maybe_done error", ?_assertError(badarg, parse([start_array, end_array, start_array, end_json], []))},
{"done error", ?_assertError(badarg, parse([{string, <<"">>}, {literal, true}, end_json], []))},
{"string error", ?_assertError(badarg, parse([{string, <<239, 191, 191>>}, end_json], [strict]))}
{"string error", ?_assertError(badarg, parse([{string, <<237, 160, 128>>}, end_json], [strict]))}
].
@ -520,8 +478,8 @@ custom_error_handler_test_() ->
parse([{string, <<"">>}, {literal, true}, end_json], [{error_handler, Error}])
)},
{"string error", ?_assertEqual(
{value, [{string, <<239, 191, 191>>}, end_json]},
parse([{string, <<239, 191, 191>>}, end_json], [{error_handler, Error}, strict])
{value, [{string, <<237, 160, 128>>}, end_json]},
parse([{string, <<237, 160, 128>>}, end_json], [{error_handler, Error}, strict])
)}
].
@ -585,36 +543,20 @@ codepoints() ->
++ lists:seq(48, 91)
++ lists:seq(93, 16#2027)
++ lists:seq(16#202a, 16#d7ff)
++ lists:seq(16#e000, 16#fdcf)
++ lists:seq(16#fdf0, 16#fffd)
++ lists:seq(16#e000, 16#ffff)
).
extended_codepoints() ->
unicode:characters_to_binary(
lists:seq(16#10000, 16#1fffd) ++ [
lists:seq(16#10000, 16#1ffff) ++ [
16#20000, 16#30000, 16#40000, 16#50000, 16#60000,
16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000,
16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000
]
).
reserved_space() -> [ to_fake_utf8(N) || N <- lists:seq(16#fdd0, 16#fdef) ].
surrogates() -> [ to_fake_utf8(N) || N <- lists:seq(16#d800, 16#dfff) ].
noncharacters() -> [ to_fake_utf8(N) || N <- lists:seq(16#fffe, 16#ffff) ].
extended_noncharacters() ->
[ to_fake_utf8(N) || N <- [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]
].
clean_string_helper(String) ->
try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean
catch error:badarg -> {error, badarg}
@ -638,37 +580,13 @@ clean_string_test_() ->
extended_codepoints(),
clean_string(extended_codepoints(), #config{escaped_strings=true})
)},
{"error reserved space", ?_assertEqual(
lists:duplicate(length(reserved_space()), {error, badarg}),
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, reserved_space())
)},
{"error surrogates", ?_assertEqual(
lists:duplicate(length(surrogates()), {error, badarg}),
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, surrogates())
)},
{"error noncharacters", ?_assertEqual(
lists:duplicate(length(noncharacters()), {error, badarg}),
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, noncharacters())
)},
{"error extended noncharacters", ?_assertEqual(
lists:duplicate(length(extended_noncharacters()), {error, badarg}),
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, extended_noncharacters())
)},
{"clean reserved space", ?_assertEqual(
lists:duplicate(length(reserved_space()), <<16#fffd/utf8>>),
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, reserved_space())
)},
{"clean surrogates", ?_assertEqual(
lists:duplicate(length(surrogates()), <<16#fffd/utf8>>),
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, surrogates())
)},
{"clean noncharacters", ?_assertEqual(
lists:duplicate(length(noncharacters()), <<16#fffd/utf8>>),
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, noncharacters())
)},
{"clean extended noncharacters", ?_assertEqual(
lists:duplicate(length(extended_noncharacters()), <<16#fffd/utf8>>),
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, extended_noncharacters())
)}
].
@ -844,22 +762,6 @@ escape_test_() ->
bad_utf8_test_() ->
[
{"noncharacter u+fffe", ?_assertError(
badarg,
clean_string(to_fake_utf8(16#fffe), #config{strict_utf8=true})
)},
{"noncharacter u+fffe replaced", ?_assertEqual(
<<16#fffd/utf8>>,
clean_string(to_fake_utf8(16#fffe), #config{})
)},
{"noncharacter u+ffff", ?_assertError(
badarg,
clean_string(to_fake_utf8(16#ffff), #config{strict_utf8=true})
)},
{"noncharacter u+ffff replaced", ?_assertEqual(
<<16#fffd/utf8>>,
clean_string(to_fake_utf8(16#ffff), #config{})
)},
{"orphan continuation byte u+0080", ?_assertError(
badarg,
clean_string(<<16#0080>>, #config{strict_utf8=true})