allow unicode noncharacters and reserved space characters in json

strings

fixes #67
This commit is contained in:
alisdair sullivan 2014-12-01 09:20:17 +00:00
parent 5f5e85914d
commit 9528216d15
5 changed files with 30 additions and 191 deletions

View file

@ -1,6 +1,7 @@
language: erlang language: erlang
script: rebar compile && rebar skip_deps=true eunit script: rebar compile && rebar skip_deps=true eunit
otp_release: otp_release:
- 17.3
- 17.1 - 17.1
- 17.0 - 17.0
- R16B03-1 - R16B03-1
@ -11,6 +12,3 @@ otp_release:
- R15B02 - R15B02
- R15B01 - R15B01
- R15B - R15B
- R14B04
- R14B03
- R14B02

View file

@ -223,9 +223,9 @@ see below | `datetime()`
encountered otherwise are replaced with the replacement codepoint (`u+fffd`) encountered otherwise are replaced with the replacement codepoint (`u+fffd`)
all erlang strings are represented by **valid** `utf8` encoded binaries. the all erlang strings are represented by **valid** `utf8` encoded binaries. the
encoder will check strings for conformance. noncharacters (like `u+ffff`) encoder will check strings for conformance. badly formed `utf8` sequences may
are allowed in erlang utf8 encoded binaries, but will be replaced in strings be replaced with the replacement codepoint (`u+fffd`) according to the unicode
passed to the encoder (although, again, see [options](#option)) spec
this implementation performs no normalization on strings beyond that this implementation performs no normalization on strings beyond that
detailed here. be careful when comparing strings as equivalent strings detailed here. be careful when comparing strings as equivalent strings
@ -244,7 +244,8 @@ see below | `datetime()`
* objects * objects
json objects are represented by erlang proplists. json maps may also be json objects are represented by erlang proplists. json maps may also be
encoded to json but the decoder will not produce maps encoded to json and optionally decoded to maps (via the `return_maps`
option)
the empty object has the special representation `[{}]` to differentiate it the empty object has the special representation `[{}]` to differentiate it
from the empty list. ambiguities like `[true, false]` prevent the use of from the empty list. ambiguities like `[true, false]` prevent the use of
@ -349,6 +350,7 @@ option() = dirty_strings
| stream | stream
| strict | strict
| {strict, [strict_option()]} | {strict, [strict_option()]}
| uescape
| unescaped_jsonp | unescaped_jsonp
strict_option() = comments strict_option() = comments

View file

@ -505,73 +505,29 @@ string(<<127, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, acc_seq(Acc, 127), Stack, Config); string(Rest, Handler, acc_seq(Acc, 127), Stack, Config);
string(<<C, Rest/binary>>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) -> string(<<C, Rest/binary>>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) ->
string(Rest, Handler, acc_seq(Acc, C), Stack, Config); string(Rest, Handler, acc_seq(Acc, C), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#20, X < 16#2028 ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X == 16#2028; X == 16#2029 -> string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X == 16#2028; X == 16#2029 ->
string(Rest, Handler, acc_seq(Acc, maybe_replace(X, Config)), Stack, Config); string(Rest, Handler, acc_seq(Acc, maybe_replace(X, Config)), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#2029, X < 16#d800 -> string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#80 ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config); string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#dfff, X < 16#fdd0 -> %% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match preceeding
string(Rest, Handler, acc_seq(Acc, X), Stack, Config); string(<<239, 191, 190, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#fdef, X < 16#fffe -> string(Rest, Handler, acc_seq(Acc, 16#fffe), Stack, Config);
string(Rest, Handler, acc_seq(Acc, X), Stack, Config); string(<<239, 191, 191, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#10000, X < 16#1fffe -> string(Rest, Handler, acc_seq(Acc, 16#ffff), Stack, Config);
string(Rest, Handler, acc_seq(Acc, X), Stack, Config); %% partial utf8 codepoints
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#20000, X < 16#2fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#30000, X < 16#3fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#40000, X < 16#4fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#50000, X < 16#5fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#60000, X < 16#6fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#70000, X < 16#7fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#80000, X < 16#8fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#90000, X < 16#9fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#a0000, X < 16#afffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#b0000, X < 16#bfffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#c0000, X < 16#cfffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#d0000, X < 16#dfffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#e0000, X < 16#efffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#f0000, X < 16#ffffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#100000, X < 16#10fffe ->
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
%% partial utf8 codepoints. check that input could possibly be valid before attempting
%% to correct
string(<<>>, Handler, Acc, Stack, Config) -> string(<<>>, Handler, Acc, Stack, Config) ->
incomplete(string, <<>>, Handler, Acc, Stack, Config); incomplete(string, <<>>, Handler, Acc, Stack, Config);
string(<<X>>, Handler, Acc, Stack, Config) when X >= 16#c2, X =< 16#f4 -> string(<<X>>, Handler, Acc, Stack, Config) when X >= 2#11000000 ->
incomplete(string, <<X>>, Handler, Acc, Stack, Config); incomplete(string, <<X>>, Handler, Acc, Stack, Config);
string(<<X, Y>>, Handler, Acc, Stack, Config) when X >= 16#e0, X =< 16#f4, Y >= 16#80, Y =< 16#bf -> string(<<X, Y>>, Handler, Acc, Stack, Config) when X >= 2#11100000, Y >= 2#10000000 ->
incomplete(string, <<X, Y>>, Handler, Acc, Stack, Config); incomplete(string, <<X, Y>>, Handler, Acc, Stack, Config);
string(<<X, Y, Z>>, Handler, Acc, Stack, Config) string(<<X, Y, Z>>, Handler, Acc, Stack, Config)
when X >= 16#f0, X =< 16#f4, when X >= 2#11100000, Y >= 2#10000000, Z >= 2#10000000 ->
Y >= 16#80, Y =< 16#bf,
Z >= 16#80, Z =< 16#bf ->
incomplete(string, <<X, Y, Z>>, Handler, Acc, Stack, Config); incomplete(string, <<X, Y, Z>>, Handler, Acc, Stack, Config);
%% surrogates %% surrogates
string(<<237, X, _, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) string(<<237, X, _, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
when X >= 160 -> when X >= 160 ->
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config); string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
%% u+xfffe, u+xffff, control codes and other noncharacters
string(<<_/utf8, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) ->
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match the
%% preceeding clause
string(<<239, 191, X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
when X == 190; X == 191 ->
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
%% overlong encodings and missing continuations of a 2 byte sequence %% overlong encodings and missing continuations of a 2 byte sequence
string(<<X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) string(<<X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
when X >= 192, X =< 223 -> when X >= 192, X =< 223 ->
@ -1268,27 +1224,16 @@ codepoints() ->
lists:seq(35, 46) ++ lists:seq(35, 46) ++
lists:seq(48, 91) ++ lists:seq(48, 91) ++
lists:seq(93, 127) ++ lists:seq(93, 127) ++
[16#2027, 16#202a, 16#d7ff, 16#e000, 16#fdcf, 16#fdf0, 16#fffd] ++ [16#2027, 16#202a, 16#d7ff, 16#e000] ++
[16#10000, 16#1fffd, 16#20000, 16#30000, 16#40000, 16#50000] ++ lists:seq(16#fdd0, 16#ffff) ++
[16#10000, 16#20000, 16#30000, 16#40000, 16#50000] ++
[16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++ [16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++
[16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000]. [16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000].
reserved_space() -> lists:seq(16#fdd0, 16#fdef). controls() -> lists:seq(0, 31).
surrogates() -> lists:seq(16#d800, 16#dfff). surrogates() -> lists:seq(16#d800, 16#dfff).
noncharacters() -> lists:seq(16#fffe, 16#ffff).
extended_noncharacters() ->
[16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] ++
[16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] ++
[16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] ++
[16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] ++
[16#9fffe, 16#9ffff, 16#afffe, 16#affff] ++
[16#bfffe, 16#bffff, 16#cfffe, 16#cffff] ++
[16#dfffe, 16#dffff, 16#efffe, 16#effff] ++
[16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
%% erlang refuses to decode certain codepoints, so fake them all %% erlang refuses to decode certain codepoints, so fake them all
to_fake_utf8(N) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>; to_fake_utf8(N) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;
@ -1305,7 +1250,7 @@ to_fake_utf8(N) ->
clean_string_test_() -> clean_string_test_() ->
Clean = codepoints(), Clean = codepoints(),
Dirty = reserved_space() ++ surrogates() ++ noncharacters() ++ extended_noncharacters(), Dirty = surrogates() ++ controls(),
% clean codepoints % clean codepoints
[{"clean u+" ++ integer_to_list(Codepoint, 16), ?_assertEqual( [{"clean u+" ++ integer_to_list(Codepoint, 16), ?_assertEqual(
[{string, <<Codepoint/utf8>>}, end_json], [{string, <<Codepoint/utf8>>}, end_json],
@ -1363,11 +1308,6 @@ dirty_string_test_() ->
<<"[\"", 237, 160, 128, "\"]">>, <<"[\"", 237, 160, 128, "\"]">>,
[dirty_strings] [dirty_strings]
}, },
{"dirty 16#10ffff",
[start_array, {string, <<244, 143, 191, 191>>}, end_array, end_json],
<<"[\"", 244, 143, 191, 191, "\"]">>,
[dirty_strings]
},
{"dirty /", {"dirty /",
[start_array, {string, <<$/>>}, end_array, end_json], [start_array, {string, <<$/>>}, end_array, end_json],
<<"[\"", $/, "\"]">>, <<"[\"", $/, "\"]">>,
@ -1393,8 +1333,6 @@ dirty_string_test_() ->
bad_utf8_test_() -> bad_utf8_test_() ->
Cases = [ Cases = [
{"noncharacter u+fffe", <<16#fffd/utf8>>, <<239, 191, 190>>},
{"noncharacter u+ffff", <<16#fffd/utf8>>, <<239, 191, 191>>},
{"orphan continuation byte u+0080", <<16#fffd/utf8>>, <<16#0080>>}, {"orphan continuation byte u+0080", <<16#fffd/utf8>>, <<16#0080>>},
{"orphan continuation byte u+00bf", <<16#fffd/utf8>>, <<16#00bf>>}, {"orphan continuation byte u+00bf", <<16#fffd/utf8>>, <<16#00bf>>},
{"2 continuation bytes", {"2 continuation bytes",
@ -1610,7 +1548,6 @@ embedded_single_quoted_string_test_() ->
decode(<<34, "quoth the raven, 'nevermore'", 34>>, [{strict, [single_quotes]}]) decode(<<34, "quoth the raven, 'nevermore'", 34>>, [{strict, [single_quotes]}])
)} )}
]. ].
ignored_bad_escapes_test_() -> ignored_bad_escapes_test_() ->

View file

@ -103,8 +103,8 @@ custom_error_handler_test_() ->
parser(self(), [{error_handler, Error}]) parser(self(), [{error_handler, Error}])
)}, )},
{"string error", ?_assertEqual( {"string error", ?_assertEqual(
{value, [{string, <<239, 191, 191>>}]}, {value, [{string, <<237, 160, 128>>}]},
parser(<<239, 191, 191>>, [{error_handler, Error}, strict]) parser(<<237, 160, 128>>, [{error_handler, Error}, strict])
)} )}
]. ].

View file

@ -355,53 +355,11 @@ clean(<<X/utf8, Rest/binary>>, Acc, Config=#config{uescape=true}) ->
maybe_replace(X, Rest, Acc, Config); maybe_replace(X, Rest, Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X == 16#2028; X == 16#2029 -> clean(<<X/utf8, Rest/binary>>, Acc, Config) when X == 16#2028; X == 16#2029 ->
maybe_replace(X, Rest, Acc, Config); maybe_replace(X, Rest, Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X < 16#d800 -> clean(<<X/utf8, Rest/binary>>, Acc, Config) ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X > 16#dfff, X < 16#fdd0 ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X > 16#fdef, X < 16#fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#10000, X < 16#1fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#20000, X < 16#2fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#30000, X < 16#3fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#40000, X < 16#4fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#50000, X < 16#5fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#60000, X < 16#6fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#70000, X < 16#7fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#80000, X < 16#8fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#90000, X < 16#9fffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#a0000, X < 16#afffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#b0000, X < 16#bfffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#c0000, X < 16#cfffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#d0000, X < 16#dfffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#e0000, X < 16#efffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#f0000, X < 16#ffffe ->
clean(Rest, [X] ++ Acc, Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#100000, X < 16#10fffe ->
clean(Rest, [X] ++ Acc, Config); clean(Rest, [X] ++ Acc, Config);
%% surrogates %% surrogates
clean(<<237, X, _, Rest/binary>>, Acc, Config) when X >= 160 -> clean(<<237, X, _, Rest/binary>>, Acc, Config) when X >= 160 ->
maybe_replace(surrogate, Rest, Acc, Config); maybe_replace(surrogate, Rest, Acc, Config);
%% noncharacters
clean(<<_/utf8, Rest/binary>>, Acc, Config) ->
maybe_replace(noncharacter, Rest, Acc, Config);
%% u+fffe and u+ffff for R14BXX
clean(<<239, 191, X, Rest/binary>>, Acc, Config) when X == 190; X == 191 ->
maybe_replace(noncharacter, Rest, Acc, Config);
%% overlong encodings and missing continuations of a 2 byte sequence %% overlong encodings and missing continuations of a 2 byte sequence
clean(<<X, Rest/binary>>, Acc, Config) when X >= 192, X =< 223 -> clean(<<X, Rest/binary>>, Acc, Config) when X >= 192, X =< 223 ->
maybe_replace(badutf, strip_continuations(Rest, 1), Acc, Config); maybe_replace(badutf, strip_continuations(Rest, 1), Acc, Config);
@ -500,7 +458,7 @@ error_test_() ->
{"value error", ?_assertError(badarg, parse([self()], []))}, {"value error", ?_assertError(badarg, parse([self()], []))},
{"maybe_done error", ?_assertError(badarg, parse([start_array, end_array, start_array, end_json], []))}, {"maybe_done error", ?_assertError(badarg, parse([start_array, end_array, start_array, end_json], []))},
{"done error", ?_assertError(badarg, parse([{string, <<"">>}, {literal, true}, end_json], []))}, {"done error", ?_assertError(badarg, parse([{string, <<"">>}, {literal, true}, end_json], []))},
{"string error", ?_assertError(badarg, parse([{string, <<239, 191, 191>>}, end_json], [strict]))} {"string error", ?_assertError(badarg, parse([{string, <<237, 160, 128>>}, end_json], [strict]))}
]. ].
@ -520,8 +478,8 @@ custom_error_handler_test_() ->
parse([{string, <<"">>}, {literal, true}, end_json], [{error_handler, Error}]) parse([{string, <<"">>}, {literal, true}, end_json], [{error_handler, Error}])
)}, )},
{"string error", ?_assertEqual( {"string error", ?_assertEqual(
{value, [{string, <<239, 191, 191>>}, end_json]}, {value, [{string, <<237, 160, 128>>}, end_json]},
parse([{string, <<239, 191, 191>>}, end_json], [{error_handler, Error}, strict]) parse([{string, <<237, 160, 128>>}, end_json], [{error_handler, Error}, strict])
)} )}
]. ].
@ -585,36 +543,20 @@ codepoints() ->
++ lists:seq(48, 91) ++ lists:seq(48, 91)
++ lists:seq(93, 16#2027) ++ lists:seq(93, 16#2027)
++ lists:seq(16#202a, 16#d7ff) ++ lists:seq(16#202a, 16#d7ff)
++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#e000, 16#ffff)
++ lists:seq(16#fdf0, 16#fffd)
). ).
extended_codepoints() -> extended_codepoints() ->
unicode:characters_to_binary( unicode:characters_to_binary(
lists:seq(16#10000, 16#1fffd) ++ [ lists:seq(16#10000, 16#1ffff) ++ [
16#20000, 16#30000, 16#40000, 16#50000, 16#60000, 16#20000, 16#30000, 16#40000, 16#50000, 16#60000,
16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000,
16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000 16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000
] ]
). ).
reserved_space() -> [ to_fake_utf8(N) || N <- lists:seq(16#fdd0, 16#fdef) ].
surrogates() -> [ to_fake_utf8(N) || N <- lists:seq(16#d800, 16#dfff) ]. surrogates() -> [ to_fake_utf8(N) || N <- lists:seq(16#d800, 16#dfff) ].
noncharacters() -> [ to_fake_utf8(N) || N <- lists:seq(16#fffe, 16#ffff) ].
extended_noncharacters() ->
[ to_fake_utf8(N) || N <- [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]
].
clean_string_helper(String) -> clean_string_helper(String) ->
try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean
catch error:badarg -> {error, badarg} catch error:badarg -> {error, badarg}
@ -638,37 +580,13 @@ clean_string_test_() ->
extended_codepoints(), extended_codepoints(),
clean_string(extended_codepoints(), #config{escaped_strings=true}) clean_string(extended_codepoints(), #config{escaped_strings=true})
)}, )},
{"error reserved space", ?_assertEqual(
lists:duplicate(length(reserved_space()), {error, badarg}),
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, reserved_space())
)},
{"error surrogates", ?_assertEqual( {"error surrogates", ?_assertEqual(
lists:duplicate(length(surrogates()), {error, badarg}), lists:duplicate(length(surrogates()), {error, badarg}),
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, surrogates()) lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, surrogates())
)}, )},
{"error noncharacters", ?_assertEqual(
lists:duplicate(length(noncharacters()), {error, badarg}),
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, noncharacters())
)},
{"error extended noncharacters", ?_assertEqual(
lists:duplicate(length(extended_noncharacters()), {error, badarg}),
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, extended_noncharacters())
)},
{"clean reserved space", ?_assertEqual(
lists:duplicate(length(reserved_space()), <<16#fffd/utf8>>),
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, reserved_space())
)},
{"clean surrogates", ?_assertEqual( {"clean surrogates", ?_assertEqual(
lists:duplicate(length(surrogates()), <<16#fffd/utf8>>), lists:duplicate(length(surrogates()), <<16#fffd/utf8>>),
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, surrogates()) lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, surrogates())
)},
{"clean noncharacters", ?_assertEqual(
lists:duplicate(length(noncharacters()), <<16#fffd/utf8>>),
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, noncharacters())
)},
{"clean extended noncharacters", ?_assertEqual(
lists:duplicate(length(extended_noncharacters()), <<16#fffd/utf8>>),
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, extended_noncharacters())
)} )}
]. ].
@ -844,22 +762,6 @@ escape_test_() ->
bad_utf8_test_() -> bad_utf8_test_() ->
[ [
{"noncharacter u+fffe", ?_assertError(
badarg,
clean_string(to_fake_utf8(16#fffe), #config{strict_utf8=true})
)},
{"noncharacter u+fffe replaced", ?_assertEqual(
<<16#fffd/utf8>>,
clean_string(to_fake_utf8(16#fffe), #config{})
)},
{"noncharacter u+ffff", ?_assertError(
badarg,
clean_string(to_fake_utf8(16#ffff), #config{strict_utf8=true})
)},
{"noncharacter u+ffff replaced", ?_assertEqual(
<<16#fffd/utf8>>,
clean_string(to_fake_utf8(16#ffff), #config{})
)},
{"orphan continuation byte u+0080", ?_assertError( {"orphan continuation byte u+0080", ?_assertError(
badarg, badarg,
clean_string(<<16#0080>>, #config{strict_utf8=true}) clean_string(<<16#0080>>, #config{strict_utf8=true})