allow unicode noncharacters and reserved space characters in json
strings fixes #67
This commit is contained in:
parent
5f5e85914d
commit
9528216d15
5 changed files with 30 additions and 191 deletions
|
@ -1,6 +1,7 @@
|
||||||
language: erlang
|
language: erlang
|
||||||
script: rebar compile && rebar skip_deps=true eunit
|
script: rebar compile && rebar skip_deps=true eunit
|
||||||
otp_release:
|
otp_release:
|
||||||
|
- 17.3
|
||||||
- 17.1
|
- 17.1
|
||||||
- 17.0
|
- 17.0
|
||||||
- R16B03-1
|
- R16B03-1
|
||||||
|
@ -11,6 +12,3 @@ otp_release:
|
||||||
- R15B02
|
- R15B02
|
||||||
- R15B01
|
- R15B01
|
||||||
- R15B
|
- R15B
|
||||||
- R14B04
|
|
||||||
- R14B03
|
|
||||||
- R14B02
|
|
||||||
|
|
10
README.md
10
README.md
|
@ -223,9 +223,9 @@ see below | `datetime()`
|
||||||
encountered otherwise are replaced with the replacement codepoint (`u+fffd`)
|
encountered otherwise are replaced with the replacement codepoint (`u+fffd`)
|
||||||
|
|
||||||
all erlang strings are represented by **valid** `utf8` encoded binaries. the
|
all erlang strings are represented by **valid** `utf8` encoded binaries. the
|
||||||
encoder will check strings for conformance. noncharacters (like `u+ffff`)
|
encoder will check strings for conformance. badly formed `utf8` sequences may
|
||||||
are allowed in erlang utf8 encoded binaries, but will be replaced in strings
|
be replaced with the replacement codepoint (`u+fffd`) according to the unicode
|
||||||
passed to the encoder (although, again, see [options](#option))
|
spec
|
||||||
|
|
||||||
this implementation performs no normalization on strings beyond that
|
this implementation performs no normalization on strings beyond that
|
||||||
detailed here. be careful when comparing strings as equivalent strings
|
detailed here. be careful when comparing strings as equivalent strings
|
||||||
|
@ -244,7 +244,8 @@ see below | `datetime()`
|
||||||
* objects
|
* objects
|
||||||
|
|
||||||
json objects are represented by erlang proplists. json maps may also be
|
json objects are represented by erlang proplists. json maps may also be
|
||||||
encoded to json but the decoder will not produce maps
|
encoded to json and optionally decoded to maps (via the `return_maps`
|
||||||
|
option)
|
||||||
|
|
||||||
the empty object has the special representation `[{}]` to differentiate it
|
the empty object has the special representation `[{}]` to differentiate it
|
||||||
from the empty list. ambiguities like `[true, false]` prevent the use of
|
from the empty list. ambiguities like `[true, false]` prevent the use of
|
||||||
|
@ -349,6 +350,7 @@ option() = dirty_strings
|
||||||
| stream
|
| stream
|
||||||
| strict
|
| strict
|
||||||
| {strict, [strict_option()]}
|
| {strict, [strict_option()]}
|
||||||
|
| uescape
|
||||||
| unescaped_jsonp
|
| unescaped_jsonp
|
||||||
|
|
||||||
strict_option() = comments
|
strict_option() = comments
|
||||||
|
|
|
@ -505,73 +505,29 @@ string(<<127, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(Rest, Handler, acc_seq(Acc, 127), Stack, Config);
|
string(Rest, Handler, acc_seq(Acc, 127), Stack, Config);
|
||||||
string(<<C, Rest/binary>>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) ->
|
string(<<C, Rest/binary>>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) ->
|
||||||
string(Rest, Handler, acc_seq(Acc, C), Stack, Config);
|
string(Rest, Handler, acc_seq(Acc, C), Stack, Config);
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#20, X < 16#2028 ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X == 16#2028; X == 16#2029 ->
|
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X == 16#2028; X == 16#2029 ->
|
||||||
string(Rest, Handler, acc_seq(Acc, maybe_replace(X, Config)), Stack, Config);
|
string(Rest, Handler, acc_seq(Acc, maybe_replace(X, Config)), Stack, Config);
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#2029, X < 16#d800 ->
|
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#80 ->
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#dfff, X < 16#fdd0 ->
|
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match preceeding
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
string(<<239, 191, 190, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X > 16#fdef, X < 16#fffe ->
|
string(Rest, Handler, acc_seq(Acc, 16#fffe), Stack, Config);
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
string(<<239, 191, 191, Rest/binary>>, Handler, Acc, Stack, Config) ->
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#10000, X < 16#1fffe ->
|
string(Rest, Handler, acc_seq(Acc, 16#ffff), Stack, Config);
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
%% partial utf8 codepoints
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#20000, X < 16#2fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#30000, X < 16#3fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#40000, X < 16#4fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#50000, X < 16#5fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#60000, X < 16#6fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#70000, X < 16#7fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#80000, X < 16#8fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#90000, X < 16#9fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#a0000, X < 16#afffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#b0000, X < 16#bfffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#c0000, X < 16#cfffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#d0000, X < 16#dfffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#e0000, X < 16#efffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#f0000, X < 16#ffffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config) when X >= 16#100000, X < 16#10fffe ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, X), Stack, Config);
|
|
||||||
%% partial utf8 codepoints. check that input could possibly be valid before attempting
|
|
||||||
%% to correct
|
|
||||||
string(<<>>, Handler, Acc, Stack, Config) ->
|
string(<<>>, Handler, Acc, Stack, Config) ->
|
||||||
incomplete(string, <<>>, Handler, Acc, Stack, Config);
|
incomplete(string, <<>>, Handler, Acc, Stack, Config);
|
||||||
string(<<X>>, Handler, Acc, Stack, Config) when X >= 16#c2, X =< 16#f4 ->
|
string(<<X>>, Handler, Acc, Stack, Config) when X >= 2#11000000 ->
|
||||||
incomplete(string, <<X>>, Handler, Acc, Stack, Config);
|
incomplete(string, <<X>>, Handler, Acc, Stack, Config);
|
||||||
string(<<X, Y>>, Handler, Acc, Stack, Config) when X >= 16#e0, X =< 16#f4, Y >= 16#80, Y =< 16#bf ->
|
string(<<X, Y>>, Handler, Acc, Stack, Config) when X >= 2#11100000, Y >= 2#10000000 ->
|
||||||
incomplete(string, <<X, Y>>, Handler, Acc, Stack, Config);
|
incomplete(string, <<X, Y>>, Handler, Acc, Stack, Config);
|
||||||
string(<<X, Y, Z>>, Handler, Acc, Stack, Config)
|
string(<<X, Y, Z>>, Handler, Acc, Stack, Config)
|
||||||
when X >= 16#f0, X =< 16#f4,
|
when X >= 2#11100000, Y >= 2#10000000, Z >= 2#10000000 ->
|
||||||
Y >= 16#80, Y =< 16#bf,
|
|
||||||
Z >= 16#80, Z =< 16#bf ->
|
|
||||||
incomplete(string, <<X, Y, Z>>, Handler, Acc, Stack, Config);
|
incomplete(string, <<X, Y, Z>>, Handler, Acc, Stack, Config);
|
||||||
%% surrogates
|
%% surrogates
|
||||||
string(<<237, X, _, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
|
string(<<237, X, _, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
|
||||||
when X >= 160 ->
|
when X >= 160 ->
|
||||||
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
|
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
|
||||||
%% u+xfffe, u+xffff, control codes and other noncharacters
|
|
||||||
string(<<_/utf8, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false}) ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
|
|
||||||
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match the
|
|
||||||
%% preceeding clause
|
|
||||||
string(<<239, 191, X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
|
|
||||||
when X == 190; X == 191 ->
|
|
||||||
string(Rest, Handler, acc_seq(Acc, 16#fffd), Stack, Config);
|
|
||||||
%% overlong encodings and missing continuations of a 2 byte sequence
|
%% overlong encodings and missing continuations of a 2 byte sequence
|
||||||
string(<<X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
|
string(<<X, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false})
|
||||||
when X >= 192, X =< 223 ->
|
when X >= 192, X =< 223 ->
|
||||||
|
@ -1268,27 +1224,16 @@ codepoints() ->
|
||||||
lists:seq(35, 46) ++
|
lists:seq(35, 46) ++
|
||||||
lists:seq(48, 91) ++
|
lists:seq(48, 91) ++
|
||||||
lists:seq(93, 127) ++
|
lists:seq(93, 127) ++
|
||||||
[16#2027, 16#202a, 16#d7ff, 16#e000, 16#fdcf, 16#fdf0, 16#fffd] ++
|
[16#2027, 16#202a, 16#d7ff, 16#e000] ++
|
||||||
[16#10000, 16#1fffd, 16#20000, 16#30000, 16#40000, 16#50000] ++
|
lists:seq(16#fdd0, 16#ffff) ++
|
||||||
|
[16#10000, 16#20000, 16#30000, 16#40000, 16#50000] ++
|
||||||
[16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++
|
[16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++
|
||||||
[16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000].
|
[16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000].
|
||||||
|
|
||||||
reserved_space() -> lists:seq(16#fdd0, 16#fdef).
|
controls() -> lists:seq(0, 31).
|
||||||
|
|
||||||
surrogates() -> lists:seq(16#d800, 16#dfff).
|
surrogates() -> lists:seq(16#d800, 16#dfff).
|
||||||
|
|
||||||
noncharacters() -> lists:seq(16#fffe, 16#ffff).
|
|
||||||
|
|
||||||
extended_noncharacters() ->
|
|
||||||
[16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] ++
|
|
||||||
[16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] ++
|
|
||||||
[16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] ++
|
|
||||||
[16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] ++
|
|
||||||
[16#9fffe, 16#9ffff, 16#afffe, 16#affff] ++
|
|
||||||
[16#bfffe, 16#bffff, 16#cfffe, 16#cffff] ++
|
|
||||||
[16#dfffe, 16#dffff, 16#efffe, 16#effff] ++
|
|
||||||
[16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
|
|
||||||
|
|
||||||
|
|
||||||
%% erlang refuses to decode certain codepoints, so fake them all
|
%% erlang refuses to decode certain codepoints, so fake them all
|
||||||
to_fake_utf8(N) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;
|
to_fake_utf8(N) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;
|
||||||
|
@ -1305,7 +1250,7 @@ to_fake_utf8(N) ->
|
||||||
|
|
||||||
clean_string_test_() ->
|
clean_string_test_() ->
|
||||||
Clean = codepoints(),
|
Clean = codepoints(),
|
||||||
Dirty = reserved_space() ++ surrogates() ++ noncharacters() ++ extended_noncharacters(),
|
Dirty = surrogates() ++ controls(),
|
||||||
% clean codepoints
|
% clean codepoints
|
||||||
[{"clean u+" ++ integer_to_list(Codepoint, 16), ?_assertEqual(
|
[{"clean u+" ++ integer_to_list(Codepoint, 16), ?_assertEqual(
|
||||||
[{string, <<Codepoint/utf8>>}, end_json],
|
[{string, <<Codepoint/utf8>>}, end_json],
|
||||||
|
@ -1363,11 +1308,6 @@ dirty_string_test_() ->
|
||||||
<<"[\"", 237, 160, 128, "\"]">>,
|
<<"[\"", 237, 160, 128, "\"]">>,
|
||||||
[dirty_strings]
|
[dirty_strings]
|
||||||
},
|
},
|
||||||
{"dirty 16#10ffff",
|
|
||||||
[start_array, {string, <<244, 143, 191, 191>>}, end_array, end_json],
|
|
||||||
<<"[\"", 244, 143, 191, 191, "\"]">>,
|
|
||||||
[dirty_strings]
|
|
||||||
},
|
|
||||||
{"dirty /",
|
{"dirty /",
|
||||||
[start_array, {string, <<$/>>}, end_array, end_json],
|
[start_array, {string, <<$/>>}, end_array, end_json],
|
||||||
<<"[\"", $/, "\"]">>,
|
<<"[\"", $/, "\"]">>,
|
||||||
|
@ -1393,8 +1333,6 @@ dirty_string_test_() ->
|
||||||
|
|
||||||
bad_utf8_test_() ->
|
bad_utf8_test_() ->
|
||||||
Cases = [
|
Cases = [
|
||||||
{"noncharacter u+fffe", <<16#fffd/utf8>>, <<239, 191, 190>>},
|
|
||||||
{"noncharacter u+ffff", <<16#fffd/utf8>>, <<239, 191, 191>>},
|
|
||||||
{"orphan continuation byte u+0080", <<16#fffd/utf8>>, <<16#0080>>},
|
{"orphan continuation byte u+0080", <<16#fffd/utf8>>, <<16#0080>>},
|
||||||
{"orphan continuation byte u+00bf", <<16#fffd/utf8>>, <<16#00bf>>},
|
{"orphan continuation byte u+00bf", <<16#fffd/utf8>>, <<16#00bf>>},
|
||||||
{"2 continuation bytes",
|
{"2 continuation bytes",
|
||||||
|
@ -1610,7 +1548,6 @@ embedded_single_quoted_string_test_() ->
|
||||||
decode(<<34, "quoth the raven, 'nevermore'", 34>>, [{strict, [single_quotes]}])
|
decode(<<34, "quoth the raven, 'nevermore'", 34>>, [{strict, [single_quotes]}])
|
||||||
)}
|
)}
|
||||||
].
|
].
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ignored_bad_escapes_test_() ->
|
ignored_bad_escapes_test_() ->
|
||||||
|
|
|
@ -103,8 +103,8 @@ custom_error_handler_test_() ->
|
||||||
parser(self(), [{error_handler, Error}])
|
parser(self(), [{error_handler, Error}])
|
||||||
)},
|
)},
|
||||||
{"string error", ?_assertEqual(
|
{"string error", ?_assertEqual(
|
||||||
{value, [{string, <<239, 191, 191>>}]},
|
{value, [{string, <<237, 160, 128>>}]},
|
||||||
parser(<<239, 191, 191>>, [{error_handler, Error}, strict])
|
parser(<<237, 160, 128>>, [{error_handler, Error}, strict])
|
||||||
)}
|
)}
|
||||||
].
|
].
|
||||||
|
|
||||||
|
|
|
@ -355,53 +355,11 @@ clean(<<X/utf8, Rest/binary>>, Acc, Config=#config{uescape=true}) ->
|
||||||
maybe_replace(X, Rest, Acc, Config);
|
maybe_replace(X, Rest, Acc, Config);
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X == 16#2028; X == 16#2029 ->
|
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X == 16#2028; X == 16#2029 ->
|
||||||
maybe_replace(X, Rest, Acc, Config);
|
maybe_replace(X, Rest, Acc, Config);
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X < 16#d800 ->
|
clean(<<X/utf8, Rest/binary>>, Acc, Config) ->
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X > 16#dfff, X < 16#fdd0 ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X > 16#fdef, X < 16#fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#10000, X < 16#1fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#20000, X < 16#2fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#30000, X < 16#3fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#40000, X < 16#4fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#50000, X < 16#5fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#60000, X < 16#6fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#70000, X < 16#7fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#80000, X < 16#8fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#90000, X < 16#9fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#a0000, X < 16#afffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#b0000, X < 16#bfffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#c0000, X < 16#cfffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#d0000, X < 16#dfffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#e0000, X < 16#efffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#f0000, X < 16#ffffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
|
||||||
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X >= 16#100000, X < 16#10fffe ->
|
|
||||||
clean(Rest, [X] ++ Acc, Config);
|
clean(Rest, [X] ++ Acc, Config);
|
||||||
%% surrogates
|
%% surrogates
|
||||||
clean(<<237, X, _, Rest/binary>>, Acc, Config) when X >= 160 ->
|
clean(<<237, X, _, Rest/binary>>, Acc, Config) when X >= 160 ->
|
||||||
maybe_replace(surrogate, Rest, Acc, Config);
|
maybe_replace(surrogate, Rest, Acc, Config);
|
||||||
%% noncharacters
|
|
||||||
clean(<<_/utf8, Rest/binary>>, Acc, Config) ->
|
|
||||||
maybe_replace(noncharacter, Rest, Acc, Config);
|
|
||||||
%% u+fffe and u+ffff for R14BXX
|
|
||||||
clean(<<239, 191, X, Rest/binary>>, Acc, Config) when X == 190; X == 191 ->
|
|
||||||
maybe_replace(noncharacter, Rest, Acc, Config);
|
|
||||||
%% overlong encodings and missing continuations of a 2 byte sequence
|
%% overlong encodings and missing continuations of a 2 byte sequence
|
||||||
clean(<<X, Rest/binary>>, Acc, Config) when X >= 192, X =< 223 ->
|
clean(<<X, Rest/binary>>, Acc, Config) when X >= 192, X =< 223 ->
|
||||||
maybe_replace(badutf, strip_continuations(Rest, 1), Acc, Config);
|
maybe_replace(badutf, strip_continuations(Rest, 1), Acc, Config);
|
||||||
|
@ -500,7 +458,7 @@ error_test_() ->
|
||||||
{"value error", ?_assertError(badarg, parse([self()], []))},
|
{"value error", ?_assertError(badarg, parse([self()], []))},
|
||||||
{"maybe_done error", ?_assertError(badarg, parse([start_array, end_array, start_array, end_json], []))},
|
{"maybe_done error", ?_assertError(badarg, parse([start_array, end_array, start_array, end_json], []))},
|
||||||
{"done error", ?_assertError(badarg, parse([{string, <<"">>}, {literal, true}, end_json], []))},
|
{"done error", ?_assertError(badarg, parse([{string, <<"">>}, {literal, true}, end_json], []))},
|
||||||
{"string error", ?_assertError(badarg, parse([{string, <<239, 191, 191>>}, end_json], [strict]))}
|
{"string error", ?_assertError(badarg, parse([{string, <<237, 160, 128>>}, end_json], [strict]))}
|
||||||
].
|
].
|
||||||
|
|
||||||
|
|
||||||
|
@ -520,8 +478,8 @@ custom_error_handler_test_() ->
|
||||||
parse([{string, <<"">>}, {literal, true}, end_json], [{error_handler, Error}])
|
parse([{string, <<"">>}, {literal, true}, end_json], [{error_handler, Error}])
|
||||||
)},
|
)},
|
||||||
{"string error", ?_assertEqual(
|
{"string error", ?_assertEqual(
|
||||||
{value, [{string, <<239, 191, 191>>}, end_json]},
|
{value, [{string, <<237, 160, 128>>}, end_json]},
|
||||||
parse([{string, <<239, 191, 191>>}, end_json], [{error_handler, Error}, strict])
|
parse([{string, <<237, 160, 128>>}, end_json], [{error_handler, Error}, strict])
|
||||||
)}
|
)}
|
||||||
].
|
].
|
||||||
|
|
||||||
|
@ -585,36 +543,20 @@ codepoints() ->
|
||||||
++ lists:seq(48, 91)
|
++ lists:seq(48, 91)
|
||||||
++ lists:seq(93, 16#2027)
|
++ lists:seq(93, 16#2027)
|
||||||
++ lists:seq(16#202a, 16#d7ff)
|
++ lists:seq(16#202a, 16#d7ff)
|
||||||
++ lists:seq(16#e000, 16#fdcf)
|
++ lists:seq(16#e000, 16#ffff)
|
||||||
++ lists:seq(16#fdf0, 16#fffd)
|
|
||||||
).
|
).
|
||||||
|
|
||||||
extended_codepoints() ->
|
extended_codepoints() ->
|
||||||
unicode:characters_to_binary(
|
unicode:characters_to_binary(
|
||||||
lists:seq(16#10000, 16#1fffd) ++ [
|
lists:seq(16#10000, 16#1ffff) ++ [
|
||||||
16#20000, 16#30000, 16#40000, 16#50000, 16#60000,
|
16#20000, 16#30000, 16#40000, 16#50000, 16#60000,
|
||||||
16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000,
|
16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000,
|
||||||
16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000
|
16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000
|
||||||
]
|
]
|
||||||
).
|
).
|
||||||
|
|
||||||
reserved_space() -> [ to_fake_utf8(N) || N <- lists:seq(16#fdd0, 16#fdef) ].
|
|
||||||
|
|
||||||
surrogates() -> [ to_fake_utf8(N) || N <- lists:seq(16#d800, 16#dfff) ].
|
surrogates() -> [ to_fake_utf8(N) || N <- lists:seq(16#d800, 16#dfff) ].
|
||||||
|
|
||||||
noncharacters() -> [ to_fake_utf8(N) || N <- lists:seq(16#fffe, 16#ffff) ].
|
|
||||||
|
|
||||||
extended_noncharacters() ->
|
|
||||||
[ to_fake_utf8(N) || N <- [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
|
|
||||||
++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
|
|
||||||
++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
|
|
||||||
++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
|
|
||||||
++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
|
|
||||||
++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
|
|
||||||
++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
|
|
||||||
++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]
|
|
||||||
].
|
|
||||||
|
|
||||||
clean_string_helper(String) ->
|
clean_string_helper(String) ->
|
||||||
try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean
|
try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean
|
||||||
catch error:badarg -> {error, badarg}
|
catch error:badarg -> {error, badarg}
|
||||||
|
@ -638,37 +580,13 @@ clean_string_test_() ->
|
||||||
extended_codepoints(),
|
extended_codepoints(),
|
||||||
clean_string(extended_codepoints(), #config{escaped_strings=true})
|
clean_string(extended_codepoints(), #config{escaped_strings=true})
|
||||||
)},
|
)},
|
||||||
{"error reserved space", ?_assertEqual(
|
|
||||||
lists:duplicate(length(reserved_space()), {error, badarg}),
|
|
||||||
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, reserved_space())
|
|
||||||
)},
|
|
||||||
{"error surrogates", ?_assertEqual(
|
{"error surrogates", ?_assertEqual(
|
||||||
lists:duplicate(length(surrogates()), {error, badarg}),
|
lists:duplicate(length(surrogates()), {error, badarg}),
|
||||||
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, surrogates())
|
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, surrogates())
|
||||||
)},
|
)},
|
||||||
{"error noncharacters", ?_assertEqual(
|
|
||||||
lists:duplicate(length(noncharacters()), {error, badarg}),
|
|
||||||
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, noncharacters())
|
|
||||||
)},
|
|
||||||
{"error extended noncharacters", ?_assertEqual(
|
|
||||||
lists:duplicate(length(extended_noncharacters()), {error, badarg}),
|
|
||||||
lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, extended_noncharacters())
|
|
||||||
)},
|
|
||||||
{"clean reserved space", ?_assertEqual(
|
|
||||||
lists:duplicate(length(reserved_space()), <<16#fffd/utf8>>),
|
|
||||||
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, reserved_space())
|
|
||||||
)},
|
|
||||||
{"clean surrogates", ?_assertEqual(
|
{"clean surrogates", ?_assertEqual(
|
||||||
lists:duplicate(length(surrogates()), <<16#fffd/utf8>>),
|
lists:duplicate(length(surrogates()), <<16#fffd/utf8>>),
|
||||||
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, surrogates())
|
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, surrogates())
|
||||||
)},
|
|
||||||
{"clean noncharacters", ?_assertEqual(
|
|
||||||
lists:duplicate(length(noncharacters()), <<16#fffd/utf8>>),
|
|
||||||
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, noncharacters())
|
|
||||||
)},
|
|
||||||
{"clean extended noncharacters", ?_assertEqual(
|
|
||||||
lists:duplicate(length(extended_noncharacters()), <<16#fffd/utf8>>),
|
|
||||||
lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{}) end, extended_noncharacters())
|
|
||||||
)}
|
)}
|
||||||
].
|
].
|
||||||
|
|
||||||
|
@ -844,22 +762,6 @@ escape_test_() ->
|
||||||
|
|
||||||
bad_utf8_test_() ->
|
bad_utf8_test_() ->
|
||||||
[
|
[
|
||||||
{"noncharacter u+fffe", ?_assertError(
|
|
||||||
badarg,
|
|
||||||
clean_string(to_fake_utf8(16#fffe), #config{strict_utf8=true})
|
|
||||||
)},
|
|
||||||
{"noncharacter u+fffe replaced", ?_assertEqual(
|
|
||||||
<<16#fffd/utf8>>,
|
|
||||||
clean_string(to_fake_utf8(16#fffe), #config{})
|
|
||||||
)},
|
|
||||||
{"noncharacter u+ffff", ?_assertError(
|
|
||||||
badarg,
|
|
||||||
clean_string(to_fake_utf8(16#ffff), #config{strict_utf8=true})
|
|
||||||
)},
|
|
||||||
{"noncharacter u+ffff replaced", ?_assertEqual(
|
|
||||||
<<16#fffd/utf8>>,
|
|
||||||
clean_string(to_fake_utf8(16#ffff), #config{})
|
|
||||||
)},
|
|
||||||
{"orphan continuation byte u+0080", ?_assertError(
|
{"orphan continuation byte u+0080", ?_assertError(
|
||||||
badarg,
|
badarg,
|
||||||
clean_string(<<16#0080>>, #config{strict_utf8=true})
|
clean_string(<<16#0080>>, #config{strict_utf8=true})
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue