test every codepoint possible for replacement/badness
This commit is contained in:
parent
0e66a82b29
commit
cd4f4a8f1c
1 changed files with 185 additions and 7 deletions
|
@ -92,26 +92,31 @@
|
|||
|
||||
%% partial codepoint max size differs across encodings
|
||||
-ifdef(utf8).
|
||||
-define(encoding, utf8).
|
||||
-define(utfx, utf8).
|
||||
-define(partial_codepoint(Bin), byte_size(Bin) < 1).
|
||||
-endif.
|
||||
|
||||
-ifdef(utf16).
|
||||
-define(encoding, utf16).
|
||||
-define(utfx, utf16).
|
||||
-define(partial_codepoint(Bin), byte_size(Bin) < 2).
|
||||
-endif.
|
||||
|
||||
-ifdef(utf16le).
|
||||
-define(encoding, utf16le).
|
||||
-define(utfx, utf16-little).
|
||||
-define(partial_codepoint(Bin), byte_size(Bin) < 2).
|
||||
-endif.
|
||||
|
||||
-ifdef(utf32).
|
||||
-define(encoding, utf32).
|
||||
-define(utfx, utf32).
|
||||
-define(partial_codepoint(Bin), byte_size(Bin) < 4).
|
||||
-endif.
|
||||
|
||||
-ifdef(utf32le).
|
||||
-define(encoding, utf32le).
|
||||
-define(utfx, utf32-little).
|
||||
-define(partial_codepoint(Bin), byte_size(Bin) < 4).
|
||||
-endif.
|
||||
|
@ -390,7 +395,7 @@ string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
|
|||
S =/= 16#dfffe andalso S =/= 16#dffff andalso
|
||||
S =/= 16#efffe andalso S =/= 16#effff andalso
|
||||
S =/= 16#ffffe andalso S =/= 16#fffff andalso
|
||||
S =/= 16#101fffe andalso S =/= 16#10ffff ->
|
||||
S =/= 16#10fffe andalso S =/= 16#10ffff ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
|
||||
string(Bin, Stack, Opts, Acc) ->
|
||||
case partial_utf(Bin) of
|
||||
|
@ -488,11 +493,11 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
|
|||
-ifdef(utf16).
|
||||
%% non-characters blah blah
|
||||
noncharacter(<<S/utf16, Rest/binary>>, Stack, Opts, Acc)
|
||||
when ?is_noncontrol(S) ->
|
||||
when ?is_noncontrol(S), S < 16#fffe ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
|
||||
%% u+ffff and u+fffe
|
||||
noncharacter(<<255, X, Rest/binary>>, Stack, Opts, Acc)
|
||||
when X == 253; X == 254 ->
|
||||
when X == 254; X == 255 ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
|
||||
%% surrogates
|
||||
noncharacter(<<X, _, Rest/binary>>, Stack, Opts, Acc)
|
||||
|
@ -503,12 +508,13 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
|
|||
-endif.
|
||||
|
||||
-ifdef(utf16le).
|
||||
%% non-characters blah blah
|
||||
noncharacter(<<S/utf16-little, Rest/binary>>, Stack, Opts, Acc)
|
||||
when ?is_noncontrol(S) ->
|
||||
when ?is_noncontrol(S), S < 16#fffe ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
|
||||
%% u+ffff and u+fffe
|
||||
noncharacter(<<X, 255, Rest/binary>>, Stack, Opts, Acc)
|
||||
when X == 253; X == 254 ->
|
||||
when X == 254; X == 255 ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
|
||||
%% surrogates
|
||||
noncharacter(<<_, X, Rest/binary>>, Stack, Opts, Acc)
|
||||
|
@ -519,6 +525,7 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
|
|||
-endif.
|
||||
|
||||
-ifdef(utf32).
|
||||
%% non-characters blah blah
|
||||
noncharacter(<<S/utf32, Rest/binary>>, Stack, Opts, Acc)
|
||||
when ?is_noncontrol(S) ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
|
||||
|
@ -535,6 +542,7 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
|
|||
-endif.
|
||||
|
||||
-ifdef(utf32le).
|
||||
%% non-characters blah blah
|
||||
noncharacter(<<S/utf32-little, Rest/binary>>, Stack, Opts, Acc)
|
||||
when ?is_noncontrol(S) ->
|
||||
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
|
||||
|
@ -1006,7 +1014,6 @@ format_number({Int, [], Exp}) ->
|
|||
format_number({Int, Frac, Exp}) ->
|
||||
{float, list_to_float(lists:reverse(Exp ++ "e" ++ Frac ++ "." ++ Int))}.
|
||||
|
||||
|
||||
|
||||
|
||||
tr(<<$r/?utfx, Rest/binary>>, Stack, Opts) ->
|
||||
|
@ -1146,4 +1153,175 @@ null(Bin, Stack, Opts) ->
|
|||
null(<<Bin/binary, Stream/binary>>, Stack, Opts)
|
||||
end}
|
||||
; false -> {error, {badjson, Bin}}
|
||||
end.
|
||||
end.
|
||||
|
||||
|
||||
-ifdef(TEST).
|
||||
-include_lib("eunit/include/eunit.hrl").
|
||||
|
||||
|
||||
noncharacters_test_() ->
|
||||
[
|
||||
{"noncharacters - badjson",
|
||||
?_assertEqual(check_bad(noncharacters()), [])
|
||||
},
|
||||
{"noncharacters - replaced",
|
||||
?_assertEqual(check_replaced(noncharacters()), [])
|
||||
}
|
||||
].
|
||||
|
||||
extended_noncharacters_test_() ->
|
||||
[
|
||||
{"extended noncharacters - badjson",
|
||||
?_assertEqual(check_bad(extended_noncharacters()), [])
|
||||
},
|
||||
{"extended noncharacters - replaced",
|
||||
?_assertEqual(check_extended_replaced(extended_noncharacters()), [])
|
||||
}
|
||||
].
|
||||
|
||||
surrogates_test_() ->
|
||||
[
|
||||
{"surrogates - badjson",
|
||||
?_assertEqual(check_bad(surrogates()), [])
|
||||
},
|
||||
{"surrogates - replaced",
|
||||
?_assertEqual(check_replaced(surrogates()), [])
|
||||
}
|
||||
].
|
||||
|
||||
control_test_() ->
|
||||
[
|
||||
{"control characters - badjson",
|
||||
?_assertEqual(check_bad(control_characters()), [])
|
||||
}
|
||||
].
|
||||
|
||||
reserved_test_() ->
|
||||
[
|
||||
{"reserved noncharacters - badjson",
|
||||
?_assertEqual(check_bad(reserved_space()), [])
|
||||
},
|
||||
{"reserved noncharacters - replaced",
|
||||
?_assertEqual(check_replaced(reserved_space()), [])
|
||||
}
|
||||
].
|
||||
|
||||
zero_test_() ->
|
||||
[
|
||||
{"nullbyte - badjson",
|
||||
?_assertEqual(check_bad(zero()), [])
|
||||
}
|
||||
].
|
||||
|
||||
good_characters_test_() ->
|
||||
[
|
||||
{"acceptable codepoints",
|
||||
?_assertEqual(check_good(good()), [])
|
||||
},
|
||||
{"acceptable extended",
|
||||
?_assertEqual(check_good(good_extended()), [])
|
||||
}
|
||||
].
|
||||
|
||||
|
||||
check_bad(List) ->
|
||||
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
|
||||
check(List, [], [])
|
||||
).
|
||||
|
||||
check_replaced(List) ->
|
||||
lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true ; (_) -> false end,
|
||||
check(List, [loose_unicode], [])
|
||||
).
|
||||
|
||||
check_extended_replaced(List) ->
|
||||
Replace = case ?encoding of
|
||||
E when E == utf16; E == utf16le -> <<16#fffd/utf8, 16#fffd/utf8>>
|
||||
; _ -> <<16#fffd/utf8>>
|
||||
end,
|
||||
lists:dropwhile(fun({_, [{string, S}|_]}) -> S == Replace ; (_) -> false end,
|
||||
check(List, [loose_unicode], [])
|
||||
).
|
||||
|
||||
check_good(List) ->
|
||||
lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end,
|
||||
check(List, [], [])
|
||||
).
|
||||
|
||||
check([], _Opts, Acc) -> Acc;
|
||||
check([H|T], Opts, Acc) ->
|
||||
R = decode(to_fake_utf(H, ?encoding), Opts),
|
||||
check(T, Opts, [{H, R}] ++ Acc).
|
||||
|
||||
|
||||
decode(JSON, Opts) ->
|
||||
F = decoder(Opts),
|
||||
loop(F(JSON), []).
|
||||
|
||||
|
||||
loop({jsx, end_json, _}, Acc) -> lists:reverse(Acc);
|
||||
loop({jsx, incomplete, More}, Acc) -> loop(More(end_stream), Acc);
|
||||
loop({jsx, Event, Next}, Acc) -> loop(Next(), [Event] ++ Acc);
|
||||
loop(_, _) -> {error, badjson}.
|
||||
|
||||
|
||||
|
||||
noncharacters() -> lists:seq(16#fffe, 16#ffff).
|
||||
|
||||
extended_noncharacters() ->
|
||||
[16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
|
||||
++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
|
||||
++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
|
||||
++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
|
||||
++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
|
||||
++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
|
||||
++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
|
||||
++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
|
||||
|
||||
surrogates() -> lists:seq(16#d800, 16#dfff).
|
||||
|
||||
control_characters() -> lists:seq(1, 31).
|
||||
|
||||
reserved_space() -> lists:seq(16#fdd0, 16#fdef).
|
||||
|
||||
zero() -> [0].
|
||||
|
||||
good() -> [32, 33]
|
||||
++ lists:seq(16#23, 16#5b)
|
||||
++ lists:seq(16#5d, 16#d7ff)
|
||||
++ lists:seq(16#e000, 16#fdcf)
|
||||
++ lists:seq(16#fdf0, 16#fffd).
|
||||
|
||||
good_extended() -> lists:seq(16#100000, 16#10fffd).
|
||||
|
||||
%% erlang refuses to encode certain codepoints, so fake them all
|
||||
to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;
|
||||
to_fake_utf(N, utf8) when N < 16#0800 ->
|
||||
<<0:5, Y:5, X:6>> = <<N:16>>,
|
||||
<<34/utf8, 2#110:3, Y:5, 2#10:2, X:6, 34/utf8>>;
|
||||
to_fake_utf(N, utf8) when N < 16#10000 ->
|
||||
<<Z:4, Y:6, X:6>> = <<N:16>>,
|
||||
<<34/utf8, 2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>;
|
||||
to_fake_utf(N, utf8) ->
|
||||
<<0:3, W:3, Z:6, Y:6, X:6>> = <<N:24>>,
|
||||
<<34/utf8, 2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>;
|
||||
|
||||
to_fake_utf(N, utf16) when N < 16#10000 -> <<34/utf16, N:16, 34/utf16>>;
|
||||
to_fake_utf(N, utf16) -> <<34/utf16, N/utf16, 34/utf16>>;
|
||||
|
||||
to_fake_utf(N, utf16le) when N < 16#10000 ->
|
||||
<<A:8, B:8>> = <<N:16>>,
|
||||
<<34, 0, B:8, A:8, 34, 0>>;
|
||||
to_fake_utf(N, utf16le) -> <<34/utf16-little, N/utf16-little, 34/utf16-little>>;
|
||||
|
||||
to_fake_utf(N, utf32) -> <<34/utf32, N:32, 34/utf32>>;
|
||||
|
||||
to_fake_utf(N, utf32le) ->
|
||||
<<A:8, B:8, C:8, D:8>> = <<N:32>>,
|
||||
<<34/utf32-little, D:8, C:8, B:8, A:8, 34/utf32-little>>.
|
||||
|
||||
|
||||
|
||||
|
||||
-endif.
|
Loading…
Add table
Add a link
Reference in a new issue