test every codepoint possible for replacement/badness

This commit is contained in:
alisdair sullivan 2011-07-28 18:47:58 -07:00
parent 0e66a82b29
commit cd4f4a8f1c

View file

@ -92,26 +92,31 @@
%% partial codepoint max size differs across encodings
-ifdef(utf8).
-define(encoding, utf8).
-define(utfx, utf8).
-define(partial_codepoint(Bin), byte_size(Bin) < 1).
-endif.
-ifdef(utf16).
-define(encoding, utf16).
-define(utfx, utf16).
-define(partial_codepoint(Bin), byte_size(Bin) < 2).
-endif.
-ifdef(utf16le).
-define(encoding, utf16le).
-define(utfx, utf16-little).
-define(partial_codepoint(Bin), byte_size(Bin) < 2).
-endif.
-ifdef(utf32).
-define(encoding, utf32).
-define(utfx, utf32).
-define(partial_codepoint(Bin), byte_size(Bin) < 4).
-endif.
-ifdef(utf32le).
-define(encoding, utf32le).
-define(utfx, utf32-little).
-define(partial_codepoint(Bin), byte_size(Bin) < 4).
-endif.
@ -390,7 +395,7 @@ string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
S =/= 16#dfffe andalso S =/= 16#dffff andalso
S =/= 16#efffe andalso S =/= 16#effff andalso
S =/= 16#ffffe andalso S =/= 16#fffff andalso
S =/= 16#101fffe andalso S =/= 16#10ffff ->
S =/= 16#10fffe andalso S =/= 16#10ffff ->
string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
string(Bin, Stack, Opts, Acc) ->
case partial_utf(Bin) of
@ -488,11 +493,11 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
-ifdef(utf16).
%% non-characters blah blah
noncharacter(<<S/utf16, Rest/binary>>, Stack, Opts, Acc)
when ?is_noncontrol(S) ->
when ?is_noncontrol(S), S < 16#fffe ->
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
%% u+ffff and u+fffe
noncharacter(<<255, X, Rest/binary>>, Stack, Opts, Acc)
when X == 253; X == 254 ->
when X == 254; X == 255 ->
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
%% surrogates
noncharacter(<<X, _, Rest/binary>>, Stack, Opts, Acc)
@ -503,12 +508,13 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
-endif.
-ifdef(utf16le).
%% non-characters blah blah
noncharacter(<<S/utf16-little, Rest/binary>>, Stack, Opts, Acc)
when ?is_noncontrol(S) ->
when ?is_noncontrol(S), S < 16#fffe ->
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
%% u+ffff and u+fffe
noncharacter(<<X, 255, Rest/binary>>, Stack, Opts, Acc)
when X == 253; X == 254 ->
when X == 254; X == 255 ->
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
%% surrogates
noncharacter(<<_, X, Rest/binary>>, Stack, Opts, Acc)
@ -519,6 +525,7 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
-endif.
-ifdef(utf32).
%% non-characters blah blah
noncharacter(<<S/utf32, Rest/binary>>, Stack, Opts, Acc)
when ?is_noncontrol(S) ->
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
@ -535,6 +542,7 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
-endif.
-ifdef(utf32le).
%% non-characters blah blah
noncharacter(<<S/utf32-little, Rest/binary>>, Stack, Opts, Acc)
when ?is_noncontrol(S) ->
string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
@ -1006,7 +1014,6 @@ format_number({Int, [], Exp}) ->
format_number({Int, Frac, Exp}) ->
{float, list_to_float(lists:reverse(Exp ++ "e" ++ Frac ++ "." ++ Int))}.
tr(<<$r/?utfx, Rest/binary>>, Stack, Opts) ->
@ -1146,4 +1153,175 @@ null(Bin, Stack, Opts) ->
null(<<Bin/binary, Stream/binary>>, Stack, Opts)
end}
; false -> {error, {badjson, Bin}}
end.
end.
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
noncharacters_test_() ->
[
{"noncharacters - badjson",
?_assertEqual(check_bad(noncharacters()), [])
},
{"noncharacters - replaced",
?_assertEqual(check_replaced(noncharacters()), [])
}
].
extended_noncharacters_test_() ->
[
{"extended noncharacters - badjson",
?_assertEqual(check_bad(extended_noncharacters()), [])
},
{"extended noncharacters - replaced",
?_assertEqual(check_extended_replaced(extended_noncharacters()), [])
}
].
surrogates_test_() ->
[
{"surrogates - badjson",
?_assertEqual(check_bad(surrogates()), [])
},
{"surrogates - replaced",
?_assertEqual(check_replaced(surrogates()), [])
}
].
control_test_() ->
[
{"control characters - badjson",
?_assertEqual(check_bad(control_characters()), [])
}
].
reserved_test_() ->
[
{"reserved noncharacters - badjson",
?_assertEqual(check_bad(reserved_space()), [])
},
{"reserved noncharacters - replaced",
?_assertEqual(check_replaced(reserved_space()), [])
}
].
zero_test_() ->
[
{"nullbyte - badjson",
?_assertEqual(check_bad(zero()), [])
}
].
good_characters_test_() ->
[
{"acceptable codepoints",
?_assertEqual(check_good(good()), [])
},
{"acceptable extended",
?_assertEqual(check_good(good_extended()), [])
}
].
check_bad(List) ->
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
check(List, [], [])
).
check_replaced(List) ->
lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true ; (_) -> false end,
check(List, [loose_unicode], [])
).
check_extended_replaced(List) ->
Replace = case ?encoding of
E when E == utf16; E == utf16le -> <<16#fffd/utf8, 16#fffd/utf8>>
; _ -> <<16#fffd/utf8>>
end,
lists:dropwhile(fun({_, [{string, S}|_]}) -> S == Replace ; (_) -> false end,
check(List, [loose_unicode], [])
).
check_good(List) ->
lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end,
check(List, [], [])
).
check([], _Opts, Acc) -> Acc;
check([H|T], Opts, Acc) ->
R = decode(to_fake_utf(H, ?encoding), Opts),
check(T, Opts, [{H, R}] ++ Acc).
decode(JSON, Opts) ->
F = decoder(Opts),
loop(F(JSON), []).
loop({jsx, end_json, _}, Acc) -> lists:reverse(Acc);
loop({jsx, incomplete, More}, Acc) -> loop(More(end_stream), Acc);
loop({jsx, Event, Next}, Acc) -> loop(Next(), [Event] ++ Acc);
loop(_, _) -> {error, badjson}.
noncharacters() -> lists:seq(16#fffe, 16#ffff).
extended_noncharacters() ->
[16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
surrogates() -> lists:seq(16#d800, 16#dfff).
control_characters() -> lists:seq(1, 31).
reserved_space() -> lists:seq(16#fdd0, 16#fdef).
zero() -> [0].
good() -> [32, 33]
++ lists:seq(16#23, 16#5b)
++ lists:seq(16#5d, 16#d7ff)
++ lists:seq(16#e000, 16#fdcf)
++ lists:seq(16#fdf0, 16#fffd).
good_extended() -> lists:seq(16#100000, 16#10fffd).
%% erlang refuses to encode certain codepoints, so fake them all
to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;
to_fake_utf(N, utf8) when N < 16#0800 ->
<<0:5, Y:5, X:6>> = <<N:16>>,
<<34/utf8, 2#110:3, Y:5, 2#10:2, X:6, 34/utf8>>;
to_fake_utf(N, utf8) when N < 16#10000 ->
<<Z:4, Y:6, X:6>> = <<N:16>>,
<<34/utf8, 2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>;
to_fake_utf(N, utf8) ->
<<0:3, W:3, Z:6, Y:6, X:6>> = <<N:24>>,
<<34/utf8, 2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>;
to_fake_utf(N, utf16) when N < 16#10000 -> <<34/utf16, N:16, 34/utf16>>;
to_fake_utf(N, utf16) -> <<34/utf16, N/utf16, 34/utf16>>;
to_fake_utf(N, utf16le) when N < 16#10000 ->
<<A:8, B:8>> = <<N:16>>,
<<34, 0, B:8, A:8, 34, 0>>;
to_fake_utf(N, utf16le) -> <<34/utf16-little, N/utf16-little, 34/utf16-little>>;
to_fake_utf(N, utf32) -> <<34/utf32, N:32, 34/utf32>>;
to_fake_utf(N, utf32le) ->
<<A:8, B:8, C:8, D:8>> = <<N:32>>,
<<34/utf32-little, D:8, C:8, B:8, A:8, 34/utf32-little>>.
-endif.