From cd4f4a8f1ccb14a282469b8a26ebe00bc878ef38 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 28 Jul 2011 18:47:58 -0700 Subject: [PATCH] test every codepoint possible for replacement/badness --- src/jsx_decoder.hrl | 192 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 185 insertions(+), 7 deletions(-) diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl index 2cb88cc..a830411 100644 --- a/src/jsx_decoder.hrl +++ b/src/jsx_decoder.hrl @@ -92,26 +92,31 @@ %% partial codepoint max size differs across encodings -ifdef(utf8). +-define(encoding, utf8). -define(utfx, utf8). -define(partial_codepoint(Bin), byte_size(Bin) < 1). -endif. -ifdef(utf16). +-define(encoding, utf16). -define(utfx, utf16). -define(partial_codepoint(Bin), byte_size(Bin) < 2). -endif. -ifdef(utf16le). +-define(encoding, utf16le). -define(utfx, utf16-little). -define(partial_codepoint(Bin), byte_size(Bin) < 2). -endif. -ifdef(utf32). +-define(encoding, utf32). -define(utfx, utf32). -define(partial_codepoint(Bin), byte_size(Bin) < 4). -endif. -ifdef(utf32le). +-define(encoding, utf32le). -define(utfx, utf32-little). -define(partial_codepoint(Bin), byte_size(Bin) < 4). -endif. @@ -390,7 +395,7 @@ string(<>, Stack, Opts, Acc) S =/= 16#dfffe andalso S =/= 16#dffff andalso S =/= 16#efffe andalso S =/= 16#effff andalso S =/= 16#ffffe andalso S =/= 16#fffff andalso - S =/= 16#101fffe andalso S =/= 16#10ffff -> + S =/= 16#10fffe andalso S =/= 16#10ffff -> string(Rest, Stack, Opts, <>); string(Bin, Stack, Opts, Acc) -> case partial_utf(Bin) of @@ -488,11 +493,11 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> -ifdef(utf16). %% non-characters blah blah noncharacter(<>, Stack, Opts, Acc) - when ?is_noncontrol(S) -> + when ?is_noncontrol(S), S < 16#fffe -> string(Rest, Stack, Opts, <>); %% u+ffff and u+fffe noncharacter(<<255, X, Rest/binary>>, Stack, Opts, Acc) - when X == 253; X == 254 -> + when X == 254; X == 255 -> string(Rest, Stack, Opts, <>); %% surrogates noncharacter(<>, Stack, Opts, Acc) @@ -503,12 +508,13 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> -endif. -ifdef(utf16le). +%% non-characters blah blah noncharacter(<>, Stack, Opts, Acc) - when ?is_noncontrol(S) -> + when ?is_noncontrol(S), S < 16#fffe -> string(Rest, Stack, Opts, <>); %% u+ffff and u+fffe noncharacter(<>, Stack, Opts, Acc) - when X == 253; X == 254 -> + when X == 254; X == 255 -> string(Rest, Stack, Opts, <>); %% surrogates noncharacter(<<_, X, Rest/binary>>, Stack, Opts, Acc) @@ -519,6 +525,7 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> -endif. -ifdef(utf32). +%% non-characters blah blah noncharacter(<>, Stack, Opts, Acc) when ?is_noncontrol(S) -> string(Rest, Stack, Opts, <>); @@ -535,6 +542,7 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> -endif. -ifdef(utf32le). +%% non-characters blah blah noncharacter(<>, Stack, Opts, Acc) when ?is_noncontrol(S) -> string(Rest, Stack, Opts, <>); @@ -1006,7 +1014,6 @@ format_number({Int, [], Exp}) -> format_number({Int, Frac, Exp}) -> {float, list_to_float(lists:reverse(Exp ++ "e" ++ Frac ++ "." ++ Int))}. - tr(<<$r/?utfx, Rest/binary>>, Stack, Opts) -> @@ -1146,4 +1153,175 @@ null(Bin, Stack, Opts) -> null(<>, Stack, Opts) end} ; false -> {error, {badjson, Bin}} - end. \ No newline at end of file + end. + + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). + + +noncharacters_test_() -> + [ + {"noncharacters - badjson", + ?_assertEqual(check_bad(noncharacters()), []) + }, + {"noncharacters - replaced", + ?_assertEqual(check_replaced(noncharacters()), []) + } + ]. + +extended_noncharacters_test_() -> + [ + {"extended noncharacters - badjson", + ?_assertEqual(check_bad(extended_noncharacters()), []) + }, + {"extended noncharacters - replaced", + ?_assertEqual(check_extended_replaced(extended_noncharacters()), []) + } + ]. + +surrogates_test_() -> + [ + {"surrogates - badjson", + ?_assertEqual(check_bad(surrogates()), []) + }, + {"surrogates - replaced", + ?_assertEqual(check_replaced(surrogates()), []) + } + ]. + +control_test_() -> + [ + {"control characters - badjson", + ?_assertEqual(check_bad(control_characters()), []) + } + ]. + +reserved_test_() -> + [ + {"reserved noncharacters - badjson", + ?_assertEqual(check_bad(reserved_space()), []) + }, + {"reserved noncharacters - replaced", + ?_assertEqual(check_replaced(reserved_space()), []) + } + ]. + +zero_test_() -> + [ + {"nullbyte - badjson", + ?_assertEqual(check_bad(zero()), []) + } + ]. + +good_characters_test_() -> + [ + {"acceptable codepoints", + ?_assertEqual(check_good(good()), []) + }, + {"acceptable extended", + ?_assertEqual(check_good(good_extended()), []) + } + ]. + + +check_bad(List) -> + lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end, + check(List, [], []) + ). + +check_replaced(List) -> + lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true ; (_) -> false end, + check(List, [loose_unicode], []) + ). + +check_extended_replaced(List) -> + Replace = case ?encoding of + E when E == utf16; E == utf16le -> <<16#fffd/utf8, 16#fffd/utf8>> + ; _ -> <<16#fffd/utf8>> + end, + lists:dropwhile(fun({_, [{string, S}|_]}) -> S == Replace ; (_) -> false end, + check(List, [loose_unicode], []) + ). + +check_good(List) -> + lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end, + check(List, [], []) + ). + +check([], _Opts, Acc) -> Acc; +check([H|T], Opts, Acc) -> + R = decode(to_fake_utf(H, ?encoding), Opts), + check(T, Opts, [{H, R}] ++ Acc). + + +decode(JSON, Opts) -> + F = decoder(Opts), + loop(F(JSON), []). + + +loop({jsx, end_json, _}, Acc) -> lists:reverse(Acc); +loop({jsx, incomplete, More}, Acc) -> loop(More(end_stream), Acc); +loop({jsx, Event, Next}, Acc) -> loop(Next(), [Event] ++ Acc); +loop(_, _) -> {error, badjson}. + + + +noncharacters() -> lists:seq(16#fffe, 16#ffff). + +extended_noncharacters() -> + [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] + ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] + ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] + ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] + ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] + ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] + ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] + ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. + +surrogates() -> lists:seq(16#d800, 16#dfff). + +control_characters() -> lists:seq(1, 31). + +reserved_space() -> lists:seq(16#fdd0, 16#fdef). + +zero() -> [0]. + +good() -> [32, 33] + ++ lists:seq(16#23, 16#5b) + ++ lists:seq(16#5d, 16#d7ff) + ++ lists:seq(16#e000, 16#fdcf) + ++ lists:seq(16#fdf0, 16#fffd). + +good_extended() -> lists:seq(16#100000, 16#10fffd). + +%% erlang refuses to encode certain codepoints, so fake them all +to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>; +to_fake_utf(N, utf8) when N < 16#0800 -> + <<0:5, Y:5, X:6>> = <>, + <<34/utf8, 2#110:3, Y:5, 2#10:2, X:6, 34/utf8>>; +to_fake_utf(N, utf8) when N < 16#10000 -> + <> = <>, + <<34/utf8, 2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>; +to_fake_utf(N, utf8) -> + <<0:3, W:3, Z:6, Y:6, X:6>> = <>, + <<34/utf8, 2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>; + +to_fake_utf(N, utf16) when N < 16#10000 -> <<34/utf16, N:16, 34/utf16>>; +to_fake_utf(N, utf16) -> <<34/utf16, N/utf16, 34/utf16>>; + +to_fake_utf(N, utf16le) when N < 16#10000 -> + <> = <>, + <<34, 0, B:8, A:8, 34, 0>>; +to_fake_utf(N, utf16le) -> <<34/utf16-little, N/utf16-little, 34/utf16-little>>; + +to_fake_utf(N, utf32) -> <<34/utf32, N:32, 34/utf32>>; + +to_fake_utf(N, utf32le) -> + <> = <>, + <<34/utf32-little, D:8, C:8, B:8, A:8, 34/utf32-little>>. + + + + +-endif. \ No newline at end of file