From 036dd72ecf9d3c886cbf618a8250dddcdf19a22d Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Tue, 20 Mar 2012 22:42:58 -0700 Subject: [PATCH] minor fixes for illegal utf8 sequences and better testing thereof --- src/jsx_encoder.erl | 130 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 116 insertions(+), 14 deletions(-) diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index f97bf02..9e362ee 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -118,7 +118,8 @@ check_string(<>) when C < 16#fdd0 -> check_string(<>) when C > 16#fdef, C < 16#fffe -> check_string(Rest); check_string(<>) - when C =/= 16#fffe andalso C =/= 16#ffff andalso + when C > 16#fffd andalso + C =/= 16#fffe andalso C =/= 16#ffff andalso C =/= 16#1fffe andalso C =/= 16#1ffff andalso C =/= 16#2fffe andalso C =/= 16#2ffff andalso C =/= 16#3fffe andalso C =/= 16#3ffff andalso @@ -140,7 +141,6 @@ check_string(<<>>) -> true; check_string(<<_, _/binary>>) -> false. clean_string(<>, Acc) when C >= 16#fdd0, C =< 16#fdef -> - io:format("1: ~p~n", [C]), clean_string(Rest, <>); clean_string(<>, Acc) when C == 16#fffe orelse C == 16#ffff orelse @@ -160,11 +160,13 @@ clean_string(<>, Acc) C == 16#efffe orelse C == 16#effff orelse C == 16#ffffe orelse C == 16#fffff orelse C == 16#10fffe orelse C == 16#10ffff -> - io:format("2: ~p~n", [C]), clean_string(Rest, <>); clean_string(<>, Acc) -> - io:format("3: ~p~n", [C]), clean_string(Rest, <>); +clean_string(<<237, X, _, Rest/binary>>, Acc) when X >= 160 -> + clean_string(Rest, <>); +clean_string(<<_, Rest/binary>>, Acc) -> + clean_string(Rest, <>); clean_string(<<>>, Acc) -> Acc. @@ -174,7 +176,10 @@ clean_string(<<>>, Acc) -> Acc. encode(Term) -> (encoder(jsx, [], []))(Term). -encode(Term, Opts) -> (encoder(jsx, [], Opts))(Term). +encode(Term, Opts) -> + try (encoder(jsx, [], Opts))(Term) + catch _:_ -> {error, badjson} + end. encode_test_() -> @@ -245,17 +250,114 @@ encode_test_() -> encode([{key, <<"value">>}]), [start_object, {key, <<"key">>}, {string, <<"value">>}, end_object, end_json] ) + } + ]. + +noncharacters_test_() -> + [ + {"noncharacters - badjson", + ?_assertEqual(check_bad(noncharacters()), []) }, - {"bad string", ?_assertError( - badarg, - encode([<<"a bad string: ", 16#ffff/utf8>>]) - ) - }, - {"allow bad string", ?_assertEqual( - encode([<<"a bad string: ", 16#1ffff/utf8>>], [loose_unicode]), - [start_array, {string, <<"a bad string: ", 16#fffd/utf8>>}, end_array, end_json] - ) + {"noncharacters - replaced", + ?_assertEqual(check_replaced(noncharacters()), []) } ]. +extended_noncharacters_test_() -> + [ + {"extended noncharacters - badjson", + ?_assertEqual(check_bad(extended_noncharacters()), []) + }, + {"extended noncharacters - replaced", + ?_assertEqual(check_replaced(extended_noncharacters()), []) + } + ]. + +surrogates_test_() -> + [ + {"surrogates - badjson", + ?_assertEqual(check_bad(surrogates()), []) + }, + {"surrogates - replaced", + ?_assertEqual(check_replaced(surrogates()), []) + } + ]. + +reserved_test_() -> + [ + {"reserved noncharacters - badjson", + ?_assertEqual(check_bad(reserved_space()), []) + }, + {"reserved noncharacters - replaced", + ?_assertEqual(check_replaced(reserved_space()), []) + } + ]. + +good_characters_test_() -> + [ + {"acceptable codepoints", + ?_assertEqual(check_good(good()), []) + }, + {"acceptable extended", + ?_assertEqual(check_good(good_extended()), []) + } + ]. + + +check_bad(List) -> + lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end, + check(List, [], []) + ). + +check_replaced(List) -> + lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true + ; (_) -> false + end, + check(List, [loose_unicode], []) + ). + +check_good(List) -> + lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end, + check(List, [], []) + ). + +check([], _Opts, Acc) -> Acc; +check([H|T], Opts, Acc) -> + R = encode(to_fake_utf(H, utf8), Opts), + check(T, Opts, [{H, R}] ++ Acc). + + + +noncharacters() -> lists:seq(16#fffe, 16#ffff). + +extended_noncharacters() -> + [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] + ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] + ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] + ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] + ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] + ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] + ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] + ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. + +surrogates() -> lists:seq(16#d800, 16#dfff). + +reserved_space() -> lists:seq(16#fdd0, 16#fdef). + +good() -> lists:seq(1, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). + +good_extended() -> lists:seq(16#100000, 16#10fffd). + +%% erlang refuses to encode certain codepoints, so fake them all +to_fake_utf(N, utf8) when N < 16#0080 -> <>; +to_fake_utf(N, utf8) when N < 16#0800 -> + <<0:5, Y:5, X:6>> = <>, + <<2#110:3, Y:5, 2#10:2, X:6>>; +to_fake_utf(N, utf8) when N < 16#10000 -> + <> = <>, + <<2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6>>; +to_fake_utf(N, utf8) -> + <<0:3, W:3, Z:6, Y:6, X:6>> = <>, + <<2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6>>. + -endif. \ No newline at end of file