corrected handling of malformed utf8 sequences

This commit is contained in:
alisdair sullivan 2012-03-21 05:19:47 -07:00
parent f6089a0892
commit 978e75887a
2 changed files with 120 additions and 3 deletions

View file

@ -334,6 +334,20 @@ noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts)
%% surrogates
noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 160 ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
noncharacter(<<X, Y, 191, Z, Rest/binary>>, Handler, [Acc|Stack], Opts)
when (
(X == 240 andalso Y == 159) orelse
(X == 240 andalso Y == 175) orelse
(X == 240 andalso Y == 191) orelse
(
(X == 241 orelse X == 242 orelse X == 243) andalso
(Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191)
) orelse
(X == 244 andalso Y == 143)
) andalso (Z == 190 orelse Z == 191) ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
noncharacter(Bin, Handler, Stack, Opts) ->
?error([Bin, Handler, Stack, Opts]).
@ -1079,6 +1093,51 @@ good_characters_test_() ->
?_assertEqual(check_good(good_extended()), [])
}
].
malformed_test_() ->
[
{"malformed codepoint with 1 byte",
?_assertEqual({error, badjson}, decode(<<128>>))
},
{"malformed codepoint with 2 bytes",
?_assertEqual({error, badjson}, decode(<<128, 192>>))
},
{"malformed codepoint with 3 bytes",
?_assertEqual({error, badjson}, decode(<<128, 192, 192>>))
},
{"malformed codepoint with 4 bytes",
?_assertEqual({error, badjson}, decode(<<128, 192, 192, 192>>))
}
].
malformed_replaced_test_() ->
F = <<16#fffd/utf8>>,
[
{"malformed codepoint with 1 byte",
?_assertEqual(
[{string, <<F/binary>>}, end_json],
decode(<<34, 128, 34>>, [loose_unicode])
)
},
{"malformed codepoint with 2 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary>>}, end_json],
decode(<<34, 128, 192, 34>>, [loose_unicode])
)
},
{"malformed codepoint with 3 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary, F/binary>>}, end_json],
decode(<<34, 128, 192, 192, 34>>, [loose_unicode])
)
},
{"malformed codepoint with 4 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
decode(<<34, 128, 192, 192, 192, 34>>, [loose_unicode])
)
}
].
check_bad(List) ->
@ -1104,6 +1163,8 @@ check([H|T], Opts, Acc) ->
check(T, Opts, [{H, R}] ++ Acc).
decode(JSON) -> decode(JSON, []).
decode(JSON, Opts) ->
try
(decoder(jsx, [], Opts))(JSON)

View file

@ -163,9 +163,29 @@ clean_string(<<C/utf8, Rest/binary>>, Acc)
C =/= 16#ffffe andalso C =/= 16#fffff andalso
C =/= 16#10fffe andalso C =/= 16#10ffff ->
clean_string(Rest, <<Acc/binary, C/utf8>>);
clean_string(<<X, _, _, Rest/binary>>, Acc) when X == 237; X == 239 ->
%% surrogates
clean_string(<<237, X, _, Rest/binary>>, Acc) when X >= 160 ->
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
clean_string(<<_, _, _, _, Rest/binary>>, Acc) ->
%% private use noncharacters
clean_string(<<239, 183, X, Rest/binary>>, Acc) when X >= 143, X =< 175 ->
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
%% u+fffe and u+ffff
clean_string(<<239, 191, X, Rest/binary>>, Acc) when X == 190; X == 191 ->
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
%% the u+Xfffe and u+Xffff noncharacters
clean_string(<<X, Y, 191, Z, Rest/binary>>, Acc)
when (
(X == 240 andalso Y == 159) orelse
(X == 240 andalso Y == 175) orelse
(X == 240 andalso Y == 191) orelse
(
(X == 241 orelse X == 242 orelse X == 243) andalso
(Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191)
) orelse
(X == 244 andalso Y == 143)
) andalso (Z == 190 orelse Z == 191) ->
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
clean_string(<<_, Rest/binary>>, Acc) ->
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
clean_string(<<>>, Acc) -> Acc.
@ -302,7 +322,43 @@ good_characters_test_() ->
?_assertEqual(check_good(good_extended()), [])
}
].
malformed_test_() ->
[
{"malformed codepoint with 1 byte", ?_assertError(badarg, encode(<<128>>))},
{"malformed codepoint with 2 bytes", ?_assertError(badarg, encode(<<128, 192>>))},
{"malformed codepoint with 3 bytes", ?_assertError(badarg, encode(<<128, 192, 192>>))},
{"malformed codepoint with 4 bytes", ?_assertError(badarg, encode(<<128, 192, 192, 192>>))}
].
malformed_replaced_test_() ->
F = <<16#fffd/utf8>>,
[
{"malformed codepoint with 1 byte",
?_assertEqual(
[{string, <<F/binary>>}, end_json],
encode(<<128>>, [loose_unicode])
)
},
{"malformed codepoint with 2 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary>>}, end_json],
encode(<<128, 192>>, [loose_unicode])
)
},
{"malformed codepoint with 3 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary, F/binary>>}, end_json],
encode(<<128, 192, 192>>, [loose_unicode])
)
},
{"malformed codepoint with 4 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
encode(<<128, 192, 192, 192>>, [loose_unicode])
)
}
].
check_bad(List) ->
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,