corrected handling of malformed utf8 sequences
This commit is contained in:
parent
f6089a0892
commit
978e75887a
2 changed files with 120 additions and 3 deletions
|
@ -334,6 +334,20 @@ noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts)
|
||||||
%% surrogates
|
%% surrogates
|
||||||
noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 160 ->
|
noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 160 ->
|
||||||
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
|
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
|
||||||
|
noncharacter(<<X, Y, 191, Z, Rest/binary>>, Handler, [Acc|Stack], Opts)
|
||||||
|
when (
|
||||||
|
(X == 240 andalso Y == 159) orelse
|
||||||
|
(X == 240 andalso Y == 175) orelse
|
||||||
|
(X == 240 andalso Y == 191) orelse
|
||||||
|
(
|
||||||
|
(X == 241 orelse X == 242 orelse X == 243) andalso
|
||||||
|
(Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191)
|
||||||
|
) orelse
|
||||||
|
(X == 244 andalso Y == 143)
|
||||||
|
) andalso (Z == 190 orelse Z == 191) ->
|
||||||
|
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
|
||||||
|
noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
|
||||||
|
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
|
||||||
noncharacter(Bin, Handler, Stack, Opts) ->
|
noncharacter(Bin, Handler, Stack, Opts) ->
|
||||||
?error([Bin, Handler, Stack, Opts]).
|
?error([Bin, Handler, Stack, Opts]).
|
||||||
|
|
||||||
|
@ -1080,6 +1094,51 @@ good_characters_test_() ->
|
||||||
}
|
}
|
||||||
].
|
].
|
||||||
|
|
||||||
|
malformed_test_() ->
|
||||||
|
[
|
||||||
|
{"malformed codepoint with 1 byte",
|
||||||
|
?_assertEqual({error, badjson}, decode(<<128>>))
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 2 bytes",
|
||||||
|
?_assertEqual({error, badjson}, decode(<<128, 192>>))
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 3 bytes",
|
||||||
|
?_assertEqual({error, badjson}, decode(<<128, 192, 192>>))
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 4 bytes",
|
||||||
|
?_assertEqual({error, badjson}, decode(<<128, 192, 192, 192>>))
|
||||||
|
}
|
||||||
|
].
|
||||||
|
|
||||||
|
malformed_replaced_test_() ->
|
||||||
|
F = <<16#fffd/utf8>>,
|
||||||
|
[
|
||||||
|
{"malformed codepoint with 1 byte",
|
||||||
|
?_assertEqual(
|
||||||
|
[{string, <<F/binary>>}, end_json],
|
||||||
|
decode(<<34, 128, 34>>, [loose_unicode])
|
||||||
|
)
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 2 bytes",
|
||||||
|
?_assertEqual(
|
||||||
|
[{string, <<F/binary, F/binary>>}, end_json],
|
||||||
|
decode(<<34, 128, 192, 34>>, [loose_unicode])
|
||||||
|
)
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 3 bytes",
|
||||||
|
?_assertEqual(
|
||||||
|
[{string, <<F/binary, F/binary, F/binary>>}, end_json],
|
||||||
|
decode(<<34, 128, 192, 192, 34>>, [loose_unicode])
|
||||||
|
)
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 4 bytes",
|
||||||
|
?_assertEqual(
|
||||||
|
[{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
|
||||||
|
decode(<<34, 128, 192, 192, 192, 34>>, [loose_unicode])
|
||||||
|
)
|
||||||
|
}
|
||||||
|
].
|
||||||
|
|
||||||
|
|
||||||
check_bad(List) ->
|
check_bad(List) ->
|
||||||
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
|
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
|
||||||
|
@ -1104,6 +1163,8 @@ check([H|T], Opts, Acc) ->
|
||||||
check(T, Opts, [{H, R}] ++ Acc).
|
check(T, Opts, [{H, R}] ++ Acc).
|
||||||
|
|
||||||
|
|
||||||
|
decode(JSON) -> decode(JSON, []).
|
||||||
|
|
||||||
decode(JSON, Opts) ->
|
decode(JSON, Opts) ->
|
||||||
try
|
try
|
||||||
(decoder(jsx, [], Opts))(JSON)
|
(decoder(jsx, [], Opts))(JSON)
|
||||||
|
|
|
@ -163,9 +163,29 @@ clean_string(<<C/utf8, Rest/binary>>, Acc)
|
||||||
C =/= 16#ffffe andalso C =/= 16#fffff andalso
|
C =/= 16#ffffe andalso C =/= 16#fffff andalso
|
||||||
C =/= 16#10fffe andalso C =/= 16#10ffff ->
|
C =/= 16#10fffe andalso C =/= 16#10ffff ->
|
||||||
clean_string(Rest, <<Acc/binary, C/utf8>>);
|
clean_string(Rest, <<Acc/binary, C/utf8>>);
|
||||||
clean_string(<<X, _, _, Rest/binary>>, Acc) when X == 237; X == 239 ->
|
%% surrogates
|
||||||
|
clean_string(<<237, X, _, Rest/binary>>, Acc) when X >= 160 ->
|
||||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||||
clean_string(<<_, _, _, _, Rest/binary>>, Acc) ->
|
%% private use noncharacters
|
||||||
|
clean_string(<<239, 183, X, Rest/binary>>, Acc) when X >= 143, X =< 175 ->
|
||||||
|
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||||
|
%% u+fffe and u+ffff
|
||||||
|
clean_string(<<239, 191, X, Rest/binary>>, Acc) when X == 190; X == 191 ->
|
||||||
|
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||||
|
%% the u+Xfffe and u+Xffff noncharacters
|
||||||
|
clean_string(<<X, Y, 191, Z, Rest/binary>>, Acc)
|
||||||
|
when (
|
||||||
|
(X == 240 andalso Y == 159) orelse
|
||||||
|
(X == 240 andalso Y == 175) orelse
|
||||||
|
(X == 240 andalso Y == 191) orelse
|
||||||
|
(
|
||||||
|
(X == 241 orelse X == 242 orelse X == 243) andalso
|
||||||
|
(Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191)
|
||||||
|
) orelse
|
||||||
|
(X == 244 andalso Y == 143)
|
||||||
|
) andalso (Z == 190 orelse Z == 191) ->
|
||||||
|
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||||
|
clean_string(<<_, Rest/binary>>, Acc) ->
|
||||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||||
clean_string(<<>>, Acc) -> Acc.
|
clean_string(<<>>, Acc) -> Acc.
|
||||||
|
|
||||||
|
@ -303,6 +323,42 @@ good_characters_test_() ->
|
||||||
}
|
}
|
||||||
].
|
].
|
||||||
|
|
||||||
|
malformed_test_() ->
|
||||||
|
[
|
||||||
|
{"malformed codepoint with 1 byte", ?_assertError(badarg, encode(<<128>>))},
|
||||||
|
{"malformed codepoint with 2 bytes", ?_assertError(badarg, encode(<<128, 192>>))},
|
||||||
|
{"malformed codepoint with 3 bytes", ?_assertError(badarg, encode(<<128, 192, 192>>))},
|
||||||
|
{"malformed codepoint with 4 bytes", ?_assertError(badarg, encode(<<128, 192, 192, 192>>))}
|
||||||
|
].
|
||||||
|
|
||||||
|
malformed_replaced_test_() ->
|
||||||
|
F = <<16#fffd/utf8>>,
|
||||||
|
[
|
||||||
|
{"malformed codepoint with 1 byte",
|
||||||
|
?_assertEqual(
|
||||||
|
[{string, <<F/binary>>}, end_json],
|
||||||
|
encode(<<128>>, [loose_unicode])
|
||||||
|
)
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 2 bytes",
|
||||||
|
?_assertEqual(
|
||||||
|
[{string, <<F/binary, F/binary>>}, end_json],
|
||||||
|
encode(<<128, 192>>, [loose_unicode])
|
||||||
|
)
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 3 bytes",
|
||||||
|
?_assertEqual(
|
||||||
|
[{string, <<F/binary, F/binary, F/binary>>}, end_json],
|
||||||
|
encode(<<128, 192, 192>>, [loose_unicode])
|
||||||
|
)
|
||||||
|
},
|
||||||
|
{"malformed codepoint with 4 bytes",
|
||||||
|
?_assertEqual(
|
||||||
|
[{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
|
||||||
|
encode(<<128, 192, 192, 192>>, [loose_unicode])
|
||||||
|
)
|
||||||
|
}
|
||||||
|
].
|
||||||
|
|
||||||
check_bad(List) ->
|
check_bad(List) ->
|
||||||
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
|
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue