corrected handling of malformed utf8 sequences
This commit is contained in:
parent
1453687080
commit
be89f5f395
2 changed files with 120 additions and 3 deletions
|
@ -334,6 +334,20 @@ noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts)
|
|||
%% surrogates
|
||||
noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 160 ->
|
||||
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
|
||||
noncharacter(<<X, Y, 191, Z, Rest/binary>>, Handler, [Acc|Stack], Opts)
|
||||
when (
|
||||
(X == 240 andalso Y == 159) orelse
|
||||
(X == 240 andalso Y == 175) orelse
|
||||
(X == 240 andalso Y == 191) orelse
|
||||
(
|
||||
(X == 241 orelse X == 242 orelse X == 243) andalso
|
||||
(Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191)
|
||||
) orelse
|
||||
(X == 244 andalso Y == 143)
|
||||
) andalso (Z == 190 orelse Z == 191) ->
|
||||
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
|
||||
noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
|
||||
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
|
||||
noncharacter(Bin, Handler, Stack, Opts) ->
|
||||
?error([Bin, Handler, Stack, Opts]).
|
||||
|
||||
|
@ -1079,6 +1093,51 @@ good_characters_test_() ->
|
|||
?_assertEqual(check_good(good_extended()), [])
|
||||
}
|
||||
].
|
||||
|
||||
malformed_test_() ->
|
||||
[
|
||||
{"malformed codepoint with 1 byte",
|
||||
?_assertEqual({error, badjson}, decode(<<128>>))
|
||||
},
|
||||
{"malformed codepoint with 2 bytes",
|
||||
?_assertEqual({error, badjson}, decode(<<128, 192>>))
|
||||
},
|
||||
{"malformed codepoint with 3 bytes",
|
||||
?_assertEqual({error, badjson}, decode(<<128, 192, 192>>))
|
||||
},
|
||||
{"malformed codepoint with 4 bytes",
|
||||
?_assertEqual({error, badjson}, decode(<<128, 192, 192, 192>>))
|
||||
}
|
||||
].
|
||||
|
||||
malformed_replaced_test_() ->
|
||||
F = <<16#fffd/utf8>>,
|
||||
[
|
||||
{"malformed codepoint with 1 byte",
|
||||
?_assertEqual(
|
||||
[{string, <<F/binary>>}, end_json],
|
||||
decode(<<34, 128, 34>>, [loose_unicode])
|
||||
)
|
||||
},
|
||||
{"malformed codepoint with 2 bytes",
|
||||
?_assertEqual(
|
||||
[{string, <<F/binary, F/binary>>}, end_json],
|
||||
decode(<<34, 128, 192, 34>>, [loose_unicode])
|
||||
)
|
||||
},
|
||||
{"malformed codepoint with 3 bytes",
|
||||
?_assertEqual(
|
||||
[{string, <<F/binary, F/binary, F/binary>>}, end_json],
|
||||
decode(<<34, 128, 192, 192, 34>>, [loose_unicode])
|
||||
)
|
||||
},
|
||||
{"malformed codepoint with 4 bytes",
|
||||
?_assertEqual(
|
||||
[{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
|
||||
decode(<<34, 128, 192, 192, 192, 34>>, [loose_unicode])
|
||||
)
|
||||
}
|
||||
].
|
||||
|
||||
|
||||
check_bad(List) ->
|
||||
|
@ -1104,6 +1163,8 @@ check([H|T], Opts, Acc) ->
|
|||
check(T, Opts, [{H, R}] ++ Acc).
|
||||
|
||||
|
||||
decode(JSON) -> decode(JSON, []).
|
||||
|
||||
decode(JSON, Opts) ->
|
||||
try
|
||||
(decoder(jsx, [], Opts))(JSON)
|
||||
|
|
|
@ -163,9 +163,29 @@ clean_string(<<C/utf8, Rest/binary>>, Acc)
|
|||
C =/= 16#ffffe andalso C =/= 16#fffff andalso
|
||||
C =/= 16#10fffe andalso C =/= 16#10ffff ->
|
||||
clean_string(Rest, <<Acc/binary, C/utf8>>);
|
||||
clean_string(<<X, _, _, Rest/binary>>, Acc) when X == 237; X == 239 ->
|
||||
%% surrogates
|
||||
clean_string(<<237, X, _, Rest/binary>>, Acc) when X >= 160 ->
|
||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||
clean_string(<<_, _, _, _, Rest/binary>>, Acc) ->
|
||||
%% private use noncharacters
|
||||
clean_string(<<239, 183, X, Rest/binary>>, Acc) when X >= 143, X =< 175 ->
|
||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||
%% u+fffe and u+ffff
|
||||
clean_string(<<239, 191, X, Rest/binary>>, Acc) when X == 190; X == 191 ->
|
||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||
%% the u+Xfffe and u+Xffff noncharacters
|
||||
clean_string(<<X, Y, 191, Z, Rest/binary>>, Acc)
|
||||
when (
|
||||
(X == 240 andalso Y == 159) orelse
|
||||
(X == 240 andalso Y == 175) orelse
|
||||
(X == 240 andalso Y == 191) orelse
|
||||
(
|
||||
(X == 241 orelse X == 242 orelse X == 243) andalso
|
||||
(Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191)
|
||||
) orelse
|
||||
(X == 244 andalso Y == 143)
|
||||
) andalso (Z == 190 orelse Z == 191) ->
|
||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||
clean_string(<<_, Rest/binary>>, Acc) ->
|
||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>);
|
||||
clean_string(<<>>, Acc) -> Acc.
|
||||
|
||||
|
@ -302,7 +322,43 @@ good_characters_test_() ->
|
|||
?_assertEqual(check_good(good_extended()), [])
|
||||
}
|
||||
].
|
||||
|
||||
|
||||
malformed_test_() ->
|
||||
[
|
||||
{"malformed codepoint with 1 byte", ?_assertError(badarg, encode(<<128>>))},
|
||||
{"malformed codepoint with 2 bytes", ?_assertError(badarg, encode(<<128, 192>>))},
|
||||
{"malformed codepoint with 3 bytes", ?_assertError(badarg, encode(<<128, 192, 192>>))},
|
||||
{"malformed codepoint with 4 bytes", ?_assertError(badarg, encode(<<128, 192, 192, 192>>))}
|
||||
].
|
||||
|
||||
malformed_replaced_test_() ->
|
||||
F = <<16#fffd/utf8>>,
|
||||
[
|
||||
{"malformed codepoint with 1 byte",
|
||||
?_assertEqual(
|
||||
[{string, <<F/binary>>}, end_json],
|
||||
encode(<<128>>, [loose_unicode])
|
||||
)
|
||||
},
|
||||
{"malformed codepoint with 2 bytes",
|
||||
?_assertEqual(
|
||||
[{string, <<F/binary, F/binary>>}, end_json],
|
||||
encode(<<128, 192>>, [loose_unicode])
|
||||
)
|
||||
},
|
||||
{"malformed codepoint with 3 bytes",
|
||||
?_assertEqual(
|
||||
[{string, <<F/binary, F/binary, F/binary>>}, end_json],
|
||||
encode(<<128, 192, 192>>, [loose_unicode])
|
||||
)
|
||||
},
|
||||
{"malformed codepoint with 4 bytes",
|
||||
?_assertEqual(
|
||||
[{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
|
||||
encode(<<128, 192, 192, 192>>, [loose_unicode])
|
||||
)
|
||||
}
|
||||
].
|
||||
|
||||
check_bad(List) ->
|
||||
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue