bad utf8 tests for clean_string/2

This commit is contained in:
alisdair sullivan 2013-02-05 12:24:56 -08:00
parent d8cde35e45
commit 052a92d325

View file

@ -589,204 +589,167 @@ opts_test_() ->
]. ].
xcode(Bin) -> xcode(Bin, #opts{}).
xcode(Bin, [replaced_bad_utf8]) -> xcode(Bin, #opts{replaced_bad_utf8=true});
xcode(Bin, Opts) ->
try clean_string(Bin, Opts)
catch error:badarg -> {error, badarg}
end.
is_bad({error, badarg}) -> true;
is_bad(_) -> false.
bad_utf8_test_() -> bad_utf8_test_() ->
[ [
{"orphan continuation byte u+0080", {"orphan continuation byte u+0080",
?_assert(is_bad(xcode(<<16#0080>>))) ?_assertError(badarg, clean_string(<<16#0080>>, #opts{}))
}, },
{"orphan continuation byte u+0080 replaced", {"orphan continuation byte u+0080 replaced",
?_assertEqual(xcode(<<16#0080>>, [replaced_bad_utf8]), <<16#fffd/utf8>>) ?_assertEqual(<<16#fffd/utf8>>, clean_string(<<16#0080>>, #opts{replaced_bad_utf8=true}))
}, },
{"orphan continuation byte u+00bf", {"orphan continuation byte u+00bf",
?_assert(is_bad(xcode(<<16#00bf>>))) ?_assertError(badarg, clean_string(<<16#00bf>>, #opts{}))
}, },
{"orphan continuation byte u+00bf replaced", {"orphan continuation byte u+00bf replaced",
?_assertEqual(xcode(<<16#00bf>>, [replaced_bad_utf8]), <<16#fffd/utf8>>) ?_assertEqual(<<16#fffd/utf8>>, clean_string(<<16#00bf>>, #opts{replaced_bad_utf8=true}))
}, },
{"2 continuation bytes", {"2 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>))) ?_assertError(badarg, clean_string(<<(binary:copy(<<16#0080>>, 2))/binary>>, #opts{}))
},
{"2 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [replaced_bad_utf8]),
binary:copy(<<16#fffd/utf8>>, 2)
)
}, },
{"2 continuation bytes replaced", ?_assertEqual(
binary:copy(<<16#fffd/utf8>>, 2),
clean_string(<<(binary:copy(<<16#0080>>, 2))/binary>>, #opts{replaced_bad_utf8=true})
)},
{"3 continuation bytes", {"3 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>))) ?_assertError(badarg, clean_string(<<(binary:copy(<<16#0080>>, 3))/binary>>, #opts{}))
},
{"3 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [replaced_bad_utf8]),
binary:copy(<<16#fffd/utf8>>, 3)
)
}, },
{"3 continuation bytes replaced", ?_assertEqual(
binary:copy(<<16#fffd/utf8>>, 3),
clean_string(<<(binary:copy(<<16#0080>>, 3))/binary>>, #opts{replaced_bad_utf8=true})
)},
{"4 continuation bytes", {"4 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>))) ?_assertError(badarg, clean_string(<<(binary:copy(<<16#0080>>, 4))/binary>>, #opts{}))
},
{"4 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [replaced_bad_utf8]),
binary:copy(<<16#fffd/utf8>>, 4)
)
}, },
{"4 continuation bytes replaced", ?_assertEqual(
binary:copy(<<16#fffd/utf8>>, 4),
clean_string(<<(binary:copy(<<16#0080>>, 4))/binary>>, #opts{replaced_bad_utf8=true})
)},
{"5 continuation bytes", {"5 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>))) ?_assertError(badarg, clean_string(<<(binary:copy(<<16#0080>>, 5))/binary>>, #opts{}))
},
{"5 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [replaced_bad_utf8]),
binary:copy(<<16#fffd/utf8>>, 5)
)
}, },
{"5 continuation bytes replaced", ?_assertEqual(
binary:copy(<<16#fffd/utf8>>, 5),
clean_string(<<(binary:copy(<<16#0080>>, 5))/binary>>, #opts{replaced_bad_utf8=true})
)},
{"6 continuation bytes", {"6 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>))) ?_assertError(badarg, clean_string(<<(binary:copy(<<16#0080>>, 6))/binary>>, #opts{}))
}, },
{"6 continuation bytes replaced", {"6 continuation bytes replaced", ?_assertEqual(
?_assertEqual( binary:copy(<<16#fffd/utf8>>, 6),
xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [replaced_bad_utf8]), clean_string(<<(binary:copy(<<16#0080>>, 6))/binary>>, #opts{replaced_bad_utf8=true})
binary:copy(<<16#fffd/utf8>>, 6) )},
{"all continuation bytes", ?_assertError(
badarg,
clean_string(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, #opts{})
)},
{"all continuation bytes replaced", ?_assertEqual(
binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf))),
clean_string(
<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>,
#opts{replaced_bad_utf8=true}
) )
}, )},
{"all continuation bytes", {"lonely start byte", ?_assertError(badarg, clean_string(<<16#00c0>>, #opts{}))},
?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>))) {"lonely start byte replaced", ?_assertEqual(
}, <<16#fffd/utf8>>,
{"all continuation bytes replaced", clean_string(<<16#00c0>>, #opts{replaced_bad_utf8=true})
?_assertEqual( )},
xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [replaced_bad_utf8]), {"lonely start bytes (2 byte)", ?_assertError(
binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf))) badarg,
) clean_string(<<16#00c0, 32, 16#00df>>, #opts{})
}, )},
{"lonely start byte", {"lonely start bytes (2 byte) replaced", ?_assertEqual(
?_assert(is_bad(xcode(<<16#00c0>>))) <<16#fffd/utf8, 32, 16#fffd/utf8>>,
}, clean_string(<<16#00c0, 32, 16#00df>>, #opts{replaced_bad_utf8=true})
{"lonely start byte replaced", )},
?_assertEqual( {"lonely start bytes (3 byte)", ?_assertError(
xcode(<<16#00c0>>, [replaced_bad_utf8]), badarg,
<<16#fffd/utf8>> clean_string(<<16#00e0, 32, 16#00ef>>, #opts{})
) )},
}, {"lonely start bytes (3 byte) replaced", ?_assertEqual(
{"lonely start bytes (2 byte)", <<16#fffd/utf8, 32, 16#fffd/utf8>>,
?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>))) clean_string(<<16#00e0, 32, 16#00ef>>, #opts{replaced_bad_utf8=true})
}, )},
{"lonely start bytes (2 byte) replaced", {"lonely start bytes (4 byte)", ?_assertError(
?_assertEqual( badarg,
xcode(<<16#00c0, 32, 16#00df>>, [replaced_bad_utf8]), clean_string(<<16#00f0, 32, 16#00f7>>, #opts{})
<<16#fffd/utf8, 32, 16#fffd/utf8>> )},
) {"lonely start bytes (4 byte) replaced", ?_assertEqual(
}, <<16#fffd/utf8, 32, 16#fffd/utf8>>,
{"lonely start bytes (3 byte)", clean_string(<<16#00f0, 32, 16#00f7>>, #opts{replaced_bad_utf8=true})
?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>))) )},
}, {"missing continuation byte (3 byte)", ?_assertError(
{"lonely start bytes (3 byte) replaced", badarg,
?_assertEqual( clean_string(<<224, 160, 32>>, #opts{})
xcode(<<16#00e0, 32, 16#00ef>>, [replaced_bad_utf8]), )},
<<16#fffd/utf8, 32, 16#fffd/utf8>> {"missing continuation byte (3 byte) replaced", ?_assertEqual(
) <<16#fffd/utf8, 32>>,
}, clean_string(<<224, 160, 32>>, #opts{replaced_bad_utf8=true})
{"lonely start bytes (4 byte)", )},
?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>))) {"missing continuation byte (4 byte missing one)", ?_assertError(
}, badarg,
{"lonely start bytes (4 byte) replaced", clean_string(<<240, 144, 128, 32>>, #opts{})
?_assertEqual( )},
xcode(<<16#00f0, 32, 16#00f7>>, [replaced_bad_utf8]), {"missing continuation byte (4 byte missing one) replaced", ?_assertEqual(
<<16#fffd/utf8, 32, 16#fffd/utf8>> <<16#fffd/utf8, 32>>,
) clean_string(<<240, 144, 128, 32>>, #opts{replaced_bad_utf8=true})
}, )},
{"missing continuation byte (3 byte)", {"missing continuation byte (4 byte missing two)", ?_assertError(
?_assert(is_bad(xcode(<<224, 160, 32>>))) badarg,
}, clean_string(<<240, 144, 32>>, #opts{})
{"missing continuation byte (3 byte) replaced", )},
?_assertEqual( {"missing continuation byte (4 byte missing two) replaced", ?_assertEqual(
xcode(<<224, 160, 32>>, [replaced_bad_utf8]), <<16#fffd/utf8, 32>>,
<<16#fffd/utf8, 32>> clean_string(<<240, 144, 32>>, #opts{replaced_bad_utf8=true})
) )},
}, {"overlong encoding of u+002f (2 byte)", ?_assertError(
{"missing continuation byte (4 byte missing one)", badarg,
?_assert(is_bad(xcode(<<240, 144, 128, 32>>))) clean_string(<<16#c0, 16#af, 32>>, #opts{})
}, )},
{"missing continuation byte (4 byte missing one) replaced", {"overlong encoding of u+002f (2 byte) replaced", ?_assertEqual(
?_assertEqual( <<16#fffd/utf8, 32>>,
xcode(<<240, 144, 128, 32>>, [replaced_bad_utf8]), clean_string(<<16#c0, 16#af, 32>>, #opts{replaced_bad_utf8=true})
<<16#fffd/utf8, 32>> )},
) {"overlong encoding of u+002f (3 byte)", ?_assertError(
}, badarg,
{"missing continuation byte (4 byte missing two)", clean_string(<<16#e0, 16#80, 16#af, 32>>, #opts{})
?_assert(is_bad(xcode(<<240, 144, 32>>))) )},
}, {"overlong encoding of u+002f (3 byte) replaced", ?_assertEqual(
{"missing continuation byte (4 byte missing two) replaced", <<16#fffd/utf8, 32>>,
?_assertEqual( clean_string(<<16#e0, 16#80, 16#af, 32>>, #opts{replaced_bad_utf8=true})
xcode(<<240, 144, 32>>, [replaced_bad_utf8]), )},
<<16#fffd/utf8, 32>> {"overlong encoding of u+002f (4 byte)", ?_assertError(
) badarg,
}, clean_string(<<16#f0, 16#80, 16#80, 16#af, 32>>, #opts{})
{"overlong encoding of u+002f (2 byte)", )},
?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>))) {"overlong encoding of u+002f (4 byte) replaced", ?_assertEqual(
}, <<16#fffd/utf8, 32>>,
{"overlong encoding of u+002f (2 byte) replaced", clean_string(<<16#f0, 16#80, 16#80, 16#af, 32>>, #opts{replaced_bad_utf8=true})
?_assertEqual( )},
xcode(<<16#c0, 16#af, 32>>, [replaced_bad_utf8]), {"highest overlong 2 byte sequence", ?_assertError(
<<16#fffd/utf8, 32>> badarg,
) clean_string(<<16#c1, 16#bf, 32>>, #opts{})
}, )},
{"overlong encoding of u+002f (3 byte)", {"highest overlong 2 byte sequence replaced", ?_assertEqual(
?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>))) <<16#fffd/utf8, 32>>,
}, clean_string(<<16#c1, 16#bf, 32>>, #opts{replaced_bad_utf8=true})
{"overlong encoding of u+002f (3 byte) replaced", )},
?_assertEqual( {"highest overlong 3 byte sequence", ?_assertError(
xcode(<<16#e0, 16#80, 16#af, 32>>, [replaced_bad_utf8]), badarg,
<<16#fffd/utf8, 32>> clean_string(<<16#e0, 16#9f, 16#bf, 32>>, #opts{})
) )},
}, {"highest overlong 3 byte sequence replaced", ?_assertEqual(
{"overlong encoding of u+002f (4 byte)", <<16#fffd/utf8, 32>>,
?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>))) clean_string(<<16#e0, 16#9f, 16#bf, 32>>, #opts{replaced_bad_utf8=true})
}, )},
{"overlong encoding of u+002f (4 byte) replaced", {"highest overlong 4 byte sequence", ?_assertError(
?_assertEqual( badarg,
xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [replaced_bad_utf8]), clean_string(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, #opts{})
<<16#fffd/utf8, 32>> )},
) {"highest overlong 4 byte sequence replaced", ?_assertEqual(
}, <<16#fffd/utf8, 32>>,
{"highest overlong 2 byte sequence", clean_string(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, #opts{replaced_bad_utf8=true})
?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>))) )}
},
{"highest overlong 2 byte sequence replaced",
?_assertEqual(
xcode(<<16#c1, 16#bf, 32>>, [replaced_bad_utf8]),
<<16#fffd/utf8, 32>>
)
},
{"highest overlong 3 byte sequence",
?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>)))
},
{"highest overlong 3 byte sequence replaced",
?_assertEqual(
xcode(<<16#e0, 16#9f, 16#bf, 32>>, [replaced_bad_utf8]),
<<16#fffd/utf8, 32>>
)
},
{"highest overlong 4 byte sequence",
?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>)))
},
{"highest overlong 4 byte sequence replaced",
?_assertEqual(
xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [replaced_bad_utf8]),
<<16#fffd/utf8, 32>>
)
}
]. ].