From 7a1bcc49923946e65d24296689b2b5a76d3ae37e Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Tue, 27 Mar 2012 23:18:06 -0700 Subject: [PATCH] bad utf sequences were being replaced with u+fffd per byte, they are now correctly replaced with just a single u+fffd codepoint --- src/jsx_decoder.erl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index d814582..a69aa57 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -489,11 +489,20 @@ noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 1 %% u+fffe and u+ffff for R14BXX noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts) when X == 190; X == 191 -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); -%% bad utf8 +%% overlong and too short utf8 sequences +noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 192, X =< 253 -> + string(strip_continuations(Rest), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% unexpected bytes noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts). +%% strips continuation bytes after bad utf bytes, guards against both too short and overlong sequences +strip_continuations(<>) when X >= 128, X =< 191 -> strip_continuations(Rest); +strip_continuations(Rest) -> Rest. + + + escape(<<$b, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, $\b)|Stack], Opts); escape(<<$f, Rest/binary>>, Handler, [Acc|Stack], Opts) ->