From 7a1bcc49923946e65d24296689b2b5a76d3ae37e Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Tue, 27 Mar 2012 23:18:06 -0700 Subject: [PATCH 01/19] bad utf sequences were being replaced with u+fffd per byte, they are now correctly replaced with just a single u+fffd codepoint --- src/jsx_decoder.erl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index d814582..a69aa57 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -489,11 +489,20 @@ noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 1 %% u+fffe and u+ffff for R14BXX noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts) when X == 190; X == 191 -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); -%% bad utf8 +%% overlong and too short utf8 sequences +noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 192, X =< 253 -> + string(strip_continuations(Rest), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% unexpected bytes noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts). +%% strips continuation bytes after bad utf bytes, guards against both too short and overlong sequences +strip_continuations(<>) when X >= 128, X =< 191 -> strip_continuations(Rest); +strip_continuations(Rest) -> Rest. + + + escape(<<$b, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, $\b)|Stack], Opts); escape(<<$f, Rest/binary>>, Handler, [Acc|Stack], Opts) -> From cf6dbd6480d205de89e4f0818922982e23093482 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 28 Mar 2012 19:34:49 -0700 Subject: [PATCH 02/19] properly guard noncharacters --- src/jsx_decoder.erl | 156 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 137 insertions(+), 19 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index a69aa57..661d341 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -284,13 +284,18 @@ string(<<37, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 37)|Stack], Opts); string(<<38, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 38)|Stack], Opts); -string(<>, {Handler, State}, S, Opts = #opts{single_quotes=true}) -> - case S of - [Acc, single_quote, key|Stack] -> - colon(Rest, {Handler, Handler:handle_event({key, ?end_seq(Acc)}, State)}, [key|Stack], Opts); - [Acc, single_quote|Stack] -> - maybe_done(Rest, {Handler, Handler:handle_event({string, ?end_seq(Acc)}, State)}, Stack, Opts); - [Acc|Stack] -> +string(<>, {Handler, State}, [Acc|Stack], Opts) -> + case Opts#opts.single_quotes of + true -> + case Stack of + [single_quote, key|S] -> + colon(Rest, {Handler, Handler:handle_event({key, ?end_seq(Acc)}, State)}, [key|S], Opts) + ; [single_quote|S] -> + maybe_done(Rest, {Handler, Handler:handle_event({string, ?end_seq(Acc)}, State)}, S, Opts) + ; _ -> + string(Rest, {Handler, State}, [?acc_seq(Acc, ?singlequote)|Stack], Opts) + end + ; false -> string(Rest, {Handler, State}, [?acc_seq(Acc, ?singlequote)|Stack], Opts) end; string(<<40, Rest/binary>>, Handler, [Acc|Stack], Opts) -> @@ -469,8 +474,53 @@ string(<<126, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 126)|Stack], Opts); string(<<127, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 127)|Stack], Opts); -string(<>, Handler, [Acc|Stack], Opts) when ?is_noncontrol(S) -> - string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts); +string(<>, Handler, [Acc|Stack], Opts) -> + case S of + %% not strictly true, but exceptions are already taken care of in preceding clauses + S when S >= 16#20, S < 16#d800 -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S > 16#dfff, S < 16#fdd0 -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S > 16#fdef, S < 16#fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#10000, S < 16#1fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#20000, S < 16#2fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#30000, S < 16#3fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#40000, S < 16#4fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#50000, S < 16#5fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#60000, S < 16#6fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#70000, S < 16#7fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#80000, S < 16#8fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#90000, S < 16#9fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#a0000, S < 16#afffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#b0000, S < 16#bfffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#c0000, S < 16#cfffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#d0000, S < 16#dfffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#e0000, S < 16#efffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#f0000, S < 16#ffffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; S when S >= 16#100000, S < 16#10fffe -> + string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) + ; _ -> + case Opts#opts.loose_unicode of + true -> noncharacter(<>, Handler, [Acc|Stack], Opts) + ; false -> ?error([<>, Handler, [Acc|Stack], Opts]) + end + end; string(Bin, Handler, Stack, Opts) -> case partial_utf(Bin) of true -> ?incomplete(string, Bin, Handler, Stack, Opts) @@ -489,18 +539,38 @@ noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 1 %% u+fffe and u+ffff for R14BXX noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts) when X == 190; X == 191 -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); -%% overlong and too short utf8 sequences -noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 192, X =< 253 -> - string(strip_continuations(Rest), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); -%% unexpected bytes +%% u+xfffe, u+xffff and other noncharacters +noncharacter(<<_/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts) -> + string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% overlong encodings and missing continuations of a 2 byte sequence +noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 192, X =< 223 -> + string(strip_continuations(Rest, 1), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% overlong encodings and missing continuations of a 3 byte sequence +noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 224, X =< 239 -> + string(strip_continuations(Rest, 2), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% overlong encodings and missing continuations of a 4 byte sequence +noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 240, X =< 247 -> + string(strip_continuations(Rest, 3), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% overlong encodings and missing continuations of a 4 byte sequence +noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 240, X =< 247 -> + string(strip_continuations(Rest, 3), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% overlong encodings and missing continuations of a 5 byte sequence +noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 248, X =< 251 -> + string(strip_continuations(Rest, 4), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% overlong encodings and missing continuations of a 6 byte sequence +noncharacter(<>, Handler, [Acc|Stack], Opts) when X == 252, X == 253 -> + string(strip_continuations(Rest, 5), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +%% unexpected bytes, including orphan continuations noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts). -%% strips continuation bytes after bad utf bytes, guards against both too short and overlong sequences -strip_continuations(<>) when X >= 128, X =< 191 -> strip_continuations(Rest); -strip_continuations(Rest) -> Rest. - +%% strips continuation bytes after bad utf bytes, guards against both too short +%% and overlong sequences. N is the maximum number of bytes to strip +strip_continuations(Rest, 0) -> Rest; +strip_continuations(<>, N) when X >= 128, X =< 191 -> strip_continuations(Rest, N - 1); +%% not a continuation byte, dispatch back to string +strip_continuations(Rest, _) -> Rest. escape(<<$b, Rest/binary>>, Handler, [Acc|Stack], Opts) -> @@ -1164,6 +1234,26 @@ escape_forward_slash_test_() -> )} ]. +noncharacters_test_() -> + [ + {"noncharacters - badjson", + ?_assertEqual(check_bad(noncharacters()), []) + }, + {"noncharacters - replaced", + ?_assertEqual(check_replaced(noncharacters()), []) + } + ]. + +extended_noncharacters_test_() -> + [ + {"extended noncharacters - badjson", + ?_assertEqual(check_bad(extended_noncharacters()), []) + }, + {"extended noncharacters - replaced", + ?_assertEqual(check_replaced(extended_noncharacters()), []) + } + ]. + surrogates_test_() -> [ {"surrogates - badjson", @@ -1180,6 +1270,16 @@ control_test_() -> ?_assertEqual(check_bad(control_characters()), []) } ]. + +reserved_test_() -> + [ + {"reserved noncharacters - badjson", + ?_assertEqual(check_bad(reserved_space()), []) + }, + {"reserved noncharacters - replaced", + ?_assertEqual(check_replaced(reserved_space()), []) + } + ]. good_characters_test_() -> [ @@ -1270,13 +1370,31 @@ decode(JSON, Opts) -> end. +noncharacters() -> lists:seq(16#fffe, 16#ffff). + +extended_noncharacters() -> + [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] + ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] + ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] + ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] + ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] + ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] + ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] + ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. + surrogates() -> lists:seq(16#d800, 16#dfff). control_characters() -> lists:seq(1, 31). -good() -> [32, 33] ++ lists:seq(16#23, 16#5b) ++ lists:seq(16#5d, 16#d7ff) ++ lists:seq(16#e000, 16#fffd). +reserved_space() -> lists:seq(16#fdd0, 16#fdef). + +good() -> [32, 33] + ++ lists:seq(16#23, 16#5b) + ++ lists:seq(16#5d, 16#d7ff) + ++ lists:seq(16#e000, 16#fdcf) + ++ lists:seq(16#fdf0, 16#fffd). -good_extended() -> lists:seq(16#100000, 16#10ffff). +good_extended() -> lists:seq(16#100000, 16#10fffd). %% erlang refuses to encode certain codepoints, so fake them all to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>; From 9bdbf7969b614bab2a68e22e55df1be496253841 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 28 Mar 2012 19:50:57 -0700 Subject: [PATCH 03/19] noncharacters work with incompletes now --- src/jsx_decoder.erl | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 661d341..0fe7645 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -543,34 +543,39 @@ noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts) when X == noncharacter(<<_/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); %% overlong encodings and missing continuations of a 2 byte sequence -noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 192, X =< 223 -> - string(strip_continuations(Rest, 1), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +noncharacter(<>, Handler, Stack, Opts) when X >= 192, X =< 223 -> + strip_continuations(Rest, Handler, [1|Stack], Opts); %% overlong encodings and missing continuations of a 3 byte sequence -noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 224, X =< 239 -> - string(strip_continuations(Rest, 2), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +noncharacter(<>, Handler, Stack, Opts) when X >= 224, X =< 239 -> + strip_continuations(Rest, Handler, [2|Stack], Opts); %% overlong encodings and missing continuations of a 4 byte sequence -noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 240, X =< 247 -> - string(strip_continuations(Rest, 3), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); -%% overlong encodings and missing continuations of a 4 byte sequence -noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 240, X =< 247 -> - string(strip_continuations(Rest, 3), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +noncharacter(<>, Handler, Stack, Opts) when X >= 240, X =< 247 -> + strip_continuations(Rest, Handler, [3|Stack], Opts); %% overlong encodings and missing continuations of a 5 byte sequence -noncharacter(<>, Handler, [Acc|Stack], Opts) when X >= 248, X =< 251 -> - string(strip_continuations(Rest, 4), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +noncharacter(<>, Handler, Stack, Opts) when X >= 248, X =< 251 -> + strip_continuations(Rest, Handler, [4|Stack], Opts); %% overlong encodings and missing continuations of a 6 byte sequence -noncharacter(<>, Handler, [Acc|Stack], Opts) when X == 252, X == 253 -> - string(strip_continuations(Rest, 5), Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +noncharacter(<>, Handler, Stack, Opts) when X == 252, X == 253 -> + strip_continuations(Rest, Handler, [5|Stack], Opts); %% unexpected bytes, including orphan continuations noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) -> - string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts). + string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +noncharacter(<<>>, Handler, Stack, Opts) -> + ?incomplete(noncharacter, <<>>, Handler, Stack, Opts). %% strips continuation bytes after bad utf bytes, guards against both too short %% and overlong sequences. N is the maximum number of bytes to strip -strip_continuations(Rest, 0) -> Rest; -strip_continuations(<>, N) when X >= 128, X =< 191 -> strip_continuations(Rest, N - 1); +strip_continuations(Rest, Handler, [0, Acc|Stack], Opts) -> + string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +strip_continuations(<>, Handler, [N|Stack], Opts) when X >= 128, X =< 191 -> + strip_continuations(Rest, Handler, [N - 1|Stack], Opts); +%% incomplete +strip_continuations(<<>>, Handler, Stack, Opts) -> + ?incomplete(strip_continuations, <<>>, Handler, Stack, Opts); %% not a continuation byte, dispatch back to string -strip_continuations(Rest, _) -> Rest. +strip_continuations(Rest, Handler, [_, Acc|Stack], Opts) -> + string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts). escape(<<$b, Rest/binary>>, Handler, [Acc|Stack], Opts) -> From edbe4d16ab36f0b3544180974fb181ce17004d1f Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 28 Mar 2012 20:17:58 -0700 Subject: [PATCH 04/19] fix clean string for bad utf8 --- src/jsx_encoder.erl | 129 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 123 insertions(+), 6 deletions(-) diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index 6f52c7a..6f48804 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -115,17 +115,90 @@ clean_string(Bin, Opts) -> is_clean(<<>>) -> true; -is_clean(<<_/utf8, Rest/binary>>) -> is_clean(Rest); +is_clean(<>) -> + case X of + X when X >= 16#0000, X < 16#d800 -> is_clean(Rest) + ; X when X > 16#dfff, X < 16#fdd0 -> is_clean(Rest) + ; X when X > 16#fdef, X < 16#fffe -> is_clean(Rest) + ; X when X >= 16#10000, X < 16#1fffe -> is_clean(Rest) + ; X when X >= 16#20000, X < 16#2fffe -> is_clean(Rest) + ; X when X >= 16#30000, X < 16#3fffe -> is_clean(Rest) + ; X when X >= 16#40000, X < 16#4fffe -> is_clean(Rest) + ; X when X >= 16#50000, X < 16#5fffe -> is_clean(Rest) + ; X when X >= 16#60000, X < 16#6fffe -> is_clean(Rest) + ; X when X >= 16#70000, X < 16#7fffe -> is_clean(Rest) + ; X when X >= 16#80000, X < 16#8fffe -> is_clean(Rest) + ; X when X >= 16#90000, X < 16#9fffe -> is_clean(Rest) + ; X when X >= 16#a0000, X < 16#afffe -> is_clean(Rest) + ; X when X >= 16#b0000, X < 16#bfffe -> is_clean(Rest) + ; X when X >= 16#c0000, X < 16#cfffe -> is_clean(Rest) + ; X when X >= 16#d0000, X < 16#dfffe -> is_clean(Rest) + ; X when X >= 16#e0000, X < 16#efffe -> is_clean(Rest) + ; X when X >= 16#f0000, X < 16#ffffe -> is_clean(Rest) + ; X when X >= 16#100000, X < 16#10fffe -> is_clean(Rest) + ; _ -> false + end; is_clean(_) -> false. clean_string(Bin, _Acc, Opts=#opts{loose_unicode=false}) -> ?error([Bin, Opts]); clean_string(<<>>, Acc, _Opts) -> unicode:characters_to_binary(lists:reverse(Acc)); -clean_string(<>, Acc, Opts) -> clean_string(Rest, [X] ++ Acc, Opts); +clean_string(<>, Acc, Opts) -> + case X of + X when X < 16#d800 -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X > 16#dfff, X < 16#fdd0 -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X > 16#fdef, X < 16#fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#10000, X < 16#1fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#20000, X < 16#2fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#30000, X < 16#3fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#40000, X < 16#4fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#50000, X < 16#5fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#60000, X < 16#6fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#70000, X < 16#7fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#80000, X < 16#8fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#90000, X < 16#9fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#a0000, X < 16#afffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#b0000, X < 16#bfffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#c0000, X < 16#cfffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#d0000, X < 16#dfffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#e0000, X < 16#efffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#f0000, X < 16#ffffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; X when X >= 16#100000, X < 16#10fffe -> clean_string(Rest, [X] ++ Acc, Opts) + ; _ -> clean_string(Rest, [16#fffd] ++ Acc, Opts) + end; %% surrogates -clean_string(<<237, X, _, Rest/binary>>, Acc, Opts) when X >= 160 -> clean_string(Rest, [16#fffd] ++ Acc, Opts); +clean_string(<<237, X, _, Rest/binary>>, Acc, Opts) when X >= 160 -> + clean_string(Rest, [16#fffd] ++ Acc, Opts); +%% u+fffe and u+ffff for R14BXX +clean_string(<<239, 191, X, Rest/binary>>, Acc, Opts) when X == 190; X == 191 -> + clean_string(Rest, [16#fffd] ++ Acc, Opts); +%% overlong encodings and missing continuations of a 2 byte sequence +clean_string(<>, Acc, Opts) when X >= 192, X =< 223 -> + clean_string(strip_continuations(Rest, 1), [16#fffd] ++ Acc, Opts); +%% overlong encodings and missing continuations of a 3 byte sequence +clean_string(<>, Acc, Opts) when X >= 224, X =< 239 -> + clean_string(strip_continuations(Rest, 2), [16#fffd] ++ Acc, Opts); +%% overlong encodings and missing continuations of a 4 byte sequence +clean_string(<>, Acc, Opts) when X >= 240, X =< 247 -> + clean_string(strip_continuations(Rest, 3), [16#fffd] ++ Acc, Opts); +%% overlong encodings and missing continuations of a 5 byte sequence +clean_string(<>, Acc, Opts) when X >= 248, X =< 251 -> + clean_string(strip_continuations(Rest, 4), [16#fffd] ++ Acc, Opts); +%% overlong encodings and missing continuations of a 6 byte sequence +clean_string(<>, Acc, Opts) when X == 252, X == 253 -> + clean_string(strip_continuations(Rest, 5), [16#fffd] ++ Acc, Opts); %% bad codepoints -clean_string(<<_, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [16#fffd] ++ Acc, Opts). +clean_string(<<_, Rest/binary>>, Acc, Opts) -> + clean_string(Rest, [16#fffd] ++ Acc, Opts). + + +%% strips continuation bytes after bad utf bytes, guards against both too short +%% and overlong sequences. N is the maximum number of bytes to strip +strip_continuations(Rest, 0) -> Rest; +strip_continuations(<>, N) when X >= 128, X =< 191 -> + strip_continuations(Rest, N - 1); +%% not a continuation byte, dispatch back to clean_string +strip_continuations(Rest, _) -> Rest. -ifdef(TEST). @@ -230,6 +303,36 @@ good_characters_test_() -> } ]. +reserved_test_() -> + [ + {"reserved noncharacters - badjson", + ?_assertEqual(check_bad(reserved_space()), []) + }, + {"reserved noncharacters - replaced", + ?_assertEqual(check_replaced(reserved_space()), []) + } + ]. + +noncharacters_test_() -> + [ + {"noncharacters - badjson", + ?_assertEqual(check_bad(noncharacters()), []) + }, + {"noncharacters - replaced", + ?_assertEqual(check_replaced(noncharacters()), []) + } + ]. + +extended_noncharacters_test_() -> + [ + {"extended noncharacters - badjson", + ?_assertEqual(check_bad(extended_noncharacters()), []) + }, + {"extended noncharacters - replaced", + ?_assertEqual(check_replaced(extended_noncharacters()), []) + } + ]. + malformed_test_() -> [ {"malformed codepoint with 1 byte", ?_assertError(badarg, encode(<<128>>))}, @@ -290,11 +393,25 @@ check([H|T], Opts, Acc) -> check(T, Opts, [{H, R}] ++ Acc). +noncharacters() -> lists:seq(16#fffe, 16#ffff). + +extended_noncharacters() -> + [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] + ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] + ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] + ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] + ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] + ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] + ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] + ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. + surrogates() -> lists:seq(16#d800, 16#dfff). -good() -> lists:seq(1, 16#d7ff) ++ lists:seq(16#e000, 16#fffd). +reserved_space() -> lists:seq(16#fdd0, 16#fdef). + +good() -> lists:seq(16#0000, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). -good_extended() -> lists:seq(16#100000, 16#10ffff). +good_extended() -> lists:seq(16#100000, 16#10fffd). %% erlang refuses to encode certain codepoints, so fake them all to_fake_utf(N, utf8) when N < 16#0080 -> <>; From 7e243bafd1416ca2795cc4429d6dc0e74a733c30 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 28 Mar 2012 21:23:35 -0700 Subject: [PATCH 05/19] remove tests for malformed 5 and 6 byte sequences --- src/jsx_decoder.erl | 57 ++------------------------------------------ src/jsx_encoder.erl | 34 -------------------------- src/jsx_utils.erl | 58 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 93 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 0fe7645..7c5d295 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -517,8 +517,8 @@ string(<>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts) ; _ -> case Opts#opts.loose_unicode of - true -> noncharacter(<>, Handler, [Acc|Stack], Opts) - ; false -> ?error([<>, Handler, [Acc|Stack], Opts]) + true -> noncharacter(<>, Handler, [Acc|Stack], Opts) + ; false -> ?error([<>, Handler, [Acc|Stack], Opts]) end end; string(Bin, Handler, Stack, Opts) -> @@ -551,12 +551,6 @@ noncharacter(<>, Handler, Stack, Opts) when X >= 224, X =< 239 - %% overlong encodings and missing continuations of a 4 byte sequence noncharacter(<>, Handler, Stack, Opts) when X >= 240, X =< 247 -> strip_continuations(Rest, Handler, [3|Stack], Opts); -%% overlong encodings and missing continuations of a 5 byte sequence -noncharacter(<>, Handler, Stack, Opts) when X >= 248, X =< 251 -> - strip_continuations(Rest, Handler, [4|Stack], Opts); -%% overlong encodings and missing continuations of a 6 byte sequence -noncharacter(<>, Handler, Stack, Opts) when X == 252, X == 253 -> - strip_continuations(Rest, Handler, [5|Stack], Opts); %% unexpected bytes, including orphan continuations noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); @@ -1295,51 +1289,6 @@ good_characters_test_() -> ?_assertEqual(check_good(good_extended()), []) } ]. - -malformed_test_() -> - [ - {"malformed codepoint with 1 byte", - ?_assertEqual({error, badjson}, decode(<<128>>)) - }, - {"malformed codepoint with 2 bytes", - ?_assertEqual({error, badjson}, decode(<<128, 192>>)) - }, - {"malformed codepoint with 3 bytes", - ?_assertEqual({error, badjson}, decode(<<128, 192, 192>>)) - }, - {"malformed codepoint with 4 bytes", - ?_assertEqual({error, badjson}, decode(<<128, 192, 192, 192>>)) - } - ]. - -malformed_replaced_test_() -> - F = <<16#fffd/utf8>>, - [ - {"malformed codepoint with 1 byte", - ?_assertEqual( - [{string, <>}, end_json], - decode(<<34, 128, 34>>, [loose_unicode]) - ) - }, - {"malformed codepoint with 2 bytes", - ?_assertEqual( - [{string, <>}, end_json], - decode(<<34, 128, 192, 34>>, [loose_unicode]) - ) - }, - {"malformed codepoint with 3 bytes", - ?_assertEqual( - [{string, <>}, end_json], - decode(<<34, 128, 192, 192, 34>>, [loose_unicode]) - ) - }, - {"malformed codepoint with 4 bytes", - ?_assertEqual( - [{string, <>}, end_json], - decode(<<34, 128, 192, 192, 192, 34>>, [loose_unicode]) - ) - } - ]. check_bad(List) -> @@ -1365,8 +1314,6 @@ check([H|T], Opts, Acc) -> check(T, Opts, [{H, R}] ++ Acc). -decode(JSON) -> decode(JSON, []). - decode(JSON, Opts) -> try (decoder(jsx, [], Opts))(JSON) diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index 6f48804..f4655bc 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -181,12 +181,6 @@ clean_string(<>, Acc, Opts) when X >= 224, X =< 239 -> %% overlong encodings and missing continuations of a 4 byte sequence clean_string(<>, Acc, Opts) when X >= 240, X =< 247 -> clean_string(strip_continuations(Rest, 3), [16#fffd] ++ Acc, Opts); -%% overlong encodings and missing continuations of a 5 byte sequence -clean_string(<>, Acc, Opts) when X >= 248, X =< 251 -> - clean_string(strip_continuations(Rest, 4), [16#fffd] ++ Acc, Opts); -%% overlong encodings and missing continuations of a 6 byte sequence -clean_string(<>, Acc, Opts) when X == 252, X == 253 -> - clean_string(strip_continuations(Rest, 5), [16#fffd] ++ Acc, Opts); %% bad codepoints clean_string(<<_, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [16#fffd] ++ Acc, Opts). @@ -341,34 +335,6 @@ malformed_test_() -> {"malformed codepoint with 4 bytes", ?_assertError(badarg, encode(<<128, 192, 192, 192>>))} ]. -malformed_replaced_test_() -> - F = <<16#fffd/utf8>>, - [ - {"malformed codepoint with 1 byte", - ?_assertEqual( - [{string, <>}, end_json], - encode(<<128>>, [loose_unicode]) - ) - }, - {"malformed codepoint with 2 bytes", - ?_assertEqual( - [{string, <>}, end_json], - encode(<<128, 192>>, [loose_unicode]) - ) - }, - {"malformed codepoint with 3 bytes", - ?_assertEqual( - [{string, <>}, end_json], - encode(<<128, 192, 192>>, [loose_unicode]) - ) - }, - {"malformed codepoint with 4 bytes", - ?_assertEqual( - [{string, <>}, end_json], - encode(<<128, 192, 192, 192>>, [loose_unicode]) - ) - } - ]. check_bad(List) -> lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end, diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 462e31d..96a9f53 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -243,7 +243,7 @@ json_escape(Str, Opts, L, Len) when L < Len -> json_escape(<>, Opts, L + 3, Len); false -> B = unicode:characters_to_binary(json_escape_sequence(16#2028)), - json_escape(<>, Opts, L + size(B), Len + size(B) - size(<<16#2028/utf8>>)) + json_escape(<>, Opts, L + 6, Len + 3) end; <> -> case Opts#opts.no_jsonp_escapes of @@ -251,21 +251,65 @@ json_escape(Str, Opts, L, Len) when L < Len -> json_escape(<>, Opts, L + 3, Len); false -> B = unicode:characters_to_binary(json_escape_sequence(16#2029)), - json_escape(<>, Opts, L + size(B), Len + size(B) - size(<<16#2029/utf8>>)) + json_escape(<>, Opts, L + 6, Len + 3) end; <<_:L/binary, X/utf8, _/binary>> when X < 16#0080 -> json_escape(Str, Opts, L + 1, Len); <<_:L/binary, X/utf8, _/binary>> when X < 16#0800 -> json_escape(Str, Opts, L + 2, Len); - <<_:L/binary, X/utf8, _/binary>> when X < 16#10000 -> + <<_:L/binary, X/utf8, _/binary>> when X < 16#dcff -> json_escape(Str, Opts, L + 3, Len); - <<_:L/binary, _/utf8, _/binary>> -> + <<_:L/binary, X/utf8, _/binary>> when X > 16#dfff, X < 16#fdd0 -> + json_escape(Str, Opts, L + 3, Len); + <<_:L/binary, X/utf8, _/binary>> when X > 16#fdef, X < 16#fffe -> + json_escape(Str, Opts, L + 3, Len); + <> when X < 16#10000 -> + case Opts#opts.loose_unicode of + true -> json_escape(<>, Opts, L + 3, Len); + false -> erlang:error(badarg, [Str, Opts]) + end; + <> + when X == 16#1fffe; X == 16#1ffff; + X == 16#2fffe; X == 16#2ffff; + X == 16#3fffe; X == 16#3ffff; + X == 16#4fffe; X == 16#4ffff; + X == 16#5fffe; X == 16#5ffff; + X == 16#6fffe; X == 16#6ffff; + X == 16#7fffe; X == 16#7ffff; + X == 16#8fffe; X == 16#8ffff; + X == 16#9fffe; X == 16#9ffff; + X == 16#afffe; X == 16#affff; + X == 16#bfffe; X == 16#bffff; + X == 16#cfffe; X == 16#cffff; + X == 16#dfffe; X == 16#dffff; + X == 16#efffe; X == 16#effff; + X == 16#ffffe; X == 16#fffff; + X == 16#10fffe; X == 16#10ffff -> + case Opts#opts.loose_unicode of + true -> json_escape(<>, Opts, L + 3, Len - 1); + false -> erlang:error(badarg, [Str, Opts]) + end; + <<_:L/binary, X/utf8, _/binary>> when X >= 16#10000 -> json_escape(Str, Opts, L + 4, Len); <> when X >= 160 -> case Opts#opts.loose_unicode of true -> json_escape(<>, Opts, L + 3, Len); false -> erlang:error(badarg, [Str, Opts]) end; + <> when X == 190; X == 191 -> + case Opts#opts.loose_unicode of + true -> json_escape(<>, Opts, L + 3, Len); + false -> erlang:error(badarg, [Str, Opts]) + end; + <> when X >= 192, X =< 223 -> + {Rest, Stripped} = strip_continuations(T, 1, 0), + json_escape(<>, Opts, L + 3, Len + 2 - Stripped); + <> when X >= 224, X =< 239 -> + {Rest, Stripped} = strip_continuations(T, 2, 0), + json_escape(<>, Opts, L + 3, Len + 2 - Stripped); + <> when X >= 240, X =< 247 -> + {Rest, Stripped} = strip_continuations(T, 3, 0), + json_escape(<>, Opts, L + 3, Len + 2 - Stripped); <> -> case Opts#opts.loose_unicode of true -> json_escape(<>, Opts, L + 3, Len + 2); @@ -291,6 +335,12 @@ to_hex(15) -> $f; to_hex(X) -> X + 48. %% ascii "1" is [49], "2" is [50], etc... +strip_continuations(Bin, 0, N) -> {Bin, N}; +strip_continuations(<>, N, M) when X >= 128, X =< 191 -> + strip_continuations(Rest, N - 1, M + 1); +%% not a continuation byte +strip_continuations(Bin, _, N) -> {Bin, N}. + %% eunit tests -ifdef(TEST). From 2d2dd5f7c12cff9d93f7f797bc6f622d91a80bf3 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 28 Mar 2012 21:51:21 -0700 Subject: [PATCH 06/19] faster implementation of string cleaning --- src/jsx_encoder.erl | 142 ++++++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 83 deletions(-) diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index f4655bc..573d870 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -106,93 +106,69 @@ fix_key(Key) when is_binary(Key) -> Key. clean_string(Bin, Opts) -> case Opts#opts.json_escape of true -> jsx_utils:json_escape(Bin, Opts); - false -> - case is_clean(Bin) of - true -> Bin; - false -> clean_string(Bin, [], Opts) + false -> clean_string(Bin, 0, size(Bin), Opts) + end. + + +clean_string(Str, Len, Len, _Opts) -> Str; +clean_string(Str, L, Len, Opts) -> + case Str of + <<_:L/binary, X/utf8, _/binary>> when X < 16#80 -> clean_string(Str, L + 1, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X < 16#800 -> clean_string(Str, L + 2, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X < 16#dcff -> clean_string(Str, L + 3, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X > 16#dfff, X < 16#fdd0 -> clean_string(Str, L + 3, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X > 16#fdef, X < 16#fffe -> clean_string(Str, L + 3, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#10000, X < 16#1fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#20000, X < 16#2fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#30000, X < 16#3fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#40000, X < 16#4fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#50000, X < 16#5fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#60000, X < 16#6fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#70000, X < 16#7fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#80000, X < 16#8fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#90000, X < 16#9fffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#a0000, X < 16#afffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#b0000, X < 16#bfffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#c0000, X < 16#cfffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#d0000, X < 16#dfffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#e0000, X < 16#efffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#f0000, X < 16#ffffe -> clean_string(Str, L + 4, Len, Opts) + ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#100000, X < 16#10fffe -> clean_string(Str, L + 4, Len, Opts) + ; <> -> + case Opts#opts.loose_unicode of + true -> + case Rest of + %% surrogates + <<237, X, _, T/binary>> when X >= 160 -> + clean_string(<>, L + 3, Len, Opts) + %% u+fffe and u+ffff for R14BXX + ; <<239, 191, X, T/binary>> when X == 190; X == 191 -> + clean_string(<>, L + 3, Len, Opts) + %% overlong encodings and missing continuations of a 2 byte sequence + ; <> when X >= 192, X =< 223 -> + {Tail, Stripped} = strip_continuations(T, 1, 0), + clean_string(<>, L + 3, Len + 2 - Stripped, Opts) + %% overlong encodings and missing continuations of a 3 byte sequence + ; <> when X >= 224, X =< 239 -> + {Tail, Stripped} = strip_continuations(T, 2, 0), + clean_string(<>, L + 3, Len + 2 - Stripped, Opts) + ; <> when X >= 240, X =< 247 -> + {Tail, Stripped} = strip_continuations(T, 3, 0), + clean_string(<>, L + 3, Len + 2 - Stripped, Opts) + ; <<_, T/binary>> -> + clean_string(<>, L + 3, Len + 2, Opts) + end + ; false -> + erlang:error(badarg, [Str, Opts]) end end. -is_clean(<<>>) -> true; -is_clean(<>) -> - case X of - X when X >= 16#0000, X < 16#d800 -> is_clean(Rest) - ; X when X > 16#dfff, X < 16#fdd0 -> is_clean(Rest) - ; X when X > 16#fdef, X < 16#fffe -> is_clean(Rest) - ; X when X >= 16#10000, X < 16#1fffe -> is_clean(Rest) - ; X when X >= 16#20000, X < 16#2fffe -> is_clean(Rest) - ; X when X >= 16#30000, X < 16#3fffe -> is_clean(Rest) - ; X when X >= 16#40000, X < 16#4fffe -> is_clean(Rest) - ; X when X >= 16#50000, X < 16#5fffe -> is_clean(Rest) - ; X when X >= 16#60000, X < 16#6fffe -> is_clean(Rest) - ; X when X >= 16#70000, X < 16#7fffe -> is_clean(Rest) - ; X when X >= 16#80000, X < 16#8fffe -> is_clean(Rest) - ; X when X >= 16#90000, X < 16#9fffe -> is_clean(Rest) - ; X when X >= 16#a0000, X < 16#afffe -> is_clean(Rest) - ; X when X >= 16#b0000, X < 16#bfffe -> is_clean(Rest) - ; X when X >= 16#c0000, X < 16#cfffe -> is_clean(Rest) - ; X when X >= 16#d0000, X < 16#dfffe -> is_clean(Rest) - ; X when X >= 16#e0000, X < 16#efffe -> is_clean(Rest) - ; X when X >= 16#f0000, X < 16#ffffe -> is_clean(Rest) - ; X when X >= 16#100000, X < 16#10fffe -> is_clean(Rest) - ; _ -> false - end; -is_clean(_) -> false. - - -clean_string(Bin, _Acc, Opts=#opts{loose_unicode=false}) -> ?error([Bin, Opts]); -clean_string(<<>>, Acc, _Opts) -> unicode:characters_to_binary(lists:reverse(Acc)); -clean_string(<>, Acc, Opts) -> - case X of - X when X < 16#d800 -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X > 16#dfff, X < 16#fdd0 -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X > 16#fdef, X < 16#fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#10000, X < 16#1fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#20000, X < 16#2fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#30000, X < 16#3fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#40000, X < 16#4fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#50000, X < 16#5fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#60000, X < 16#6fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#70000, X < 16#7fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#80000, X < 16#8fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#90000, X < 16#9fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#a0000, X < 16#afffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#b0000, X < 16#bfffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#c0000, X < 16#cfffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#d0000, X < 16#dfffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#e0000, X < 16#efffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#f0000, X < 16#ffffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; X when X >= 16#100000, X < 16#10fffe -> clean_string(Rest, [X] ++ Acc, Opts) - ; _ -> clean_string(Rest, [16#fffd] ++ Acc, Opts) - end; -%% surrogates -clean_string(<<237, X, _, Rest/binary>>, Acc, Opts) when X >= 160 -> - clean_string(Rest, [16#fffd] ++ Acc, Opts); -%% u+fffe and u+ffff for R14BXX -clean_string(<<239, 191, X, Rest/binary>>, Acc, Opts) when X == 190; X == 191 -> - clean_string(Rest, [16#fffd] ++ Acc, Opts); -%% overlong encodings and missing continuations of a 2 byte sequence -clean_string(<>, Acc, Opts) when X >= 192, X =< 223 -> - clean_string(strip_continuations(Rest, 1), [16#fffd] ++ Acc, Opts); -%% overlong encodings and missing continuations of a 3 byte sequence -clean_string(<>, Acc, Opts) when X >= 224, X =< 239 -> - clean_string(strip_continuations(Rest, 2), [16#fffd] ++ Acc, Opts); -%% overlong encodings and missing continuations of a 4 byte sequence -clean_string(<>, Acc, Opts) when X >= 240, X =< 247 -> - clean_string(strip_continuations(Rest, 3), [16#fffd] ++ Acc, Opts); -%% bad codepoints -clean_string(<<_, Rest/binary>>, Acc, Opts) -> - clean_string(Rest, [16#fffd] ++ Acc, Opts). - - -%% strips continuation bytes after bad utf bytes, guards against both too short -%% and overlong sequences. N is the maximum number of bytes to strip -strip_continuations(Rest, 0) -> Rest; -strip_continuations(<>, N) when X >= 128, X =< 191 -> - strip_continuations(Rest, N - 1); -%% not a continuation byte, dispatch back to clean_string -strip_continuations(Rest, _) -> Rest. +strip_continuations(Bin, 0, N) -> {Bin, N}; +strip_continuations(<>, N, M) when X >= 128, X =< 191 -> + strip_continuations(Rest, N - 1, M + 1); +%% not a continuation byte +strip_continuations(Bin, _, N) -> {Bin, N}. -ifdef(TEST). From dc6a026e94851ef84475507e9977845dad650506 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 28 Mar 2012 23:31:07 -0700 Subject: [PATCH 07/19] add tests for bad utf sequences, fix failures --- src/jsx_decoder.erl | 201 +++++++++++++++++++++++++++++++++++++++ src/jsx_encoder.erl | 210 +++++++++++++++++++++++++++++++++++++++-- src/jsx_utils.erl | 225 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 622 insertions(+), 14 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 7c5d295..4ef6ed1 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -1041,6 +1041,207 @@ done(Bin, Handler, Stack, Opts) -> ?error([Bin, Handler, Stack, Opts]). -include_lib("eunit/include/eunit.hrl"). +xcode(Bin) -> xcode(Bin, []). + +xcode(Bin, Opts) -> + Size = size(Bin), + try jsx:to_term(<<34, Bin:Size/binary, 34>>, Opts) + catch error:badarg -> {error, badarg} + end. + + +is_bad({error, badarg}) -> true; +is_bad(_) -> false. + + +bad_utf8_test_() -> + [ + {"orphan continuation byte u+0080", + ?_assert(is_bad(xcode(<<16#0080>>))) + }, + {"orphan continuation byte u+0080 replaced", + ?_assertEqual(xcode(<<16#0080>>, [loose_unicode]), <<16#fffd/utf8>>) + }, + {"orphan continuation byte u+00bf", + ?_assert(is_bad(xcode(<<16#00bf>>))) + }, + {"orphan continuation byte u+00bf replaced", + ?_assertEqual(xcode(<<16#00bf>>, [loose_unicode]), <<16#fffd/utf8>>) + }, + {"2 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>))) + }, + {"2 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 2) + ) + }, + {"3 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>))) + }, + {"3 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 3) + ) + }, + {"4 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>))) + }, + {"4 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 4) + ) + }, + {"5 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>))) + }, + {"5 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 5) + ) + }, + {"6 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>))) + }, + {"6 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 6) + ) + }, + {"all continuation bytes", + ?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>))) + }, + {"all continuation bytes replaced", + ?_assertEqual( + xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf))) + ) + }, + {"lonely start byte", + ?_assert(is_bad(xcode(<<16#00c0>>))) + }, + {"lonely start byte replaced", + ?_assertEqual( + xcode(<<16#00c0>>, [loose_unicode]), + <<16#fffd/utf8>> + ) + }, + {"lonely start bytes (2 byte)", + ?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>))) + }, + {"lonely start bytes (2 byte) replaced", + ?_assertEqual( + xcode(<<16#00c0, 32, 16#00df>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"lonely start bytes (3 byte)", + ?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>))) + }, + {"lonely start bytes (3 byte) replaced", + ?_assertEqual( + xcode(<<16#00e0, 32, 16#00ef>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"lonely start bytes (4 byte)", + ?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>))) + }, + {"lonely start bytes (4 byte) replaced", + ?_assertEqual( + xcode(<<16#00f0, 32, 16#00f7>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"missing continuation byte (3 byte)", + ?_assert(is_bad(xcode(<<224, 160, 32>>))) + }, + {"missing continuation byte (3 byte) replaced", + ?_assertEqual( + xcode(<<224, 160, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"missing continuation byte (4 byte missing one)", + ?_assert(is_bad(xcode(<<240, 144, 128, 32>>))) + }, + {"missing continuation byte2 (4 byte missing one) replaced", + ?_assertEqual( + xcode(<<240, 144, 128, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"missing continuation byte (4 byte missing two)", + ?_assert(is_bad(xcode(<<240, 144, 32>>))) + }, + {"missing continuation byte2 (4 byte missing two) replaced", + ?_assertEqual( + xcode(<<240, 144, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (2 byte)", + ?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (2 byte) replaced", + ?_assertEqual( + xcode(<<16#c0, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (3 byte)", + ?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (3 byte) replaced", + ?_assertEqual( + xcode(<<16#e0, 16#80, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (4 byte)", + ?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (4 byte) replaced", + ?_assertEqual( + xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 2 byte sequence", + ?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>))) + }, + {"highest overlong 2 byte sequence replaced", + ?_assertEqual( + xcode(<<16#c1, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 3 byte sequence", + ?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>))) + }, + {"highest overlong 3 byte sequence replaced", + ?_assertEqual( + xcode(<<16#e0, 16#9f, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 4 byte sequence", + ?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>))) + }, + {"highest overlong 4 byte sequence replaced", + ?_assertEqual( + xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + } + ]. + + comments_test_() -> [ {"preceeding // comment", ?_assertEqual( diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index 573d870..079d454 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -174,6 +174,208 @@ strip_continuations(Bin, _, N) -> {Bin, N}. -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). + +xcode(Bin) -> xcode(Bin, #opts{}). + +xcode(Bin, [loose_unicode]) -> xcode(Bin, #opts{loose_unicode=true}); +xcode(Bin, Opts) -> + try clean_string(Bin, Opts) + catch error:badarg -> {error, badarg} + end. + + +is_bad({error, badarg}) -> true; +is_bad(_) -> false. + + +bad_utf8_test_() -> + [ + {"orphan continuation byte u+0080", + ?_assert(is_bad(xcode(<<16#0080>>))) + }, + {"orphan continuation byte u+0080 replaced", + ?_assertEqual(xcode(<<16#0080>>, [loose_unicode]), <<16#fffd/utf8>>) + }, + {"orphan continuation byte u+00bf", + ?_assert(is_bad(xcode(<<16#00bf>>))) + }, + {"orphan continuation byte u+00bf replaced", + ?_assertEqual(xcode(<<16#00bf>>, [loose_unicode]), <<16#fffd/utf8>>) + }, + {"2 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>))) + }, + {"2 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 2) + ) + }, + {"3 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>))) + }, + {"3 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 3) + ) + }, + {"4 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>))) + }, + {"4 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 4) + ) + }, + {"5 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>))) + }, + {"5 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 5) + ) + }, + {"6 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>))) + }, + {"6 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 6) + ) + }, + {"all continuation bytes", + ?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>))) + }, + {"all continuation bytes replaced", + ?_assertEqual( + xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf))) + ) + }, + {"lonely start byte", + ?_assert(is_bad(xcode(<<16#00c0>>))) + }, + {"lonely start byte replaced", + ?_assertEqual( + xcode(<<16#00c0>>, [loose_unicode]), + <<16#fffd/utf8>> + ) + }, + {"lonely start bytes (2 byte)", + ?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>))) + }, + {"lonely start bytes (2 byte) replaced", + ?_assertEqual( + xcode(<<16#00c0, 32, 16#00df>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"lonely start bytes (3 byte)", + ?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>))) + }, + {"lonely start bytes (3 byte) replaced", + ?_assertEqual( + xcode(<<16#00e0, 32, 16#00ef>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"lonely start bytes (4 byte)", + ?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>))) + }, + {"lonely start bytes (4 byte) replaced", + ?_assertEqual( + xcode(<<16#00f0, 32, 16#00f7>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"missing continuation byte (3 byte)", + ?_assert(is_bad(xcode(<<224, 160, 32>>))) + }, + {"missing continuation byte (3 byte) replaced", + ?_assertEqual( + xcode(<<224, 160, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"missing continuation byte (4 byte missing one)", + ?_assert(is_bad(xcode(<<240, 144, 128, 32>>))) + }, + {"missing continuation byte2 (4 byte missing one) replaced", + ?_assertEqual( + xcode(<<240, 144, 128, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"missing continuation byte (4 byte missing two)", + ?_assert(is_bad(xcode(<<240, 144, 32>>))) + }, + {"missing continuation byte2 (4 byte missing two) replaced", + ?_assertEqual( + xcode(<<240, 144, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (2 byte)", + ?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (2 byte) replaced", + ?_assertEqual( + xcode(<<16#c0, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (3 byte)", + ?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (3 byte) replaced", + ?_assertEqual( + xcode(<<16#e0, 16#80, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (4 byte)", + ?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (4 byte) replaced", + ?_assertEqual( + xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 2 byte sequence", + ?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>))) + }, + {"highest overlong 2 byte sequence replaced", + ?_assertEqual( + xcode(<<16#c1, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 3 byte sequence", + ?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>))) + }, + {"highest overlong 3 byte sequence replaced", + ?_assertEqual( + xcode(<<16#e0, 16#9f, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 4 byte sequence", + ?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>))) + }, + {"highest overlong 4 byte sequence replaced", + ?_assertEqual( + xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + } + ]. + + encode(Term) -> (encoder(jsx, [], []))(Term). encode(Term, Opts) -> @@ -303,14 +505,6 @@ extended_noncharacters_test_() -> } ]. -malformed_test_() -> - [ - {"malformed codepoint with 1 byte", ?_assertError(badarg, encode(<<128>>))}, - {"malformed codepoint with 2 bytes", ?_assertError(badarg, encode(<<128, 192>>))}, - {"malformed codepoint with 3 bytes", ?_assertError(badarg, encode(<<128, 192, 192>>))}, - {"malformed codepoint with 4 bytes", ?_assertError(badarg, encode(<<128, 192, 192, 192>>))} - ]. - check_bad(List) -> lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end, diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 96a9f53..aea689c 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -302,14 +302,26 @@ json_escape(Str, Opts, L, Len) when L < Len -> false -> erlang:error(badarg, [Str, Opts]) end; <> when X >= 192, X =< 223 -> - {Rest, Stripped} = strip_continuations(T, 1, 0), - json_escape(<>, Opts, L + 3, Len + 2 - Stripped); + case Opts#opts.loose_unicode of + true -> + {Rest, Stripped} = strip_continuations(T, 1, 0), + json_escape(<>, Opts, L + 3, Len + 2 - Stripped); + false -> erlang:error(badarg, [Str, Opts]) + end; <> when X >= 224, X =< 239 -> - {Rest, Stripped} = strip_continuations(T, 2, 0), - json_escape(<>, Opts, L + 3, Len + 2 - Stripped); + case Opts#opts.loose_unicode of + true -> + {Rest, Stripped} = strip_continuations(T, 2, 0), + json_escape(<>, Opts, L + 3, Len + 2 - Stripped); + false -> erlang:error(badarg, [Str, Opts]) + end; <> when X >= 240, X =< 247 -> - {Rest, Stripped} = strip_continuations(T, 3, 0), - json_escape(<>, Opts, L + 3, Len + 2 - Stripped); + case Opts#opts.loose_unicode of + true -> + {Rest, Stripped} = strip_continuations(T, 3, 0), + json_escape(<>, Opts, L + 3, Len + 2 - Stripped); + false -> erlang:error(badarg, [Str, Opts]) + end; <> -> case Opts#opts.loose_unicode of true -> json_escape(<>, Opts, L + 3, Len + 2); @@ -347,6 +359,207 @@ strip_continuations(Bin, _, N) -> {Bin, N}. -include_lib("eunit/include/eunit.hrl"). +xcode(Bin) -> xcode(Bin, #opts{}). + +xcode(Bin, [loose_unicode]) -> xcode(Bin, #opts{loose_unicode=true}); +xcode(Bin, Opts) -> + try json_escape(Bin, Opts) + catch error:badarg -> {error, badarg} + end. + + +is_bad({error, badarg}) -> true; +is_bad(_) -> false. + + +bad_utf8_test_() -> + [ + {"orphan continuation byte u+0080", + ?_assert(is_bad(xcode(<<16#0080>>))) + }, + {"orphan continuation byte u+0080 replaced", + ?_assertEqual(xcode(<<16#0080>>, [loose_unicode]), <<16#fffd/utf8>>) + }, + {"orphan continuation byte u+00bf", + ?_assert(is_bad(xcode(<<16#00bf>>))) + }, + {"orphan continuation byte u+00bf replaced", + ?_assertEqual(xcode(<<16#00bf>>, [loose_unicode]), <<16#fffd/utf8>>) + }, + {"2 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>))) + }, + {"2 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 2) + ) + }, + {"3 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>))) + }, + {"3 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 3) + ) + }, + {"4 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>))) + }, + {"4 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 4) + ) + }, + {"5 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>))) + }, + {"5 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 5) + ) + }, + {"6 continuation bytes", + ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>))) + }, + {"6 continuation bytes replaced", + ?_assertEqual( + xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, 6) + ) + }, + {"all continuation bytes", + ?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>))) + }, + {"all continuation bytes replaced", + ?_assertEqual( + xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [loose_unicode]), + binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf))) + ) + }, + {"lonely start byte", + ?_assert(is_bad(xcode(<<16#00c0>>))) + }, + {"lonely start byte replaced", + ?_assertEqual( + xcode(<<16#00c0>>, [loose_unicode]), + <<16#fffd/utf8>> + ) + }, + {"lonely start bytes (2 byte)", + ?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>))) + }, + {"lonely start bytes (2 byte) replaced", + ?_assertEqual( + xcode(<<16#00c0, 32, 16#00df>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"lonely start bytes (3 byte)", + ?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>))) + }, + {"lonely start bytes (3 byte) replaced", + ?_assertEqual( + xcode(<<16#00e0, 32, 16#00ef>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"lonely start bytes (4 byte)", + ?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>))) + }, + {"lonely start bytes (4 byte) replaced", + ?_assertEqual( + xcode(<<16#00f0, 32, 16#00f7>>, [loose_unicode]), + <<16#fffd/utf8, 32, 16#fffd/utf8>> + ) + }, + {"missing continuation byte (3 byte)", + ?_assert(is_bad(xcode(<<224, 160, 32>>))) + }, + {"missing continuation byte (3 byte) replaced", + ?_assertEqual( + xcode(<<224, 160, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"missing continuation byte (4 byte missing one)", + ?_assert(is_bad(xcode(<<240, 144, 128, 32>>))) + }, + {"missing continuation byte2 (4 byte missing one) replaced", + ?_assertEqual( + xcode(<<240, 144, 128, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"missing continuation byte (4 byte missing two)", + ?_assert(is_bad(xcode(<<240, 144, 32>>))) + }, + {"missing continuation byte2 (4 byte missing two) replaced", + ?_assertEqual( + xcode(<<240, 144, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (2 byte)", + ?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (2 byte) replaced", + ?_assertEqual( + xcode(<<16#c0, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (3 byte)", + ?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (3 byte) replaced", + ?_assertEqual( + xcode(<<16#e0, 16#80, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"overlong encoding of u+002f (4 byte)", + ?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>))) + }, + {"overlong encoding of u+002f (4 byte) replaced", + ?_assertEqual( + xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 2 byte sequence", + ?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>))) + }, + {"highest overlong 2 byte sequence replaced", + ?_assertEqual( + xcode(<<16#c1, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 3 byte sequence", + ?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>))) + }, + {"highest overlong 3 byte sequence replaced", + ?_assertEqual( + xcode(<<16#e0, 16#9f, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + }, + {"highest overlong 4 byte sequence", + ?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>))) + }, + {"highest overlong 4 byte sequence replaced", + ?_assertEqual( + xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [loose_unicode]), + <<16#fffd/utf8, 32>> + ) + } + ]. + + binary_escape_test_() -> [ {"json string escaping", From 65449753791a9dd259a32ac289cb6f191b7f63e1 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 28 Mar 2012 23:46:06 -0700 Subject: [PATCH 08/19] add more comprehensive checking to json_escape --- src/jsx_utils.erl | 136 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 127 insertions(+), 9 deletions(-) diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index aea689c..7c688f9 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -592,15 +592,6 @@ binary_escape_test_() -> <<"\\/Date(1303502009425)\\/">> ) }, - {"bad utf8", - ?_assertError(badarg, json_escape(<<32, 64, 128, 255>>, #opts{})) - }, - {"bad utf8 ok", - ?_assertEqual( - json_escape(<<32, 64, 128, 255>>, #opts{loose_unicode=true}), - <<32, 64, 16#fffd/utf8, 16#fffd/utf8>> - ) - }, {"bad surrogate", ?_assertError(badarg, json_escape(<<237, 160, 127>>, #opts{}))}, {"bad surrogate ok", ?_assertEqual( @@ -616,4 +607,131 @@ binary_escape_test_() -> } ]. + +surrogates_test_() -> + [ + {"surrogates - badjson", + ?_assertEqual(check_bad(surrogates()), []) + }, + {"surrogates - replaced", + ?_assertEqual(check_replaced(surrogates()), []) + } + ]. + + +good_characters_test_() -> + [ + {"acceptable codepoints", + ?_assertEqual(check_good(good()), []) + }, + {"acceptable extended", + ?_assertEqual(check_good(good_extended()), []) + } + ]. + + +reserved_test_() -> + [ + {"reserved noncharacters - badjson", + ?_assertEqual(check_bad(reserved_space()), []) + }, + {"reserved noncharacters - replaced", + ?_assertEqual(check_replaced(reserved_space()), []) + } + ]. + + +noncharacters_test_() -> + [ + {"noncharacters - badjson", + ?_assertEqual(check_bad(noncharacters()), []) + }, + {"noncharacters - replaced", + ?_assertEqual(check_replaced(noncharacters()), []) + } + ]. + + +extended_noncharacters_test_() -> + [ + {"extended noncharacters - badjson", + ?_assertEqual(check_bad(extended_noncharacters()), []) + }, + {"extended noncharacters - replaced", + ?_assertEqual(check_replaced(extended_noncharacters()), []) + } + ]. + + +check_bad(List) -> + lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end, + check(List, #opts{}, []) + ). + + +check_replaced(List) -> + lists:dropwhile(fun({_, <<16#fffd/utf8>>}) -> true + ; (_) -> false + end, + check(List, #opts{loose_unicode=true}, []) + ). + + +check_good(List) -> + lists:dropwhile(fun({_, _}) -> true ; (_) -> false end, + check(List, #opts{}, []) + ). + + +check([], _Opts, Acc) -> Acc; +check([H|T], Opts, Acc) -> + R = escape(to_fake_utf(H, utf8), Opts), + check(T, Opts, [{H, R}] ++ Acc). + + +escape(JSON, Opts) -> + try json_escape(JSON, Opts) + catch error:badarg -> {error, badjson} + end. + + +noncharacters() -> lists:seq(16#fffe, 16#ffff). + + +extended_noncharacters() -> + [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] + ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] + ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] + ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] + ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] + ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] + ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] + ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. + + +surrogates() -> lists:seq(16#d800, 16#dfff). + + +reserved_space() -> lists:seq(16#fdd0, 16#fdef). + + +good() -> lists:seq(16#0000, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). + + +good_extended() -> lists:seq(16#100000, 16#10fffd). + + +%% erlang refuses to encode certain codepoints, so fake them all +to_fake_utf(N, utf8) when N < 16#0080 -> <>; +to_fake_utf(N, utf8) when N < 16#0800 -> + <<0:5, Y:5, X:6>> = <>, + <<2#110:3, Y:5, 2#10:2, X:6>>; +to_fake_utf(N, utf8) when N < 16#10000 -> + <> = <>, + <<2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6>>; +to_fake_utf(N, utf8) -> + <<0:3, W:3, Z:6, Y:6, X:6>> = <>, + <<2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6>>. + + -endif. \ No newline at end of file From 9d2448669e8d8b64ced9d4d70c33db3fb58e6042 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 28 Mar 2012 23:46:18 -0700 Subject: [PATCH 09/19] whitespace --- src/jsx_decoder.erl | 21 +++++++++++++++++++-- src/jsx_encoder.erl | 21 ++++++++++++++++++--- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 4ef6ed1..77ae2a6 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -1426,6 +1426,7 @@ comments_test_() -> )} ]. + escape_forward_slash_test_() -> [ {"escape forward slash test", ?_assertEqual( @@ -1434,6 +1435,7 @@ escape_forward_slash_test_() -> )} ]. + noncharacters_test_() -> [ {"noncharacters - badjson", @@ -1444,6 +1446,7 @@ noncharacters_test_() -> } ]. + extended_noncharacters_test_() -> [ {"extended noncharacters - badjson", @@ -1454,6 +1457,7 @@ extended_noncharacters_test_() -> } ]. + surrogates_test_() -> [ {"surrogates - badjson", @@ -1464,6 +1468,7 @@ surrogates_test_() -> } ]. + control_test_() -> [ {"control characters - badjson", @@ -1471,6 +1476,7 @@ control_test_() -> } ]. + reserved_test_() -> [ {"reserved noncharacters - badjson", @@ -1480,6 +1486,7 @@ reserved_test_() -> ?_assertEqual(check_replaced(reserved_space()), []) } ]. + good_characters_test_() -> [ @@ -1497,6 +1504,7 @@ check_bad(List) -> check(List, [], []) ). + check_replaced(List) -> lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true ; (_) -> false @@ -1504,11 +1512,13 @@ check_replaced(List) -> check(List, [loose_unicode], []) ). + check_good(List) -> lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end, check(List, [], []) ). + check([], _Opts, Acc) -> Acc; check([H|T], Opts, Acc) -> R = decode(to_fake_utf(H, utf8), Opts), @@ -1524,7 +1534,8 @@ decode(JSON, Opts) -> noncharacters() -> lists:seq(16#fffe, 16#ffff). - + + extended_noncharacters() -> [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] @@ -1535,20 +1546,26 @@ extended_noncharacters() -> ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. + surrogates() -> lists:seq(16#d800, 16#dfff). + control_characters() -> lists:seq(1, 31). + reserved_space() -> lists:seq(16#fdd0, 16#fdef). + good() -> [32, 33] ++ lists:seq(16#23, 16#5b) ++ lists:seq(16#5d, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). - + + good_extended() -> lists:seq(16#100000, 16#10fffd). + %% erlang refuses to encode certain codepoints, so fake them all to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>; to_fake_utf(N, utf8) when N < 16#0800 -> diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index 079d454..b1e95f7 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -455,6 +455,7 @@ encode_test_() -> } ]. + surrogates_test_() -> [ {"surrogates - badjson", @@ -464,7 +465,8 @@ surrogates_test_() -> ?_assertEqual(check_replaced(surrogates()), []) } ]. - + + good_characters_test_() -> [ {"acceptable codepoints", @@ -475,6 +477,7 @@ good_characters_test_() -> } ]. + reserved_test_() -> [ {"reserved noncharacters - badjson", @@ -485,6 +488,7 @@ reserved_test_() -> } ]. + noncharacters_test_() -> [ {"noncharacters - badjson", @@ -495,6 +499,7 @@ noncharacters_test_() -> } ]. + extended_noncharacters_test_() -> [ {"extended noncharacters - badjson", @@ -511,6 +516,7 @@ check_bad(List) -> check(List, [], []) ). + check_replaced(List) -> lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true ; (_) -> false @@ -518,11 +524,13 @@ check_replaced(List) -> check(List, [loose_unicode], []) ). + check_good(List) -> lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end, check(List, [], []) ). + check([], _Opts, Acc) -> Acc; check([H|T], Opts, Acc) -> R = encode(to_fake_utf(H, utf8), Opts), @@ -530,7 +538,8 @@ check([H|T], Opts, Acc) -> noncharacters() -> lists:seq(16#fffe, 16#ffff). - + + extended_noncharacters() -> [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] @@ -541,14 +550,19 @@ extended_noncharacters() -> ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. + surrogates() -> lists:seq(16#d800, 16#dfff). + reserved_space() -> lists:seq(16#fdd0, 16#fdef). + good() -> lists:seq(16#0000, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). - + + good_extended() -> lists:seq(16#100000, 16#10fffd). + %% erlang refuses to encode certain codepoints, so fake them all to_fake_utf(N, utf8) when N < 16#0080 -> <>; to_fake_utf(N, utf8) when N < 16#0800 -> @@ -561,4 +575,5 @@ to_fake_utf(N, utf8) -> <<0:3, W:3, Z:6, Y:6, X:6>> = <>, <<2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6>>. + -endif. \ No newline at end of file From 19e710da55cffc4dc639ff23ba7510b735163e6a Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 00:01:50 -0700 Subject: [PATCH 10/19] add dirty_strings option to bypass json escaping strings during encoding --- src/jsx_opts.hrl | 3 ++- src/jsx_utils.erl | 23 +++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/jsx_opts.hrl b/src/jsx_opts.hrl index a184dbe..942ae14 100644 --- a/src/jsx_opts.hrl +++ b/src/jsx_opts.hrl @@ -5,5 +5,6 @@ single_quotes = false, no_jsonp_escapes = false, comments = false, - json_escape = false + json_escape = false, + dirty_strings = false }). \ No newline at end of file diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 7c688f9..479187f 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -51,6 +51,8 @@ parse_opts([comments|Rest], Opts) -> parse_opts(Rest, Opts#opts{comments=true}); parse_opts([json_escape|Rest], Opts) -> parse_opts(Rest, Opts#opts{json_escape=true}); +parse_opts([dirty_strings|Rest], Opts) -> + parse_opts(Rest, Opts#opts{json_escape=true}); parse_opts(_, _) -> {error, badarg}. @@ -63,7 +65,8 @@ valid_flags() -> single_quotes, no_jsonp_escapes, comments, - json_escape + json_escape, + dirty_strings ]. @@ -88,7 +91,10 @@ extract_parser_opts([K|Rest], Acc) -> %% everything else should be a legal json string component json_escape(String, Opts) when is_binary(String) -> - json_escape(String, Opts, 0, size(String)). + case Opts#opts.dirty_strings of + true -> String + ; false -> json_escape(String, Opts, 0, size(String)) + end. -define(control_character(X), @@ -592,17 +598,10 @@ binary_escape_test_() -> <<"\\/Date(1303502009425)\\/">> ) }, - {"bad surrogate", ?_assertError(badarg, json_escape(<<237, 160, 127>>, #opts{}))}, - {"bad surrogate ok", + {"dirty strings", ?_assertEqual( - json_escape(<<237, 160, 127>>, #opts{loose_unicode=true}), - <<16#fffd/utf8>> - ) - }, - {"all sizes of codepoints", - ?_assertEqual( - json_escape(unicode:characters_to_binary([0, 32, 16#80, 16#800, 16#10000]), #opts{}), - <<"\\u0000", 32/utf8, 16#80/utf8, 16#800/utf8, 16#10000/utf8>> + json_escape(<<"\\x25\\uffff">>, #opts{dirty_strings=true}), + <<"\\x25\\uffff">> ) } ]. From 73f4dadde59df22f226e3ed0ab73ffb937c091a0 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 00:05:09 -0700 Subject: [PATCH 11/19] document dirty_strings in README --- README.markdown | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.markdown b/README.markdown index e9de71a..5b06ebe 100644 --- a/README.markdown +++ b/README.markdown @@ -127,6 +127,10 @@ javascript interpreters treat the codepoints `u+2028` and `u+2029` as significan json has no official comments but some parsers allow c style comments. this flag allows comments (both `// ...` and `/* ... */` style) anywhere whitespace is allowed +#### `dirty_strings` #### + +json escaping is lossy, it mutates the json string and repeated application can result in unwanted behaviour. if your strings are already escaped (or you'd like to force invalid strings into "json") use this flag to bypass escaping + ### incomplete input ### From 867199539c84ba90f1b7dd5f06b982ee6990b51a Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 00:13:08 -0700 Subject: [PATCH 12/19] add (failing) tests for bad escape sequences --- src/jsx_decoder.erl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 77ae2a6..7015ac2 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -1242,6 +1242,19 @@ bad_utf8_test_() -> ]. +ignore_bad_escapes_test_() -> + [ + {"ignore unrecognized escape sequence", ?_assertEqual( + decode(<<"[\"\\x25\"]">>, [ignore_bad_escapes]), + [start_array, {string, <<"\\x25">>}, end_array, end_json] + )}, + {"ignore invalid \\uXXXX escape sequence", ?_assertEqual( + decode(<<"[\"\\uFFFF\"]">>, [ignore_bad_escapes]), + [start_array, {string, <<"\\uFFFF">>}, end_array, end_json] + )} + ]. + + comments_test_() -> [ {"preceeding // comment", ?_assertEqual( From 5f6d7c1f01dedef46ef6f37f759fb6873a988e76 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 00:18:53 -0700 Subject: [PATCH 13/19] upon further consideration, ignoring bad \uuXXXX escape sequences could be dangerous, remove test --- src/jsx_decoder.erl | 6 ++---- src/jsx_opts.hrl | 3 ++- src/jsx_utils.erl | 5 ++++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 7015ac2..a380022 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -594,6 +594,8 @@ escape(<<$u, Rest/binary>>, Handler, Stack, Opts) -> escaped_unicode(Rest, Handler, Stack, Opts); escape(<<>>, Handler, Stack, Opts) -> ?incomplete(escape, <<>>, Handler, Stack, Opts); +escape(Bin, Handler, [Acc|Stack], Opts=#opts{ignore_bad_escapes=true}) -> + string(Bin, Handler, [?acc_seq(Acc, ?rsolidus)|Stack], Opts); escape(Bin, Handler, Stack, Opts) -> ?error([Bin, Handler, Stack, Opts]). @@ -1247,10 +1249,6 @@ ignore_bad_escapes_test_() -> {"ignore unrecognized escape sequence", ?_assertEqual( decode(<<"[\"\\x25\"]">>, [ignore_bad_escapes]), [start_array, {string, <<"\\x25">>}, end_array, end_json] - )}, - {"ignore invalid \\uXXXX escape sequence", ?_assertEqual( - decode(<<"[\"\\uFFFF\"]">>, [ignore_bad_escapes]), - [start_array, {string, <<"\\uFFFF">>}, end_array, end_json] )} ]. diff --git a/src/jsx_opts.hrl b/src/jsx_opts.hrl index 942ae14..3db2dcb 100644 --- a/src/jsx_opts.hrl +++ b/src/jsx_opts.hrl @@ -6,5 +6,6 @@ no_jsonp_escapes = false, comments = false, json_escape = false, - dirty_strings = false + dirty_strings = false, + ignore_bad_escapes = false }). \ No newline at end of file diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 479187f..71024d8 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -53,6 +53,8 @@ parse_opts([json_escape|Rest], Opts) -> parse_opts(Rest, Opts#opts{json_escape=true}); parse_opts([dirty_strings|Rest], Opts) -> parse_opts(Rest, Opts#opts{json_escape=true}); +parse_opts([ignore_bad_escapes|Rest], Opts) -> + parse_opts(Rest, Opts#opts{ignore_bad_escapes=true}); parse_opts(_, _) -> {error, badarg}. @@ -66,7 +68,8 @@ valid_flags() -> no_jsonp_escapes, comments, json_escape, - dirty_strings + dirty_strings, + ignore_bad_escapes ]. From f909a5bc6d6516d8b92d6073d16e0f9cc52903b2 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 00:20:12 -0700 Subject: [PATCH 14/19] document ignore_bad_escapes --- README.markdown | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.markdown b/README.markdown index 5b06ebe..1d501fc 100644 --- a/README.markdown +++ b/README.markdown @@ -131,6 +131,10 @@ json has no official comments but some parsers allow c style comments. this flag json escaping is lossy, it mutates the json string and repeated application can result in unwanted behaviour. if your strings are already escaped (or you'd like to force invalid strings into "json") use this flag to bypass escaping +#### `ignore_bad_escapes` #### + +during decoding, ignore unrecognized escape sequences and leave them as is in the stream + ### incomplete input ### From c8ff83a39489072beb9f74de1b72be96f68c9264 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 00:24:31 -0700 Subject: [PATCH 15/19] add relax option, a synonym for [loose_unicode, single_quotes, comments, ignore_bad_escapes] --- README.markdown | 4 ++++ src/jsx_utils.erl | 12 ++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/README.markdown b/README.markdown index 1d501fc..24281eb 100644 --- a/README.markdown +++ b/README.markdown @@ -135,6 +135,10 @@ json escaping is lossy, it mutates the json string and repeated application can during decoding, ignore unrecognized escape sequences and leave them as is in the stream +#### `relax` #### + +relax is a synonym for `[loose_unicode, single_quotes, comments, ignore_bad_escapes]` + ### incomplete input ### diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 71024d8..5402875 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -52,9 +52,16 @@ parse_opts([comments|Rest], Opts) -> parse_opts([json_escape|Rest], Opts) -> parse_opts(Rest, Opts#opts{json_escape=true}); parse_opts([dirty_strings|Rest], Opts) -> - parse_opts(Rest, Opts#opts{json_escape=true}); + parse_opts(Rest, Opts#opts{dirty_strings=true}); parse_opts([ignore_bad_escapes|Rest], Opts) -> parse_opts(Rest, Opts#opts{ignore_bad_escapes=true}); +parse_opts([relax|Rest], Opts) -> + parse_opts(Rest, Opts#opts{ + loose_unicode = true, + single_quotes = true, + comments = true, + ignore_bad_escapes = true + }); parse_opts(_, _) -> {error, badarg}. @@ -69,7 +76,8 @@ valid_flags() -> comments, json_escape, dirty_strings, - ignore_bad_escapes + ignore_bad_escapes, + relax ]. From f277edabb558d4313ffa30811daf4cc4bc7c57b4 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 06:03:14 -0700 Subject: [PATCH 16/19] opts tests --- src/jsx_utils.erl | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 5402875..7362ae7 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -618,6 +618,48 @@ binary_escape_test_() -> ]. +opts_test_() -> + [ + {"all flags", + ?_assertEqual( + parse_opts([ + loose_unicode, + escape_forward_slash, + explicit_end, + single_quotes, + no_jsonp_escapes, + comments, + json_escape, + dirty_strings, + ignore_bad_escapes + ]), + #opts{ + loose_unicode=true, + escape_forward_slash=true, + explicit_end=true, + single_quotes=true, + no_jsonp_escapes=true, + comments=true, + json_escape=true, + dirty_strings=true, + ignore_bad_escapes=true + } + ) + }, + {"relax flag", + ?_assertEqual( + parse_opts([relax]), + #opts{ + loose_unicode=true, + single_quotes=true, + comments=true, + ignore_bad_escapes=true + } + ) + } + ]. + + surrogates_test_() -> [ {"surrogates - badjson", From 63535b27d7f57540b0676939e85528d23ac4252c Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 06:25:49 -0700 Subject: [PATCH 17/19] check more extended codepoints --- src/jsx_decoder.erl | 5 ++++- src/jsx_encoder.erl | 5 ++++- src/jsx_utils.erl | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index a380022..63df98e 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -1574,7 +1574,10 @@ good() -> [32, 33] ++ lists:seq(16#fdf0, 16#fffd). -good_extended() -> lists:seq(16#100000, 16#10fffd). +good_extended() -> [16#10000, 16#20000, 16#30000, 16#40000, 16#50000, + 16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, + 16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000 + ] ++ lists:seq(16#100000, 16#10fffd). %% erlang refuses to encode certain codepoints, so fake them all diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index b1e95f7..bef380d 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -560,7 +560,10 @@ reserved_space() -> lists:seq(16#fdd0, 16#fdef). good() -> lists:seq(16#0000, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). -good_extended() -> lists:seq(16#100000, 16#10fffd). +good_extended() -> [16#10000, 16#20000, 16#30000, 16#40000, 16#50000, + 16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, + 16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000 + ] ++ lists:seq(16#100000, 16#10fffd). %% erlang refuses to encode certain codepoints, so fake them all diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 7362ae7..93c0eea 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -770,7 +770,10 @@ reserved_space() -> lists:seq(16#fdd0, 16#fdef). good() -> lists:seq(16#0000, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). -good_extended() -> lists:seq(16#100000, 16#10fffd). +good_extended() -> [16#10000, 16#20000, 16#30000, 16#40000, 16#50000, + 16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, + 16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000 + ] ++ lists:seq(16#100000, 16#10fffd). %% erlang refuses to encode certain codepoints, so fake them all From 6f6a6601776d37b0f7089711cbd398b150a71ec3 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 21:43:55 -0700 Subject: [PATCH 18/19] fix escaping problems in format/x --- src/jsx_decoder.erl | 13 +- src/jsx_encoder.erl | 87 ++++++--- src/jsx_to_json.erl | 13 +- src/jsx_utils.erl | 434 ++++---------------------------------------- 4 files changed, 105 insertions(+), 442 deletions(-) diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 63df98e..a6c1451 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -270,11 +270,11 @@ string(<<33, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(<>, {Handler, State}, S, Opts) -> case S of [Acc, key|Stack] -> - colon(Rest, {Handler, Handler:handle_event({key, ?end_seq(Acc)}, State)}, [key|Stack], Opts); + colon(Rest, {Handler, Handler:handle_event({key, maybe_escape(?end_seq(Acc), Opts)}, State)}, [key|Stack], Opts); [_Acc, single_quote|_Stack] -> ?error([<>, {Handler, State}, S, Opts]); [Acc|Stack] -> - maybe_done(Rest, {Handler, Handler:handle_event({string, ?end_seq(Acc)}, State)}, Stack, Opts) + maybe_done(Rest, {Handler, Handler:handle_event({string, maybe_escape(?end_seq(Acc), Opts)}, State)}, Stack, Opts) end; string(<<35, Rest/binary>>, Handler, [Acc|Stack], Opts) -> string(Rest, Handler, [?acc_seq(Acc, 35)|Stack], Opts); @@ -289,9 +289,9 @@ string(<>, {Handler, State}, [Acc|Stack], Opts) -> true -> case Stack of [single_quote, key|S] -> - colon(Rest, {Handler, Handler:handle_event({key, ?end_seq(Acc)}, State)}, [key|S], Opts) + colon(Rest, {Handler, Handler:handle_event({key, maybe_escape(?end_seq(Acc), Opts)}, State)}, [key|S], Opts) ; [single_quote|S] -> - maybe_done(Rest, {Handler, Handler:handle_event({string, ?end_seq(Acc)}, State)}, S, Opts) + maybe_done(Rest, {Handler, Handler:handle_event({string, maybe_escape(?end_seq(Acc), Opts)}, State)}, S, Opts) ; _ -> string(Rest, {Handler, State}, [?acc_seq(Acc, ?singlequote)|Stack], Opts) end @@ -530,6 +530,11 @@ string(Bin, Handler, Stack, Opts) -> ; false -> ?error([Bin, Handler, Stack, Opts]) end end. + + +maybe_escape(Str, Opts=#opts{json_escape=true}) -> jsx_utils:json_escape(Str, Opts); +maybe_escape(Str, _Opts) -> Str. + %% we don't need to guard against partial utf here, because it's already taken %% care of in string diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index bef380d..f1d14f6 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -104,12 +104,41 @@ fix_key(Key) when is_binary(Key) -> Key. clean_string(Bin, Opts) -> - case Opts#opts.json_escape of - true -> jsx_utils:json_escape(Bin, Opts); - false -> clean_string(Bin, 0, size(Bin), Opts) + case Opts#opts.loose_unicode of + true -> jsx_utils:json_escape(clean_string(Bin, 0, size(Bin), Opts), Opts) + ; false -> + case is_clean(Bin) of + true -> jsx_utils:json_escape(Bin, Opts) + ; false -> erlang:error(badarg, [Bin, Opts]) + end end. +is_clean(<<>>) -> true; +is_clean(<>) when X < 16#80 -> is_clean(Rest); +is_clean(<>) when X < 16#800 -> is_clean(Rest); +is_clean(<>) when X < 16#dcff -> is_clean(Rest); +is_clean(<>) when X > 16#dfff, X < 16#fdd0 -> is_clean(Rest); +is_clean(<>) when X > 16#fdef, X < 16#fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#10000, X < 16#1fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#20000, X < 16#2fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#30000, X < 16#3fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#40000, X < 16#4fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#50000, X < 16#5fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#60000, X < 16#6fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#70000, X < 16#7fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#80000, X < 16#8fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#90000, X < 16#9fffe -> is_clean(Rest); +is_clean(<>) when X >= 16#a0000, X < 16#afffe -> is_clean(Rest); +is_clean(<>) when X >= 16#b0000, X < 16#bfffe -> is_clean(Rest); +is_clean(<>) when X >= 16#c0000, X < 16#cfffe -> is_clean(Rest); +is_clean(<>) when X >= 16#d0000, X < 16#dfffe -> is_clean(Rest); +is_clean(<>) when X >= 16#e0000, X < 16#efffe -> is_clean(Rest); +is_clean(<>) when X >= 16#f0000, X < 16#ffffe -> is_clean(Rest); +is_clean(<>) when X >= 16#100000, X < 16#10fffe -> is_clean(Rest); +is_clean(Bin) -> erlang:error(badarg, [Bin]). + + clean_string(Str, Len, Len, _Opts) -> Str; clean_string(Str, L, Len, Opts) -> case Str of @@ -134,33 +163,31 @@ clean_string(Str, L, Len, Opts) -> ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#e0000, X < 16#efffe -> clean_string(Str, L + 4, Len, Opts) ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#f0000, X < 16#ffffe -> clean_string(Str, L + 4, Len, Opts) ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#100000, X < 16#10fffe -> clean_string(Str, L + 4, Len, Opts) - ; <> -> - case Opts#opts.loose_unicode of - true -> - case Rest of - %% surrogates - <<237, X, _, T/binary>> when X >= 160 -> - clean_string(<>, L + 3, Len, Opts) - %% u+fffe and u+ffff for R14BXX - ; <<239, 191, X, T/binary>> when X == 190; X == 191 -> - clean_string(<>, L + 3, Len, Opts) - %% overlong encodings and missing continuations of a 2 byte sequence - ; <> when X >= 192, X =< 223 -> - {Tail, Stripped} = strip_continuations(T, 1, 0), - clean_string(<>, L + 3, Len + 2 - Stripped, Opts) - %% overlong encodings and missing continuations of a 3 byte sequence - ; <> when X >= 224, X =< 239 -> - {Tail, Stripped} = strip_continuations(T, 2, 0), - clean_string(<>, L + 3, Len + 2 - Stripped, Opts) - ; <> when X >= 240, X =< 247 -> - {Tail, Stripped} = strip_continuations(T, 3, 0), - clean_string(<>, L + 3, Len + 2 - Stripped, Opts) - ; <<_, T/binary>> -> - clean_string(<>, L + 3, Len + 2, Opts) - end - ; false -> - erlang:error(badarg, [Str, Opts]) - end + %% noncharacters + ; <> when X < 16#10000 -> + clean_string(<>, L + 3, Len, Opts) + ; <> -> + clean_string(<>, L + 4, Len, Opts) + %% surrogates + ; <> when X >= 160 -> + clean_string(<>, L + 3, Len, Opts) + %% u+fffe and u+ffff for R14BXX + ; <> when X == 190; X == 191 -> + clean_string(<>, L + 3, Len, Opts) + %% overlong encodings and missing continuations of a 2 byte sequence + ; <> when X >= 192, X =< 223 -> + {Tail, Stripped} = strip_continuations(T, 1, 0), + clean_string(<>, L + 3, Len + 2 - Stripped, Opts) + %% overlong encodings and missing continuations of a 3 byte sequence + ; <> when X >= 224, X =< 239 -> + {Tail, Stripped} = strip_continuations(T, 2, 0), + clean_string(<>, L + 3, Len + 2 - Stripped, Opts) + %% overlong encodings and missing continuations of a 4 byte sequence + ; <> when X >= 240, X =< 247 -> + {Tail, Stripped} = strip_continuations(T, 3, 0), + clean_string(<>, L + 3, Len + 2 - Stripped, Opts) + ; <> -> + clean_string(<>, L + 3, Len + 2, Opts) end. diff --git a/src/jsx_to_json.erl b/src/jsx_to_json.erl index 4d2ea14..739dcd7 100644 --- a/src/jsx_to_json.erl +++ b/src/jsx_to_json.erl @@ -39,7 +39,7 @@ -spec to_json(Source::any(), Opts::opts()) -> binary(). to_json(Source, Opts) when is_list(Opts) -> - (jsx:encoder(?MODULE, Opts, jsx_utils:extract_opts([json_escape] ++ Opts)))(Source). + (jsx:encoder(?MODULE, Opts, jsx_utils:extract_opts(Opts)))(Source). -spec format(Source::binary(), Opts::opts()) -> binary(). @@ -195,6 +195,9 @@ basic_format_test_() -> [{"naked float", ?_assertEqual(format(<<"1.23">>, []), <<"1.23">>)}] }, {"naked string", ?_assertEqual(format(<<"\"hi\"">>, []), <<"\"hi\"">>)}, + {"naked string with control character", ?_assertEqual( + format(<<"\"hi\\n\"">>, [json_escape]), <<"\"hi\\n\"">> + )}, {"naked literal", ?_assertEqual(format(<<"true">>, []), <<"true">>)}, {"simple object", ?_assertEqual( format(<<" { \"key\" :\n\t \"value\"\r\r\r\n } ">>, []), @@ -241,6 +244,9 @@ basic_to_json_test_() -> [{"naked float", ?_assertEqual(to_json(1.23, []) , <<"1.23">>)}] }, {"naked string", ?_assertEqual(to_json(<<"hi">>, []), <<"\"hi\"">>)}, + {"naked string with control character", ?_assertEqual( + to_json(<<"hi\n">>, [json_escape]), <<"\"hi\\n\"">> + )}, {"naked literal", ?_assertEqual(to_json(true, []), <<"true">>)}, {"simple object", ?_assertEqual( to_json( @@ -324,10 +330,5 @@ opts_test_() -> )} ]. -ext_opts_test_() -> - [{"extopts", ?_assertEqual( - format(<<"[]">>, [loose_unicode, {escape_forward_slash, true}]), - <<"[]">> - )}]. -endif. \ No newline at end of file diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index 93c0eea..ac1c9f3 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -280,70 +280,39 @@ json_escape(Str, Opts, L, Len) when L < Len -> json_escape(Str, Opts, L + 3, Len); <<_:L/binary, X/utf8, _/binary>> when X > 16#fdef, X < 16#fffe -> json_escape(Str, Opts, L + 3, Len); - <> when X < 16#10000 -> - case Opts#opts.loose_unicode of - true -> json_escape(<>, Opts, L + 3, Len); - false -> erlang:error(badarg, [Str, Opts]) - end; - <> - when X == 16#1fffe; X == 16#1ffff; - X == 16#2fffe; X == 16#2ffff; - X == 16#3fffe; X == 16#3ffff; - X == 16#4fffe; X == 16#4ffff; - X == 16#5fffe; X == 16#5ffff; - X == 16#6fffe; X == 16#6ffff; - X == 16#7fffe; X == 16#7ffff; - X == 16#8fffe; X == 16#8ffff; - X == 16#9fffe; X == 16#9ffff; - X == 16#afffe; X == 16#affff; - X == 16#bfffe; X == 16#bffff; - X == 16#cfffe; X == 16#cffff; - X == 16#dfffe; X == 16#dffff; - X == 16#efffe; X == 16#effff; - X == 16#ffffe; X == 16#fffff; - X == 16#10fffe; X == 16#10ffff -> - case Opts#opts.loose_unicode of - true -> json_escape(<>, Opts, L + 3, Len - 1); - false -> erlang:error(badarg, [Str, Opts]) - end; - <<_:L/binary, X/utf8, _/binary>> when X >= 16#10000 -> + <<_:L/binary, X/utf8, _/binary>> when X >= 16#10000, X < 16#1fffe -> json_escape(Str, Opts, L + 4, Len); - <> when X >= 160 -> - case Opts#opts.loose_unicode of - true -> json_escape(<>, Opts, L + 3, Len); - false -> erlang:error(badarg, [Str, Opts]) - end; - <> when X == 190; X == 191 -> - case Opts#opts.loose_unicode of - true -> json_escape(<>, Opts, L + 3, Len); - false -> erlang:error(badarg, [Str, Opts]) - end; - <> when X >= 192, X =< 223 -> - case Opts#opts.loose_unicode of - true -> - {Rest, Stripped} = strip_continuations(T, 1, 0), - json_escape(<>, Opts, L + 3, Len + 2 - Stripped); - false -> erlang:error(badarg, [Str, Opts]) - end; - <> when X >= 224, X =< 239 -> - case Opts#opts.loose_unicode of - true -> - {Rest, Stripped} = strip_continuations(T, 2, 0), - json_escape(<>, Opts, L + 3, Len + 2 - Stripped); - false -> erlang:error(badarg, [Str, Opts]) - end; - <> when X >= 240, X =< 247 -> - case Opts#opts.loose_unicode of - true -> - {Rest, Stripped} = strip_continuations(T, 3, 0), - json_escape(<>, Opts, L + 3, Len + 2 - Stripped); - false -> erlang:error(badarg, [Str, Opts]) - end; - <> -> - case Opts#opts.loose_unicode of - true -> json_escape(<>, Opts, L + 3, Len + 2); - false -> erlang:error(badarg, [Str, Opts]) - end + <<_:L/binary, X/utf8, _/binary>> when X >= 16#20000, X < 16#2fffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#30000, X < 16#3fffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#40000, X < 16#4fffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#50000, X < 16#5fffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#60000, X < 16#6fffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#70000, X < 16#7fffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#80000, X < 16#8fffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#90000, X < 16#9fffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#a0000, X < 16#afffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#b0000, X < 16#bfffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#c0000, X < 16#cfffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#d0000, X < 16#dfffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#e0000, X < 16#efffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#f0000, X < 16#ffffe -> + json_escape(Str, Opts, L + 4, Len); + <<_:L/binary, X/utf8, _/binary>> when X >= 16#100000, X < 16#10fffe -> + json_escape(Str, Opts, L + 4, Len); + _ -> erlang:error(badarg, [Str, Opts]) end; json_escape(Str, _, L, Len) when L =:= Len -> Str. @@ -364,219 +333,11 @@ to_hex(15) -> $f; to_hex(X) -> X + 48. %% ascii "1" is [49], "2" is [50], etc... -strip_continuations(Bin, 0, N) -> {Bin, N}; -strip_continuations(<>, N, M) when X >= 128, X =< 191 -> - strip_continuations(Rest, N - 1, M + 1); -%% not a continuation byte -strip_continuations(Bin, _, N) -> {Bin, N}. - - %% eunit tests -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). -xcode(Bin) -> xcode(Bin, #opts{}). - -xcode(Bin, [loose_unicode]) -> xcode(Bin, #opts{loose_unicode=true}); -xcode(Bin, Opts) -> - try json_escape(Bin, Opts) - catch error:badarg -> {error, badarg} - end. - - -is_bad({error, badarg}) -> true; -is_bad(_) -> false. - - -bad_utf8_test_() -> - [ - {"orphan continuation byte u+0080", - ?_assert(is_bad(xcode(<<16#0080>>))) - }, - {"orphan continuation byte u+0080 replaced", - ?_assertEqual(xcode(<<16#0080>>, [loose_unicode]), <<16#fffd/utf8>>) - }, - {"orphan continuation byte u+00bf", - ?_assert(is_bad(xcode(<<16#00bf>>))) - }, - {"orphan continuation byte u+00bf replaced", - ?_assertEqual(xcode(<<16#00bf>>, [loose_unicode]), <<16#fffd/utf8>>) - }, - {"2 continuation bytes", - ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>))) - }, - {"2 continuation bytes replaced", - ?_assertEqual( - xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [loose_unicode]), - binary:copy(<<16#fffd/utf8>>, 2) - ) - }, - {"3 continuation bytes", - ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>))) - }, - {"3 continuation bytes replaced", - ?_assertEqual( - xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [loose_unicode]), - binary:copy(<<16#fffd/utf8>>, 3) - ) - }, - {"4 continuation bytes", - ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>))) - }, - {"4 continuation bytes replaced", - ?_assertEqual( - xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [loose_unicode]), - binary:copy(<<16#fffd/utf8>>, 4) - ) - }, - {"5 continuation bytes", - ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>))) - }, - {"5 continuation bytes replaced", - ?_assertEqual( - xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [loose_unicode]), - binary:copy(<<16#fffd/utf8>>, 5) - ) - }, - {"6 continuation bytes", - ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>))) - }, - {"6 continuation bytes replaced", - ?_assertEqual( - xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [loose_unicode]), - binary:copy(<<16#fffd/utf8>>, 6) - ) - }, - {"all continuation bytes", - ?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>))) - }, - {"all continuation bytes replaced", - ?_assertEqual( - xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [loose_unicode]), - binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf))) - ) - }, - {"lonely start byte", - ?_assert(is_bad(xcode(<<16#00c0>>))) - }, - {"lonely start byte replaced", - ?_assertEqual( - xcode(<<16#00c0>>, [loose_unicode]), - <<16#fffd/utf8>> - ) - }, - {"lonely start bytes (2 byte)", - ?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>))) - }, - {"lonely start bytes (2 byte) replaced", - ?_assertEqual( - xcode(<<16#00c0, 32, 16#00df>>, [loose_unicode]), - <<16#fffd/utf8, 32, 16#fffd/utf8>> - ) - }, - {"lonely start bytes (3 byte)", - ?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>))) - }, - {"lonely start bytes (3 byte) replaced", - ?_assertEqual( - xcode(<<16#00e0, 32, 16#00ef>>, [loose_unicode]), - <<16#fffd/utf8, 32, 16#fffd/utf8>> - ) - }, - {"lonely start bytes (4 byte)", - ?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>))) - }, - {"lonely start bytes (4 byte) replaced", - ?_assertEqual( - xcode(<<16#00f0, 32, 16#00f7>>, [loose_unicode]), - <<16#fffd/utf8, 32, 16#fffd/utf8>> - ) - }, - {"missing continuation byte (3 byte)", - ?_assert(is_bad(xcode(<<224, 160, 32>>))) - }, - {"missing continuation byte (3 byte) replaced", - ?_assertEqual( - xcode(<<224, 160, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - }, - {"missing continuation byte (4 byte missing one)", - ?_assert(is_bad(xcode(<<240, 144, 128, 32>>))) - }, - {"missing continuation byte2 (4 byte missing one) replaced", - ?_assertEqual( - xcode(<<240, 144, 128, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - }, - {"missing continuation byte (4 byte missing two)", - ?_assert(is_bad(xcode(<<240, 144, 32>>))) - }, - {"missing continuation byte2 (4 byte missing two) replaced", - ?_assertEqual( - xcode(<<240, 144, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - }, - {"overlong encoding of u+002f (2 byte)", - ?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>))) - }, - {"overlong encoding of u+002f (2 byte) replaced", - ?_assertEqual( - xcode(<<16#c0, 16#af, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - }, - {"overlong encoding of u+002f (3 byte)", - ?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>))) - }, - {"overlong encoding of u+002f (3 byte) replaced", - ?_assertEqual( - xcode(<<16#e0, 16#80, 16#af, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - }, - {"overlong encoding of u+002f (4 byte)", - ?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>))) - }, - {"overlong encoding of u+002f (4 byte) replaced", - ?_assertEqual( - xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - }, - {"highest overlong 2 byte sequence", - ?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>))) - }, - {"highest overlong 2 byte sequence replaced", - ?_assertEqual( - xcode(<<16#c1, 16#bf, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - }, - {"highest overlong 3 byte sequence", - ?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>))) - }, - {"highest overlong 3 byte sequence replaced", - ?_assertEqual( - xcode(<<16#e0, 16#9f, 16#bf, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - }, - {"highest overlong 4 byte sequence", - ?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>))) - }, - {"highest overlong 4 byte sequence replaced", - ?_assertEqual( - xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [loose_unicode]), - <<16#fffd/utf8, 32>> - ) - } - ]. - - binary_escape_test_() -> [ {"json string escaping", @@ -629,7 +390,6 @@ opts_test_() -> single_quotes, no_jsonp_escapes, comments, - json_escape, dirty_strings, ignore_bad_escapes ]), @@ -640,7 +400,6 @@ opts_test_() -> single_quotes=true, no_jsonp_escapes=true, comments=true, - json_escape=true, dirty_strings=true, ignore_bad_escapes=true } @@ -660,133 +419,4 @@ opts_test_() -> ]. -surrogates_test_() -> - [ - {"surrogates - badjson", - ?_assertEqual(check_bad(surrogates()), []) - }, - {"surrogates - replaced", - ?_assertEqual(check_replaced(surrogates()), []) - } - ]. - - -good_characters_test_() -> - [ - {"acceptable codepoints", - ?_assertEqual(check_good(good()), []) - }, - {"acceptable extended", - ?_assertEqual(check_good(good_extended()), []) - } - ]. - - -reserved_test_() -> - [ - {"reserved noncharacters - badjson", - ?_assertEqual(check_bad(reserved_space()), []) - }, - {"reserved noncharacters - replaced", - ?_assertEqual(check_replaced(reserved_space()), []) - } - ]. - - -noncharacters_test_() -> - [ - {"noncharacters - badjson", - ?_assertEqual(check_bad(noncharacters()), []) - }, - {"noncharacters - replaced", - ?_assertEqual(check_replaced(noncharacters()), []) - } - ]. - - -extended_noncharacters_test_() -> - [ - {"extended noncharacters - badjson", - ?_assertEqual(check_bad(extended_noncharacters()), []) - }, - {"extended noncharacters - replaced", - ?_assertEqual(check_replaced(extended_noncharacters()), []) - } - ]. - - -check_bad(List) -> - lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end, - check(List, #opts{}, []) - ). - - -check_replaced(List) -> - lists:dropwhile(fun({_, <<16#fffd/utf8>>}) -> true - ; (_) -> false - end, - check(List, #opts{loose_unicode=true}, []) - ). - - -check_good(List) -> - lists:dropwhile(fun({_, _}) -> true ; (_) -> false end, - check(List, #opts{}, []) - ). - - -check([], _Opts, Acc) -> Acc; -check([H|T], Opts, Acc) -> - R = escape(to_fake_utf(H, utf8), Opts), - check(T, Opts, [{H, R}] ++ Acc). - - -escape(JSON, Opts) -> - try json_escape(JSON, Opts) - catch error:badarg -> {error, badjson} - end. - - -noncharacters() -> lists:seq(16#fffe, 16#ffff). - - -extended_noncharacters() -> - [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] - ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] - ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] - ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] - ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] - ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] - ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] - ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. - - -surrogates() -> lists:seq(16#d800, 16#dfff). - - -reserved_space() -> lists:seq(16#fdd0, 16#fdef). - - -good() -> lists:seq(16#0000, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). - - -good_extended() -> [16#10000, 16#20000, 16#30000, 16#40000, 16#50000, - 16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, - 16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000 - ] ++ lists:seq(16#100000, 16#10fffd). - - -%% erlang refuses to encode certain codepoints, so fake them all -to_fake_utf(N, utf8) when N < 16#0080 -> <>; -to_fake_utf(N, utf8) when N < 16#0800 -> - <<0:5, Y:5, X:6>> = <>, - <<2#110:3, Y:5, 2#10:2, X:6>>; -to_fake_utf(N, utf8) when N < 16#10000 -> - <> = <>, - <<2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6>>; -to_fake_utf(N, utf8) -> - <<0:3, W:3, Z:6, Y:6, X:6>> = <>, - <<2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6>>. - - -endif. \ No newline at end of file From a6dee1690419dbaa0895350a7715ac2cd0aaa19b Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Mar 2012 21:48:24 -0700 Subject: [PATCH 19/19] bump version to 1.1.2 --- README.markdown | 12 ++---------- src/jsx.app.src | 2 +- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/README.markdown b/README.markdown index 24281eb..17b81ef 100644 --- a/README.markdown +++ b/README.markdown @@ -127,17 +127,9 @@ javascript interpreters treat the codepoints `u+2028` and `u+2029` as significan json has no official comments but some parsers allow c style comments. this flag allows comments (both `// ...` and `/* ... */` style) anywhere whitespace is allowed -#### `dirty_strings` #### +#### `json_escape` #### -json escaping is lossy, it mutates the json string and repeated application can result in unwanted behaviour. if your strings are already escaped (or you'd like to force invalid strings into "json") use this flag to bypass escaping - -#### `ignore_bad_escapes` #### - -during decoding, ignore unrecognized escape sequences and leave them as is in the stream - -#### `relax` #### - -relax is a synonym for `[loose_unicode, single_quotes, comments, ignore_bad_escapes]` +by default, both the encoder and decoder return strings as utf8 binaries appropriate for use in erlang. escape sequences that were present in decoded terms are converted into the appropriate codepoint and encoded terms are unaltered. this flag escapes strings for output in json, removing control codes and replacing them with the appropriate escapes ### incomplete input ### diff --git a/src/jsx.app.src b/src/jsx.app.src index 846abd4..5875359 100644 --- a/src/jsx.app.src +++ b/src/jsx.app.src @@ -1,7 +1,7 @@ {application, jsx, [ {description, "a streaming, evented json parsing toolkit"}, - {vsn, "1.1.1"}, + {vsn, "1.1.2"}, {modules, [ jsx, jsx_encoder,