diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 4055b2e..f70e0f8 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -334,6 +334,20 @@ noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts) %% surrogates noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 160 -> string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +noncharacter(<>, Handler, [Acc|Stack], Opts) + when ( + (X == 240 andalso Y == 159) orelse + (X == 240 andalso Y == 175) orelse + (X == 240 andalso Y == 191) orelse + ( + (X == 241 orelse X == 242 orelse X == 243) andalso + (Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191) + ) orelse + (X == 244 andalso Y == 143) + ) andalso (Z == 190 orelse Z == 191) -> + string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); +noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) -> + string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts); noncharacter(Bin, Handler, Stack, Opts) -> ?error([Bin, Handler, Stack, Opts]). @@ -1079,6 +1093,51 @@ good_characters_test_() -> ?_assertEqual(check_good(good_extended()), []) } ]. + +malformed_test_() -> + [ + {"malformed codepoint with 1 byte", + ?_assertEqual({error, badjson}, decode(<<128>>)) + }, + {"malformed codepoint with 2 bytes", + ?_assertEqual({error, badjson}, decode(<<128, 192>>)) + }, + {"malformed codepoint with 3 bytes", + ?_assertEqual({error, badjson}, decode(<<128, 192, 192>>)) + }, + {"malformed codepoint with 4 bytes", + ?_assertEqual({error, badjson}, decode(<<128, 192, 192, 192>>)) + } + ]. + +malformed_replaced_test_() -> + F = <<16#fffd/utf8>>, + [ + {"malformed codepoint with 1 byte", + ?_assertEqual( + [{string, <>}, end_json], + decode(<<34, 128, 34>>, [loose_unicode]) + ) + }, + {"malformed codepoint with 2 bytes", + ?_assertEqual( + [{string, <>}, end_json], + decode(<<34, 128, 192, 34>>, [loose_unicode]) + ) + }, + {"malformed codepoint with 3 bytes", + ?_assertEqual( + [{string, <>}, end_json], + decode(<<34, 128, 192, 192, 34>>, [loose_unicode]) + ) + }, + {"malformed codepoint with 4 bytes", + ?_assertEqual( + [{string, <>}, end_json], + decode(<<34, 128, 192, 192, 192, 34>>, [loose_unicode]) + ) + } + ]. check_bad(List) -> @@ -1104,6 +1163,8 @@ check([H|T], Opts, Acc) -> check(T, Opts, [{H, R}] ++ Acc). +decode(JSON) -> decode(JSON, []). + decode(JSON, Opts) -> try (decoder(jsx, [], Opts))(JSON) diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index 4ee0fa5..5cd9934 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -163,9 +163,29 @@ clean_string(<>, Acc) C =/= 16#ffffe andalso C =/= 16#fffff andalso C =/= 16#10fffe andalso C =/= 16#10ffff -> clean_string(Rest, <>); -clean_string(<>, Acc) when X == 237; X == 239 -> +%% surrogates +clean_string(<<237, X, _, Rest/binary>>, Acc) when X >= 160 -> clean_string(Rest, <>); -clean_string(<<_, _, _, _, Rest/binary>>, Acc) -> +%% private use noncharacters +clean_string(<<239, 183, X, Rest/binary>>, Acc) when X >= 143, X =< 175 -> + clean_string(Rest, <>); +%% u+fffe and u+ffff +clean_string(<<239, 191, X, Rest/binary>>, Acc) when X == 190; X == 191 -> + clean_string(Rest, <>); +%% the u+Xfffe and u+Xffff noncharacters +clean_string(<>, Acc) + when ( + (X == 240 andalso Y == 159) orelse + (X == 240 andalso Y == 175) orelse + (X == 240 andalso Y == 191) orelse + ( + (X == 241 orelse X == 242 orelse X == 243) andalso + (Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191) + ) orelse + (X == 244 andalso Y == 143) + ) andalso (Z == 190 orelse Z == 191) -> + clean_string(Rest, <>); +clean_string(<<_, Rest/binary>>, Acc) -> clean_string(Rest, <>); clean_string(<<>>, Acc) -> Acc. @@ -302,7 +322,43 @@ good_characters_test_() -> ?_assertEqual(check_good(good_extended()), []) } ]. - + +malformed_test_() -> + [ + {"malformed codepoint with 1 byte", ?_assertError(badarg, encode(<<128>>))}, + {"malformed codepoint with 2 bytes", ?_assertError(badarg, encode(<<128, 192>>))}, + {"malformed codepoint with 3 bytes", ?_assertError(badarg, encode(<<128, 192, 192>>))}, + {"malformed codepoint with 4 bytes", ?_assertError(badarg, encode(<<128, 192, 192, 192>>))} + ]. + +malformed_replaced_test_() -> + F = <<16#fffd/utf8>>, + [ + {"malformed codepoint with 1 byte", + ?_assertEqual( + [{string, <>}, end_json], + encode(<<128>>, [loose_unicode]) + ) + }, + {"malformed codepoint with 2 bytes", + ?_assertEqual( + [{string, <>}, end_json], + encode(<<128, 192>>, [loose_unicode]) + ) + }, + {"malformed codepoint with 3 bytes", + ?_assertEqual( + [{string, <>}, end_json], + encode(<<128, 192, 192>>, [loose_unicode]) + ) + }, + {"malformed codepoint with 4 bytes", + ?_assertEqual( + [{string, <>}, end_json], + encode(<<128, 192, 192, 192>>, [loose_unicode]) + ) + } + ]. check_bad(List) -> lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,