diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index 18201bf..1dbd13f 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -23,7 +23,7 @@ -module(jsx_encoder). --export([encoder/3, clean_string/1]). +-export([encoder/3]). -spec encoder(Handler::module(), State::any(), Opts::jsx:opts()) -> jsx:encoder(). @@ -53,7 +53,7 @@ start(Term, {Handler, State}, Opts) -> value(String, {Handler, State}, Opts) when is_binary(String) -> - Handler:handle_event({string, clean_string(String, <<>>, Opts)}, State); + Handler:handle_event({string, clean_string(String, Opts)}, State); value(Float, {Handler, State}, _Opts) when is_float(Float) -> Handler:handle_event({float, Float}, State); value(Int, {Handler, State}, _Opts) when is_integer(Int) -> @@ -83,7 +83,7 @@ object([{Key, Value}|Rest], {Handler, State}, Opts) -> Handler, value( Value, - {Handler, Handler:handle_event({key, clean_string(fix_key(Key), <<>>, Opts)}, State)}, + {Handler, Handler:handle_event({key, clean_string(fix_key(Key), Opts)}, State)}, Opts ) }, @@ -103,98 +103,31 @@ fix_key(Key) when is_atom(Key) -> fix_key(atom_to_binary(Key, utf8)); fix_key(Key) when is_binary(Key) -> Key. -clean_string(Bin) -> clean_string(Bin, <<>>, #opts{json_escape=true}). +clean_string(Bin, Opts) -> + case Opts#opts.json_escape of + true -> jsx_utils:json_escape(Bin, Opts); + false -> + case is_clean(Bin) of + true -> Bin; + false -> clean_string(Bin, [], Opts) + end + end. -clean_string(<<$\", Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<$\\, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<$\b, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<$\f, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<$\n, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<$\r, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<$\t, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<$/, Rest/binary>>, Acc, Opts=#opts{json_escape=true, escape_forward_slash=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<16#2028/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true, no_jsonp_escapes=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<16#2029/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true, no_jsonp_escapes=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<16#2028/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<16#2029/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<>, Acc, Opts=#opts{json_escape=true}) when C < 32 -> - clean_string(Rest, <>, Opts); -clean_string(<>, Acc, Opts) when C < 16#fdd0 -> - clean_string(Rest, <>, Opts); -clean_string(<>, Acc, Opts) when C > 16#fdef, C < 16#fffe -> - clean_string(Rest, <>, Opts); -clean_string(<>, Acc, Opts) - when C > 16#ffff andalso - C =/= 16#1fffe andalso C =/= 16#1ffff andalso - C =/= 16#2fffe andalso C =/= 16#2ffff andalso - C =/= 16#3fffe andalso C =/= 16#3ffff andalso - C =/= 16#4fffe andalso C =/= 16#4ffff andalso - C =/= 16#5fffe andalso C =/= 16#5ffff andalso - C =/= 16#6fffe andalso C =/= 16#6ffff andalso - C =/= 16#7fffe andalso C =/= 16#7ffff andalso - C =/= 16#8fffe andalso C =/= 16#8ffff andalso - C =/= 16#9fffe andalso C =/= 16#9ffff andalso - C =/= 16#afffe andalso C =/= 16#affff andalso - C =/= 16#bfffe andalso C =/= 16#bffff andalso - C =/= 16#cfffe andalso C =/= 16#cffff andalso - C =/= 16#dfffe andalso C =/= 16#dffff andalso - C =/= 16#efffe andalso C =/= 16#effff andalso - C =/= 16#ffffe andalso C =/= 16#fffff andalso - C =/= 16#10fffe andalso C =/= 16#10ffff -> - clean_string(Rest, <>, Opts); + +is_clean(<<>>) -> true; +is_clean(<<_/utf8, Rest/binary>>) -> is_clean(Rest); +is_clean(_) -> false. + + +clean_string(Bin, _Acc, Opts=#opts{loose_unicode=false}) -> ?error([Bin, Opts]); +clean_string(<<>>, Acc, _Opts) -> unicode:characters_to_binary(lists:reverse(Acc)); +clean_string(<>, Acc, Opts) -> clean_string(Rest, [X] ++ Acc, Opts); %% surrogates -clean_string(<<237, X, _, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) when X >= 160 -> - clean_string(Rest, <>, Opts); -%% private use noncharacters -clean_string(<<239, 183, X, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) when X >= 143, X =< 175 -> - clean_string(Rest, <>, Opts); -%% u+fffe and u+ffff -clean_string(<<239, 191, X, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) when X == 190; X == 191 -> - clean_string(Rest, <>, Opts); -%% the u+Xfffe and u+Xffff noncharacters -clean_string(<>, Acc, Opts=#opts{loose_unicode=true}) when ( - (X == 240 andalso Y == 159) orelse - (X == 240 andalso Y == 175) orelse - (X == 240 andalso Y == 191) orelse - ( - (X == 241 orelse X == 242 orelse X == 243) andalso - (Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191) - ) orelse - (X == 244 andalso Y == 143) - ) andalso (Z == 190 orelse Z == 191) -> - clean_string(Rest, <>, Opts); -clean_string(<<_, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) -> - clean_string(Rest, <>, Opts); -clean_string(<<>>, Acc, _) -> Acc; -clean_string(Bin, _Acc, Opts) -> erlang:error(badarg, [Bin, Opts]). - - -%% convert a codepoint to it's \uXXXX equiv. -json_escape_sequence(X) -> - <> = <>, - unicode:characters_to_binary([$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))]). +clean_string(<<237, X, _, Rest/binary>>, Acc, Opts) when X >= 160 -> clean_string(Rest, [16#fffd] ++ Acc, Opts); +%% bad codepoints +clean_string(<<_, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [16#fffd] ++ Acc, Opts). -to_hex(10) -> $a; -to_hex(11) -> $b; -to_hex(12) -> $c; -to_hex(13) -> $d; -to_hex(14) -> $e; -to_hex(15) -> $f; -to_hex(X) -> X + 48. %% ascii "1" is [49], "2" is [50], etc... - -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). @@ -276,26 +209,6 @@ encode_test_() -> ) } ]. - -noncharacters_test_() -> - [ - {"noncharacters - badjson", - ?_assertEqual(check_bad(noncharacters()), []) - }, - {"noncharacters - replaced", - ?_assertEqual(check_replaced(noncharacters()), []) - } - ]. - -extended_noncharacters_test_() -> - [ - {"extended noncharacters - badjson", - ?_assertEqual(check_bad(extended_noncharacters()), []) - }, - {"extended noncharacters - replaced", - ?_assertEqual(check_replaced(extended_noncharacters()), []) - } - ]. surrogates_test_() -> [ @@ -306,16 +219,6 @@ surrogates_test_() -> ?_assertEqual(check_replaced(surrogates()), []) } ]. - -reserved_test_() -> - [ - {"reserved noncharacters - badjson", - ?_assertEqual(check_bad(reserved_space()), []) - }, - {"reserved noncharacters - replaced", - ?_assertEqual(check_replaced(reserved_space()), []) - } - ]. good_characters_test_() -> [ @@ -387,26 +290,11 @@ check([H|T], Opts, Acc) -> check(T, Opts, [{H, R}] ++ Acc). - -noncharacters() -> lists:seq(16#fffe, 16#ffff). - -extended_noncharacters() -> - [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff] - ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff] - ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff] - ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff] - ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff] - ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff] - ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff] - ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff]. - surrogates() -> lists:seq(16#d800, 16#dfff). -reserved_space() -> lists:seq(16#fdd0, 16#fdef). - -good() -> lists:seq(1, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd). +good() -> lists:seq(1, 16#d7ff) ++ lists:seq(16#e000, 16#ffff). -good_extended() -> lists:seq(16#100000, 16#10fffd). +good_extended() -> lists:seq(16#100000, 16#10ffff). %% erlang refuses to encode certain codepoints, so fake them all to_fake_utf(N, utf8) when N < 16#0080 -> <>;