slightly more efficient encoding of strings
This commit is contained in:
parent
41002bd10f
commit
04ea83266e
1 changed files with 26 additions and 138 deletions
|
@ -23,7 +23,7 @@
|
||||||
|
|
||||||
-module(jsx_encoder).
|
-module(jsx_encoder).
|
||||||
|
|
||||||
-export([encoder/3, clean_string/1]).
|
-export([encoder/3]).
|
||||||
|
|
||||||
-spec encoder(Handler::module(), State::any(), Opts::jsx:opts()) -> jsx:encoder().
|
-spec encoder(Handler::module(), State::any(), Opts::jsx:opts()) -> jsx:encoder().
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ start(Term, {Handler, State}, Opts) ->
|
||||||
|
|
||||||
|
|
||||||
value(String, {Handler, State}, Opts) when is_binary(String) ->
|
value(String, {Handler, State}, Opts) when is_binary(String) ->
|
||||||
Handler:handle_event({string, clean_string(String, <<>>, Opts)}, State);
|
Handler:handle_event({string, clean_string(String, Opts)}, State);
|
||||||
value(Float, {Handler, State}, _Opts) when is_float(Float) ->
|
value(Float, {Handler, State}, _Opts) when is_float(Float) ->
|
||||||
Handler:handle_event({float, Float}, State);
|
Handler:handle_event({float, Float}, State);
|
||||||
value(Int, {Handler, State}, _Opts) when is_integer(Int) ->
|
value(Int, {Handler, State}, _Opts) when is_integer(Int) ->
|
||||||
|
@ -83,7 +83,7 @@ object([{Key, Value}|Rest], {Handler, State}, Opts) ->
|
||||||
Handler,
|
Handler,
|
||||||
value(
|
value(
|
||||||
Value,
|
Value,
|
||||||
{Handler, Handler:handle_event({key, clean_string(fix_key(Key), <<>>, Opts)}, State)},
|
{Handler, Handler:handle_event({key, clean_string(fix_key(Key), Opts)}, State)},
|
||||||
Opts
|
Opts
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
|
@ -103,98 +103,31 @@ fix_key(Key) when is_atom(Key) -> fix_key(atom_to_binary(Key, utf8));
|
||||||
fix_key(Key) when is_binary(Key) -> Key.
|
fix_key(Key) when is_binary(Key) -> Key.
|
||||||
|
|
||||||
|
|
||||||
clean_string(Bin) -> clean_string(Bin, <<>>, #opts{json_escape=true}).
|
clean_string(Bin, Opts) ->
|
||||||
|
case Opts#opts.json_escape of
|
||||||
|
true -> jsx_utils:json_escape(Bin, Opts);
|
||||||
|
false ->
|
||||||
|
case is_clean(Bin) of
|
||||||
|
true -> Bin;
|
||||||
|
false -> clean_string(Bin, [], Opts)
|
||||||
|
end
|
||||||
|
end.
|
||||||
|
|
||||||
clean_string(<<$\", Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, $\\, $\">>, Opts);
|
is_clean(<<>>) -> true;
|
||||||
clean_string(<<$\\, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
is_clean(<<_/utf8, Rest/binary>>) -> is_clean(Rest);
|
||||||
clean_string(Rest, <<Acc/binary, $\\, $\\>>, Opts);
|
is_clean(_) -> false.
|
||||||
clean_string(<<$\b, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, $\\, $b>>, Opts);
|
|
||||||
clean_string(<<$\f, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
clean_string(Bin, _Acc, Opts=#opts{loose_unicode=false}) -> ?error([Bin, Opts]);
|
||||||
clean_string(Rest, <<Acc/binary, $\\, $f>>, Opts);
|
clean_string(<<>>, Acc, _Opts) -> unicode:characters_to_binary(lists:reverse(Acc));
|
||||||
clean_string(<<$\n, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
clean_string(<<X/utf8, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [X] ++ Acc, Opts);
|
||||||
clean_string(Rest, <<Acc/binary, $\\, $n>>, Opts);
|
|
||||||
clean_string(<<$\r, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, $\\, $r>>, Opts);
|
|
||||||
clean_string(<<$\t, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, $\\, $t>>, Opts);
|
|
||||||
clean_string(<<$/, Rest/binary>>, Acc, Opts=#opts{json_escape=true, escape_forward_slash=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, $\\, $/>>, Opts);
|
|
||||||
clean_string(<<16#2028/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true, no_jsonp_escapes=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, 16#2028/utf8>>, Opts);
|
|
||||||
clean_string(<<16#2029/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true, no_jsonp_escapes=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, 16#2029/utf8>>, Opts);
|
|
||||||
clean_string(<<16#2028/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, (json_escape_sequence(16#2028))/binary>>, Opts);
|
|
||||||
clean_string(<<16#2029/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, (json_escape_sequence(16#2029))/binary>>, Opts);
|
|
||||||
clean_string(<<C/utf8, Rest/binary>>, Acc, Opts=#opts{json_escape=true}) when C < 32 ->
|
|
||||||
clean_string(Rest, <<Acc/binary, (json_escape_sequence(C))/binary>>, Opts);
|
|
||||||
clean_string(<<C/utf8, Rest/binary>>, Acc, Opts) when C < 16#fdd0 ->
|
|
||||||
clean_string(Rest, <<Acc/binary, C/utf8>>, Opts);
|
|
||||||
clean_string(<<C/utf8, Rest/binary>>, Acc, Opts) when C > 16#fdef, C < 16#fffe ->
|
|
||||||
clean_string(Rest, <<Acc/binary, C/utf8>>, Opts);
|
|
||||||
clean_string(<<C/utf8, Rest/binary>>, Acc, Opts)
|
|
||||||
when C > 16#ffff andalso
|
|
||||||
C =/= 16#1fffe andalso C =/= 16#1ffff andalso
|
|
||||||
C =/= 16#2fffe andalso C =/= 16#2ffff andalso
|
|
||||||
C =/= 16#3fffe andalso C =/= 16#3ffff andalso
|
|
||||||
C =/= 16#4fffe andalso C =/= 16#4ffff andalso
|
|
||||||
C =/= 16#5fffe andalso C =/= 16#5ffff andalso
|
|
||||||
C =/= 16#6fffe andalso C =/= 16#6ffff andalso
|
|
||||||
C =/= 16#7fffe andalso C =/= 16#7ffff andalso
|
|
||||||
C =/= 16#8fffe andalso C =/= 16#8ffff andalso
|
|
||||||
C =/= 16#9fffe andalso C =/= 16#9ffff andalso
|
|
||||||
C =/= 16#afffe andalso C =/= 16#affff andalso
|
|
||||||
C =/= 16#bfffe andalso C =/= 16#bffff andalso
|
|
||||||
C =/= 16#cfffe andalso C =/= 16#cffff andalso
|
|
||||||
C =/= 16#dfffe andalso C =/= 16#dffff andalso
|
|
||||||
C =/= 16#efffe andalso C =/= 16#effff andalso
|
|
||||||
C =/= 16#ffffe andalso C =/= 16#fffff andalso
|
|
||||||
C =/= 16#10fffe andalso C =/= 16#10ffff ->
|
|
||||||
clean_string(Rest, <<Acc/binary, C/utf8>>, Opts);
|
|
||||||
%% surrogates
|
%% surrogates
|
||||||
clean_string(<<237, X, _, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) when X >= 160 ->
|
clean_string(<<237, X, _, Rest/binary>>, Acc, Opts) when X >= 160 -> clean_string(Rest, [16#fffd] ++ Acc, Opts);
|
||||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>, Opts);
|
%% bad codepoints
|
||||||
%% private use noncharacters
|
clean_string(<<_, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [16#fffd] ++ Acc, Opts).
|
||||||
clean_string(<<239, 183, X, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) when X >= 143, X =< 175 ->
|
|
||||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>, Opts);
|
|
||||||
%% u+fffe and u+ffff
|
|
||||||
clean_string(<<239, 191, X, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) when X == 190; X == 191 ->
|
|
||||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>, Opts);
|
|
||||||
%% the u+Xfffe and u+Xffff noncharacters
|
|
||||||
clean_string(<<X, Y, 191, Z, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) when (
|
|
||||||
(X == 240 andalso Y == 159) orelse
|
|
||||||
(X == 240 andalso Y == 175) orelse
|
|
||||||
(X == 240 andalso Y == 191) orelse
|
|
||||||
(
|
|
||||||
(X == 241 orelse X == 242 orelse X == 243) andalso
|
|
||||||
(Y == 143 orelse Y == 159 orelse Y == 175 orelse Y == 191)
|
|
||||||
) orelse
|
|
||||||
(X == 244 andalso Y == 143)
|
|
||||||
) andalso (Z == 190 orelse Z == 191) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>, Opts);
|
|
||||||
clean_string(<<_, Rest/binary>>, Acc, Opts=#opts{loose_unicode=true}) ->
|
|
||||||
clean_string(Rest, <<Acc/binary, 16#fffd/utf8>>, Opts);
|
|
||||||
clean_string(<<>>, Acc, _) -> Acc;
|
|
||||||
clean_string(Bin, _Acc, Opts) -> erlang:error(badarg, [Bin, Opts]).
|
|
||||||
|
|
||||||
|
|
||||||
%% convert a codepoint to it's \uXXXX equiv.
|
|
||||||
json_escape_sequence(X) ->
|
|
||||||
<<A:4, B:4, C:4, D:4>> = <<X:16>>,
|
|
||||||
unicode:characters_to_binary([$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))]).
|
|
||||||
|
|
||||||
|
|
||||||
to_hex(10) -> $a;
|
|
||||||
to_hex(11) -> $b;
|
|
||||||
to_hex(12) -> $c;
|
|
||||||
to_hex(13) -> $d;
|
|
||||||
to_hex(14) -> $e;
|
|
||||||
to_hex(15) -> $f;
|
|
||||||
to_hex(X) -> X + 48. %% ascii "1" is [49], "2" is [50], etc...
|
|
||||||
|
|
||||||
-ifdef(TEST).
|
-ifdef(TEST).
|
||||||
-include_lib("eunit/include/eunit.hrl").
|
-include_lib("eunit/include/eunit.hrl").
|
||||||
|
|
||||||
|
@ -276,26 +209,6 @@ encode_test_() ->
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
].
|
].
|
||||||
|
|
||||||
noncharacters_test_() ->
|
|
||||||
[
|
|
||||||
{"noncharacters - badjson",
|
|
||||||
?_assertEqual(check_bad(noncharacters()), [])
|
|
||||||
},
|
|
||||||
{"noncharacters - replaced",
|
|
||||||
?_assertEqual(check_replaced(noncharacters()), [])
|
|
||||||
}
|
|
||||||
].
|
|
||||||
|
|
||||||
extended_noncharacters_test_() ->
|
|
||||||
[
|
|
||||||
{"extended noncharacters - badjson",
|
|
||||||
?_assertEqual(check_bad(extended_noncharacters()), [])
|
|
||||||
},
|
|
||||||
{"extended noncharacters - replaced",
|
|
||||||
?_assertEqual(check_replaced(extended_noncharacters()), [])
|
|
||||||
}
|
|
||||||
].
|
|
||||||
|
|
||||||
surrogates_test_() ->
|
surrogates_test_() ->
|
||||||
[
|
[
|
||||||
|
@ -306,16 +219,6 @@ surrogates_test_() ->
|
||||||
?_assertEqual(check_replaced(surrogates()), [])
|
?_assertEqual(check_replaced(surrogates()), [])
|
||||||
}
|
}
|
||||||
].
|
].
|
||||||
|
|
||||||
reserved_test_() ->
|
|
||||||
[
|
|
||||||
{"reserved noncharacters - badjson",
|
|
||||||
?_assertEqual(check_bad(reserved_space()), [])
|
|
||||||
},
|
|
||||||
{"reserved noncharacters - replaced",
|
|
||||||
?_assertEqual(check_replaced(reserved_space()), [])
|
|
||||||
}
|
|
||||||
].
|
|
||||||
|
|
||||||
good_characters_test_() ->
|
good_characters_test_() ->
|
||||||
[
|
[
|
||||||
|
@ -387,26 +290,11 @@ check([H|T], Opts, Acc) ->
|
||||||
check(T, Opts, [{H, R}] ++ Acc).
|
check(T, Opts, [{H, R}] ++ Acc).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
noncharacters() -> lists:seq(16#fffe, 16#ffff).
|
|
||||||
|
|
||||||
extended_noncharacters() ->
|
|
||||||
[16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
|
|
||||||
++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
|
|
||||||
++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
|
|
||||||
++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
|
|
||||||
++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
|
|
||||||
++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
|
|
||||||
++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
|
|
||||||
++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
|
|
||||||
|
|
||||||
surrogates() -> lists:seq(16#d800, 16#dfff).
|
surrogates() -> lists:seq(16#d800, 16#dfff).
|
||||||
|
|
||||||
reserved_space() -> lists:seq(16#fdd0, 16#fdef).
|
good() -> lists:seq(1, 16#d7ff) ++ lists:seq(16#e000, 16#ffff).
|
||||||
|
|
||||||
good() -> lists:seq(1, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd).
|
|
||||||
|
|
||||||
good_extended() -> lists:seq(16#100000, 16#10fffd).
|
good_extended() -> lists:seq(16#100000, 16#10ffff).
|
||||||
|
|
||||||
%% erlang refuses to encode certain codepoints, so fake them all
|
%% erlang refuses to encode certain codepoints, so fake them all
|
||||||
to_fake_utf(N, utf8) when N < 16#0080 -> <<N:8>>;
|
to_fake_utf(N, utf8) when N < 16#0080 -> <<N:8>>;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue