From ecdd34fa378e91556d1cd5907f574d682984c37e Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Sat, 30 Jul 2011 23:52:29 -0700 Subject: [PATCH] building strings as lists of codepoints, then converting them to binaries is faster than building binaries directly, or slicing segments out of binaries in the case of utf8. by more than 25% --- src/jsx_decoder.hrl | 71 ++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl index 714cb3d..c5dda0a 100644 --- a/src/jsx_decoder.hrl +++ b/src/jsx_decoder.hrl @@ -354,12 +354,12 @@ key(Bin, Stack, Opts) -> %% string uses partial_utf/1 to cease parsing when invalid encodings are %% encountered rather than just checking remaining binary size like other %% states -string(Bin, Stack, Opts) -> string(Bin, Stack, Opts, <<>>). +string(Bin, Stack, Opts) -> string(Bin, Stack, Opts, []). string(<>, [key|_] = Stack, Opts, Acc) -> - {jsx, {key, Acc}, fun() -> colon(Rest, Stack, Opts) end}; + {jsx, {key, unicode:characters_to_binary(lists:reverse(Acc))}, fun() -> colon(Rest, Stack, Opts) end}; string(<>, Stack, Opts, Acc) -> - {jsx, {string, Acc}, fun() -> + {jsx, {string, unicode:characters_to_binary(lists:reverse(Acc))}, fun() -> maybe_done(Rest, Stack, Opts) end}; string(<>, Stack, Opts, Acc) -> @@ -369,11 +369,11 @@ string(<>, Stack, Opts, Acc) -> %% the range 32..16#fdcf is safe, so allow that string(<>, Stack, Opts, Acc) when ?is_noncontrol(S), S < 16#fdd0 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [S] ++ Acc); %% the range 16#fdf0..16#fffd is also safe string(<>, Stack, Opts, Acc) when S > 16#fdef, S < 16#fffe -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [S] ++ Acc); %% i think doing it like this is faster than just putting this clause first. %% yes, i think it's insane too string(<>, Stack, Opts, Acc) @@ -394,7 +394,7 @@ string(<>, Stack, Opts, Acc) S =/= 16#efffe andalso S =/= 16#effff andalso S =/= 16#ffffe andalso S =/= 16#fffff andalso S =/= 16#10fffe andalso S =/= 16#10ffff -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [S] ++ Acc); string(Bin, Stack, Opts, Acc) -> case partial_utf(Bin) of true -> @@ -476,14 +476,14 @@ partial_utf(_) -> false. %% non-characters erlang doesn't recognize as non-characters, idiotically noncharacter(<>, Stack, Opts, Acc) when ?is_noncontrol(S) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% u+fffe and u+ffff noncharacter(<<239, 191, X, Rest/binary>>, Stack, Opts, Acc) when X == 190; X == 191 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% surrogates noncharacter(<<237, X, _, Rest/binary>>, Stack, Opts, Acc) when X >= 160 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); noncharacter(Bin, _Stack, _Opts, _Acc) -> {error, {badjson, Bin}}. -endif. @@ -492,15 +492,15 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> %% non-characters blah blah noncharacter(<>, Stack, Opts, Acc) when ?is_noncontrol(S) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% u+ffff and u+fffe noncharacter(<<255, X, Rest/binary>>, Stack, Opts, Acc) when X == 254; X == 255 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% surrogates noncharacter(<>, Stack, Opts, Acc) when X >= 216, X =< 223 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); noncharacter(Bin, _Stack, _Opts, _Acc) -> {error, {badjson, Bin}}. -endif. @@ -509,15 +509,15 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> %% non-characters blah blah noncharacter(<>, Stack, Opts, Acc) when ?is_noncontrol(S) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% u+ffff and u+fffe noncharacter(<>, Stack, Opts, Acc) when X == 254; X == 255 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% surrogates noncharacter(<<_, X, Rest/binary>>, Stack, Opts, Acc) when X >= 216, X =< 223 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); noncharacter(Bin, _Stack, _Opts, _Acc) -> {error, {badjson, Bin}}. -endif. @@ -526,15 +526,15 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> %% non-characters blah blah noncharacter(<>, Stack, Opts, Acc) when ?is_noncontrol(S) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% u+ffff and u+fffe noncharacter(<<0, 0, 255, X, Rest/binary>>, Stack, Opts, Acc) when X == 254; X == 255 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% surrogates noncharacter(<<0, 0, X, _, Rest/binary>>, Stack, Opts, Acc) when X >= 216, X =< 223 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); noncharacter(Bin, _Stack, _Opts, _Acc) -> {error, {badjson, Bin}}. -endif. @@ -543,15 +543,15 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> %% non-characters blah blah noncharacter(<>, Stack, Opts, Acc) when ?is_noncontrol(S) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% u+ffff and u+fffe noncharacter(<>, Stack, Opts, Acc) when X == 254; X == 255 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); %% surrogates noncharacter(<<_, X, 0, 0, Rest/binary>>, Stack, Opts, Acc) when X >= 216, X =< 223 -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [16#fffd] ++ Acc); noncharacter(Bin, _Stack, _Opts, _Acc) -> {error, {badjson, Bin}}. -endif. @@ -561,20 +561,20 @@ noncharacter(Bin, _Stack, _Opts, _Acc) -> %% escaped_unicode used to hold the codepoint sequence. unescessary, but nicer %% than using the string accumulator escape(<<$b/?utfx, Rest/binary>>, Stack, Opts, Acc) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, "\b" ++ Acc); escape(<<$f/?utfx, Rest/binary>>, Stack, Opts, Acc) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, "\f" ++ Acc); escape(<<$n/?utfx, Rest/binary>>, Stack, Opts, Acc) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, "\n" ++ Acc); escape(<<$r/?utfx, Rest/binary>>, Stack, Opts, Acc) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, "\r" ++ Acc); escape(<<$t/?utfx, Rest/binary>>, Stack, Opts, Acc) -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, "\t" ++ Acc); escape(<<$u/?utfx, Rest/binary>>, Stack, Opts, Acc) -> escaped_unicode(Rest, Stack, Opts, Acc, []); escape(<>, Stack, Opts, Acc) when S =:= ?quote; S =:= ?solidus; S =:= ?rsolidus -> - string(Rest, Stack, Opts, <>); + string(Rest, Stack, Opts, [S] ++ Acc); escape(Bin, Stack, Opts, Acc) -> case ?partial_codepoint(Bin) of true -> @@ -600,7 +600,7 @@ escaped_unicode(<>, Stack, Opts, String, [C, B, A]) ; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef -> case Opts#opts.loose_unicode of true -> - string(Rest, Stack, Opts, <>) + string(Rest, Stack, Opts, [16#fffd] ++ String) ; false -> {error, {badjson, <>}} end @@ -609,13 +609,13 @@ escaped_unicode(<>, Stack, Opts, String, [C, B, A]) ; X when X == 16#0000 -> case Opts#opts.loose_unicode of true -> - string(Rest, Stack, Opts, <>) + string(Rest, Stack, Opts, [16#fffd] ++ String) ; false -> {error, {badjson, <>}} end %% anything else ; X -> - string(Rest, Stack, Opts, <>) + string(Rest, Stack, Opts, [X] ++ String) end; escaped_unicode(<>, Stack, Opts, String, Acc) when ?is_hex(S) -> @@ -643,7 +643,7 @@ low_surrogate(<>, Stack, Opts, String, High) -> low_surrogate(<> = Bin, Stack, Opts, String, _) -> case Opts#opts.loose_unicode of true -> - string(Bin, Stack, Opts, <>) + string(Bin, Stack, Opts, [16#fffd] ++ String) ; false -> {error, {badjson, <>}} end; @@ -675,7 +675,7 @@ low_surrogate_u(<> = Bin, Stack, Opts, String, _) -> string(<>, Stack, Opts, - <> + [16#fffd] ++ String ) ; false -> {error, {badjson, <>}} @@ -710,14 +710,13 @@ low_surrogate(<>, Stack, Opts, String, [C, B, A], High) string(Rest, Stack, Opts, - <> + [16#fffd] ++ String ) ; false -> {error, {badjson, <>}} end - ; Y -> - io:format("~p ~p~n", [V, Y]), - string(Rest, Stack, Opts, <>) + ; _ -> + string(Rest, Stack, Opts, [V] ++ String) end %% not a low surrogate, bad bad bad ; _ ->