diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 4a049ed..da3bdd2 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -338,11 +338,8 @@ key(Bin, Handler, Stack, Config) -> ?error(key, Bin, Handler, Stack, Config). -%% explicitly whitelist ascii set for faster parsing. really? really. someone should -%% submit a patch that unrolls simple guards %% note that if you encounter an error from string and you can't find the clause that %% caused it here, it might be in unescape below - string(Bin, Handler, Stack, Config) -> string(Bin, Handler, [], Stack, Config). @@ -355,9 +352,10 @@ string(<>, Handler, Acc, Stack, Config) -> string(Rest, Handler, [Acc, maybe_replace(?solidus, Config)], Stack, Config); string(<>, Handler, Acc, Stack, Config) -> unescape(Rest, Handler, Acc, Stack, Config); -string(<>, Handler, Acc, Stack, Config=#config{uescape=true}) -> +%% TODO this is pretty gross and i don't like it +string(<> = Bin, Handler, Acc, Stack, Config=#config{uescape=true}) -> case X of - X when X < 16#80 -> string(Rest, Handler, [Acc, X], Stack, Config); + X when X < 16#80 -> count(Bin, Handler, Acc, Stack, Config); X -> string(Rest, Handler, [Acc, json_escape_sequence(X)], Stack, Config) end; %% u+2028 @@ -367,14 +365,11 @@ string(<<226, 128, 168, Rest/binary>>, Handler, Acc, Stack, Config) -> string(<<226, 128, 169, Rest/binary>>, Handler, Acc, Stack, Config) -> string(Rest, Handler, [Acc, maybe_replace(16#2029, Config)], Stack, Config); string(<<_/utf8, _/binary>> = Bin, Handler, Acc, Stack, Config) -> - Size = count(Bin, 0, Config), - <> = Bin, - string(Rest, Handler, [Acc, Clean], Stack, Config); -%% really, really dirty strings. if there's no valid utf8 we never reach `count` -%% and things get replaced instead of ignored + count(Bin, Handler, Acc, Stack, Config); +%% necessary for bytes that are badly formed utf8 that won't match in `count` string(<>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) -> string(Rest, Handler, [Acc, X], Stack, Config); -%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match preceeding +%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match with /utf8 string(<<239, 191, 190, Rest/binary>>, Handler, Acc, Stack, Config) -> string(Rest, Handler, [Acc, <<16#fffe/utf8>>], Stack, Config); string(<<239, 191, 191, Rest/binary>>, Handler, Acc, Stack, Config) -> @@ -411,6 +406,14 @@ string(<<_, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false string(Bin, Handler, Acc, Stack, Config) -> ?error(string, Bin, Handler, Acc, Stack, Config). +count(Bin, Handler, Acc, Stack, Config) -> + Size = count(Bin, 0, Config), + <> = Bin, + string(Rest, Handler, [Acc, Clean], Stack, Config). + + +%% explicitly whitelist ascii set for faster parsing. really? really. someone should +%% submit a patch that unrolls simple guards count(<<0, Rest/binary>>, N, Config) -> count(Rest, N + 1, Config); count(<<1, Rest/binary>>, N, Config) -> @@ -666,13 +669,12 @@ count(<<127, Rest/binary>>, N, Config) -> count(<<_, Rest/binary>>, N, Config=#config{dirty_strings=true}) -> count(Rest, N + 1, Config); count(<<_/utf8, _/binary>>, N, #config{uescape=true}) -> N; -%% u+2028 -count(<<226, 128, 168, _/binary>>, N, _) -> N; -%% u+2029 -count(<<226, 128, 169, _/binary>>, N, _) -> N; count(<>, N, Config) -> case X of X when X < 16#800 -> count(Rest, N + 2, Config); + %% jsonp escaping + 16#2028 -> N; + 16#2029 -> N; X when X < 16#10000 -> count(Rest, N + 3, Config); _ -> count(Rest, N + 4, Config) end; @@ -1007,7 +1009,6 @@ finish_number(Rest, Handler, Acc, Stack, Config) -> format_number({integer, Acc}) -> {integer, binary_to_integer(Acc)}; format_number({float, Acc}) -> {float, binary_to_float(Acc)}. -endif. - -ifdef(no_binary_to_whatever). format_number({integer, Acc}) -> {integer, list_to_integer(unicode:characters_to_list(Acc))}; format_number({float, Acc}) -> {float, list_to_float(unicode:characters_to_list(Acc))}. @@ -1471,6 +1472,7 @@ codepoints() -> [16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++ [16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000]. + surrogates() -> lists:seq(16#d800, 16#dfff). diff --git a/src/jsx_parser.erl b/src/jsx_parser.erl index 7113c71..a274574 100644 --- a/src/jsx_parser.erl +++ b/src/jsx_parser.erl @@ -130,6 +130,7 @@ value(BadTokens, Handler, Stack, Config) when is_list(BadTokens) -> value(Token, Handler, Stack, Config) -> value([Token], Handler, Stack, Config). + object([end_object|Tokens], Handler, [object|Stack], Config) -> maybe_done(Tokens, handle_event(end_object, Handler, Config), Stack, Config); object([{key, Key}|Tokens], Handler, Stack, Config) @@ -153,6 +154,7 @@ object([], Handler, Stack, Config) -> object(Token, Handler, Stack, Config) -> object([Token], Handler, Stack, Config). + array([end_array|Tokens], Handler, [array|Stack], Config) -> maybe_done(Tokens, handle_event(end_array, Handler, Config), Stack, Config); array([], Handler, Stack, Config) -> @@ -162,6 +164,7 @@ array(Tokens, Handler, Stack, Config) when is_list(Tokens) -> array(Token, Handler, Stack, Config) -> array([Token], Handler, Stack, Config). + maybe_done([end_json], Handler, [], Config) -> done([end_json], Handler, [], Config); maybe_done(Tokens, Handler, [object|_] = Stack, Config) when is_list(Tokens) -> @@ -175,6 +178,7 @@ maybe_done(BadTokens, Handler, Stack, Config) when is_list(BadTokens) -> maybe_done(Token, Handler, Stack, Config) -> maybe_done([Token], Handler, Stack, Config). + done([], Handler, [], Config=#config{stream=true}) -> incomplete(done, Handler, [], Config); done(Tokens, Handler, [], Config) when Tokens == [end_json]; Tokens == [] -> @@ -195,23 +199,89 @@ clean_string(Bin, #config{dirty_strings=true}) -> Bin; clean_string(Bin, Config) -> clean(Bin, [], Config). -clean(<<>>, Acc, _) -> iolist_to_binary(Acc); -clean(<>, Acc, Config) when X < 16#20 -> - clean(Rest, [Acc, maybe_replace(X, Config)], Config); +%% unroll the control characters +clean(<<0, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(0, Config)], Config); +clean(<<1, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(1, Config)], Config); +clean(<<2, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(2, Config)], Config); +clean(<<3, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(3, Config)], Config); +clean(<<4, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(4, Config)], Config); +clean(<<5, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(5, Config)], Config); +clean(<<6, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(6, Config)], Config); +clean(<<7, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(7, Config)], Config); +clean(<<8, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(8, Config)], Config); +clean(<<9, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(9, Config)], Config); +clean(<<10, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(10, Config)], Config); +clean(<<11, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(11, Config)], Config); +clean(<<12, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(12, Config)], Config); +clean(<<13, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(13, Config)], Config); +clean(<<14, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(14, Config)], Config); +clean(<<15, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(15, Config)], Config); +clean(<<16, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(16, Config)], Config); +clean(<<17, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(17, Config)], Config); +clean(<<18, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(18, Config)], Config); +clean(<<19, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(19, Config)], Config); +clean(<<20, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(20, Config)], Config); +clean(<<21, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(21, Config)], Config); +clean(<<22, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(22, Config)], Config); +clean(<<23, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(23, Config)], Config); +clean(<<24, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(24, Config)], Config); +clean(<<25, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(25, Config)], Config); +clean(<<26, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(26, Config)], Config); +clean(<<27, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(27, Config)], Config); +clean(<<28, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(28, Config)], Config); +clean(<<29, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(29, Config)], Config); +clean(<<30, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(30, Config)], Config); +clean(<<31, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(31, Config)], Config); clean(<<34, Rest/binary>>, Acc, Config) -> clean(Rest, [Acc, maybe_replace(34, Config)], Config); clean(<<47, Rest/binary>>, Acc, Config) -> clean(Rest, [Acc, maybe_replace(47, Config)], Config); clean(<<92, Rest/binary>>, Acc, Config) -> clean(Rest, [Acc, maybe_replace(92, Config)], Config); -clean(<>, Acc, Config=#config{uescape=true}) when X >= 16#80 -> - clean(Rest, [Acc, json_escape_sequence(X)], Config); -clean(<>, Acc, Config) when X == 16#2028; X == 16#2029 -> - clean(Rest, [Acc, maybe_replace(X, Config)], Config); -clean(<<_/utf8, _/binary>> = Bin, Acc, Config) -> - Size = count(Bin, 0, Config), - <> = Bin, - clean(Rest, [Acc, Clean], Config); +clean(<> = Bin, Acc, Config=#config{uescape=true}) -> + case X of + X when X < 16#80 -> start_count(Bin, Acc, Config); + _ -> clean(Rest, [Acc, json_escape_sequence(X)], Config) + end; +%% u+2028 +clean(<<226, 128, 168, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(16#2028, Config)], Config); +%% u+2029 +clean(<<226, 128, 169, Rest/binary>>, Acc, Config) -> + clean(Rest, [Acc, maybe_replace(16#2029, Config)], Config); +clean(<<_/utf8, _/binary>> = Bin, Acc, Config) -> start_count(Bin, Acc, Config); %% surrogates clean(<<237, X, _, Rest/binary>>, Acc, Config) when X >= 160 -> clean(Rest, [Acc, maybe_replace(surrogate, Config)], Config); @@ -225,10 +295,17 @@ clean(<>, Acc, Config) when X >= 224, X =< 239 -> clean(<>, Acc, Config) when X >= 240, X =< 247 -> clean(strip_continuations(Rest, 3), [Acc, maybe_replace(badutf, Config)], Config); clean(<<_, Rest/binary>>, Acc, Config) -> - clean(Rest, [Acc, maybe_replace(badutf, Config)], Config). + clean(Rest, [Acc, maybe_replace(badutf, Config)], Config); +clean(<<>>, Acc, _) -> iolist_to_binary(Acc). -count(<<>>, N, _) -> N; +start_count(Bin, Acc, Config) -> + Size = count(Bin, 0, Config), + <> = Bin, + clean(Rest, [Acc, Clean], Config). + + +%% again, unrolling ascii makes a huge difference. sadly count(<<0, _/binary>>, N, _) -> N; count(<<1, _/binary>>, N, _) -> N; count(<<2, _/binary>>, N, _) -> N; @@ -451,17 +528,16 @@ count(<<126, Rest/binary>>, N, Config) -> count(<<127, Rest/binary>>, N, Config) -> count(Rest, N + 1, Config); count(<<_/utf8, _/binary>>, N, #config{uescape=true}) -> N; -%% u+2028 -count(<<226, 128, 168, _/binary>>, N, _) -> N; -%% u+2029 -count(<<226, 128, 169, _/binary>>, N, _) -> N; count(<>, N, Config) -> case X of X when X < 16#800 -> count(Rest, N + 2, Config); + 16#2028 -> N; + 16#2029 -> N; X when X < 16#10000 -> count(Rest, N + 3, Config); _ -> count(Rest, N + 4, Config) end; -count(<<_, _/binary>>, N, _) -> N. +count(<<_, _/binary>>, N, _) -> N; +count(<<>>, N, _) -> N. strip_continuations(Bin, 0) -> Bin; @@ -524,6 +600,7 @@ to_hex(X) -> X + 48. %% ascii "1" is [49], "2" is [50], etc... init([]) -> []. + -spec handle_event(Event::any(), Acc::list()) -> list(). handle_event(end_json, State) -> lists:reverse(State); @@ -631,6 +708,7 @@ codepoints() -> ++ lists:seq(16#e000, 16#ffff) ). + extended_codepoints() -> unicode:characters_to_binary( lists:seq(16#10000, 16#1ffff) ++ [ @@ -640,13 +718,16 @@ extended_codepoints() -> ] ). + surrogates() -> [ to_fake_utf8(N) || N <- lists:seq(16#d800, 16#dfff) ]. + clean_string_helper(String) -> try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean catch error:badarg -> {error, badarg} end. + clean_string_test_() -> [ {"clean codepoints", ?_assertEqual( @@ -1069,4 +1150,5 @@ datetime_test_() -> )} ]. + -endif.