rework decoder and encoder slightly to be a little more sane

This commit is contained in:
alisdair sullivan 2014-12-11 06:57:49 +00:00
parent 3625aaeb66
commit 8592742ded
2 changed files with 118 additions and 34 deletions

View file

@ -338,11 +338,8 @@ key(Bin, Handler, Stack, Config) ->
?error(key, Bin, Handler, Stack, Config).
%% explicitly whitelist ascii set for faster parsing. really? really. someone should
%% submit a patch that unrolls simple guards
%% note that if you encounter an error from string and you can't find the clause that
%% caused it here, it might be in unescape below
string(Bin, Handler, Stack, Config) ->
string(Bin, Handler, [], Stack, Config).
@ -355,9 +352,10 @@ string(<<?solidus, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, [Acc, maybe_replace(?solidus, Config)], Stack, Config);
string(<<?rsolidus/utf8, Rest/binary>>, Handler, Acc, Stack, Config) ->
unescape(Rest, Handler, Acc, Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config=#config{uescape=true}) ->
%% TODO this is pretty gross and i don't like it
string(<<X/utf8, Rest/binary>> = Bin, Handler, Acc, Stack, Config=#config{uescape=true}) ->
case X of
X when X < 16#80 -> string(Rest, Handler, [Acc, X], Stack, Config);
X when X < 16#80 -> count(Bin, Handler, Acc, Stack, Config);
X -> string(Rest, Handler, [Acc, json_escape_sequence(X)], Stack, Config)
end;
%% u+2028
@ -367,14 +365,11 @@ string(<<226, 128, 168, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(<<226, 128, 169, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, [Acc, maybe_replace(16#2029, Config)], Stack, Config);
string(<<_/utf8, _/binary>> = Bin, Handler, Acc, Stack, Config) ->
Size = count(Bin, 0, Config),
<<Clean:Size/binary, Rest/binary>> = Bin,
string(Rest, Handler, [Acc, Clean], Stack, Config);
%% really, really dirty strings. if there's no valid utf8 we never reach `count`
%% and things get replaced instead of ignored
count(Bin, Handler, Acc, Stack, Config);
%% necessary for bytes that are badly formed utf8 that won't match in `count`
string(<<X, Rest/binary>>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) ->
string(Rest, Handler, [Acc, X], Stack, Config);
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match preceeding
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match with /utf8
string(<<239, 191, 190, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, [Acc, <<16#fffe/utf8>>], Stack, Config);
string(<<239, 191, 191, Rest/binary>>, Handler, Acc, Stack, Config) ->
@ -411,6 +406,14 @@ string(<<_, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false
string(Bin, Handler, Acc, Stack, Config) -> ?error(string, Bin, Handler, Acc, Stack, Config).
count(Bin, Handler, Acc, Stack, Config) ->
Size = count(Bin, 0, Config),
<<Clean:Size/binary, Rest/binary>> = Bin,
string(Rest, Handler, [Acc, Clean], Stack, Config).
%% explicitly whitelist ascii set for faster parsing. really? really. someone should
%% submit a patch that unrolls simple guards
count(<<0, Rest/binary>>, N, Config) ->
count(Rest, N + 1, Config);
count(<<1, Rest/binary>>, N, Config) ->
@ -666,13 +669,12 @@ count(<<127, Rest/binary>>, N, Config) ->
count(<<_, Rest/binary>>, N, Config=#config{dirty_strings=true}) ->
count(Rest, N + 1, Config);
count(<<_/utf8, _/binary>>, N, #config{uescape=true}) -> N;
%% u+2028
count(<<226, 128, 168, _/binary>>, N, _) -> N;
%% u+2029
count(<<226, 128, 169, _/binary>>, N, _) -> N;
count(<<X/utf8, Rest/binary>>, N, Config) ->
case X of
X when X < 16#800 -> count(Rest, N + 2, Config);
%% jsonp escaping
16#2028 -> N;
16#2029 -> N;
X when X < 16#10000 -> count(Rest, N + 3, Config);
_ -> count(Rest, N + 4, Config)
end;
@ -1007,7 +1009,6 @@ finish_number(Rest, Handler, Acc, Stack, Config) ->
format_number({integer, Acc}) -> {integer, binary_to_integer(Acc)};
format_number({float, Acc}) -> {float, binary_to_float(Acc)}.
-endif.
-ifdef(no_binary_to_whatever).
format_number({integer, Acc}) -> {integer, list_to_integer(unicode:characters_to_list(Acc))};
format_number({float, Acc}) -> {float, list_to_float(unicode:characters_to_list(Acc))}.
@ -1471,6 +1472,7 @@ codepoints() ->
[16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++
[16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000].
surrogates() -> lists:seq(16#d800, 16#dfff).

View file

@ -130,6 +130,7 @@ value(BadTokens, Handler, Stack, Config) when is_list(BadTokens) ->
value(Token, Handler, Stack, Config) ->
value([Token], Handler, Stack, Config).
object([end_object|Tokens], Handler, [object|Stack], Config) ->
maybe_done(Tokens, handle_event(end_object, Handler, Config), Stack, Config);
object([{key, Key}|Tokens], Handler, Stack, Config)
@ -153,6 +154,7 @@ object([], Handler, Stack, Config) ->
object(Token, Handler, Stack, Config) ->
object([Token], Handler, Stack, Config).
array([end_array|Tokens], Handler, [array|Stack], Config) ->
maybe_done(Tokens, handle_event(end_array, Handler, Config), Stack, Config);
array([], Handler, Stack, Config) ->
@ -162,6 +164,7 @@ array(Tokens, Handler, Stack, Config) when is_list(Tokens) ->
array(Token, Handler, Stack, Config) ->
array([Token], Handler, Stack, Config).
maybe_done([end_json], Handler, [], Config) ->
done([end_json], Handler, [], Config);
maybe_done(Tokens, Handler, [object|_] = Stack, Config) when is_list(Tokens) ->
@ -175,6 +178,7 @@ maybe_done(BadTokens, Handler, Stack, Config) when is_list(BadTokens) ->
maybe_done(Token, Handler, Stack, Config) ->
maybe_done([Token], Handler, Stack, Config).
done([], Handler, [], Config=#config{stream=true}) ->
incomplete(done, Handler, [], Config);
done(Tokens, Handler, [], Config) when Tokens == [end_json]; Tokens == [] ->
@ -195,23 +199,89 @@ clean_string(Bin, #config{dirty_strings=true}) -> Bin;
clean_string(Bin, Config) -> clean(Bin, [], Config).
clean(<<>>, Acc, _) -> iolist_to_binary(Acc);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X < 16#20 ->
clean(Rest, [Acc, maybe_replace(X, Config)], Config);
%% unroll the control characters
clean(<<0, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(0, Config)], Config);
clean(<<1, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(1, Config)], Config);
clean(<<2, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(2, Config)], Config);
clean(<<3, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(3, Config)], Config);
clean(<<4, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(4, Config)], Config);
clean(<<5, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(5, Config)], Config);
clean(<<6, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(6, Config)], Config);
clean(<<7, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(7, Config)], Config);
clean(<<8, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(8, Config)], Config);
clean(<<9, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(9, Config)], Config);
clean(<<10, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(10, Config)], Config);
clean(<<11, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(11, Config)], Config);
clean(<<12, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(12, Config)], Config);
clean(<<13, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(13, Config)], Config);
clean(<<14, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(14, Config)], Config);
clean(<<15, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(15, Config)], Config);
clean(<<16, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(16, Config)], Config);
clean(<<17, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(17, Config)], Config);
clean(<<18, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(18, Config)], Config);
clean(<<19, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(19, Config)], Config);
clean(<<20, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(20, Config)], Config);
clean(<<21, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(21, Config)], Config);
clean(<<22, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(22, Config)], Config);
clean(<<23, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(23, Config)], Config);
clean(<<24, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(24, Config)], Config);
clean(<<25, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(25, Config)], Config);
clean(<<26, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(26, Config)], Config);
clean(<<27, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(27, Config)], Config);
clean(<<28, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(28, Config)], Config);
clean(<<29, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(29, Config)], Config);
clean(<<30, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(30, Config)], Config);
clean(<<31, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(31, Config)], Config);
clean(<<34, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(34, Config)], Config);
clean(<<47, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(47, Config)], Config);
clean(<<92, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(92, Config)], Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config=#config{uescape=true}) when X >= 16#80 ->
clean(Rest, [Acc, json_escape_sequence(X)], Config);
clean(<<X/utf8, Rest/binary>>, Acc, Config) when X == 16#2028; X == 16#2029 ->
clean(Rest, [Acc, maybe_replace(X, Config)], Config);
clean(<<_/utf8, _/binary>> = Bin, Acc, Config) ->
Size = count(Bin, 0, Config),
<<Clean:Size/binary, Rest/binary>> = Bin,
clean(Rest, [Acc, Clean], Config);
clean(<<X/utf8, Rest/binary>> = Bin, Acc, Config=#config{uescape=true}) ->
case X of
X when X < 16#80 -> start_count(Bin, Acc, Config);
_ -> clean(Rest, [Acc, json_escape_sequence(X)], Config)
end;
%% u+2028
clean(<<226, 128, 168, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(16#2028, Config)], Config);
%% u+2029
clean(<<226, 128, 169, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(16#2029, Config)], Config);
clean(<<_/utf8, _/binary>> = Bin, Acc, Config) -> start_count(Bin, Acc, Config);
%% surrogates
clean(<<237, X, _, Rest/binary>>, Acc, Config) when X >= 160 ->
clean(Rest, [Acc, maybe_replace(surrogate, Config)], Config);
@ -225,10 +295,17 @@ clean(<<X, Rest/binary>>, Acc, Config) when X >= 224, X =< 239 ->
clean(<<X, Rest/binary>>, Acc, Config) when X >= 240, X =< 247 ->
clean(strip_continuations(Rest, 3), [Acc, maybe_replace(badutf, Config)], Config);
clean(<<_, Rest/binary>>, Acc, Config) ->
clean(Rest, [Acc, maybe_replace(badutf, Config)], Config).
clean(Rest, [Acc, maybe_replace(badutf, Config)], Config);
clean(<<>>, Acc, _) -> iolist_to_binary(Acc).
count(<<>>, N, _) -> N;
start_count(Bin, Acc, Config) ->
Size = count(Bin, 0, Config),
<<Clean:Size/binary, Rest/binary>> = Bin,
clean(Rest, [Acc, Clean], Config).
%% again, unrolling ascii makes a huge difference. sadly
count(<<0, _/binary>>, N, _) -> N;
count(<<1, _/binary>>, N, _) -> N;
count(<<2, _/binary>>, N, _) -> N;
@ -451,17 +528,16 @@ count(<<126, Rest/binary>>, N, Config) ->
count(<<127, Rest/binary>>, N, Config) ->
count(Rest, N + 1, Config);
count(<<_/utf8, _/binary>>, N, #config{uescape=true}) -> N;
%% u+2028
count(<<226, 128, 168, _/binary>>, N, _) -> N;
%% u+2029
count(<<226, 128, 169, _/binary>>, N, _) -> N;
count(<<X/utf8, Rest/binary>>, N, Config) ->
case X of
X when X < 16#800 -> count(Rest, N + 2, Config);
16#2028 -> N;
16#2029 -> N;
X when X < 16#10000 -> count(Rest, N + 3, Config);
_ -> count(Rest, N + 4, Config)
end;
count(<<_, _/binary>>, N, _) -> N.
count(<<_, _/binary>>, N, _) -> N;
count(<<>>, N, _) -> N.
strip_continuations(Bin, 0) -> Bin;
@ -524,6 +600,7 @@ to_hex(X) -> X + 48. %% ascii "1" is [49], "2" is [50], etc...
init([]) -> [].
-spec handle_event(Event::any(), Acc::list()) -> list().
handle_event(end_json, State) -> lists:reverse(State);
@ -631,6 +708,7 @@ codepoints() ->
++ lists:seq(16#e000, 16#ffff)
).
extended_codepoints() ->
unicode:characters_to_binary(
lists:seq(16#10000, 16#1ffff) ++ [
@ -640,13 +718,16 @@ extended_codepoints() ->
]
).
surrogates() -> [ to_fake_utf8(N) || N <- lists:seq(16#d800, 16#dfff) ].
clean_string_helper(String) ->
try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean
catch error:badarg -> {error, badarg}
end.
clean_string_test_() ->
[
{"clean codepoints", ?_assertEqual(
@ -1069,4 +1150,5 @@ datetime_test_() ->
)}
].
-endif.