rework decoder and encoder slightly to be a little more sane

This commit is contained in:
alisdair sullivan 2014-12-11 06:57:49 +00:00
parent 3625aaeb66
commit 8592742ded
2 changed files with 118 additions and 34 deletions

View file

@ -338,11 +338,8 @@ key(Bin, Handler, Stack, Config) ->
?error(key, Bin, Handler, Stack, Config).
%% explicitly whitelist ascii set for faster parsing. really? really. someone should
%% submit a patch that unrolls simple guards
%% note that if you encounter an error from string and you can't find the clause that
%% caused it here, it might be in unescape below
string(Bin, Handler, Stack, Config) ->
string(Bin, Handler, [], Stack, Config).
@ -355,9 +352,10 @@ string(<<?solidus, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, [Acc, maybe_replace(?solidus, Config)], Stack, Config);
string(<<?rsolidus/utf8, Rest/binary>>, Handler, Acc, Stack, Config) ->
unescape(Rest, Handler, Acc, Stack, Config);
string(<<X/utf8, Rest/binary>>, Handler, Acc, Stack, Config=#config{uescape=true}) ->
%% TODO this is pretty gross and i don't like it
string(<<X/utf8, Rest/binary>> = Bin, Handler, Acc, Stack, Config=#config{uescape=true}) ->
case X of
X when X < 16#80 -> string(Rest, Handler, [Acc, X], Stack, Config);
X when X < 16#80 -> count(Bin, Handler, Acc, Stack, Config);
X -> string(Rest, Handler, [Acc, json_escape_sequence(X)], Stack, Config)
end;
%% u+2028
@ -367,14 +365,11 @@ string(<<226, 128, 168, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(<<226, 128, 169, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, [Acc, maybe_replace(16#2029, Config)], Stack, Config);
string(<<_/utf8, _/binary>> = Bin, Handler, Acc, Stack, Config) ->
Size = count(Bin, 0, Config),
<<Clean:Size/binary, Rest/binary>> = Bin,
string(Rest, Handler, [Acc, Clean], Stack, Config);
%% really, really dirty strings. if there's no valid utf8 we never reach `count`
%% and things get replaced instead of ignored
count(Bin, Handler, Acc, Stack, Config);
%% necessary for bytes that are badly formed utf8 that won't match in `count`
string(<<X, Rest/binary>>, Handler, Acc, Stack, Config=#config{dirty_strings=true}) ->
string(Rest, Handler, [Acc, X], Stack, Config);
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match preceeding
%% u+fffe and u+ffff for R14BXX (subsequent runtimes will happily match with /utf8
string(<<239, 191, 190, Rest/binary>>, Handler, Acc, Stack, Config) ->
string(Rest, Handler, [Acc, <<16#fffe/utf8>>], Stack, Config);
string(<<239, 191, 191, Rest/binary>>, Handler, Acc, Stack, Config) ->
@ -411,6 +406,14 @@ string(<<_, Rest/binary>>, Handler, Acc, Stack, Config=#config{strict_utf8=false
string(Bin, Handler, Acc, Stack, Config) -> ?error(string, Bin, Handler, Acc, Stack, Config).
count(Bin, Handler, Acc, Stack, Config) ->
Size = count(Bin, 0, Config),
<<Clean:Size/binary, Rest/binary>> = Bin,
string(Rest, Handler, [Acc, Clean], Stack, Config).
%% explicitly whitelist ascii set for faster parsing. really? really. someone should
%% submit a patch that unrolls simple guards
count(<<0, Rest/binary>>, N, Config) ->
count(Rest, N + 1, Config);
count(<<1, Rest/binary>>, N, Config) ->
@ -666,13 +669,12 @@ count(<<127, Rest/binary>>, N, Config) ->
count(<<_, Rest/binary>>, N, Config=#config{dirty_strings=true}) ->
count(Rest, N + 1, Config);
count(<<_/utf8, _/binary>>, N, #config{uescape=true}) -> N;
%% u+2028
count(<<226, 128, 168, _/binary>>, N, _) -> N;
%% u+2029
count(<<226, 128, 169, _/binary>>, N, _) -> N;
count(<<X/utf8, Rest/binary>>, N, Config) ->
case X of
X when X < 16#800 -> count(Rest, N + 2, Config);
%% jsonp escaping
16#2028 -> N;
16#2029 -> N;
X when X < 16#10000 -> count(Rest, N + 3, Config);
_ -> count(Rest, N + 4, Config)
end;
@ -1007,7 +1009,6 @@ finish_number(Rest, Handler, Acc, Stack, Config) ->
format_number({integer, Acc}) -> {integer, binary_to_integer(Acc)};
format_number({float, Acc}) -> {float, binary_to_float(Acc)}.
-endif.
-ifdef(no_binary_to_whatever).
format_number({integer, Acc}) -> {integer, list_to_integer(unicode:characters_to_list(Acc))};
format_number({float, Acc}) -> {float, list_to_float(unicode:characters_to_list(Acc))}.
@ -1471,6 +1472,7 @@ codepoints() ->
[16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 16#b0000] ++
[16#c0000, 16#d0000, 16#e0000, 16#f0000, 16#100000].
surrogates() -> lists:seq(16#d800, 16#dfff).