Merge branch 'develop'

This commit is contained in:
alisdair sullivan 2012-03-29 21:48:57 -07:00
commit c80c1f7d40
7 changed files with 842 additions and 155 deletions

View file

@ -127,6 +127,10 @@ javascript interpreters treat the codepoints `u+2028` and `u+2029` as significan
json has no official comments but some parsers allow c style comments. this flag allows comments (both `// ...` and `/* ... */` style) anywhere whitespace is allowed
#### `json_escape` ####
by default, both the encoder and decoder return strings as utf8 binaries appropriate for use in erlang. escape sequences that were present in decoded terms are converted into the appropriate codepoint and encoded terms are unaltered. this flag escapes strings for output in json, removing control codes and replacing them with the appropriate escapes
### <a name="incompletes">incomplete input</a> ###

View file

@ -1,7 +1,7 @@
{application, jsx,
[
{description, "a streaming, evented json parsing toolkit"},
{vsn, "1.1.1"},
{vsn, "1.1.2"},
{modules, [
jsx,
jsx_encoder,

View file

@ -270,11 +270,11 @@ string(<<33, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(<<?doublequote, Rest/binary>>, {Handler, State}, S, Opts) ->
case S of
[Acc, key|Stack] ->
colon(Rest, {Handler, Handler:handle_event({key, ?end_seq(Acc)}, State)}, [key|Stack], Opts);
colon(Rest, {Handler, Handler:handle_event({key, maybe_escape(?end_seq(Acc), Opts)}, State)}, [key|Stack], Opts);
[_Acc, single_quote|_Stack] ->
?error([<<?doublequote, Rest/binary>>, {Handler, State}, S, Opts]);
[Acc|Stack] ->
maybe_done(Rest, {Handler, Handler:handle_event({string, ?end_seq(Acc)}, State)}, Stack, Opts)
maybe_done(Rest, {Handler, Handler:handle_event({string, maybe_escape(?end_seq(Acc), Opts)}, State)}, Stack, Opts)
end;
string(<<35, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 35)|Stack], Opts);
@ -284,13 +284,18 @@ string(<<37, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 37)|Stack], Opts);
string(<<38, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 38)|Stack], Opts);
string(<<?singlequote, Rest/binary>>, {Handler, State}, S, Opts = #opts{single_quotes=true}) ->
case S of
[Acc, single_quote, key|Stack] ->
colon(Rest, {Handler, Handler:handle_event({key, ?end_seq(Acc)}, State)}, [key|Stack], Opts);
[Acc, single_quote|Stack] ->
maybe_done(Rest, {Handler, Handler:handle_event({string, ?end_seq(Acc)}, State)}, Stack, Opts);
[Acc|Stack] ->
string(<<?singlequote, Rest/binary>>, {Handler, State}, [Acc|Stack], Opts) ->
case Opts#opts.single_quotes of
true ->
case Stack of
[single_quote, key|S] ->
colon(Rest, {Handler, Handler:handle_event({key, maybe_escape(?end_seq(Acc), Opts)}, State)}, [key|S], Opts)
; [single_quote|S] ->
maybe_done(Rest, {Handler, Handler:handle_event({string, maybe_escape(?end_seq(Acc), Opts)}, State)}, S, Opts)
; _ ->
string(Rest, {Handler, State}, [?acc_seq(Acc, ?singlequote)|Stack], Opts)
end
; false ->
string(Rest, {Handler, State}, [?acc_seq(Acc, ?singlequote)|Stack], Opts)
end;
string(<<40, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
@ -469,8 +474,53 @@ string(<<126, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 126)|Stack], Opts);
string(<<127, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 127)|Stack], Opts);
string(<<S/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts) when ?is_noncontrol(S) ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts);
string(<<S/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
case S of
%% not strictly true, but exceptions are already taken care of in preceding clauses
S when S >= 16#20, S < 16#d800 ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S > 16#dfff, S < 16#fdd0 ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S > 16#fdef, S < 16#fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#10000, S < 16#1fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#20000, S < 16#2fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#30000, S < 16#3fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#40000, S < 16#4fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#50000, S < 16#5fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#60000, S < 16#6fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#70000, S < 16#7fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#80000, S < 16#8fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#90000, S < 16#9fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#a0000, S < 16#afffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#b0000, S < 16#bfffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#c0000, S < 16#cfffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#d0000, S < 16#dfffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#e0000, S < 16#efffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#f0000, S < 16#ffffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; S when S >= 16#100000, S < 16#10fffe ->
string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
; _ ->
case Opts#opts.loose_unicode of
true -> noncharacter(<<S/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts)
; false -> ?error([<<S/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts])
end
end;
string(Bin, Handler, Stack, Opts) ->
case partial_utf(Bin) of
true -> ?incomplete(string, Bin, Handler, Stack, Opts)
@ -480,6 +530,11 @@ string(Bin, Handler, Stack, Opts) ->
; false -> ?error([Bin, Handler, Stack, Opts])
end
end.
maybe_escape(Str, Opts=#opts{json_escape=true}) -> jsx_utils:json_escape(Str, Opts);
maybe_escape(Str, _Opts) -> Str.
%% we don't need to guard against partial utf here, because it's already taken
%% care of in string
@ -489,8 +544,36 @@ noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 1
%% u+fffe and u+ffff for R14BXX
noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts) when X == 190; X == 191 ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
%% bad utf8
%% u+xfffe, u+xffff and other noncharacters
noncharacter(<<_/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
%% overlong encodings and missing continuations of a 2 byte sequence
noncharacter(<<X, Rest/binary>>, Handler, Stack, Opts) when X >= 192, X =< 223 ->
strip_continuations(Rest, Handler, [1|Stack], Opts);
%% overlong encodings and missing continuations of a 3 byte sequence
noncharacter(<<X, Rest/binary>>, Handler, Stack, Opts) when X >= 224, X =< 239 ->
strip_continuations(Rest, Handler, [2|Stack], Opts);
%% overlong encodings and missing continuations of a 4 byte sequence
noncharacter(<<X, Rest/binary>>, Handler, Stack, Opts) when X >= 240, X =< 247 ->
strip_continuations(Rest, Handler, [3|Stack], Opts);
%% unexpected bytes, including orphan continuations
noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
noncharacter(<<>>, Handler, Stack, Opts) ->
?incomplete(noncharacter, <<>>, Handler, Stack, Opts).
%% strips continuation bytes after bad utf bytes, guards against both too short
%% and overlong sequences. N is the maximum number of bytes to strip
strip_continuations(Rest, Handler, [0, Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
strip_continuations(<<X, Rest/binary>>, Handler, [N|Stack], Opts) when X >= 128, X =< 191 ->
strip_continuations(Rest, Handler, [N - 1|Stack], Opts);
%% incomplete
strip_continuations(<<>>, Handler, Stack, Opts) ->
?incomplete(strip_continuations, <<>>, Handler, Stack, Opts);
%% not a continuation byte, dispatch back to string
strip_continuations(Rest, Handler, [_, Acc|Stack], Opts) ->
string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts).
@ -516,6 +599,8 @@ escape(<<$u, Rest/binary>>, Handler, Stack, Opts) ->
escaped_unicode(Rest, Handler, Stack, Opts);
escape(<<>>, Handler, Stack, Opts) ->
?incomplete(escape, <<>>, Handler, Stack, Opts);
escape(Bin, Handler, [Acc|Stack], Opts=#opts{ignore_bad_escapes=true}) ->
string(Bin, Handler, [?acc_seq(Acc, ?rsolidus)|Stack], Opts);
escape(Bin, Handler, Stack, Opts) ->
?error([Bin, Handler, Stack, Opts]).
@ -963,6 +1048,216 @@ done(Bin, Handler, Stack, Opts) -> ?error([Bin, Handler, Stack, Opts]).
-include_lib("eunit/include/eunit.hrl").
xcode(Bin) -> xcode(Bin, []).
xcode(Bin, Opts) ->
Size = size(Bin),
try jsx:to_term(<<34, Bin:Size/binary, 34>>, Opts)
catch error:badarg -> {error, badarg}
end.
is_bad({error, badarg}) -> true;
is_bad(_) -> false.
bad_utf8_test_() ->
[
{"orphan continuation byte u+0080",
?_assert(is_bad(xcode(<<16#0080>>)))
},
{"orphan continuation byte u+0080 replaced",
?_assertEqual(xcode(<<16#0080>>, [loose_unicode]), <<16#fffd/utf8>>)
},
{"orphan continuation byte u+00bf",
?_assert(is_bad(xcode(<<16#00bf>>)))
},
{"orphan continuation byte u+00bf replaced",
?_assertEqual(xcode(<<16#00bf>>, [loose_unicode]), <<16#fffd/utf8>>)
},
{"2 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>)))
},
{"2 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 2)
)
},
{"3 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>)))
},
{"3 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 3)
)
},
{"4 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>)))
},
{"4 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 4)
)
},
{"5 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>)))
},
{"5 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 5)
)
},
{"6 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>)))
},
{"6 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 6)
)
},
{"all continuation bytes",
?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>)))
},
{"all continuation bytes replaced",
?_assertEqual(
xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf)))
)
},
{"lonely start byte",
?_assert(is_bad(xcode(<<16#00c0>>)))
},
{"lonely start byte replaced",
?_assertEqual(
xcode(<<16#00c0>>, [loose_unicode]),
<<16#fffd/utf8>>
)
},
{"lonely start bytes (2 byte)",
?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>)))
},
{"lonely start bytes (2 byte) replaced",
?_assertEqual(
xcode(<<16#00c0, 32, 16#00df>>, [loose_unicode]),
<<16#fffd/utf8, 32, 16#fffd/utf8>>
)
},
{"lonely start bytes (3 byte)",
?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>)))
},
{"lonely start bytes (3 byte) replaced",
?_assertEqual(
xcode(<<16#00e0, 32, 16#00ef>>, [loose_unicode]),
<<16#fffd/utf8, 32, 16#fffd/utf8>>
)
},
{"lonely start bytes (4 byte)",
?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>)))
},
{"lonely start bytes (4 byte) replaced",
?_assertEqual(
xcode(<<16#00f0, 32, 16#00f7>>, [loose_unicode]),
<<16#fffd/utf8, 32, 16#fffd/utf8>>
)
},
{"missing continuation byte (3 byte)",
?_assert(is_bad(xcode(<<224, 160, 32>>)))
},
{"missing continuation byte (3 byte) replaced",
?_assertEqual(
xcode(<<224, 160, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"missing continuation byte (4 byte missing one)",
?_assert(is_bad(xcode(<<240, 144, 128, 32>>)))
},
{"missing continuation byte2 (4 byte missing one) replaced",
?_assertEqual(
xcode(<<240, 144, 128, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"missing continuation byte (4 byte missing two)",
?_assert(is_bad(xcode(<<240, 144, 32>>)))
},
{"missing continuation byte2 (4 byte missing two) replaced",
?_assertEqual(
xcode(<<240, 144, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"overlong encoding of u+002f (2 byte)",
?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>)))
},
{"overlong encoding of u+002f (2 byte) replaced",
?_assertEqual(
xcode(<<16#c0, 16#af, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"overlong encoding of u+002f (3 byte)",
?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>)))
},
{"overlong encoding of u+002f (3 byte) replaced",
?_assertEqual(
xcode(<<16#e0, 16#80, 16#af, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"overlong encoding of u+002f (4 byte)",
?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>)))
},
{"overlong encoding of u+002f (4 byte) replaced",
?_assertEqual(
xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"highest overlong 2 byte sequence",
?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>)))
},
{"highest overlong 2 byte sequence replaced",
?_assertEqual(
xcode(<<16#c1, 16#bf, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"highest overlong 3 byte sequence",
?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>)))
},
{"highest overlong 3 byte sequence replaced",
?_assertEqual(
xcode(<<16#e0, 16#9f, 16#bf, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"highest overlong 4 byte sequence",
?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>)))
},
{"highest overlong 4 byte sequence replaced",
?_assertEqual(
xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
}
].
ignore_bad_escapes_test_() ->
[
{"ignore unrecognized escape sequence", ?_assertEqual(
decode(<<"[\"\\x25\"]">>, [ignore_bad_escapes]),
[start_array, {string, <<"\\x25">>}, end_array, end_json]
)}
].
comments_test_() ->
[
{"preceeding // comment", ?_assertEqual(
@ -1147,6 +1442,7 @@ comments_test_() ->
)}
].
escape_forward_slash_test_() ->
[
{"escape forward slash test", ?_assertEqual(
@ -1155,6 +1451,29 @@ escape_forward_slash_test_() ->
)}
].
noncharacters_test_() ->
[
{"noncharacters - badjson",
?_assertEqual(check_bad(noncharacters()), [])
},
{"noncharacters - replaced",
?_assertEqual(check_replaced(noncharacters()), [])
}
].
extended_noncharacters_test_() ->
[
{"extended noncharacters - badjson",
?_assertEqual(check_bad(extended_noncharacters()), [])
},
{"extended noncharacters - replaced",
?_assertEqual(check_replaced(extended_noncharacters()), [])
}
].
surrogates_test_() ->
[
{"surrogates - badjson",
@ -1165,12 +1484,25 @@ surrogates_test_() ->
}
].
control_test_() ->
[
{"control characters - badjson",
?_assertEqual(check_bad(control_characters()), [])
}
].
reserved_test_() ->
[
{"reserved noncharacters - badjson",
?_assertEqual(check_bad(reserved_space()), [])
},
{"reserved noncharacters - replaced",
?_assertEqual(check_replaced(reserved_space()), [])
}
].
good_characters_test_() ->
[
@ -1181,51 +1513,6 @@ good_characters_test_() ->
?_assertEqual(check_good(good_extended()), [])
}
].
malformed_test_() ->
[
{"malformed codepoint with 1 byte",
?_assertEqual({error, badjson}, decode(<<128>>))
},
{"malformed codepoint with 2 bytes",
?_assertEqual({error, badjson}, decode(<<128, 192>>))
},
{"malformed codepoint with 3 bytes",
?_assertEqual({error, badjson}, decode(<<128, 192, 192>>))
},
{"malformed codepoint with 4 bytes",
?_assertEqual({error, badjson}, decode(<<128, 192, 192, 192>>))
}
].
malformed_replaced_test_() ->
F = <<16#fffd/utf8>>,
[
{"malformed codepoint with 1 byte",
?_assertEqual(
[{string, <<F/binary>>}, end_json],
decode(<<34, 128, 34>>, [loose_unicode])
)
},
{"malformed codepoint with 2 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary>>}, end_json],
decode(<<34, 128, 192, 34>>, [loose_unicode])
)
},
{"malformed codepoint with 3 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary, F/binary>>}, end_json],
decode(<<34, 128, 192, 192, 34>>, [loose_unicode])
)
},
{"malformed codepoint with 4 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
decode(<<34, 128, 192, 192, 192, 34>>, [loose_unicode])
)
}
].
check_bad(List) ->
@ -1233,6 +1520,7 @@ check_bad(List) ->
check(List, [], [])
).
check_replaced(List) ->
lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true
; (_) -> false
@ -1240,19 +1528,19 @@ check_replaced(List) ->
check(List, [loose_unicode], [])
).
check_good(List) ->
lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end,
check(List, [], [])
).
check([], _Opts, Acc) -> Acc;
check([H|T], Opts, Acc) ->
R = decode(to_fake_utf(H, utf8), Opts),
check(T, Opts, [{H, R}] ++ Acc).
decode(JSON) -> decode(JSON, []).
decode(JSON, Opts) ->
try
(decoder(jsx, [], Opts))(JSON)
@ -1261,13 +1549,41 @@ decode(JSON, Opts) ->
end.
noncharacters() -> lists:seq(16#fffe, 16#ffff).
extended_noncharacters() ->
[16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
surrogates() -> lists:seq(16#d800, 16#dfff).
control_characters() -> lists:seq(1, 31).
good() -> [32, 33] ++ lists:seq(16#23, 16#5b) ++ lists:seq(16#5d, 16#d7ff) ++ lists:seq(16#e000, 16#fffd).
good_extended() -> lists:seq(16#100000, 16#10ffff).
reserved_space() -> lists:seq(16#fdd0, 16#fdef).
good() -> [32, 33]
++ lists:seq(16#23, 16#5b)
++ lists:seq(16#5d, 16#d7ff)
++ lists:seq(16#e000, 16#fdcf)
++ lists:seq(16#fdf0, 16#fffd).
good_extended() -> [16#10000, 16#20000, 16#30000, 16#40000, 16#50000,
16#60000, 16#70000, 16#80000, 16#90000, 16#a0000,
16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000
] ++ lists:seq(16#100000, 16#10fffd).
%% erlang refuses to encode certain codepoints, so fake them all
to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;

View file

@ -104,33 +104,305 @@ fix_key(Key) when is_binary(Key) -> Key.
clean_string(Bin, Opts) ->
case Opts#opts.json_escape of
true -> jsx_utils:json_escape(Bin, Opts);
false ->
case Opts#opts.loose_unicode of
true -> jsx_utils:json_escape(clean_string(Bin, 0, size(Bin), Opts), Opts)
; false ->
case is_clean(Bin) of
true -> Bin;
false -> clean_string(Bin, [], Opts)
true -> jsx_utils:json_escape(Bin, Opts)
; false -> erlang:error(badarg, [Bin, Opts])
end
end.
is_clean(<<>>) -> true;
is_clean(<<_/utf8, Rest/binary>>) -> is_clean(Rest);
is_clean(_) -> false.
is_clean(<<X/utf8, Rest/binary>>) when X < 16#80 -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X < 16#800 -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X < 16#dcff -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X > 16#dfff, X < 16#fdd0 -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X > 16#fdef, X < 16#fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#10000, X < 16#1fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#20000, X < 16#2fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#30000, X < 16#3fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#40000, X < 16#4fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#50000, X < 16#5fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#60000, X < 16#6fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#70000, X < 16#7fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#80000, X < 16#8fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#90000, X < 16#9fffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#a0000, X < 16#afffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#b0000, X < 16#bfffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#c0000, X < 16#cfffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#d0000, X < 16#dfffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#e0000, X < 16#efffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#f0000, X < 16#ffffe -> is_clean(Rest);
is_clean(<<X/utf8, Rest/binary>>) when X >= 16#100000, X < 16#10fffe -> is_clean(Rest);
is_clean(Bin) -> erlang:error(badarg, [Bin]).
clean_string(Bin, _Acc, Opts=#opts{loose_unicode=false}) -> ?error([Bin, Opts]);
clean_string(<<>>, Acc, _Opts) -> unicode:characters_to_binary(lists:reverse(Acc));
clean_string(<<X/utf8, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [X] ++ Acc, Opts);
%% surrogates
clean_string(<<237, X, _, Rest/binary>>, Acc, Opts) when X >= 160 -> clean_string(Rest, [16#fffd] ++ Acc, Opts);
%% bad codepoints
clean_string(<<_, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [16#fffd] ++ Acc, Opts).
clean_string(Str, Len, Len, _Opts) -> Str;
clean_string(Str, L, Len, Opts) ->
case Str of
<<_:L/binary, X/utf8, _/binary>> when X < 16#80 -> clean_string(Str, L + 1, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X < 16#800 -> clean_string(Str, L + 2, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X < 16#dcff -> clean_string(Str, L + 3, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X > 16#dfff, X < 16#fdd0 -> clean_string(Str, L + 3, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X > 16#fdef, X < 16#fffe -> clean_string(Str, L + 3, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#10000, X < 16#1fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#20000, X < 16#2fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#30000, X < 16#3fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#40000, X < 16#4fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#50000, X < 16#5fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#60000, X < 16#6fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#70000, X < 16#7fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#80000, X < 16#8fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#90000, X < 16#9fffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#a0000, X < 16#afffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#b0000, X < 16#bfffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#c0000, X < 16#cfffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#d0000, X < 16#dfffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#e0000, X < 16#efffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#f0000, X < 16#ffffe -> clean_string(Str, L + 4, Len, Opts)
; <<_:L/binary, X/utf8, _/binary>> when X >= 16#100000, X < 16#10fffe -> clean_string(Str, L + 4, Len, Opts)
%% noncharacters
; <<H:L/binary, X/utf8, T/binary>> when X < 16#10000 ->
clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 3, Len, Opts)
; <<H:L/binary, _/utf8, T/binary>> ->
clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 4, Len, Opts)
%% surrogates
; <<H:L/binary, 237, X, _, T/binary>> when X >= 160 ->
clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 3, Len, Opts)
%% u+fffe and u+ffff for R14BXX
; <<H:L/binary, 239, 191, X, T/binary>> when X == 190; X == 191 ->
clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 3, Len, Opts)
%% overlong encodings and missing continuations of a 2 byte sequence
; <<H:L/binary, X, T/binary>> when X >= 192, X =< 223 ->
{Tail, Stripped} = strip_continuations(T, 1, 0),
clean_string(<<H:L/binary, 16#fffd/utf8, Tail/binary>>, L + 3, Len + 2 - Stripped, Opts)
%% overlong encodings and missing continuations of a 3 byte sequence
; <<H:L/binary, X, T/binary>> when X >= 224, X =< 239 ->
{Tail, Stripped} = strip_continuations(T, 2, 0),
clean_string(<<H:L/binary, 16#fffd/utf8, Tail/binary>>, L + 3, Len + 2 - Stripped, Opts)
%% overlong encodings and missing continuations of a 4 byte sequence
; <<H:L/binary, X, T/binary>> when X >= 240, X =< 247 ->
{Tail, Stripped} = strip_continuations(T, 3, 0),
clean_string(<<H:L/binary, 16#fffd/utf8, Tail/binary>>, L + 3, Len + 2 - Stripped, Opts)
; <<H:L/binary, _, T/binary>> ->
clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 3, Len + 2, Opts)
end.
strip_continuations(Bin, 0, N) -> {Bin, N};
strip_continuations(<<X, Rest/binary>>, N, M) when X >= 128, X =< 191 ->
strip_continuations(Rest, N - 1, M + 1);
%% not a continuation byte
strip_continuations(Bin, _, N) -> {Bin, N}.
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
xcode(Bin) -> xcode(Bin, #opts{}).
xcode(Bin, [loose_unicode]) -> xcode(Bin, #opts{loose_unicode=true});
xcode(Bin, Opts) ->
try clean_string(Bin, Opts)
catch error:badarg -> {error, badarg}
end.
is_bad({error, badarg}) -> true;
is_bad(_) -> false.
bad_utf8_test_() ->
[
{"orphan continuation byte u+0080",
?_assert(is_bad(xcode(<<16#0080>>)))
},
{"orphan continuation byte u+0080 replaced",
?_assertEqual(xcode(<<16#0080>>, [loose_unicode]), <<16#fffd/utf8>>)
},
{"orphan continuation byte u+00bf",
?_assert(is_bad(xcode(<<16#00bf>>)))
},
{"orphan continuation byte u+00bf replaced",
?_assertEqual(xcode(<<16#00bf>>, [loose_unicode]), <<16#fffd/utf8>>)
},
{"2 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>)))
},
{"2 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 2)
)
},
{"3 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>)))
},
{"3 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 3)
)
},
{"4 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>)))
},
{"4 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 4)
)
},
{"5 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>)))
},
{"5 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 5)
)
},
{"6 continuation bytes",
?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>)))
},
{"6 continuation bytes replaced",
?_assertEqual(
xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, 6)
)
},
{"all continuation bytes",
?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>)))
},
{"all continuation bytes replaced",
?_assertEqual(
xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [loose_unicode]),
binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf)))
)
},
{"lonely start byte",
?_assert(is_bad(xcode(<<16#00c0>>)))
},
{"lonely start byte replaced",
?_assertEqual(
xcode(<<16#00c0>>, [loose_unicode]),
<<16#fffd/utf8>>
)
},
{"lonely start bytes (2 byte)",
?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>)))
},
{"lonely start bytes (2 byte) replaced",
?_assertEqual(
xcode(<<16#00c0, 32, 16#00df>>, [loose_unicode]),
<<16#fffd/utf8, 32, 16#fffd/utf8>>
)
},
{"lonely start bytes (3 byte)",
?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>)))
},
{"lonely start bytes (3 byte) replaced",
?_assertEqual(
xcode(<<16#00e0, 32, 16#00ef>>, [loose_unicode]),
<<16#fffd/utf8, 32, 16#fffd/utf8>>
)
},
{"lonely start bytes (4 byte)",
?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>)))
},
{"lonely start bytes (4 byte) replaced",
?_assertEqual(
xcode(<<16#00f0, 32, 16#00f7>>, [loose_unicode]),
<<16#fffd/utf8, 32, 16#fffd/utf8>>
)
},
{"missing continuation byte (3 byte)",
?_assert(is_bad(xcode(<<224, 160, 32>>)))
},
{"missing continuation byte (3 byte) replaced",
?_assertEqual(
xcode(<<224, 160, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"missing continuation byte (4 byte missing one)",
?_assert(is_bad(xcode(<<240, 144, 128, 32>>)))
},
{"missing continuation byte2 (4 byte missing one) replaced",
?_assertEqual(
xcode(<<240, 144, 128, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"missing continuation byte (4 byte missing two)",
?_assert(is_bad(xcode(<<240, 144, 32>>)))
},
{"missing continuation byte2 (4 byte missing two) replaced",
?_assertEqual(
xcode(<<240, 144, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"overlong encoding of u+002f (2 byte)",
?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>)))
},
{"overlong encoding of u+002f (2 byte) replaced",
?_assertEqual(
xcode(<<16#c0, 16#af, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"overlong encoding of u+002f (3 byte)",
?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>)))
},
{"overlong encoding of u+002f (3 byte) replaced",
?_assertEqual(
xcode(<<16#e0, 16#80, 16#af, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"overlong encoding of u+002f (4 byte)",
?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>)))
},
{"overlong encoding of u+002f (4 byte) replaced",
?_assertEqual(
xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"highest overlong 2 byte sequence",
?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>)))
},
{"highest overlong 2 byte sequence replaced",
?_assertEqual(
xcode(<<16#c1, 16#bf, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"highest overlong 3 byte sequence",
?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>)))
},
{"highest overlong 3 byte sequence replaced",
?_assertEqual(
xcode(<<16#e0, 16#9f, 16#bf, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
},
{"highest overlong 4 byte sequence",
?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>)))
},
{"highest overlong 4 byte sequence replaced",
?_assertEqual(
xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [loose_unicode]),
<<16#fffd/utf8, 32>>
)
}
].
encode(Term) -> (encoder(jsx, [], []))(Term).
encode(Term, Opts) ->
@ -210,6 +482,7 @@ encode_test_() ->
}
].
surrogates_test_() ->
[
{"surrogates - badjson",
@ -219,7 +492,8 @@ surrogates_test_() ->
?_assertEqual(check_replaced(surrogates()), [])
}
].
good_characters_test_() ->
[
{"acceptable codepoints",
@ -230,48 +504,46 @@ good_characters_test_() ->
}
].
malformed_test_() ->
[
{"malformed codepoint with 1 byte", ?_assertError(badarg, encode(<<128>>))},
{"malformed codepoint with 2 bytes", ?_assertError(badarg, encode(<<128, 192>>))},
{"malformed codepoint with 3 bytes", ?_assertError(badarg, encode(<<128, 192, 192>>))},
{"malformed codepoint with 4 bytes", ?_assertError(badarg, encode(<<128, 192, 192, 192>>))}
].
malformed_replaced_test_() ->
F = <<16#fffd/utf8>>,
reserved_test_() ->
[
{"malformed codepoint with 1 byte",
?_assertEqual(
[{string, <<F/binary>>}, end_json],
encode(<<128>>, [loose_unicode])
)
{"reserved noncharacters - badjson",
?_assertEqual(check_bad(reserved_space()), [])
},
{"malformed codepoint with 2 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary>>}, end_json],
encode(<<128, 192>>, [loose_unicode])
)
},
{"malformed codepoint with 3 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary, F/binary>>}, end_json],
encode(<<128, 192, 192>>, [loose_unicode])
)
},
{"malformed codepoint with 4 bytes",
?_assertEqual(
[{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
encode(<<128, 192, 192, 192>>, [loose_unicode])
)
{"reserved noncharacters - replaced",
?_assertEqual(check_replaced(reserved_space()), [])
}
].
noncharacters_test_() ->
[
{"noncharacters - badjson",
?_assertEqual(check_bad(noncharacters()), [])
},
{"noncharacters - replaced",
?_assertEqual(check_replaced(noncharacters()), [])
}
].
extended_noncharacters_test_() ->
[
{"extended noncharacters - badjson",
?_assertEqual(check_bad(extended_noncharacters()), [])
},
{"extended noncharacters - replaced",
?_assertEqual(check_replaced(extended_noncharacters()), [])
}
].
check_bad(List) ->
lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
check(List, [], [])
).
check_replaced(List) ->
lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true
; (_) -> false
@ -279,22 +551,47 @@ check_replaced(List) ->
check(List, [loose_unicode], [])
).
check_good(List) ->
lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end,
check(List, [], [])
).
check([], _Opts, Acc) -> Acc;
check([H|T], Opts, Acc) ->
R = encode(to_fake_utf(H, utf8), Opts),
check(T, Opts, [{H, R}] ++ Acc).
noncharacters() -> lists:seq(16#fffe, 16#ffff).
extended_noncharacters() ->
[16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
surrogates() -> lists:seq(16#d800, 16#dfff).
good() -> lists:seq(1, 16#d7ff) ++ lists:seq(16#e000, 16#fffd).
good_extended() -> lists:seq(16#100000, 16#10ffff).
reserved_space() -> lists:seq(16#fdd0, 16#fdef).
good() -> lists:seq(16#0000, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd).
good_extended() -> [16#10000, 16#20000, 16#30000, 16#40000, 16#50000,
16#60000, 16#70000, 16#80000, 16#90000, 16#a0000,
16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000
] ++ lists:seq(16#100000, 16#10fffd).
%% erlang refuses to encode certain codepoints, so fake them all
to_fake_utf(N, utf8) when N < 16#0080 -> <<N:8>>;
@ -308,4 +605,5 @@ to_fake_utf(N, utf8) ->
<<0:3, W:3, Z:6, Y:6, X:6>> = <<N:24>>,
<<2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6>>.
-endif.

View file

@ -5,5 +5,7 @@
single_quotes = false,
no_jsonp_escapes = false,
comments = false,
json_escape = false
json_escape = false,
dirty_strings = false,
ignore_bad_escapes = false
}).

View file

@ -39,7 +39,7 @@
-spec to_json(Source::any(), Opts::opts()) -> binary().
to_json(Source, Opts) when is_list(Opts) ->
(jsx:encoder(?MODULE, Opts, jsx_utils:extract_opts([json_escape] ++ Opts)))(Source).
(jsx:encoder(?MODULE, Opts, jsx_utils:extract_opts(Opts)))(Source).
-spec format(Source::binary(), Opts::opts()) -> binary().
@ -195,6 +195,9 @@ basic_format_test_() ->
[{"naked float", ?_assertEqual(format(<<"1.23">>, []), <<"1.23">>)}]
},
{"naked string", ?_assertEqual(format(<<"\"hi\"">>, []), <<"\"hi\"">>)},
{"naked string with control character", ?_assertEqual(
format(<<"\"hi\\n\"">>, [json_escape]), <<"\"hi\\n\"">>
)},
{"naked literal", ?_assertEqual(format(<<"true">>, []), <<"true">>)},
{"simple object", ?_assertEqual(
format(<<" { \"key\" :\n\t \"value\"\r\r\r\n } ">>, []),
@ -241,6 +244,9 @@ basic_to_json_test_() ->
[{"naked float", ?_assertEqual(to_json(1.23, []) , <<"1.23">>)}]
},
{"naked string", ?_assertEqual(to_json(<<"hi">>, []), <<"\"hi\"">>)},
{"naked string with control character", ?_assertEqual(
to_json(<<"hi\n">>, [json_escape]), <<"\"hi\\n\"">>
)},
{"naked literal", ?_assertEqual(to_json(true, []), <<"true">>)},
{"simple object", ?_assertEqual(
to_json(
@ -324,10 +330,5 @@ opts_test_() ->
)}
].
ext_opts_test_() ->
[{"extopts", ?_assertEqual(
format(<<"[]">>, [loose_unicode, {escape_forward_slash, true}]),
<<"[]">>
)}].
-endif.

View file

@ -51,6 +51,17 @@ parse_opts([comments|Rest], Opts) ->
parse_opts(Rest, Opts#opts{comments=true});
parse_opts([json_escape|Rest], Opts) ->
parse_opts(Rest, Opts#opts{json_escape=true});
parse_opts([dirty_strings|Rest], Opts) ->
parse_opts(Rest, Opts#opts{dirty_strings=true});
parse_opts([ignore_bad_escapes|Rest], Opts) ->
parse_opts(Rest, Opts#opts{ignore_bad_escapes=true});
parse_opts([relax|Rest], Opts) ->
parse_opts(Rest, Opts#opts{
loose_unicode = true,
single_quotes = true,
comments = true,
ignore_bad_escapes = true
});
parse_opts(_, _) ->
{error, badarg}.
@ -63,7 +74,10 @@ valid_flags() ->
single_quotes,
no_jsonp_escapes,
comments,
json_escape
json_escape,
dirty_strings,
ignore_bad_escapes,
relax
].
@ -88,7 +102,10 @@ extract_parser_opts([K|Rest], Acc) ->
%% everything else should be a legal json string component
json_escape(String, Opts) when is_binary(String) ->
json_escape(String, Opts, 0, size(String)).
case Opts#opts.dirty_strings of
true -> String
; false -> json_escape(String, Opts, 0, size(String))
end.
-define(control_character(X),
@ -243,7 +260,7 @@ json_escape(Str, Opts, L, Len) when L < Len ->
json_escape(<<H/binary, 16#2028/utf8, T/binary>>, Opts, L + 3, Len);
false ->
B = unicode:characters_to_binary(json_escape_sequence(16#2028)),
json_escape(<<H/binary, B/binary, T/binary>>, Opts, L + size(B), Len + size(B) - size(<<16#2028/utf8>>))
json_escape(<<H/binary, B/binary, T/binary>>, Opts, L + 6, Len + 3)
end;
<<H:L/binary, 16#2029/utf8, T/binary>> ->
case Opts#opts.no_jsonp_escapes of
@ -251,26 +268,51 @@ json_escape(Str, Opts, L, Len) when L < Len ->
json_escape(<<H/binary, 16#2029/utf8, T/binary>>, Opts, L + 3, Len);
false ->
B = unicode:characters_to_binary(json_escape_sequence(16#2029)),
json_escape(<<H/binary, B/binary, T/binary>>, Opts, L + size(B), Len + size(B) - size(<<16#2029/utf8>>))
json_escape(<<H/binary, B/binary, T/binary>>, Opts, L + 6, Len + 3)
end;
<<_:L/binary, X/utf8, _/binary>> when X < 16#0080 ->
json_escape(Str, Opts, L + 1, Len);
<<_:L/binary, X/utf8, _/binary>> when X < 16#0800 ->
json_escape(Str, Opts, L + 2, Len);
<<_:L/binary, X/utf8, _/binary>> when X < 16#10000 ->
<<_:L/binary, X/utf8, _/binary>> when X < 16#dcff ->
json_escape(Str, Opts, L + 3, Len);
<<_:L/binary, _/utf8, _/binary>> ->
<<_:L/binary, X/utf8, _/binary>> when X > 16#dfff, X < 16#fdd0 ->
json_escape(Str, Opts, L + 3, Len);
<<_:L/binary, X/utf8, _/binary>> when X > 16#fdef, X < 16#fffe ->
json_escape(Str, Opts, L + 3, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#10000, X < 16#1fffe ->
json_escape(Str, Opts, L + 4, Len);
<<H:L/binary, 237, X, _, T/binary>> when X >= 160 ->
case Opts#opts.loose_unicode of
true -> json_escape(<<H/binary, 16#fffd/utf8, T/binary>>, Opts, L + 3, Len);
false -> erlang:error(badarg, [Str, Opts])
end;
<<H:L/binary, _, T/binary>> ->
case Opts#opts.loose_unicode of
true -> json_escape(<<H/binary, 16#fffd/utf8, T/binary>>, Opts, L + 3, Len + 2);
false -> erlang:error(badarg, [Str, Opts])
end
<<_:L/binary, X/utf8, _/binary>> when X >= 16#20000, X < 16#2fffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#30000, X < 16#3fffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#40000, X < 16#4fffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#50000, X < 16#5fffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#60000, X < 16#6fffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#70000, X < 16#7fffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#80000, X < 16#8fffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#90000, X < 16#9fffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#a0000, X < 16#afffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#b0000, X < 16#bfffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#c0000, X < 16#cfffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#d0000, X < 16#dfffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#e0000, X < 16#efffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#f0000, X < 16#ffffe ->
json_escape(Str, Opts, L + 4, Len);
<<_:L/binary, X/utf8, _/binary>> when X >= 16#100000, X < 16#10fffe ->
json_escape(Str, Opts, L + 4, Len);
_ -> erlang:error(badarg, [Str, Opts])
end;
json_escape(Str, _, L, Len) when L =:= Len ->
Str.
@ -291,7 +333,6 @@ to_hex(15) -> $f;
to_hex(X) -> X + 48. %% ascii "1" is [49], "2" is [50], etc...
%% eunit tests
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
@ -329,28 +370,53 @@ binary_escape_test_() ->
<<"\\/Date(1303502009425)\\/">>
)
},
{"bad utf8",
?_assertError(badarg, json_escape(<<32, 64, 128, 255>>, #opts{}))
},
{"bad utf8 ok",
{"dirty strings",
?_assertEqual(
json_escape(<<32, 64, 128, 255>>, #opts{loose_unicode=true}),
<<32, 64, 16#fffd/utf8, 16#fffd/utf8>>
)
},
{"bad surrogate", ?_assertError(badarg, json_escape(<<237, 160, 127>>, #opts{}))},
{"bad surrogate ok",
?_assertEqual(
json_escape(<<237, 160, 127>>, #opts{loose_unicode=true}),
<<16#fffd/utf8>>
)
},
{"all sizes of codepoints",
?_assertEqual(
json_escape(unicode:characters_to_binary([0, 32, 16#80, 16#800, 16#10000]), #opts{}),
<<"\\u0000", 32/utf8, 16#80/utf8, 16#800/utf8, 16#10000/utf8>>
json_escape(<<"\\x25\\uffff">>, #opts{dirty_strings=true}),
<<"\\x25\\uffff">>
)
}
].
opts_test_() ->
[
{"all flags",
?_assertEqual(
parse_opts([
loose_unicode,
escape_forward_slash,
explicit_end,
single_quotes,
no_jsonp_escapes,
comments,
dirty_strings,
ignore_bad_escapes
]),
#opts{
loose_unicode=true,
escape_forward_slash=true,
explicit_end=true,
single_quotes=true,
no_jsonp_escapes=true,
comments=true,
dirty_strings=true,
ignore_bad_escapes=true
}
)
},
{"relax flag",
?_assertEqual(
parse_opts([relax]),
#opts{
loose_unicode=true,
single_quotes=true,
comments=true,
ignore_bad_escapes=true
}
)
}
].
-endif.