adds {escaped_unicode, replace} option that replaces illegal escape sequences with the unicode replacement character u+fffd when encountered
This commit is contained in:
parent
b4eaf15dbe
commit
66f5e0b48a
6 changed files with 56 additions and 9 deletions
|
@ -33,7 +33,7 @@
|
||||||
|
|
||||||
|
|
||||||
-type jsx_opts() :: [jsx_opt()].
|
-type jsx_opts() :: [jsx_opt()].
|
||||||
-type jsx_opt() :: {escaped_unicode, ascii | codepoint | none}
|
-type jsx_opt() :: {escaped_unicode, ascii | codepoint | replace | none}
|
||||||
| {multi_term, true | false}
|
| {multi_term, true | false}
|
||||||
| {encoding, auto
|
| {encoding, auto
|
||||||
| utf8
|
| utf8
|
||||||
|
|
|
@ -135,7 +135,7 @@ parse_opts(Opts) ->
|
||||||
parse_opts([], Opts) ->
|
parse_opts([], Opts) ->
|
||||||
Opts;
|
Opts;
|
||||||
parse_opts([{escaped_unicode, Value}|Rest], Opts) ->
|
parse_opts([{escaped_unicode, Value}|Rest], Opts) ->
|
||||||
true = lists:member(Value, [ascii, codepoint, none]),
|
true = lists:member(Value, [ascii, codepoint, replace, none]),
|
||||||
parse_opts(Rest, Opts#opts{escaped_unicode=Value});
|
parse_opts(Rest, Opts#opts{escaped_unicode=Value});
|
||||||
parse_opts([{multi_term, Value}|Rest], Opts) ->
|
parse_opts([{multi_term, Value}|Rest], Opts) ->
|
||||||
true = lists:member(Value, [true, false]),
|
true = lists:member(Value, [true, false]),
|
||||||
|
@ -458,13 +458,17 @@ escape(Bin, Stack, Opts, Acc) ->
|
||||||
|
|
||||||
|
|
||||||
%% this code is ugly and unfortunate, but so is json's handling of escaped
|
%% this code is ugly and unfortunate, but so is json's handling of escaped
|
||||||
%% unicode codepoint sequences. if the ascii option is present, the sequence
|
%% unicode codepoint sequences.
|
||||||
%% is converted to a codepoint and inserted into the string if it represents
|
%% if the ascii option is present, the sequence is converted to a codepoint
|
||||||
%% an ascii value. if the codepoint option is present the sequence is
|
%% and inserted into the string if it represents an ascii value.
|
||||||
%% converted and inserted as long as it represents a valid unicode codepoint.
|
%% if the codepoint option is present the sequence is converted and inserted
|
||||||
%% this means non-characters representable in 16 bits are not converted (the
|
%% as long as it represents a valid unicode codepoint. this means
|
||||||
%5 utf16 surrogates and the two special non-characters). any other option and
|
%% non-characters representable in 16 bits are not converted (the utf16
|
||||||
%% no conversion is done
|
%% surrogates and the two special non-characters).
|
||||||
|
%% if the replace option is present sequences are converted as in codepoint
|
||||||
|
%% with the exception that the non-characters are replaced with u+fffd, the
|
||||||
|
%% unicode replacement character
|
||||||
|
%% any other option and no conversion is done
|
||||||
escaped_unicode(<<D/?utfx, Rest/binary>>,
|
escaped_unicode(<<D/?utfx, Rest/binary>>,
|
||||||
Stack,
|
Stack,
|
||||||
#opts{escaped_unicode=ascii}=Opts,
|
#opts{escaped_unicode=ascii}=Opts,
|
||||||
|
@ -504,6 +508,33 @@ escaped_unicode(<<D/?utfx, Rest/binary>>,
|
||||||
; _ ->
|
; _ ->
|
||||||
string(Rest, Stack, Opts, [D, C, B, A, $u, ?rsolidus] ++ String)
|
string(Rest, Stack, Opts, [D, C, B, A, $u, ?rsolidus] ++ String)
|
||||||
end;
|
end;
|
||||||
|
escaped_unicode(<<D/?utfx, Rest/binary>>,
|
||||||
|
Stack,
|
||||||
|
#opts{escaped_unicode=replace}=Opts,
|
||||||
|
String,
|
||||||
|
[C, B, A])
|
||||||
|
when ?is_hex(D) ->
|
||||||
|
case erlang:list_to_integer([A, B, C, D], 16) of
|
||||||
|
X when X >= 16#dc00, X =< 16#dfff ->
|
||||||
|
case check_acc_for_surrogate(String) of
|
||||||
|
false ->
|
||||||
|
string(Rest,
|
||||||
|
Stack,
|
||||||
|
Opts,
|
||||||
|
[16#fffd] ++ String
|
||||||
|
)
|
||||||
|
; {Y, NewString} ->
|
||||||
|
string(Rest,
|
||||||
|
Stack,
|
||||||
|
Opts,
|
||||||
|
[surrogate_to_codepoint(Y, X)] ++ NewString
|
||||||
|
)
|
||||||
|
end
|
||||||
|
; X when X < 16#d800; X > 16#dfff, X < 16#fffe ->
|
||||||
|
string(Rest, Stack, Opts, [X] ++ String)
|
||||||
|
; _ ->
|
||||||
|
string(Rest, Stack, Opts, [16#fffd] ++ String)
|
||||||
|
end;
|
||||||
escaped_unicode(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A])
|
escaped_unicode(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A])
|
||||||
when ?is_hex(D) ->
|
when ?is_hex(D) ->
|
||||||
string(Rest, Stack, Opts, [D, C, B, A, $u, ?rsolidus] ++ String);
|
string(Rest, Stack, Opts, [D, C, B, A, $u, ?rsolidus] ++ String);
|
||||||
|
|
1
test/cases/unicode_replaced.json
Normal file
1
test/cases/unicode_replaced.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
[ "non-character: ", "\uffff" ]
|
7
test/cases/unicode_replaced.test
Normal file
7
test/cases/unicode_replaced.test
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
{name, "unicode_replaced"}.
|
||||||
|
{jsx, [start_array,
|
||||||
|
{string,"non-character: "},
|
||||||
|
{string,[16#fffd]},
|
||||||
|
end_array,end_json]}.
|
||||||
|
{json, "unicode_replaced.json"}.
|
||||||
|
{jsx_flags, [{escaped_unicode,replace}]}.
|
1
test/cases/unicode_to_codepoint_noncharacter.json
Normal file
1
test/cases/unicode_to_codepoint_noncharacter.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
[ "non-character: ", "\uffff" ]
|
7
test/cases/unicode_to_codepoint_noncharacter.test
Normal file
7
test/cases/unicode_to_codepoint_noncharacter.test
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
{name, "unicode_to_codepoint_noncharacter"}.
|
||||||
|
{jsx, [start_array,
|
||||||
|
{string,"non-character: "},
|
||||||
|
{string,"\\uffff"},
|
||||||
|
end_array,end_json]}.
|
||||||
|
{json, "unicode_to_codepoint_noncharacter.json"}.
|
||||||
|
{jsx_flags, [{escaped_unicode,codepoint}]}.
|
Loading…
Add table
Add a link
Reference in a new issue