From 66f5e0b48a1146c0bb4413eae7af1adea6e1f358 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Fri, 22 Jul 2011 23:47:35 -0700 Subject: [PATCH] adds {escaped_unicode, replace} option that replaces illegal escape sequences with the unicode replacement character u+fffd when encountered --- src/jsx_common.hrl | 2 +- src/jsx_decoder.hrl | 47 +++++++++++++++---- test/cases/unicode_replaced.json | 1 + test/cases/unicode_replaced.test | 7 +++ .../unicode_to_codepoint_noncharacter.json | 1 + .../unicode_to_codepoint_noncharacter.test | 7 +++ 6 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 test/cases/unicode_replaced.json create mode 100644 test/cases/unicode_replaced.test create mode 100644 test/cases/unicode_to_codepoint_noncharacter.json create mode 100644 test/cases/unicode_to_codepoint_noncharacter.test diff --git a/src/jsx_common.hrl b/src/jsx_common.hrl index bbcec79..09c758c 100644 --- a/src/jsx_common.hrl +++ b/src/jsx_common.hrl @@ -33,7 +33,7 @@ -type jsx_opts() :: [jsx_opt()]. --type jsx_opt() :: {escaped_unicode, ascii | codepoint | none} +-type jsx_opt() :: {escaped_unicode, ascii | codepoint | replace | none} | {multi_term, true | false} | {encoding, auto | utf8 diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl index 9caf60a..19983df 100644 --- a/src/jsx_decoder.hrl +++ b/src/jsx_decoder.hrl @@ -135,7 +135,7 @@ parse_opts(Opts) -> parse_opts([], Opts) -> Opts; parse_opts([{escaped_unicode, Value}|Rest], Opts) -> - true = lists:member(Value, [ascii, codepoint, none]), + true = lists:member(Value, [ascii, codepoint, replace, none]), parse_opts(Rest, Opts#opts{escaped_unicode=Value}); parse_opts([{multi_term, Value}|Rest], Opts) -> true = lists:member(Value, [true, false]), @@ -458,13 +458,17 @@ escape(Bin, Stack, Opts, Acc) -> %% this code is ugly and unfortunate, but so is json's handling of escaped -%% unicode codepoint sequences. if the ascii option is present, the sequence -%% is converted to a codepoint and inserted into the string if it represents -%% an ascii value. if the codepoint option is present the sequence is -%% converted and inserted as long as it represents a valid unicode codepoint. -%% this means non-characters representable in 16 bits are not converted (the -%5 utf16 surrogates and the two special non-characters). any other option and -%% no conversion is done +%% unicode codepoint sequences. +%% if the ascii option is present, the sequence is converted to a codepoint +%% and inserted into the string if it represents an ascii value. +%% if the codepoint option is present the sequence is converted and inserted +%% as long as it represents a valid unicode codepoint. this means +%% non-characters representable in 16 bits are not converted (the utf16 +%% surrogates and the two special non-characters). +%% if the replace option is present sequences are converted as in codepoint +%% with the exception that the non-characters are replaced with u+fffd, the +%% unicode replacement character +%% any other option and no conversion is done escaped_unicode(<>, Stack, #opts{escaped_unicode=ascii}=Opts, @@ -504,6 +508,33 @@ escaped_unicode(<>, ; _ -> string(Rest, Stack, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) end; +escaped_unicode(<>, + Stack, + #opts{escaped_unicode=replace}=Opts, + String, + [C, B, A]) + when ?is_hex(D) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X >= 16#dc00, X =< 16#dfff -> + case check_acc_for_surrogate(String) of + false -> + string(Rest, + Stack, + Opts, + [16#fffd] ++ String + ) + ; {Y, NewString} -> + string(Rest, + Stack, + Opts, + [surrogate_to_codepoint(Y, X)] ++ NewString + ) + end + ; X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Opts, [X] ++ String) + ; _ -> + string(Rest, Stack, Opts, [16#fffd] ++ String) + end; escaped_unicode(<>, Stack, Opts, String, [C, B, A]) when ?is_hex(D) -> string(Rest, Stack, Opts, [D, C, B, A, $u, ?rsolidus] ++ String); diff --git a/test/cases/unicode_replaced.json b/test/cases/unicode_replaced.json new file mode 100644 index 0000000..c8a71c9 --- /dev/null +++ b/test/cases/unicode_replaced.json @@ -0,0 +1 @@ +[ "non-character: ", "\uffff" ] \ No newline at end of file diff --git a/test/cases/unicode_replaced.test b/test/cases/unicode_replaced.test new file mode 100644 index 0000000..5cfe64b --- /dev/null +++ b/test/cases/unicode_replaced.test @@ -0,0 +1,7 @@ +{name, "unicode_replaced"}. +{jsx, [start_array, + {string,"non-character: "}, + {string,[16#fffd]}, + end_array,end_json]}. +{json, "unicode_replaced.json"}. +{jsx_flags, [{escaped_unicode,replace}]}. \ No newline at end of file diff --git a/test/cases/unicode_to_codepoint_noncharacter.json b/test/cases/unicode_to_codepoint_noncharacter.json new file mode 100644 index 0000000..c8a71c9 --- /dev/null +++ b/test/cases/unicode_to_codepoint_noncharacter.json @@ -0,0 +1 @@ +[ "non-character: ", "\uffff" ] \ No newline at end of file diff --git a/test/cases/unicode_to_codepoint_noncharacter.test b/test/cases/unicode_to_codepoint_noncharacter.test new file mode 100644 index 0000000..63558db --- /dev/null +++ b/test/cases/unicode_to_codepoint_noncharacter.test @@ -0,0 +1,7 @@ +{name, "unicode_to_codepoint_noncharacter"}. +{jsx, [start_array, + {string,"non-character: "}, + {string,"\\uffff"}, + end_array,end_json]}. +{json, "unicode_to_codepoint_noncharacter.json"}. +{jsx_flags, [{escaped_unicode,codepoint}]}. \ No newline at end of file