From 66f5e0b48a1146c0bb4413eae7af1adea6e1f358 Mon Sep 17 00:00:00 2001
From: alisdair sullivan <alisdairsullivan@yahoo.ca>
Date: Fri, 22 Jul 2011 23:47:35 -0700
Subject: [PATCH] adds {escaped_unicode, replace} option that replaces illegal
 escape sequences with the unicode replacement character u+fffd when
 encountered

---
 src/jsx_common.hrl                            |  2 +-
 src/jsx_decoder.hrl                           | 47 +++++++++++++++----
 test/cases/unicode_replaced.json              |  1 +
 test/cases/unicode_replaced.test              |  7 +++
 .../unicode_to_codepoint_noncharacter.json    |  1 +
 .../unicode_to_codepoint_noncharacter.test    |  7 +++
 6 files changed, 56 insertions(+), 9 deletions(-)
 create mode 100644 test/cases/unicode_replaced.json
 create mode 100644 test/cases/unicode_replaced.test
 create mode 100644 test/cases/unicode_to_codepoint_noncharacter.json
 create mode 100644 test/cases/unicode_to_codepoint_noncharacter.test

diff --git a/src/jsx_common.hrl b/src/jsx_common.hrl
index bbcec79..09c758c 100644
--- a/src/jsx_common.hrl
+++ b/src/jsx_common.hrl
@@ -33,7 +33,7 @@
 
 
 -type jsx_opts() :: [jsx_opt()].
--type jsx_opt() :: {escaped_unicode, ascii | codepoint | none}
+-type jsx_opt() :: {escaped_unicode, ascii | codepoint | replace | none}
     | {multi_term, true | false}
     | {encoding, auto 
         | utf8
diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl
index 9caf60a..19983df 100644
--- a/src/jsx_decoder.hrl
+++ b/src/jsx_decoder.hrl
@@ -135,7 +135,7 @@ parse_opts(Opts) ->
 parse_opts([], Opts) ->
     Opts;
 parse_opts([{escaped_unicode, Value}|Rest], Opts) ->
-    true = lists:member(Value, [ascii, codepoint, none]),
+    true = lists:member(Value, [ascii, codepoint, replace, none]),
     parse_opts(Rest, Opts#opts{escaped_unicode=Value});
 parse_opts([{multi_term, Value}|Rest], Opts) ->
     true = lists:member(Value, [true, false]),
@@ -458,13 +458,17 @@ escape(Bin, Stack, Opts, Acc) ->
 
 
 %% this code is ugly and unfortunate, but so is json's handling of escaped 
-%%   unicode codepoint sequences. if the ascii option is present, the sequence 
-%%   is converted to a codepoint and inserted into the string if it represents 
-%%   an ascii value. if the codepoint option is present the sequence is 
-%%   converted and inserted as long as it represents a valid unicode codepoint. 
-%%   this means non-characters representable in 16 bits are not converted (the 
-%5   utf16 surrogates and the two special non-characters). any other option and 
-%%   no conversion is done
+%%   unicode codepoint sequences.
+%% if the ascii option is present, the sequence is converted to a codepoint 
+%%   and inserted into the string if it represents an ascii value. 
+%% if the codepoint option is present the sequence is converted and inserted 
+%%   as long as it represents a valid unicode codepoint. this means 
+%%   non-characters representable in 16 bits are not converted (the utf16 
+%%   surrogates and the two special non-characters). 
+%% if the replace option is present sequences are converted as in codepoint
+%%   with the exception that the non-characters are replaced with u+fffd, the
+%%   unicode replacement character
+%% any other option and no conversion is done
 escaped_unicode(<<D/?utfx, Rest/binary>>, 
         Stack, 
         #opts{escaped_unicode=ascii}=Opts, 
@@ -504,6 +508,33 @@ escaped_unicode(<<D/?utfx, Rest/binary>>,
         ; _ ->
             string(Rest, Stack, Opts, [D, C, B, A, $u, ?rsolidus] ++ String)
     end;
+escaped_unicode(<<D/?utfx, Rest/binary>>, 
+        Stack, 
+        #opts{escaped_unicode=replace}=Opts, 
+        String, 
+        [C, B, A]) 
+    when ?is_hex(D) ->
+    case erlang:list_to_integer([A, B, C, D], 16) of
+        X when X >= 16#dc00, X =< 16#dfff ->
+            case check_acc_for_surrogate(String) of
+                false ->
+                    string(Rest, 
+                        Stack, 
+                        Opts, 
+                        [16#fffd] ++ String
+                    )
+                ; {Y, NewString} ->
+                    string(Rest, 
+                        Stack, 
+                        Opts, 
+                        [surrogate_to_codepoint(Y, X)] ++ NewString
+                    )
+            end
+        ; X when X < 16#d800; X > 16#dfff, X < 16#fffe ->
+            string(Rest, Stack, Opts, [X] ++ String) 
+        ; _ ->
+            string(Rest, Stack, Opts, [16#fffd] ++ String)
+    end;
 escaped_unicode(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A]) 
     when ?is_hex(D) ->
     string(Rest, Stack, Opts, [D, C, B, A, $u, ?rsolidus] ++ String);
diff --git a/test/cases/unicode_replaced.json b/test/cases/unicode_replaced.json
new file mode 100644
index 0000000..c8a71c9
--- /dev/null
+++ b/test/cases/unicode_replaced.json
@@ -0,0 +1 @@
+[ "non-character: ", "\uffff" ]
\ No newline at end of file
diff --git a/test/cases/unicode_replaced.test b/test/cases/unicode_replaced.test
new file mode 100644
index 0000000..5cfe64b
--- /dev/null
+++ b/test/cases/unicode_replaced.test
@@ -0,0 +1,7 @@
+{name, "unicode_replaced"}.
+{jsx, [start_array,
+ {string,"non-character: "},
+ {string,[16#fffd]},
+ end_array,end_json]}.
+{json, "unicode_replaced.json"}.
+{jsx_flags, [{escaped_unicode,replace}]}.
\ No newline at end of file
diff --git a/test/cases/unicode_to_codepoint_noncharacter.json b/test/cases/unicode_to_codepoint_noncharacter.json
new file mode 100644
index 0000000..c8a71c9
--- /dev/null
+++ b/test/cases/unicode_to_codepoint_noncharacter.json
@@ -0,0 +1 @@
+[ "non-character: ", "\uffff" ]
\ No newline at end of file
diff --git a/test/cases/unicode_to_codepoint_noncharacter.test b/test/cases/unicode_to_codepoint_noncharacter.test
new file mode 100644
index 0000000..63558db
--- /dev/null
+++ b/test/cases/unicode_to_codepoint_noncharacter.test
@@ -0,0 +1,7 @@
+{name, "unicode_to_codepoint_noncharacter"}.
+{jsx, [start_array,
+ {string,"non-character: "},
+ {string,"\\uffff"},
+ end_array,end_json]}.
+{json, "unicode_to_codepoint_noncharacter.json"}.
+{jsx_flags, [{escaped_unicode,codepoint}]}.
\ No newline at end of file