From 653205501c8889fffaaaf4a20103d4266b0d6da6 Mon Sep 17 00:00:00 2001
From: alisdair sullivan <alisdairsullivan@yahoo.ca>
Date: Wed, 27 Jul 2011 01:59:03 -0700
Subject: [PATCH 1/2] adds loose_unicode option which replaces badly formed
 unicode (escaped non-characters and restricted codepoints, as well as partial
 surrogates) with u+fffd instead of throwing an error

---
 src/jsx_common.hrl                            |  1 +
 src/jsx_decoder.hrl                           | 98 +++++++++++++++++--
 src/jsx_utils.erl                             |  4 +-
 .../escaped_noncharacter_ext_replaced.json    |  1 +
 .../escaped_noncharacter_ext_replaced.test    |  4 +
 test/cases/escaped_noncharacter_replaced.json |  1 +
 test/cases/escaped_noncharacter_replaced.test |  4 +
 test/cases/escaped_nullbyte_replaced.json     |  1 +
 test/cases/escaped_nullbyte_replaced.test     |  4 +
 test/cases/noncharacter.json                  |  1 +
 test/cases/noncharacter.test                  |  3 +
 test/cases/unpaired_surrogate.json            |  1 +
 test/cases/unpaired_surrogate.test            |  3 +
 test/cases/unpaired_surrogate_replaced.json   |  1 +
 test/cases/unpaired_surrogate_replaced.test   |  4 +
 15 files changed, 121 insertions(+), 10 deletions(-)
 create mode 100644 test/cases/escaped_noncharacter_ext_replaced.json
 create mode 100644 test/cases/escaped_noncharacter_ext_replaced.test
 create mode 100644 test/cases/escaped_noncharacter_replaced.json
 create mode 100644 test/cases/escaped_noncharacter_replaced.test
 create mode 100644 test/cases/escaped_nullbyte_replaced.json
 create mode 100644 test/cases/escaped_nullbyte_replaced.test
 create mode 100644 test/cases/noncharacter.json
 create mode 100644 test/cases/noncharacter.test
 create mode 100644 test/cases/unpaired_surrogate.json
 create mode 100644 test/cases/unpaired_surrogate.test
 create mode 100644 test/cases/unpaired_surrogate_replaced.json
 create mode 100644 test/cases/unpaired_surrogate_replaced.test

diff --git a/src/jsx_common.hrl b/src/jsx_common.hrl
index 6431434..9c950c9 100644
--- a/src/jsx_common.hrl
+++ b/src/jsx_common.hrl
@@ -34,6 +34,7 @@
 
 -type jsx_opts() :: [jsx_opt()].
 -type jsx_opt() :: {multi_term, true | false}
+    | loose_unicode
     | {encoding, auto 
         | utf8
         | utf16
diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl
index 6aa5b6b..d3a2fc4 100644
--- a/src/jsx_decoder.hrl
+++ b/src/jsx_decoder.hrl
@@ -31,6 +31,7 @@
 %% opts record for decoder
 -record(opts, {
     multi_term = false,
+    loose_unicode = false,
     encoding = auto
 }).
 
@@ -81,7 +82,7 @@
 ).
 
 -define(is_noncontrol(Symbol),
-    Symbol >= ?space
+    (Symbol >= ?space)
 ).
 
 -define(is_whitespace(Symbol),
@@ -138,6 +139,8 @@ parse_opts([{multi_term, Value}|Rest], Opts) ->
     parse_opts(Rest, Opts#opts{multi_term=Value});
 parse_opts([multi_term|Rest], Opts) ->
     parse_opts(Rest, Opts#opts{multi_term=true});
+parse_opts([loose_unicode|Rest], Opts) ->
+    parse_opts(Rest, Opts#opts{loose_unicode=true});
 parse_opts([{encoding, _}|Rest], Opts) ->
     parse_opts(Rest, Opts);
 parse_opts(_, _) ->
@@ -350,7 +353,6 @@ key(Bin, Stack, Opts) ->
 %%   states
 string(Bin, Stack, Opts) -> string(Bin, Stack, Opts, <<>>).
 
-
 string(<<?quote/?utfx, Rest/binary>>, [key|_] = Stack, Opts, Acc) ->
     {jsx, {key, Acc}, fun() -> colon(Rest, Stack, Opts) end};
 string(<<?quote/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
@@ -358,8 +360,37 @@ string(<<?quote/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
         maybe_done(Rest, Stack, Opts)
     end};
 string(<<?rsolidus/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
-    escape(Rest, Stack, Opts, Acc);   
-string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc) when ?is_noncontrol(S) ->
+    escape(Rest, Stack, Opts, Acc);
+%% things get dumb here. erlang doesn't properly restrict unicode non-characters
+%%   so you can't trust the codepoints it returns always
+%% the range 32..16#fdcf is safe, so allow that
+string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
+        when ?is_noncontrol(S), S < 16#fdd0 ->
+    string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
+%% the range 16#fdf0..16#fffd is also safe
+string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
+        when S > 16#fdef, S < 16#fffe ->
+    string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
+%% i think doing it like this is faster than just putting this clause first.
+%%   yes, i think it's insane too
+string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
+        when S > 16#ffff andalso
+            S =/= 16#1fffe andalso S =/= 16#1ffff andalso
+            S =/= 16#2fffe andalso S =/= 16#2ffff andalso
+            S =/= 16#3fffe andalso S =/= 16#3ffff andalso
+            S =/= 16#4fffe andalso S =/= 16#4ffff andalso
+            S =/= 16#5fffe andalso S =/= 16#5ffff andalso
+            S =/= 16#6fffe andalso S =/= 16#6ffff andalso
+            S =/= 16#7fffe andalso S =/= 16#7ffff andalso
+            S =/= 16#8fffe andalso S =/= 16#8ffff andalso
+            S =/= 16#9fffe andalso S =/= 16#9ffff andalso
+            S =/= 16#afffe andalso S =/= 16#affff andalso
+            S =/= 16#bfffe andalso S =/= 16#bffff andalso
+            S =/= 16#cfffe andalso S =/= 16#cffff andalso
+            S =/= 16#dfffe andalso S =/= 16#dffff andalso
+            S =/= 16#efffe andalso S =/= 16#effff andalso
+            S =/= 16#ffffe andalso S =/= 16#fffff andalso
+            S =/= 16#101fffe andalso S =/= 16#10ffff ->
     string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
 string(Bin, Stack, Opts, Acc) ->
     case partial_utf(Bin) of 
@@ -380,12 +411,14 @@ partial_utf(<<X, Rest/binary>>) when X >= 16#e0, X =< 16#ef ->
     case Rest of
         <<>> -> true
         ; <<Y>> when Y >= 16#80, Y =< 16#bf -> true
+        ; _ -> false
     end;
 partial_utf(<<X, Rest/binary>>) when X >= 16#f0, X =< 16#f4 ->
     case Rest of
         <<>> -> true
         ; <<Y>> when Y >= 16#80, Y =< 16#bf -> true
         ; <<Y, Z>> when Y >= 16#80, Y =< 16#bf, Z >= 16#80, Z =< 16#bf -> true
+        ; _ -> false
     end;
 partial_utf(_) -> false.    
 -endif.    
@@ -467,11 +500,21 @@ escaped_unicode(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A])
             low_surrogate(Rest, Stack, Opts, String, X)
         %% non-characters, you're not allowed to exchange these
         ; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef ->
-            {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            case Opts#opts.loose_unicode of
+                true ->
+                    string(Rest, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
+                ; false ->    
+                    {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            end
         %% allowing interchange of null bytes allows attackers to forge
         %%   malicious streams
         ; X when X == 16#0000 ->
-            {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            case Opts#opts.loose_unicode of
+                true ->
+                    string(Rest, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
+                ; false ->    
+                    {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            end
         %% anything else
         ; X ->
             string(Rest, Stack, Opts, <<String/binary, X/utf8>>)
@@ -498,6 +541,14 @@ escaped_unicode(Bin, Stack, Opts, String, Acc) ->
 
 low_surrogate(<<?rsolidus/?utfx, Rest/binary>>, Stack, Opts, String, High) ->
     low_surrogate_u(Rest, Stack, Opts, String, High);
+%% not an escaped codepoint, our high codepoint is illegal
+low_surrogate(<<S/?utfx, Rest/binary>> = Bin, Stack, Opts, String, _) ->
+    case Opts#opts.loose_unicode of
+        true ->
+            string(Bin, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
+        ; false ->
+            {error, {badjson, <<S/?utfx, Rest/binary>>}}
+    end;
 low_surrogate(Bin, Stack, Opts, String, High) ->
     case ?partial_codepoint(Bin) of
         true -> 
@@ -518,6 +569,19 @@ low_surrogate(Bin, Stack, Opts, String, High) ->
     
 low_surrogate_u(<<$u/?utfx, Rest/binary>>, Stack, Opts, String, High) ->
     low_surrogate(Rest, Stack, Opts, String, [], High);
+%% not a low surrogate, dispatch back to string to handle, including the
+%%   rsolidus we parsed previously
+low_surrogate_u(<<S/?utfx, Rest/binary>> = Bin, Stack, Opts, String, _) ->
+    case Opts#opts.loose_unicode of
+        true ->
+            string(<<?rsolidus/?utfx, Bin/binary>>,
+                Stack,
+                Opts,
+                <<String/binary, 16#fffd/utf8>>
+            )
+        ; false ->
+            {error, {badjson, <<S/?utfx, Rest/binary>>}}
+    end;
 low_surrogate_u(Bin, Stack, Opts, String, High) ->
     case ?partial_codepoint(Bin) of
         true -> 
@@ -543,14 +607,32 @@ low_surrogate(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A], High)
             V = surrogate_to_codepoint(High, X),
             case V rem 16#10000 of
                 Y when Y == 16#fffe; Y == 16#ffff ->
-                    {error, {badjson, <<D/?utfx, Rest/binary>>}}
+                    case Opts#opts.loose_unicode of
+                        true ->
+                            string(Rest,
+                                Stack,
+                                Opts,
+                                <<String/binary, 16#fffd/utf8, 16#fffd/utf8>>
+                            )
+                        ; false ->    
+                            {error, {badjson, <<D/?utfx, Rest/binary>>}}
+                    end
                 ; Y ->
                     io:format("~p ~p~n", [V, Y]),
                     string(Rest, Stack, Opts, <<String/binary, V/utf8>>)
             end
         %% not a low surrogate, bad bad bad
         ; _ ->
-            {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            case Opts#opts.loose_unicode of
+                true ->
+                    string(Rest,
+                        Stack,
+                        Opts,
+                        <<String/binary, 16#fffd/utf8, 16#fffd/utf8>>
+                    )
+                ; false ->    
+                    {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            end
     end;
 low_surrogate(<<S/?utfx, Rest/binary>>, Stack, Opts, String, Acc, High) 
         when ?is_hex(S) ->
diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl
index 3a9ec1f..bb06334 100644
--- a/src/jsx_utils.erl
+++ b/src/jsx_utils.erl
@@ -204,10 +204,10 @@ detect_encoding(<<X, 0, 0, 0, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
 detect_encoding(<<0, 0, 0, X, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
     (jsx_utf32:decoder(Opts))(JSON);
 %% utf16-little null order detection
-detect_encoding(<<X, 0, _, 0, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
+detect_encoding(<<X, 0, _, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
     (jsx_utf16le:decoder(Opts))(JSON);
 %% utf16-big null order detection
-detect_encoding(<<0, X, 0, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
+detect_encoding(<<0, X, _, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
     (jsx_utf16:decoder(Opts))(JSON);
 %% utf8 null order detection
 detect_encoding(<<X, Y, _Rest/binary>> = JSON, Opts) when X =/= 0, Y =/= 0 ->
diff --git a/test/cases/escaped_noncharacter_ext_replaced.json b/test/cases/escaped_noncharacter_ext_replaced.json
new file mode 100644
index 0000000..f10ec2b
--- /dev/null
+++ b/test/cases/escaped_noncharacter_ext_replaced.json
@@ -0,0 +1 @@
+"\ud83f\udfff"
\ No newline at end of file
diff --git a/test/cases/escaped_noncharacter_ext_replaced.test b/test/cases/escaped_noncharacter_ext_replaced.test
new file mode 100644
index 0000000..c2741f7
--- /dev/null
+++ b/test/cases/escaped_noncharacter_ext_replaced.test
@@ -0,0 +1,4 @@
+{name, "escaped noncharacter (extended)"}.
+{jsx, [{string, <<16#fffd/utf8, 16#fffd/utf8>>}, end_json]}.
+{json, "escaped_noncharacter_ext.json"}.
+{jsx_flags, [loose_unicode]}.
\ No newline at end of file
diff --git a/test/cases/escaped_noncharacter_replaced.json b/test/cases/escaped_noncharacter_replaced.json
new file mode 100644
index 0000000..e5c1b65
--- /dev/null
+++ b/test/cases/escaped_noncharacter_replaced.json
@@ -0,0 +1 @@
+"\uffff"
\ No newline at end of file
diff --git a/test/cases/escaped_noncharacter_replaced.test b/test/cases/escaped_noncharacter_replaced.test
new file mode 100644
index 0000000..9c5faac
--- /dev/null
+++ b/test/cases/escaped_noncharacter_replaced.test
@@ -0,0 +1,4 @@
+{name, "escaped noncharacter replacement"}.
+{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
+{json, "escaped_noncharacter_replaced.json"}.
+{jsx_flags, [loose_unicode]}.
\ No newline at end of file
diff --git a/test/cases/escaped_nullbyte_replaced.json b/test/cases/escaped_nullbyte_replaced.json
new file mode 100644
index 0000000..ed6780d
--- /dev/null
+++ b/test/cases/escaped_nullbyte_replaced.json
@@ -0,0 +1 @@
+"\u0000"
\ No newline at end of file
diff --git a/test/cases/escaped_nullbyte_replaced.test b/test/cases/escaped_nullbyte_replaced.test
new file mode 100644
index 0000000..785acc8
--- /dev/null
+++ b/test/cases/escaped_nullbyte_replaced.test
@@ -0,0 +1,4 @@
+{name, "escaped nullbyte replaced"}.
+{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
+{json, "escaped_nullbyte_replaced.json"}.
+{jsx_flags, [loose_unicode]}.
\ No newline at end of file
diff --git a/test/cases/noncharacter.json b/test/cases/noncharacter.json
new file mode 100644
index 0000000..09db417
--- /dev/null
+++ b/test/cases/noncharacter.json
@@ -0,0 +1 @@
+"﷐"
\ No newline at end of file
diff --git a/test/cases/noncharacter.test b/test/cases/noncharacter.test
new file mode 100644
index 0000000..6b3732c
--- /dev/null
+++ b/test/cases/noncharacter.test
@@ -0,0 +1,3 @@
+{name, "noncharacter"}.
+{jsx, {error, badjson}}.
+{json, "noncharacter.json"}.
\ No newline at end of file
diff --git a/test/cases/unpaired_surrogate.json b/test/cases/unpaired_surrogate.json
new file mode 100644
index 0000000..32497a8
--- /dev/null
+++ b/test/cases/unpaired_surrogate.json
@@ -0,0 +1 @@
+["\ud801blah"]
\ No newline at end of file
diff --git a/test/cases/unpaired_surrogate.test b/test/cases/unpaired_surrogate.test
new file mode 100644
index 0000000..e2da5c1
--- /dev/null
+++ b/test/cases/unpaired_surrogate.test
@@ -0,0 +1,3 @@
+{name, "unpaired_surrogate"}.
+{jsx, {error, badjson}}.
+{json, "unpaired_surrogate.json"}.
diff --git a/test/cases/unpaired_surrogate_replaced.json b/test/cases/unpaired_surrogate_replaced.json
new file mode 100644
index 0000000..32497a8
--- /dev/null
+++ b/test/cases/unpaired_surrogate_replaced.json
@@ -0,0 +1 @@
+["\ud801blah"]
\ No newline at end of file
diff --git a/test/cases/unpaired_surrogate_replaced.test b/test/cases/unpaired_surrogate_replaced.test
new file mode 100644
index 0000000..7269bc2
--- /dev/null
+++ b/test/cases/unpaired_surrogate_replaced.test
@@ -0,0 +1,4 @@
+{name, "unpaired surrogate replaced"}.
+{jsx, [start_array,{string,<<16#fffd/utf8, "blah">>},end_array,end_json]}.
+{json, "unpaired_surrogate_replaced.json"}.
+{jsx_flags, [loose_unicode]}.

From 80e9381b42c525ce1588629fad05639218362aba Mon Sep 17 00:00:00 2001
From: alisdair sullivan <alisdairsullivan@yahoo.ca>
Date: Wed, 27 Jul 2011 06:52:16 -0700
Subject: [PATCH 2/2] proper guarding and handling of noncharacters in json
 strings. more tests required

---
 src/jsx_decoder.hrl                   | 102 ++++++++++++++++++++++++--
 test/cases/noncharacter_replaced.json |   1 +
 test/cases/noncharacter_replaced.test |   4 +
 test/cases/nullbyte_replaced.json     |   1 +
 test/cases/nullbyte_replaced.test     |   4 +
 5 files changed, 107 insertions(+), 5 deletions(-)
 create mode 100644 test/cases/noncharacter_replaced.json
 create mode 100644 test/cases/noncharacter_replaced.test
 create mode 100644 test/cases/nullbyte_replaced.json
 create mode 100644 test/cases/nullbyte_replaced.test

diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl
index d3a2fc4..2cb88cc 100644
--- a/src/jsx_decoder.hrl
+++ b/src/jsx_decoder.hrl
@@ -400,7 +400,11 @@ string(Bin, Stack, Opts, Acc) ->
                 ; (Stream) -> 
                     string(<<Bin/binary, Stream/binary>>, Stack, Opts, Acc)
             end}
-        ; false -> {error, {badjson, Bin}}
+        ; false ->
+            case Opts#opts.loose_unicode of
+                true -> noncharacter(Bin, Stack, Opts, Acc)
+                ; false -> {error, {badjson, Bin}}
+            end
     end.
 
     
@@ -449,13 +453,101 @@ partial_utf(_) -> false.
 -endif.
 
 -ifdef(utf32).
-partial_utf(<<_:32>>) -> false;
-partial_utf(_) -> true.
+partial_utf(<<>>) -> true;
+partial_utf(<<_>>) -> true;
+partial_utf(<<_, _>>) -> true;
+partial_utf(<<_, _, _>>) -> true;
+partial_utf(_) -> false.
 -endif.
 
 -ifdef(utf32le).
-partial_utf(<<_:32>>) -> false;
-partial_utf(_) -> true.
+partial_utf(<<>>) -> true;
+partial_utf(<<_>>) -> true;
+partial_utf(<<_, _>>) -> true;
+partial_utf(<<_, _, _>>) -> true;
+partial_utf(_) -> false.
+-endif.
+
+
+-ifdef(utf8).
+%% non-characters erlang doesn't recognize as non-characters, idiotically
+noncharacter(<<S/utf8, Rest/binary>>, Stack, Opts, Acc)
+        when ?is_noncontrol(S) ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% u+fffe and u+ffff
+noncharacter(<<239, 191, X, Rest/binary>>, Stack, Opts, Acc) 
+        when X == 190; X == 191 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% surrogates
+noncharacter(<<237, X, _, Rest/binary>>, Stack, Opts, Acc) when X >= 160 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+noncharacter(Bin, _Stack, _Opts, _Acc) ->
+    {error, {badjson, Bin}}.
+-endif.
+
+-ifdef(utf16).
+%% non-characters blah blah
+noncharacter(<<S/utf16, Rest/binary>>, Stack, Opts, Acc)
+        when ?is_noncontrol(S) ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% u+ffff and u+fffe
+noncharacter(<<255, X, Rest/binary>>, Stack, Opts, Acc)
+        when X == 253; X == 254 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% surrogates
+noncharacter(<<X, _, Rest/binary>>, Stack, Opts, Acc)
+        when X >= 216, X =< 223 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+noncharacter(Bin, _Stack, _Opts, _Acc) ->
+    {error, {badjson, Bin}}.
+-endif.
+
+-ifdef(utf16le).
+noncharacter(<<S/utf16-little, Rest/binary>>, Stack, Opts, Acc)
+        when ?is_noncontrol(S) ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% u+ffff and u+fffe
+noncharacter(<<X, 255, Rest/binary>>, Stack, Opts, Acc)
+        when X == 253; X == 254 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% surrogates
+noncharacter(<<_, X, Rest/binary>>, Stack, Opts, Acc)
+        when X >= 216, X =< 223 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+noncharacter(Bin, _Stack, _Opts, _Acc) ->
+    {error, {badjson, Bin}}.
+-endif.
+
+-ifdef(utf32).
+noncharacter(<<S/utf32, Rest/binary>>, Stack, Opts, Acc)
+        when ?is_noncontrol(S) ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% u+ffff and u+fffe
+noncharacter(<<0, 0, 255, X, Rest/binary>>, Stack, Opts, Acc)
+        when X == 254; X == 255 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% surrogates
+noncharacter(<<0, 0, X, _, Rest/binary>>, Stack, Opts, Acc)
+        when X >= 216, X =< 223 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+noncharacter(Bin, _Stack, _Opts, _Acc) ->
+    {error, {badjson, Bin}}.
+-endif.
+
+-ifdef(utf32le).
+noncharacter(<<S/utf32-little, Rest/binary>>, Stack, Opts, Acc)
+        when ?is_noncontrol(S) ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% u+ffff and u+fffe
+noncharacter(<<X, 255, 0, 0, Rest/binary>>, Stack, Opts, Acc)
+        when X == 254; X == 255 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+%% surrogates
+noncharacter(<<_, X, 0, 0, Rest/binary>>, Stack, Opts, Acc)
+        when X >= 216, X =< 223 ->
+    string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
+noncharacter(Bin, _Stack, _Opts, _Acc) ->
+    {error, {badjson, Bin}}.
 -endif.
 
 
diff --git a/test/cases/noncharacter_replaced.json b/test/cases/noncharacter_replaced.json
new file mode 100644
index 0000000..09db417
--- /dev/null
+++ b/test/cases/noncharacter_replaced.json
@@ -0,0 +1 @@
+"﷐"
\ No newline at end of file
diff --git a/test/cases/noncharacter_replaced.test b/test/cases/noncharacter_replaced.test
new file mode 100644
index 0000000..0944886
--- /dev/null
+++ b/test/cases/noncharacter_replaced.test
@@ -0,0 +1,4 @@
+{name, "noncharacter replaced"}.
+{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
+{json, "noncharacter_replaced.json"}.
+{jsx_flags, [loose_unicode]}.
\ No newline at end of file
diff --git a/test/cases/nullbyte_replaced.json b/test/cases/nullbyte_replaced.json
new file mode 100644
index 0000000..ed6780d
--- /dev/null
+++ b/test/cases/nullbyte_replaced.json
@@ -0,0 +1 @@
+"\u0000"
\ No newline at end of file
diff --git a/test/cases/nullbyte_replaced.test b/test/cases/nullbyte_replaced.test
new file mode 100644
index 0000000..9a909eb
--- /dev/null
+++ b/test/cases/nullbyte_replaced.test
@@ -0,0 +1,4 @@
+{name, "nullbyte replaced"}.
+{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
+{json, "nullbyte_replaced.json"}.
+{jsx_flags, [loose_unicode]}.
\ No newline at end of file