Merge branch 'develop'

2012-03-29 21:48:57 -07:00 · 2012-03-29 21:48:57 -07:00 · c80c1f7d40
commit c80c1f7d40
parent 4f0bfb0317 a6dee16904
7 changed files with 842 additions and 155 deletions
--- a/README.markdown
+++ b/README.markdown
@ -127,6 +127,10 @@ javascript interpreters treat the codepoints `u+2028` and `u+2029` as significan

 json has no official comments but some parsers allow c style comments. this flag allows comments (both `// ...` and `/* ... */` style) anywhere whitespace is allowed

+#### `json_escape` ####
+
+by default, both the encoder and decoder return strings as utf8 binaries appropriate for use in erlang. escape sequences that were present in decoded terms are converted into the appropriate codepoint and encoded terms are unaltered. this flag escapes strings for output in json, removing control codes and replacing them with the appropriate escapes
+

 ### <a name="incompletes">incomplete input</a> ###

--- a/src/jsx.app.src
+++ b/src/jsx.app.src
@ -1,7 +1,7 @@
 {application, jsx,
 [
    {description, "a streaming, evented json parsing toolkit"},
-    {vsn, "1.1.1"},
+    {vsn, "1.1.2"},
    {modules, [
        jsx,
        jsx_encoder,
--- a/src/jsx_decoder.erl
+++ b/src/jsx_decoder.erl
@ -270,11 +270,11 @@ string(<<33, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
 string(<<?doublequote, Rest/binary>>, {Handler, State}, S, Opts) ->
    case S of
        [Acc, key|Stack] ->
-            colon(Rest, {Handler, Handler:handle_event({key, ?end_seq(Acc)}, State)}, [key|Stack], Opts);
+            colon(Rest, {Handler, Handler:handle_event({key, maybe_escape(?end_seq(Acc), Opts)}, State)}, [key|Stack], Opts);
        [_Acc, single_quote|_Stack] ->
            ?error([<<?doublequote, Rest/binary>>, {Handler, State}, S, Opts]);
        [Acc|Stack] ->
-            maybe_done(Rest, {Handler, Handler:handle_event({string, ?end_seq(Acc)}, State)}, Stack, Opts)
+            maybe_done(Rest, {Handler, Handler:handle_event({string, maybe_escape(?end_seq(Acc), Opts)}, State)}, Stack, Opts)
    end;
 string(<<35, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
    string(Rest, Handler, [?acc_seq(Acc, 35)|Stack], Opts);
@ -284,13 +284,18 @@ string(<<37, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
    string(Rest, Handler, [?acc_seq(Acc, 37)|Stack], Opts);
 string(<<38, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
    string(Rest, Handler, [?acc_seq(Acc, 38)|Stack], Opts);
-string(<<?singlequote, Rest/binary>>, {Handler, State}, S, Opts = #opts{single_quotes=true}) ->
-    case S of
-        [Acc, single_quote, key|Stack] ->
-            colon(Rest, {Handler, Handler:handle_event({key, ?end_seq(Acc)}, State)}, [key|Stack], Opts);
-        [Acc, single_quote|Stack] ->
-            maybe_done(Rest, {Handler, Handler:handle_event({string, ?end_seq(Acc)}, State)}, Stack, Opts);
-        [Acc|Stack] ->
+string(<<?singlequote, Rest/binary>>, {Handler, State}, [Acc|Stack], Opts) ->
+    case Opts#opts.single_quotes of
+        true ->
+            case Stack of
+                [single_quote, key|S] ->
+                    colon(Rest, {Handler, Handler:handle_event({key, maybe_escape(?end_seq(Acc), Opts)}, State)}, [key|S], Opts)
+                ; [single_quote|S] ->
+                    maybe_done(Rest, {Handler, Handler:handle_event({string, maybe_escape(?end_seq(Acc), Opts)}, State)}, S, Opts)
+                ; _ ->
+                    string(Rest, {Handler, State}, [?acc_seq(Acc, ?singlequote)|Stack], Opts)
+            end
+        ; false ->
            string(Rest, {Handler, State}, [?acc_seq(Acc, ?singlequote)|Stack], Opts)
    end;
 string(<<40, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
@ -469,8 +474,53 @@ string(<<126, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
    string(Rest, Handler, [?acc_seq(Acc, 126)|Stack], Opts);
 string(<<127, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
    string(Rest, Handler, [?acc_seq(Acc, 127)|Stack], Opts);
-string(<<S/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts) when ?is_noncontrol(S) ->
-    string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts);
+string(<<S/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
+    case S of
+        %% not strictly true, but exceptions are already taken care of in preceding clauses
+        S when S >= 16#20, S < 16#d800 ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S > 16#dfff, S < 16#fdd0 ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S > 16#fdef, S < 16#fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#10000, S < 16#1fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#20000, S < 16#2fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#30000, S < 16#3fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#40000, S < 16#4fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#50000, S < 16#5fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#60000, S < 16#6fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#70000, S < 16#7fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#80000, S < 16#8fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#90000, S < 16#9fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#a0000, S < 16#afffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#b0000, S < 16#bfffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#c0000, S < 16#cfffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#d0000, S < 16#dfffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#e0000, S < 16#efffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#f0000, S < 16#ffffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; S when S >= 16#100000, S < 16#10fffe ->
+            string(Rest, Handler, [?acc_seq(Acc, S)|Stack], Opts)
+        ; _ ->
+            case Opts#opts.loose_unicode of
+                true -> noncharacter(<<S/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts)
+                ; false -> ?error([<<S/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts])
+            end
+    end;
 string(Bin, Handler, Stack, Opts) ->
    case partial_utf(Bin) of 
        true -> ?incomplete(string, Bin, Handler, Stack, Opts)
@ -480,6 +530,11 @@ string(Bin, Handler, Stack, Opts) ->
                ; false -> ?error([Bin, Handler, Stack, Opts])
            end
    end.
+
+
+maybe_escape(Str, Opts=#opts{json_escape=true}) -> jsx_utils:json_escape(Str, Opts);
+maybe_escape(Str, _Opts) -> Str.
+
    
 %% we don't need to guard against partial utf here, because it's already taken
 %%   care of in string
@ -489,8 +544,36 @@ noncharacter(<<237, X, _, Rest/binary>>, Handler, [Acc|Stack], Opts) when X >= 1
 %% u+fffe and u+ffff for R14BXX
 noncharacter(<<239, 191, X, Rest/binary>>, Handler, [Acc|Stack], Opts) when X == 190; X == 191 ->
    string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
-%% bad utf8
+%% u+xfffe, u+xffff and other noncharacters
+noncharacter(<<_/utf8, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
+    string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
+%% overlong encodings and missing continuations of a 2 byte sequence
+noncharacter(<<X, Rest/binary>>, Handler, Stack, Opts) when X >= 192, X =< 223 ->
+    strip_continuations(Rest, Handler, [1|Stack], Opts);
+%% overlong encodings and missing continuations of a 3 byte sequence
+noncharacter(<<X, Rest/binary>>, Handler, Stack, Opts) when X >= 224, X =< 239 ->
+    strip_continuations(Rest, Handler, [2|Stack], Opts);
+%% overlong encodings and missing continuations of a 4 byte sequence
+noncharacter(<<X, Rest/binary>>, Handler, Stack, Opts) when X >= 240, X =< 247 ->
+    strip_continuations(Rest, Handler, [3|Stack], Opts);
+%% unexpected bytes, including orphan continuations
 noncharacter(<<_, Rest/binary>>, Handler, [Acc|Stack], Opts) ->
+    string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
+noncharacter(<<>>, Handler, Stack, Opts) ->
+    ?incomplete(noncharacter, <<>>, Handler, Stack, Opts).
+
+
+%% strips continuation bytes after bad utf bytes, guards against both too short
+%%  and overlong sequences. N is the maximum number of bytes to strip
+strip_continuations(Rest, Handler, [0, Acc|Stack], Opts) ->
+    string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts);
+strip_continuations(<<X, Rest/binary>>, Handler,  [N|Stack], Opts) when X >= 128, X =< 191 ->
+    strip_continuations(Rest, Handler, [N - 1|Stack], Opts);
+%% incomplete    
+strip_continuations(<<>>, Handler, Stack, Opts) ->
+    ?incomplete(strip_continuations, <<>>, Handler, Stack, Opts);
+%% not a continuation byte, dispatch back to string
+strip_continuations(Rest, Handler, [_, Acc|Stack], Opts) ->
    string(Rest, Handler, [?acc_seq(Acc, 16#fffd)|Stack], Opts).


@ -516,6 +599,8 @@ escape(<<$u, Rest/binary>>, Handler, Stack, Opts) ->
    escaped_unicode(Rest, Handler, Stack, Opts);
 escape(<<>>, Handler, Stack, Opts) ->
    ?incomplete(escape, <<>>, Handler, Stack, Opts);
+escape(Bin, Handler, [Acc|Stack], Opts=#opts{ignore_bad_escapes=true}) ->
+    string(Bin, Handler, [?acc_seq(Acc, ?rsolidus)|Stack], Opts);
 escape(Bin, Handler, Stack, Opts) ->
    ?error([Bin, Handler, Stack, Opts]).

@ -963,6 +1048,216 @@ done(Bin, Handler, Stack, Opts) -> ?error([Bin, Handler, Stack, Opts]).
 -include_lib("eunit/include/eunit.hrl").


+xcode(Bin) -> xcode(Bin, []).
+
+xcode(Bin, Opts) ->
+    Size = size(Bin),
+    try jsx:to_term(<<34, Bin:Size/binary, 34>>, Opts)
+    catch error:badarg -> {error, badarg}
+    end.
+
+
+is_bad({error, badarg}) -> true;
+is_bad(_) -> false.
+
+
+bad_utf8_test_() ->
+    [
+        {"orphan continuation byte u+0080",
+            ?_assert(is_bad(xcode(<<16#0080>>)))
+        },
+        {"orphan continuation byte u+0080 replaced",
+            ?_assertEqual(xcode(<<16#0080>>, [loose_unicode]), <<16#fffd/utf8>>)
+        },
+        {"orphan continuation byte u+00bf",
+            ?_assert(is_bad(xcode(<<16#00bf>>)))
+        },
+        {"orphan continuation byte u+00bf replaced",
+            ?_assertEqual(xcode(<<16#00bf>>, [loose_unicode]), <<16#fffd/utf8>>)
+        },
+        {"2 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>)))
+        },
+        {"2 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 2)
+            )
+        },
+        {"3 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>)))
+        },
+        {"3 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 3)
+            )
+        },
+        {"4 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>)))
+        },
+        {"4 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 4)
+            )
+        },
+        {"5 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>)))
+        },
+        {"5 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 5)
+            )
+        },
+        {"6 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>)))
+        },
+        {"6 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 6)
+            )
+        },
+        {"all continuation bytes",
+            ?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>)))
+        },        
+        {"all continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf)))
+            )
+        },
+        {"lonely start byte",
+            ?_assert(is_bad(xcode(<<16#00c0>>)))
+        },
+        {"lonely start byte replaced",
+            ?_assertEqual(
+                xcode(<<16#00c0>>, [loose_unicode]),
+                <<16#fffd/utf8>>
+            )
+        },
+        {"lonely start bytes (2 byte)",
+            ?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>)))
+        },
+        {"lonely start bytes (2 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#00c0, 32, 16#00df>>, [loose_unicode]),
+                <<16#fffd/utf8, 32, 16#fffd/utf8>>
+            )
+        },
+        {"lonely start bytes (3 byte)",
+            ?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>)))
+        },
+        {"lonely start bytes (3 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#00e0, 32, 16#00ef>>, [loose_unicode]),
+                <<16#fffd/utf8, 32, 16#fffd/utf8>>
+            )
+        },
+        {"lonely start bytes (4 byte)",
+            ?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>)))
+        },
+        {"lonely start bytes (4 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#00f0, 32, 16#00f7>>, [loose_unicode]),
+                <<16#fffd/utf8, 32, 16#fffd/utf8>>
+            )
+        },
+        {"missing continuation byte (3 byte)",
+            ?_assert(is_bad(xcode(<<224, 160, 32>>)))
+        },
+        {"missing continuation byte (3 byte) replaced",
+            ?_assertEqual(
+                xcode(<<224, 160, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"missing continuation byte (4 byte missing one)",
+            ?_assert(is_bad(xcode(<<240, 144, 128, 32>>)))
+        },
+        {"missing continuation byte2 (4 byte missing one) replaced",
+            ?_assertEqual(
+                xcode(<<240, 144, 128, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"missing continuation byte (4 byte missing two)",
+            ?_assert(is_bad(xcode(<<240, 144, 32>>)))
+        },
+        {"missing continuation byte2 (4 byte missing two) replaced",
+            ?_assertEqual(
+                xcode(<<240, 144, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"overlong encoding of u+002f (2 byte)",
+            ?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>)))
+        },
+        {"overlong encoding of u+002f (2 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#c0, 16#af, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"overlong encoding of u+002f (3 byte)",
+            ?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>)))
+        },
+        {"overlong encoding of u+002f (3 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#e0, 16#80, 16#af, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"overlong encoding of u+002f (4 byte)",
+            ?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>)))
+        },
+        {"overlong encoding of u+002f (4 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"highest overlong 2 byte sequence",
+            ?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>)))
+        },
+        {"highest overlong 2 byte sequence replaced",
+            ?_assertEqual(
+                xcode(<<16#c1, 16#bf, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"highest overlong 3 byte sequence",
+            ?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>)))
+        },
+        {"highest overlong 3 byte sequence replaced",
+            ?_assertEqual(
+                xcode(<<16#e0, 16#9f, 16#bf, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"highest overlong 4 byte sequence",
+            ?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>)))
+        },
+        {"highest overlong 4 byte sequence replaced",
+            ?_assertEqual(
+                xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        }
+    ].
+
+
+ignore_bad_escapes_test_() ->
+    [
+        {"ignore unrecognized escape sequence", ?_assertEqual(
+            decode(<<"[\"\\x25\"]">>, [ignore_bad_escapes]),
+            [start_array, {string, <<"\\x25">>}, end_array, end_json]
+        )}
+    ].
+
+
 comments_test_() ->
    [
        {"preceeding // comment", ?_assertEqual(
@ -1147,6 +1442,7 @@ comments_test_() ->
        )}
    ].

+
 escape_forward_slash_test_() ->
    [
        {"escape forward slash test", ?_assertEqual(
@ -1155,6 +1451,29 @@ escape_forward_slash_test_() ->
        )}
    ].

+
+noncharacters_test_() ->
+    [
+        {"noncharacters - badjson",
+            ?_assertEqual(check_bad(noncharacters()), [])
+        },
+        {"noncharacters - replaced",
+            ?_assertEqual(check_replaced(noncharacters()), [])
+        }
+    ].
+
+
+extended_noncharacters_test_() ->
+    [
+        {"extended noncharacters - badjson",
+            ?_assertEqual(check_bad(extended_noncharacters()), [])
+        },
+        {"extended noncharacters - replaced",
+            ?_assertEqual(check_replaced(extended_noncharacters()), [])
+        }
+    ].
+
+
 surrogates_test_() ->
    [
        {"surrogates - badjson",
@ -1165,12 +1484,25 @@ surrogates_test_() ->
        }
    ].

+
 control_test_() ->
    [
        {"control characters - badjson",
            ?_assertEqual(check_bad(control_characters()), [])
        }
    ].
+
+
+reserved_test_() ->
+    [
+        {"reserved noncharacters - badjson",
+            ?_assertEqual(check_bad(reserved_space()), [])
+        },
+        {"reserved noncharacters - replaced",
+            ?_assertEqual(check_replaced(reserved_space()), [])
+        }
+    ].
+
    
 good_characters_test_() ->
    [
@ -1181,51 +1513,6 @@ good_characters_test_() ->
            ?_assertEqual(check_good(good_extended()), [])
        }
    ].
-
-malformed_test_() ->
-    [
-        {"malformed codepoint with 1 byte",
-            ?_assertEqual({error, badjson}, decode(<<128>>))
-        },
-        {"malformed codepoint with 2 bytes",
-            ?_assertEqual({error, badjson}, decode(<<128, 192>>))
-        },
-        {"malformed codepoint with 3 bytes",
-            ?_assertEqual({error, badjson}, decode(<<128, 192, 192>>))
-        },
-        {"malformed codepoint with 4 bytes",
-            ?_assertEqual({error, badjson}, decode(<<128, 192, 192, 192>>))
-        }
-    ].
-
-malformed_replaced_test_() ->
-    F = <<16#fffd/utf8>>,
-    [
-        {"malformed codepoint with 1 byte",
-            ?_assertEqual(
-                [{string, <<F/binary>>}, end_json],
-                decode(<<34, 128, 34>>, [loose_unicode])
-            )
-        },
-        {"malformed codepoint with 2 bytes",
-            ?_assertEqual(
-                [{string, <<F/binary, F/binary>>}, end_json],
-                decode(<<34, 128, 192, 34>>, [loose_unicode])
-            )
-        },
-        {"malformed codepoint with 3 bytes",
-            ?_assertEqual(
-                [{string, <<F/binary, F/binary, F/binary>>}, end_json],
-                decode(<<34, 128, 192, 192, 34>>, [loose_unicode])
-            )
-        },
-        {"malformed codepoint with 4 bytes",
-            ?_assertEqual(
-                [{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
-                decode(<<34, 128, 192, 192, 192, 34>>, [loose_unicode])
-            )
-        }
-    ].
    

 check_bad(List) ->
@ -1233,6 +1520,7 @@ check_bad(List) ->
        check(List, [], [])
    ).

+
 check_replaced(List) ->
    lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true
            ; (_) -> false 
@ -1240,19 +1528,19 @@ check_replaced(List) ->
        check(List, [loose_unicode], [])
    ).

+
 check_good(List) ->
    lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end,
        check(List, [], [])
    ).

+
 check([], _Opts, Acc) -> Acc;
 check([H|T], Opts, Acc) ->
    R = decode(to_fake_utf(H, utf8), Opts),
    check(T, Opts, [{H, R}] ++ Acc).


-decode(JSON) -> decode(JSON, []).
-
 decode(JSON, Opts) ->
    try
        (decoder(jsx, [], Opts))(JSON)
@ -1261,13 +1549,41 @@ decode(JSON, Opts) ->
    end.


+noncharacters() -> lists:seq(16#fffe, 16#ffff).
+
+
+extended_noncharacters() ->
+    [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
+        ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
+        ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
+        ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
+        ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
+        ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
+        ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
+        ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
+
+
 surrogates() -> lists:seq(16#d800, 16#dfff).

+
 control_characters() -> lists:seq(1, 31).

-good() -> [32, 33] ++ lists:seq(16#23, 16#5b) ++ lists:seq(16#5d, 16#d7ff) ++ lists:seq(16#e000, 16#fffd).
-            
-good_extended() -> lists:seq(16#100000, 16#10ffff).
+
+reserved_space() -> lists:seq(16#fdd0, 16#fdef).
+
+
+good() -> [32, 33]
+            ++ lists:seq(16#23, 16#5b)
+            ++ lists:seq(16#5d, 16#d7ff)
+            ++ lists:seq(16#e000, 16#fdcf)
+            ++ lists:seq(16#fdf0, 16#fffd).
+
+        
+good_extended() -> [16#10000, 16#20000, 16#30000, 16#40000, 16#50000,
+        16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 
+        16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000
+    ] ++ lists:seq(16#100000, 16#10fffd).
+

 %% erlang refuses to encode certain codepoints, so fake them all
 to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;
--- a/src/jsx_encoder.erl
+++ b/src/jsx_encoder.erl
@ -104,33 +104,305 @@ fix_key(Key) when is_binary(Key) -> Key.


 clean_string(Bin, Opts) ->
-    case Opts#opts.json_escape of
-        true -> jsx_utils:json_escape(Bin, Opts);
-        false -> 
+    case Opts#opts.loose_unicode of
+        true -> jsx_utils:json_escape(clean_string(Bin, 0, size(Bin), Opts), Opts)
+        ; false ->
            case is_clean(Bin) of
-                true -> Bin;
-                false -> clean_string(Bin, [], Opts)
+                true -> jsx_utils:json_escape(Bin, Opts)
+                ; false -> erlang:error(badarg, [Bin, Opts])
            end
    end.


 is_clean(<<>>) -> true;
-is_clean(<<_/utf8, Rest/binary>>) -> is_clean(Rest);
-is_clean(_) -> false.
+is_clean(<<X/utf8, Rest/binary>>) when X < 16#80 -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X < 16#800 -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X < 16#dcff -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X > 16#dfff, X < 16#fdd0 -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X > 16#fdef, X < 16#fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#10000, X < 16#1fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#20000, X < 16#2fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#30000, X < 16#3fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#40000, X < 16#4fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#50000, X < 16#5fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#60000, X < 16#6fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#70000, X < 16#7fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#80000, X < 16#8fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#90000, X < 16#9fffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#a0000, X < 16#afffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#b0000, X < 16#bfffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#c0000, X < 16#cfffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#d0000, X < 16#dfffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#e0000, X < 16#efffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#f0000, X < 16#ffffe -> is_clean(Rest);
+is_clean(<<X/utf8, Rest/binary>>) when X >= 16#100000, X < 16#10fffe -> is_clean(Rest);
+is_clean(Bin) -> erlang:error(badarg, [Bin]).


-clean_string(Bin, _Acc, Opts=#opts{loose_unicode=false}) -> ?error([Bin, Opts]);
-clean_string(<<>>, Acc, _Opts) -> unicode:characters_to_binary(lists:reverse(Acc));
-clean_string(<<X/utf8, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [X] ++ Acc, Opts);
-%% surrogates
-clean_string(<<237, X, _, Rest/binary>>, Acc, Opts) when X >= 160 -> clean_string(Rest, [16#fffd] ++ Acc, Opts);
-%% bad codepoints
-clean_string(<<_, Rest/binary>>, Acc, Opts) -> clean_string(Rest, [16#fffd] ++ Acc, Opts).
+clean_string(Str, Len, Len, _Opts) -> Str;
+clean_string(Str, L, Len, Opts) ->
+    case Str of
+        <<_:L/binary, X/utf8, _/binary>> when X < 16#80 -> clean_string(Str, L + 1, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X < 16#800 -> clean_string(Str, L + 2, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X < 16#dcff -> clean_string(Str, L + 3, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X > 16#dfff, X < 16#fdd0 -> clean_string(Str, L + 3, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X > 16#fdef, X < 16#fffe -> clean_string(Str, L + 3, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#10000, X < 16#1fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#20000, X < 16#2fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#30000, X < 16#3fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#40000, X < 16#4fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#50000, X < 16#5fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#60000, X < 16#6fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#70000, X < 16#7fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#80000, X < 16#8fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#90000, X < 16#9fffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#a0000, X < 16#afffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#b0000, X < 16#bfffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#c0000, X < 16#cfffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#d0000, X < 16#dfffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#e0000, X < 16#efffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#f0000, X < 16#ffffe -> clean_string(Str, L + 4, Len, Opts)
+        ; <<_:L/binary, X/utf8, _/binary>> when X >= 16#100000, X < 16#10fffe -> clean_string(Str, L + 4, Len, Opts)
+        %% noncharacters
+        ; <<H:L/binary, X/utf8, T/binary>> when X < 16#10000 ->
+            clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 3, Len, Opts)
+        ; <<H:L/binary, _/utf8, T/binary>> ->
+            clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 4, Len, Opts)
+        %% surrogates
+        ; <<H:L/binary, 237, X, _, T/binary>> when X >= 160 ->
+            clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 3, Len, Opts)
+        %% u+fffe and u+ffff for R14BXX
+        ; <<H:L/binary, 239, 191, X, T/binary>> when X == 190; X == 191 ->
+            clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 3, Len, Opts)
+        %% overlong encodings and missing continuations of a 2 byte sequence
+        ; <<H:L/binary, X, T/binary>> when X >= 192, X =< 223 ->
+            {Tail, Stripped} = strip_continuations(T, 1, 0),
+            clean_string(<<H:L/binary, 16#fffd/utf8, Tail/binary>>, L + 3, Len + 2 - Stripped, Opts)
+        %% overlong encodings and missing continuations of a 3 byte sequence
+        ; <<H:L/binary, X, T/binary>> when X >= 224, X =< 239 ->
+            {Tail, Stripped} = strip_continuations(T, 2, 0),
+            clean_string(<<H:L/binary, 16#fffd/utf8, Tail/binary>>, L + 3, Len + 2 - Stripped, Opts)
+        %% overlong encodings and missing continuations of a 4 byte sequence
+        ; <<H:L/binary, X, T/binary>> when X >= 240, X =< 247 ->
+            {Tail, Stripped} = strip_continuations(T, 3, 0),
+            clean_string(<<H:L/binary, 16#fffd/utf8, Tail/binary>>, L + 3, Len + 2 - Stripped, Opts)
+        ; <<H:L/binary, _, T/binary>> ->
+            clean_string(<<H:L/binary, 16#fffd/utf8, T/binary>>, L + 3, Len + 2, Opts)
+    end.
+
+
+strip_continuations(Bin, 0, N) -> {Bin, N};
+strip_continuations(<<X, Rest/binary>>, N, M) when X >= 128, X =< 191 ->
+    strip_continuations(Rest, N - 1, M + 1);
+%% not a continuation byte
+strip_continuations(Bin, _, N) -> {Bin, N}. 


 -ifdef(TEST).
 -include_lib("eunit/include/eunit.hrl").

+
+xcode(Bin) -> xcode(Bin, #opts{}).
+
+xcode(Bin, [loose_unicode]) -> xcode(Bin, #opts{loose_unicode=true});
+xcode(Bin, Opts) ->
+    try clean_string(Bin, Opts)
+    catch error:badarg -> {error, badarg}
+    end.
+
+
+is_bad({error, badarg}) -> true;
+is_bad(_) -> false.
+
+
+bad_utf8_test_() ->
+    [
+        {"orphan continuation byte u+0080",
+            ?_assert(is_bad(xcode(<<16#0080>>)))
+        },
+        {"orphan continuation byte u+0080 replaced",
+            ?_assertEqual(xcode(<<16#0080>>, [loose_unicode]), <<16#fffd/utf8>>)
+        },
+        {"orphan continuation byte u+00bf",
+            ?_assert(is_bad(xcode(<<16#00bf>>)))
+        },
+        {"orphan continuation byte u+00bf replaced",
+            ?_assertEqual(xcode(<<16#00bf>>, [loose_unicode]), <<16#fffd/utf8>>)
+        },
+        {"2 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>)))
+        },
+        {"2 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 2))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 2)
+            )
+        },
+        {"3 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>)))
+        },
+        {"3 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 3))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 3)
+            )
+        },
+        {"4 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>)))
+        },
+        {"4 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 4))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 4)
+            )
+        },
+        {"5 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>)))
+        },
+        {"5 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 5))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 5)
+            )
+        },
+        {"6 continuation bytes",
+            ?_assert(is_bad(xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>)))
+        },
+        {"6 continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(binary:copy(<<16#0080>>, 6))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, 6)
+            )
+        },
+        {"all continuation bytes",
+            ?_assert(is_bad(xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>)))
+        },        
+        {"all continuation bytes replaced",
+            ?_assertEqual(
+                xcode(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, [loose_unicode]),
+                binary:copy(<<16#fffd/utf8>>, length(lists:seq(16#0080, 16#00bf)))
+            )
+        },
+        {"lonely start byte",
+            ?_assert(is_bad(xcode(<<16#00c0>>)))
+        },
+        {"lonely start byte replaced",
+            ?_assertEqual(
+                xcode(<<16#00c0>>, [loose_unicode]),
+                <<16#fffd/utf8>>
+            )
+        },
+        {"lonely start bytes (2 byte)",
+            ?_assert(is_bad(xcode(<<16#00c0, 32, 16#00df>>)))
+        },
+        {"lonely start bytes (2 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#00c0, 32, 16#00df>>, [loose_unicode]),
+                <<16#fffd/utf8, 32, 16#fffd/utf8>>
+            )
+        },
+        {"lonely start bytes (3 byte)",
+            ?_assert(is_bad(xcode(<<16#00e0, 32, 16#00ef>>)))
+        },
+        {"lonely start bytes (3 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#00e0, 32, 16#00ef>>, [loose_unicode]),
+                <<16#fffd/utf8, 32, 16#fffd/utf8>>
+            )
+        },
+        {"lonely start bytes (4 byte)",
+            ?_assert(is_bad(xcode(<<16#00f0, 32, 16#00f7>>)))
+        },
+        {"lonely start bytes (4 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#00f0, 32, 16#00f7>>, [loose_unicode]),
+                <<16#fffd/utf8, 32, 16#fffd/utf8>>
+            )
+        },
+        {"missing continuation byte (3 byte)",
+            ?_assert(is_bad(xcode(<<224, 160, 32>>)))
+        },
+        {"missing continuation byte (3 byte) replaced",
+            ?_assertEqual(
+                xcode(<<224, 160, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"missing continuation byte (4 byte missing one)",
+            ?_assert(is_bad(xcode(<<240, 144, 128, 32>>)))
+        },
+        {"missing continuation byte2 (4 byte missing one) replaced",
+            ?_assertEqual(
+                xcode(<<240, 144, 128, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"missing continuation byte (4 byte missing two)",
+            ?_assert(is_bad(xcode(<<240, 144, 32>>)))
+        },
+        {"missing continuation byte2 (4 byte missing two) replaced",
+            ?_assertEqual(
+                xcode(<<240, 144, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"overlong encoding of u+002f (2 byte)",
+            ?_assert(is_bad(xcode(<<16#c0, 16#af, 32>>)))
+        },
+        {"overlong encoding of u+002f (2 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#c0, 16#af, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"overlong encoding of u+002f (3 byte)",
+            ?_assert(is_bad(xcode(<<16#e0, 16#80, 16#af, 32>>)))
+        },
+        {"overlong encoding of u+002f (3 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#e0, 16#80, 16#af, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"overlong encoding of u+002f (4 byte)",
+            ?_assert(is_bad(xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>)))
+        },
+        {"overlong encoding of u+002f (4 byte) replaced",
+            ?_assertEqual(
+                xcode(<<16#f0, 16#80, 16#80, 16#af, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"highest overlong 2 byte sequence",
+            ?_assert(is_bad(xcode(<<16#c1, 16#bf, 32>>)))
+        },
+        {"highest overlong 2 byte sequence replaced",
+            ?_assertEqual(
+                xcode(<<16#c1, 16#bf, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"highest overlong 3 byte sequence",
+            ?_assert(is_bad(xcode(<<16#e0, 16#9f, 16#bf, 32>>)))
+        },
+        {"highest overlong 3 byte sequence replaced",
+            ?_assertEqual(
+                xcode(<<16#e0, 16#9f, 16#bf, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        },
+        {"highest overlong 4 byte sequence",
+            ?_assert(is_bad(xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>)))
+        },
+        {"highest overlong 4 byte sequence replaced",
+            ?_assertEqual(
+                xcode(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, [loose_unicode]),
+                <<16#fffd/utf8, 32>>
+            )
+        }
+    ].
+
+
 encode(Term) -> (encoder(jsx, [], []))(Term).

 encode(Term, Opts) ->
@ -210,6 +482,7 @@ encode_test_() ->
        }
    ].

+
 surrogates_test_() ->
    [
        {"surrogates - badjson",
@ -219,7 +492,8 @@ surrogates_test_() ->
            ?_assertEqual(check_replaced(surrogates()), [])
        }
    ].
-    
+
+
 good_characters_test_() ->
    [
        {"acceptable codepoints",
@ -230,48 +504,46 @@ good_characters_test_() ->
        }
    ].

-malformed_test_() ->
-    [
-        {"malformed codepoint with 1 byte", ?_assertError(badarg, encode(<<128>>))},
-        {"malformed codepoint with 2 bytes", ?_assertError(badarg, encode(<<128, 192>>))},
-        {"malformed codepoint with 3 bytes", ?_assertError(badarg, encode(<<128, 192, 192>>))},
-        {"malformed codepoint with 4 bytes", ?_assertError(badarg, encode(<<128, 192, 192, 192>>))}
-    ].

-malformed_replaced_test_() ->
-    F = <<16#fffd/utf8>>,
+reserved_test_() ->
    [
-        {"malformed codepoint with 1 byte",
-            ?_assertEqual(
-                [{string, <<F/binary>>}, end_json],
-                encode(<<128>>, [loose_unicode])
-            )
+        {"reserved noncharacters - badjson",
+            ?_assertEqual(check_bad(reserved_space()), [])
        },
-        {"malformed codepoint with 2 bytes",
-            ?_assertEqual(
-                [{string, <<F/binary, F/binary>>}, end_json],
-                encode(<<128, 192>>, [loose_unicode])
-            )
-        },
-        {"malformed codepoint with 3 bytes",
-            ?_assertEqual(
-                [{string, <<F/binary, F/binary, F/binary>>}, end_json],
-                encode(<<128, 192, 192>>, [loose_unicode])
-            )
-        },
-        {"malformed codepoint with 4 bytes",
-            ?_assertEqual(
-                [{string, <<F/binary, F/binary, F/binary, F/binary>>}, end_json],
-                encode(<<128, 192, 192, 192>>, [loose_unicode])
-            )
+        {"reserved noncharacters - replaced",
+            ?_assertEqual(check_replaced(reserved_space()), [])
        }
    ].

+
+noncharacters_test_() ->
+    [
+        {"noncharacters - badjson",
+            ?_assertEqual(check_bad(noncharacters()), [])
+        },
+        {"noncharacters - replaced",
+            ?_assertEqual(check_replaced(noncharacters()), [])
+        }
+    ].
+
+
+extended_noncharacters_test_() ->
+    [
+        {"extended noncharacters - badjson",
+            ?_assertEqual(check_bad(extended_noncharacters()), [])
+        },
+        {"extended noncharacters - replaced",
+            ?_assertEqual(check_replaced(extended_noncharacters()), [])
+        }
+    ].
+
+
 check_bad(List) ->
    lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
        check(List, [], [])
    ).

+
 check_replaced(List) ->
    lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true
            ; (_) -> false 
@ -279,22 +551,47 @@ check_replaced(List) ->
        check(List, [loose_unicode], [])
    ).

+
 check_good(List) ->
    lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end,
        check(List, [], [])
    ).

+
 check([], _Opts, Acc) -> Acc;
 check([H|T], Opts, Acc) ->
    R = encode(to_fake_utf(H, utf8), Opts),
    check(T, Opts, [{H, R}] ++ Acc).
    

+noncharacters() -> lists:seq(16#fffe, 16#ffff).
+
+
+extended_noncharacters() ->
+    [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
+        ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
+        ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
+        ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
+        ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
+        ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
+        ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
+        ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
+
+
 surrogates() -> lists:seq(16#d800, 16#dfff).

-good() -> lists:seq(1, 16#d7ff) ++ lists:seq(16#e000, 16#fffd).
-            
-good_extended() -> lists:seq(16#100000, 16#10ffff).
+
+reserved_space() -> lists:seq(16#fdd0, 16#fdef).
+
+
+good() -> lists:seq(16#0000, 16#d7ff) ++ lists:seq(16#e000, 16#fdcf) ++ lists:seq(16#fdf0, 16#fffd).
+
+
+good_extended() -> [16#10000, 16#20000, 16#30000, 16#40000, 16#50000,
+        16#60000, 16#70000, 16#80000, 16#90000, 16#a0000, 
+        16#b0000, 16#c0000, 16#d0000, 16#e0000, 16#f0000
+    ] ++ lists:seq(16#100000, 16#10fffd).
+

 %% erlang refuses to encode certain codepoints, so fake them all
 to_fake_utf(N, utf8) when N < 16#0080 -> <<N:8>>;
@ -308,4 +605,5 @@ to_fake_utf(N, utf8) ->
    <<0:3, W:3, Z:6, Y:6, X:6>> = <<N:24>>,
    <<2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6>>.

+
 -endif.
--- a/src/jsx_opts.hrl
+++ b/src/jsx_opts.hrl
@ -5,5 +5,7 @@
    single_quotes = false,
    no_jsonp_escapes = false,
    comments = false,
-    json_escape = false
+    json_escape = false,
+    dirty_strings = false,
+    ignore_bad_escapes = false
 }).
--- a/src/jsx_to_json.erl
+++ b/src/jsx_to_json.erl
@ -39,7 +39,7 @@
 -spec to_json(Source::any(), Opts::opts()) -> binary().
    
 to_json(Source, Opts) when is_list(Opts) ->
-    (jsx:encoder(?MODULE, Opts, jsx_utils:extract_opts([json_escape] ++ Opts)))(Source).
+    (jsx:encoder(?MODULE, Opts, jsx_utils:extract_opts(Opts)))(Source).


 -spec format(Source::binary(), Opts::opts()) -> binary().
@ -195,6 +195,9 @@ basic_format_test_() ->
            [{"naked float", ?_assertEqual(format(<<"1.23">>, []), <<"1.23">>)}]
        },
        {"naked string", ?_assertEqual(format(<<"\"hi\"">>, []), <<"\"hi\"">>)},
+        {"naked string with control character", ?_assertEqual(
+            format(<<"\"hi\\n\"">>, [json_escape]), <<"\"hi\\n\"">>
+        )},
        {"naked literal", ?_assertEqual(format(<<"true">>, []), <<"true">>)},
        {"simple object", ?_assertEqual(
            format(<<"  { \"key\"  :\n\t \"value\"\r\r\r\n }  ">>, []),
@ -241,6 +244,9 @@ basic_to_json_test_() ->
            [{"naked float", ?_assertEqual(to_json(1.23, []) , <<"1.23">>)}]
        },
        {"naked string", ?_assertEqual(to_json(<<"hi">>, []), <<"\"hi\"">>)},
+        {"naked string with control character", ?_assertEqual(
+            to_json(<<"hi\n">>, [json_escape]), <<"\"hi\\n\"">>
+        )},
        {"naked literal", ?_assertEqual(to_json(true, []), <<"true">>)},
        {"simple object", ?_assertEqual(
            to_json(
@ -324,10 +330,5 @@ opts_test_() ->
        )}
    ].

-ext_opts_test_() ->
-    [{"extopts", ?_assertEqual(
-        format(<<"[]">>, [loose_unicode, {escape_forward_slash, true}]),
-        <<"[]">>
-    )}].
    
 -endif.
--- a/src/jsx_utils.erl
+++ b/src/jsx_utils.erl
@ -51,6 +51,17 @@ parse_opts([comments|Rest], Opts) ->
    parse_opts(Rest, Opts#opts{comments=true});
 parse_opts([json_escape|Rest], Opts) ->
    parse_opts(Rest, Opts#opts{json_escape=true});
+parse_opts([dirty_strings|Rest], Opts) ->
+    parse_opts(Rest, Opts#opts{dirty_strings=true});
+parse_opts([ignore_bad_escapes|Rest], Opts) ->
+    parse_opts(Rest, Opts#opts{ignore_bad_escapes=true});
+parse_opts([relax|Rest], Opts) ->
+    parse_opts(Rest, Opts#opts{
+        loose_unicode = true,
+        single_quotes = true,
+        comments = true,
+        ignore_bad_escapes = true
+    });
 parse_opts(_, _) ->
    {error, badarg}.

@ -63,7 +74,10 @@ valid_flags() ->
        single_quotes,
        no_jsonp_escapes,
        comments,
-        json_escape
+        json_escape,
+        dirty_strings,
+        ignore_bad_escapes,
+        relax
    ].


@ -88,7 +102,10 @@ extract_parser_opts([K|Rest], Acc) ->
 %%  everything else should be a legal json string component

 json_escape(String, Opts) when is_binary(String) ->
-    json_escape(String, Opts, 0, size(String)).
+    case Opts#opts.dirty_strings of
+        true -> String
+        ; false -> json_escape(String, Opts, 0, size(String))
+    end.


 -define(control_character(X),
@ -243,7 +260,7 @@ json_escape(Str, Opts, L, Len) when L < Len ->
                    json_escape(<<H/binary, 16#2028/utf8, T/binary>>, Opts, L + 3, Len);
                false ->
                    B = unicode:characters_to_binary(json_escape_sequence(16#2028)),
-                    json_escape(<<H/binary, B/binary, T/binary>>, Opts, L + size(B), Len + size(B) - size(<<16#2028/utf8>>))
+                    json_escape(<<H/binary, B/binary, T/binary>>, Opts, L + 6, Len + 3)
            end;
        <<H:L/binary, 16#2029/utf8, T/binary>> ->
            case Opts#opts.no_jsonp_escapes of
@ -251,26 +268,51 @@ json_escape(Str, Opts, L, Len) when L < Len ->
                    json_escape(<<H/binary, 16#2029/utf8, T/binary>>, Opts, L + 3, Len);
                false ->
                    B = unicode:characters_to_binary(json_escape_sequence(16#2029)),
-                    json_escape(<<H/binary, B/binary, T/binary>>, Opts, L + size(B), Len + size(B) - size(<<16#2029/utf8>>))
+                    json_escape(<<H/binary, B/binary, T/binary>>, Opts, L + 6, Len + 3)
            end;
        <<_:L/binary, X/utf8, _/binary>> when X < 16#0080 ->   
            json_escape(Str, Opts, L + 1, Len);
        <<_:L/binary, X/utf8, _/binary>> when X < 16#0800 ->
            json_escape(Str, Opts, L + 2, Len);
-        <<_:L/binary, X/utf8, _/binary>> when X < 16#10000 ->
+        <<_:L/binary, X/utf8, _/binary>> when X < 16#dcff ->
            json_escape(Str, Opts, L + 3, Len);
-        <<_:L/binary, _/utf8, _/binary>> ->
+        <<_:L/binary, X/utf8, _/binary>> when X > 16#dfff, X < 16#fdd0 ->
+            json_escape(Str, Opts, L + 3, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X > 16#fdef, X < 16#fffe ->
+            json_escape(Str, Opts, L + 3, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#10000, X < 16#1fffe ->
            json_escape(Str, Opts, L + 4, Len);
-        <<H:L/binary, 237, X, _, T/binary>> when X >= 160 ->
-            case Opts#opts.loose_unicode of
-                true -> json_escape(<<H/binary, 16#fffd/utf8, T/binary>>, Opts, L + 3, Len);
-                false -> erlang:error(badarg, [Str, Opts])
-            end;
-        <<H:L/binary, _, T/binary>> ->
-            case Opts#opts.loose_unicode of
-                true -> json_escape(<<H/binary, 16#fffd/utf8, T/binary>>, Opts, L + 3, Len + 2);
-                false -> erlang:error(badarg, [Str, Opts])
-            end            
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#20000, X < 16#2fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#30000, X < 16#3fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#40000, X < 16#4fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#50000, X < 16#5fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#60000, X < 16#6fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#70000, X < 16#7fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#80000, X < 16#8fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#90000, X < 16#9fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#a0000, X < 16#afffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#b0000, X < 16#bfffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#c0000, X < 16#cfffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#d0000, X < 16#dfffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#e0000, X < 16#efffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#f0000, X < 16#ffffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        <<_:L/binary, X/utf8, _/binary>> when X >= 16#100000, X < 16#10fffe ->
+            json_escape(Str, Opts, L + 4, Len);
+        _ -> erlang:error(badarg, [Str, Opts])
    end;
 json_escape(Str, _, L, Len) when L =:= Len ->
    Str.
@ -291,7 +333,6 @@ to_hex(15) -> $f;
 to_hex(X) -> X + 48.    %% ascii "1" is [49], "2" is [50], etc...


-
 %% eunit tests
 -ifdef(TEST).
 -include_lib("eunit/include/eunit.hrl").
@ -329,28 +370,53 @@ binary_escape_test_() ->
                <<"\\/Date(1303502009425)\\/">>
            )
        },
-        {"bad utf8",
-            ?_assertError(badarg, json_escape(<<32, 64, 128, 255>>, #opts{}))
-        },
-        {"bad utf8 ok",
+        {"dirty strings",
            ?_assertEqual(
-                json_escape(<<32, 64, 128, 255>>, #opts{loose_unicode=true}),
-                <<32, 64, 16#fffd/utf8, 16#fffd/utf8>>
-            )
-        },
-        {"bad surrogate", ?_assertError(badarg, json_escape(<<237, 160, 127>>, #opts{}))},
-        {"bad surrogate ok",
-            ?_assertEqual(
-                json_escape(<<237, 160, 127>>, #opts{loose_unicode=true}),
-                <<16#fffd/utf8>>
-            )
-        },
-        {"all sizes of codepoints",
-            ?_assertEqual(
-                json_escape(unicode:characters_to_binary([0, 32, 16#80, 16#800, 16#10000]), #opts{}),
-                <<"\\u0000", 32/utf8, 16#80/utf8, 16#800/utf8, 16#10000/utf8>>
+                json_escape(<<"\\x25\\uffff">>, #opts{dirty_strings=true}),
+                <<"\\x25\\uffff">>
            )
        }
    ].

+
+opts_test_() ->
+    [
+        {"all flags",
+            ?_assertEqual(
+                parse_opts([
+                    loose_unicode,
+                    escape_forward_slash,
+                    explicit_end,
+                    single_quotes,
+                    no_jsonp_escapes,
+                    comments,
+                    dirty_strings,
+                    ignore_bad_escapes
+                ]),
+                #opts{
+                    loose_unicode=true,
+                    escape_forward_slash=true,
+                    explicit_end=true,
+                    single_quotes=true,
+                    no_jsonp_escapes=true,
+                    comments=true,
+                    dirty_strings=true,
+                    ignore_bad_escapes=true
+                }
+            )
+        },
+        {"relax flag",
+            ?_assertEqual(
+                parse_opts([relax]),
+                #opts{
+                    loose_unicode=true,
+                    single_quotes=true,
+                    comments=true,
+                    ignore_bad_escapes=true
+                }
+            )
+        }
+    ].
+
+
 -endif.