adds loose_unicode option which replaces badly formed unicode (escaped non-characters and restricted codepoints, as well as partial surrogates) with u+fffd instead of throwing an error

2011-07-27 01:59:03 -07:00 · 2011-07-27 01:59:03 -07:00 · 653205501c
commit 653205501c
parent 51d27bb3b5
15 changed files with 121 additions and 10 deletions
--- a/src/jsx_common.hrl
+++ b/src/jsx_common.hrl
@ -34,6 +34,7 @@

 -type jsx_opts() :: [jsx_opt()].
 -type jsx_opt() :: {multi_term, true | false}
+    | loose_unicode
    | {encoding, auto 
        | utf8
        | utf16
--- a/src/jsx_decoder.hrl
+++ b/src/jsx_decoder.hrl
@ -31,6 +31,7 @@
 %% opts record for decoder
 -record(opts, {
    multi_term = false,
+    loose_unicode = false,
    encoding = auto
 }).

@ -81,7 +82,7 @@
 ).

 -define(is_noncontrol(Symbol),
-    Symbol >= ?space
+    (Symbol >= ?space)
 ).

 -define(is_whitespace(Symbol),
@ -138,6 +139,8 @@ parse_opts([{multi_term, Value}|Rest], Opts) ->
    parse_opts(Rest, Opts#opts{multi_term=Value});
 parse_opts([multi_term|Rest], Opts) ->
    parse_opts(Rest, Opts#opts{multi_term=true});
+parse_opts([loose_unicode|Rest], Opts) ->
+    parse_opts(Rest, Opts#opts{loose_unicode=true});
 parse_opts([{encoding, _}|Rest], Opts) ->
    parse_opts(Rest, Opts);
 parse_opts(_, _) ->
@ -350,7 +353,6 @@ key(Bin, Stack, Opts) ->
 %%   states
 string(Bin, Stack, Opts) -> string(Bin, Stack, Opts, <<>>).

-
 string(<<?quote/?utfx, Rest/binary>>, [key|_] = Stack, Opts, Acc) ->
    {jsx, {key, Acc}, fun() -> colon(Rest, Stack, Opts) end};
 string(<<?quote/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
@ -358,8 +360,37 @@ string(<<?quote/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
        maybe_done(Rest, Stack, Opts)
    end};
 string(<<?rsolidus/?utfx, Rest/binary>>, Stack, Opts, Acc) ->
-    escape(Rest, Stack, Opts, Acc);   
-string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc) when ?is_noncontrol(S) ->
+    escape(Rest, Stack, Opts, Acc);
+%% things get dumb here. erlang doesn't properly restrict unicode non-characters
+%%   so you can't trust the codepoints it returns always
+%% the range 32..16#fdcf is safe, so allow that
+string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
+        when ?is_noncontrol(S), S < 16#fdd0 ->
+    string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
+%% the range 16#fdf0..16#fffd is also safe
+string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
+        when S > 16#fdef, S < 16#fffe ->
+    string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
+%% i think doing it like this is faster than just putting this clause first.
+%%   yes, i think it's insane too
+string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
+        when S > 16#ffff andalso
+            S =/= 16#1fffe andalso S =/= 16#1ffff andalso
+            S =/= 16#2fffe andalso S =/= 16#2ffff andalso
+            S =/= 16#3fffe andalso S =/= 16#3ffff andalso
+            S =/= 16#4fffe andalso S =/= 16#4ffff andalso
+            S =/= 16#5fffe andalso S =/= 16#5ffff andalso
+            S =/= 16#6fffe andalso S =/= 16#6ffff andalso
+            S =/= 16#7fffe andalso S =/= 16#7ffff andalso
+            S =/= 16#8fffe andalso S =/= 16#8ffff andalso
+            S =/= 16#9fffe andalso S =/= 16#9ffff andalso
+            S =/= 16#afffe andalso S =/= 16#affff andalso
+            S =/= 16#bfffe andalso S =/= 16#bffff andalso
+            S =/= 16#cfffe andalso S =/= 16#cffff andalso
+            S =/= 16#dfffe andalso S =/= 16#dffff andalso
+            S =/= 16#efffe andalso S =/= 16#effff andalso
+            S =/= 16#ffffe andalso S =/= 16#fffff andalso
+            S =/= 16#101fffe andalso S =/= 16#10ffff ->
    string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
 string(Bin, Stack, Opts, Acc) ->
    case partial_utf(Bin) of 
@ -380,12 +411,14 @@ partial_utf(<<X, Rest/binary>>) when X >= 16#e0, X =< 16#ef ->
    case Rest of
        <<>> -> true
        ; <<Y>> when Y >= 16#80, Y =< 16#bf -> true
+        ; _ -> false
    end;
 partial_utf(<<X, Rest/binary>>) when X >= 16#f0, X =< 16#f4 ->
    case Rest of
        <<>> -> true
        ; <<Y>> when Y >= 16#80, Y =< 16#bf -> true
        ; <<Y, Z>> when Y >= 16#80, Y =< 16#bf, Z >= 16#80, Z =< 16#bf -> true
+        ; _ -> false
    end;
 partial_utf(_) -> false.    
 -endif.    
@ -467,11 +500,21 @@ escaped_unicode(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A])
            low_surrogate(Rest, Stack, Opts, String, X)
        %% non-characters, you're not allowed to exchange these
        ; X when X == 16#fffe; X == 16#ffff; X >= 16#fdd0, X =< 16#fdef ->
-            {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            case Opts#opts.loose_unicode of
+                true ->
+                    string(Rest, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
+                ; false ->    
+                    {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            end
        %% allowing interchange of null bytes allows attackers to forge
        %%   malicious streams
        ; X when X == 16#0000 ->
-            {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            case Opts#opts.loose_unicode of
+                true ->
+                    string(Rest, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
+                ; false ->    
+                    {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            end
        %% anything else
        ; X ->
            string(Rest, Stack, Opts, <<String/binary, X/utf8>>)
@ -498,6 +541,14 @@ escaped_unicode(Bin, Stack, Opts, String, Acc) ->

 low_surrogate(<<?rsolidus/?utfx, Rest/binary>>, Stack, Opts, String, High) ->
    low_surrogate_u(Rest, Stack, Opts, String, High);
+%% not an escaped codepoint, our high codepoint is illegal
+low_surrogate(<<S/?utfx, Rest/binary>> = Bin, Stack, Opts, String, _) ->
+    case Opts#opts.loose_unicode of
+        true ->
+            string(Bin, Stack, Opts, <<String/binary, 16#fffd/utf8>>)
+        ; false ->
+            {error, {badjson, <<S/?utfx, Rest/binary>>}}
+    end;
 low_surrogate(Bin, Stack, Opts, String, High) ->
    case ?partial_codepoint(Bin) of
        true -> 
@ -518,6 +569,19 @@ low_surrogate(Bin, Stack, Opts, String, High) ->
    
 low_surrogate_u(<<$u/?utfx, Rest/binary>>, Stack, Opts, String, High) ->
    low_surrogate(Rest, Stack, Opts, String, [], High);
+%% not a low surrogate, dispatch back to string to handle, including the
+%%   rsolidus we parsed previously
+low_surrogate_u(<<S/?utfx, Rest/binary>> = Bin, Stack, Opts, String, _) ->
+    case Opts#opts.loose_unicode of
+        true ->
+            string(<<?rsolidus/?utfx, Bin/binary>>,
+                Stack,
+                Opts,
+                <<String/binary, 16#fffd/utf8>>
+            )
+        ; false ->
+            {error, {badjson, <<S/?utfx, Rest/binary>>}}
+    end;
 low_surrogate_u(Bin, Stack, Opts, String, High) ->
    case ?partial_codepoint(Bin) of
        true -> 
@ -543,14 +607,32 @@ low_surrogate(<<D/?utfx, Rest/binary>>, Stack, Opts, String, [C, B, A], High)
            V = surrogate_to_codepoint(High, X),
            case V rem 16#10000 of
                Y when Y == 16#fffe; Y == 16#ffff ->
-                    {error, {badjson, <<D/?utfx, Rest/binary>>}}
+                    case Opts#opts.loose_unicode of
+                        true ->
+                            string(Rest,
+                                Stack,
+                                Opts,
+                                <<String/binary, 16#fffd/utf8, 16#fffd/utf8>>
+                            )
+                        ; false ->    
+                            {error, {badjson, <<D/?utfx, Rest/binary>>}}
+                    end
                ; Y ->
                    io:format("~p ~p~n", [V, Y]),
                    string(Rest, Stack, Opts, <<String/binary, V/utf8>>)
            end
        %% not a low surrogate, bad bad bad
        ; _ ->
-            {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            case Opts#opts.loose_unicode of
+                true ->
+                    string(Rest,
+                        Stack,
+                        Opts,
+                        <<String/binary, 16#fffd/utf8, 16#fffd/utf8>>
+                    )
+                ; false ->    
+                    {error, {badjson, <<D/?utfx, Rest/binary>>}}
+            end
    end;
 low_surrogate(<<S/?utfx, Rest/binary>>, Stack, Opts, String, Acc, High) 
        when ?is_hex(S) ->
--- a/src/jsx_utils.erl
+++ b/src/jsx_utils.erl
@ -204,10 +204,10 @@ detect_encoding(<<X, 0, 0, 0, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
 detect_encoding(<<0, 0, 0, X, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
    (jsx_utf32:decoder(Opts))(JSON);
 %% utf16-little null order detection
-detect_encoding(<<X, 0, _, 0, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
+detect_encoding(<<X, 0, _, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
    (jsx_utf16le:decoder(Opts))(JSON);
 %% utf16-big null order detection
-detect_encoding(<<0, X, 0, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
+detect_encoding(<<0, X, _, _, _Rest/binary>> = JSON, Opts) when X =/= 0 ->
    (jsx_utf16:decoder(Opts))(JSON);
 %% utf8 null order detection
 detect_encoding(<<X, Y, _Rest/binary>> = JSON, Opts) when X =/= 0, Y =/= 0 ->
--- a/test/cases/escaped_noncharacter_ext_replaced.json
+++ b/test/cases/escaped_noncharacter_ext_replaced.json
@ -0,0 +1 @@
+"\ud83f\udfff"
--- a/test/cases/escaped_noncharacter_ext_replaced.test
+++ b/test/cases/escaped_noncharacter_ext_replaced.test
@ -0,0 +1,4 @@
+{name, "escaped noncharacter (extended)"}.
+{jsx, [{string, <<16#fffd/utf8, 16#fffd/utf8>>}, end_json]}.
+{json, "escaped_noncharacter_ext.json"}.
+{jsx_flags, [loose_unicode]}.
--- a/test/cases/escaped_noncharacter_replaced.json
+++ b/test/cases/escaped_noncharacter_replaced.json
@ -0,0 +1 @@
+"\uffff"
--- a/test/cases/escaped_noncharacter_replaced.test
+++ b/test/cases/escaped_noncharacter_replaced.test
@ -0,0 +1,4 @@
+{name, "escaped noncharacter replacement"}.
+{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
+{json, "escaped_noncharacter_replaced.json"}.
+{jsx_flags, [loose_unicode]}.
--- a/test/cases/escaped_nullbyte_replaced.json
+++ b/test/cases/escaped_nullbyte_replaced.json
@ -0,0 +1 @@
+"\u0000"
--- a/test/cases/escaped_nullbyte_replaced.test
+++ b/test/cases/escaped_nullbyte_replaced.test
@ -0,0 +1,4 @@
+{name, "escaped nullbyte replaced"}.
+{jsx, [{string,<<16#fffd/utf8>>},end_json]}.
+{json, "escaped_nullbyte_replaced.json"}.
+{jsx_flags, [loose_unicode]}.
--- a/test/cases/noncharacter.json
+++ b/test/cases/noncharacter.json
@ -0,0 +1 @@
+"﷐"
--- a/test/cases/noncharacter.test
+++ b/test/cases/noncharacter.test
@ -0,0 +1,3 @@
+{name, "noncharacter"}.
+{jsx, {error, badjson}}.
+{json, "noncharacter.json"}.
--- a/test/cases/unpaired_surrogate.json
+++ b/test/cases/unpaired_surrogate.json
@ -0,0 +1 @@
+["\ud801blah"]
--- a/test/cases/unpaired_surrogate.test
+++ b/test/cases/unpaired_surrogate.test
@ -0,0 +1,3 @@
+{name, "unpaired_surrogate"}.
+{jsx, {error, badjson}}.
+{json, "unpaired_surrogate.json"}.
--- a/test/cases/unpaired_surrogate_replaced.json
+++ b/test/cases/unpaired_surrogate_replaced.json
@ -0,0 +1 @@
+["\ud801blah"]
--- a/test/cases/unpaired_surrogate_replaced.test
+++ b/test/cases/unpaired_surrogate_replaced.test
@ -0,0 +1,4 @@
+{name, "unpaired surrogate replaced"}.
+{jsx, [start_array,{string,<<16#fffd/utf8, "blah">>},end_array,end_json]}.
+{json, "unpaired_surrogate_replaced.json"}.
+{jsx_flags, [loose_unicode]}.