From cd4f4a8f1ccb14a282469b8a26ebe00bc878ef38 Mon Sep 17 00:00:00 2001
From: alisdair sullivan <alisdairsullivan@yahoo.ca>
Date: Thu, 28 Jul 2011 18:47:58 -0700
Subject: [PATCH] test every codepoint possible for replacement/badness

---
 src/jsx_decoder.hrl | 192 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 185 insertions(+), 7 deletions(-)

diff --git a/src/jsx_decoder.hrl b/src/jsx_decoder.hrl
index 2cb88cc..a830411 100644
--- a/src/jsx_decoder.hrl
+++ b/src/jsx_decoder.hrl
@@ -92,26 +92,31 @@
 
 %% partial codepoint max size differs across encodings
 -ifdef(utf8).
+-define(encoding, utf8).
 -define(utfx, utf8).
 -define(partial_codepoint(Bin), byte_size(Bin) < 1).
 -endif.
 
 -ifdef(utf16).
+-define(encoding, utf16).
 -define(utfx, utf16).
 -define(partial_codepoint(Bin), byte_size(Bin) < 2).
 -endif.
 
 -ifdef(utf16le).
+-define(encoding, utf16le).
 -define(utfx, utf16-little).
 -define(partial_codepoint(Bin), byte_size(Bin) < 2).
 -endif.
     
 -ifdef(utf32).
+-define(encoding, utf32).
 -define(utfx, utf32).
 -define(partial_codepoint(Bin), byte_size(Bin) < 4).
 -endif.
 
 -ifdef(utf32le).
+-define(encoding, utf32le).
 -define(utfx, utf32-little).
 -define(partial_codepoint(Bin), byte_size(Bin) < 4).
 -endif.
@@ -390,7 +395,7 @@ string(<<S/?utfx, Rest/binary>>, Stack, Opts, Acc)
             S =/= 16#dfffe andalso S =/= 16#dffff andalso
             S =/= 16#efffe andalso S =/= 16#effff andalso
             S =/= 16#ffffe andalso S =/= 16#fffff andalso
-            S =/= 16#101fffe andalso S =/= 16#10ffff ->
+            S =/= 16#10fffe andalso S =/= 16#10ffff ->
     string(Rest, Stack, Opts, <<Acc/binary, S/utf8>>);
 string(Bin, Stack, Opts, Acc) ->
     case partial_utf(Bin) of 
@@ -488,11 +493,11 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
 -ifdef(utf16).
 %% non-characters blah blah
 noncharacter(<<S/utf16, Rest/binary>>, Stack, Opts, Acc)
-        when ?is_noncontrol(S) ->
+        when ?is_noncontrol(S), S < 16#fffe ->
     string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
 %% u+ffff and u+fffe
 noncharacter(<<255, X, Rest/binary>>, Stack, Opts, Acc)
-        when X == 253; X == 254 ->
+        when X == 254; X == 255 ->
     string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
 %% surrogates
 noncharacter(<<X, _, Rest/binary>>, Stack, Opts, Acc)
@@ -503,12 +508,13 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
 -endif.
 
 -ifdef(utf16le).
+%% non-characters blah blah
 noncharacter(<<S/utf16-little, Rest/binary>>, Stack, Opts, Acc)
-        when ?is_noncontrol(S) ->
+        when ?is_noncontrol(S), S < 16#fffe ->
     string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
 %% u+ffff and u+fffe
 noncharacter(<<X, 255, Rest/binary>>, Stack, Opts, Acc)
-        when X == 253; X == 254 ->
+        when X == 254; X == 255 ->
     string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
 %% surrogates
 noncharacter(<<_, X, Rest/binary>>, Stack, Opts, Acc)
@@ -519,6 +525,7 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
 -endif.
 
 -ifdef(utf32).
+%% non-characters blah blah
 noncharacter(<<S/utf32, Rest/binary>>, Stack, Opts, Acc)
         when ?is_noncontrol(S) ->
     string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
@@ -535,6 +542,7 @@ noncharacter(Bin, _Stack, _Opts, _Acc) ->
 -endif.
 
 -ifdef(utf32le).
+%% non-characters blah blah
 noncharacter(<<S/utf32-little, Rest/binary>>, Stack, Opts, Acc)
         when ?is_noncontrol(S) ->
     string(Rest, Stack, Opts, <<Acc/binary, 16#fffd/utf8>>);
@@ -1006,7 +1014,6 @@ format_number({Int, [], Exp}) ->
 format_number({Int, Frac, Exp}) ->
     {float, list_to_float(lists:reverse(Exp ++ "e" ++ Frac ++ "." ++ Int))}.
             
-         
 
 
 tr(<<$r/?utfx, Rest/binary>>, Stack, Opts) ->
@@ -1146,4 +1153,175 @@ null(Bin, Stack, Opts) ->
                     null(<<Bin/binary, Stream/binary>>, Stack, Opts)
             end}
         ; false -> {error, {badjson, Bin}}
-    end.
\ No newline at end of file
+    end.
+    
+
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+
+
+noncharacters_test_() ->
+    [
+        {"noncharacters - badjson",
+            ?_assertEqual(check_bad(noncharacters()), [])
+        },
+        {"noncharacters - replaced",
+            ?_assertEqual(check_replaced(noncharacters()), [])
+        }
+    ].
+
+extended_noncharacters_test_() ->
+    [
+        {"extended noncharacters - badjson",
+            ?_assertEqual(check_bad(extended_noncharacters()), [])
+        },
+        {"extended noncharacters - replaced",
+            ?_assertEqual(check_extended_replaced(extended_noncharacters()), [])
+        }
+    ].
+
+surrogates_test_() ->
+    [
+        {"surrogates - badjson",
+            ?_assertEqual(check_bad(surrogates()), [])
+        },
+        {"surrogates - replaced",
+            ?_assertEqual(check_replaced(surrogates()), [])
+        }
+    ].
+
+control_test_() ->
+    [
+        {"control characters - badjson",
+            ?_assertEqual(check_bad(control_characters()), [])
+        }
+    ].
+
+reserved_test_() ->
+    [
+        {"reserved noncharacters - badjson",
+            ?_assertEqual(check_bad(reserved_space()), [])
+        },
+        {"reserved noncharacters - replaced",
+            ?_assertEqual(check_replaced(reserved_space()), [])
+        }
+    ].
+
+zero_test_() ->
+    [
+        {"nullbyte - badjson",
+            ?_assertEqual(check_bad(zero()), [])
+        }
+    ].
+    
+good_characters_test_() ->
+    [
+        {"acceptable codepoints",
+            ?_assertEqual(check_good(good()), [])
+        },
+        {"acceptable extended",
+            ?_assertEqual(check_good(good_extended()), [])
+        }
+    ].
+    
+
+check_bad(List) ->
+    lists:dropwhile(fun({_, {error, badjson}}) -> true ; (_) -> false end,
+        check(List, [], [])
+    ).
+
+check_replaced(List) ->
+    lists:dropwhile(fun({_, [{string, <<16#fffd/utf8>>}|_]}) -> true ; (_) -> false end,
+        check(List, [loose_unicode], [])
+    ).
+
+check_extended_replaced(List) ->
+    Replace = case ?encoding of
+        E when E == utf16; E == utf16le -> <<16#fffd/utf8, 16#fffd/utf8>>
+        ; _ -> <<16#fffd/utf8>>
+    end, 
+    lists:dropwhile(fun({_, [{string, S}|_]}) -> S == Replace ; (_) -> false end,
+        check(List, [loose_unicode], [])
+    ).
+
+check_good(List) ->
+    lists:dropwhile(fun({_, [{string, _}|_]}) -> true ; (_) -> false end,
+        check(List, [], [])
+    ).
+
+check([], _Opts, Acc) -> Acc;
+check([H|T], Opts, Acc) ->
+    R = decode(to_fake_utf(H, ?encoding), Opts),
+    check(T, Opts, [{H, R}] ++ Acc).
+
+
+decode(JSON, Opts) ->
+    F = decoder(Opts),
+    loop(F(JSON), []).
+
+
+loop({jsx, end_json, _}, Acc) -> lists:reverse(Acc);
+loop({jsx, incomplete, More}, Acc) -> loop(More(end_stream), Acc);
+loop({jsx, Event, Next}, Acc) -> loop(Next(), [Event] ++ Acc);
+loop(_, _) -> {error, badjson}.
+    
+
+
+noncharacters() -> lists:seq(16#fffe, 16#ffff).
+    
+extended_noncharacters() ->
+    [16#1fffe, 16#1ffff, 16#2fffe, 16#2ffff]
+        ++ [16#3fffe, 16#3ffff, 16#4fffe, 16#4ffff]
+        ++ [16#5fffe, 16#5ffff, 16#6fffe, 16#6ffff]
+        ++ [16#7fffe, 16#7ffff, 16#8fffe, 16#8ffff]
+        ++ [16#9fffe, 16#9ffff, 16#afffe, 16#affff]
+        ++ [16#bfffe, 16#bffff, 16#cfffe, 16#cffff]
+        ++ [16#dfffe, 16#dffff, 16#efffe, 16#effff]
+        ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff].
+
+surrogates() -> lists:seq(16#d800, 16#dfff).
+
+control_characters() -> lists:seq(1, 31).
+
+reserved_space() -> lists:seq(16#fdd0, 16#fdef).
+
+zero() -> [0].
+
+good() -> [32, 33]
+            ++ lists:seq(16#23, 16#5b)
+            ++ lists:seq(16#5d, 16#d7ff)
+            ++ lists:seq(16#e000, 16#fdcf)
+            ++ lists:seq(16#fdf0, 16#fffd).
+            
+good_extended() -> lists:seq(16#100000, 16#10fffd).
+
+%% erlang refuses to encode certain codepoints, so fake them all
+to_fake_utf(N, utf8) when N < 16#0080 -> <<34/utf8, N:8, 34/utf8>>;
+to_fake_utf(N, utf8) when N < 16#0800 ->
+    <<0:5, Y:5, X:6>> = <<N:16>>,
+    <<34/utf8, 2#110:3, Y:5, 2#10:2, X:6, 34/utf8>>; 
+to_fake_utf(N, utf8) when N < 16#10000 ->
+    <<Z:4, Y:6, X:6>> = <<N:16>>,
+    <<34/utf8, 2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>;
+to_fake_utf(N, utf8) ->
+    <<0:3, W:3, Z:6, Y:6, X:6>> = <<N:24>>,
+    <<34/utf8, 2#11110:5, W:3, 2#10:2, Z:6, 2#10:2, Y:6, 2#10:2, X:6, 34/utf8>>;
+
+to_fake_utf(N, utf16) when N < 16#10000 -> <<34/utf16, N:16, 34/utf16>>;
+to_fake_utf(N, utf16) -> <<34/utf16, N/utf16, 34/utf16>>;
+
+to_fake_utf(N, utf16le) when N < 16#10000 ->
+    <<A:8, B:8>> = <<N:16>>,
+    <<34, 0, B:8, A:8, 34, 0>>;
+to_fake_utf(N, utf16le) -> <<34/utf16-little, N/utf16-little, 34/utf16-little>>;
+
+to_fake_utf(N, utf32) -> <<34/utf32, N:32, 34/utf32>>;
+
+to_fake_utf(N, utf32le) ->
+    <<A:8, B:8, C:8, D:8>> = <<N:32>>,
+    <<34/utf32-little, D:8, C:8, B:8, A:8, 34/utf32-little>>.
+
+    
+
+
+-endif.
\ No newline at end of file