diff --git a/README.md b/README.md index 3550b15..92b5622 100644 --- a/README.md +++ b/README.md @@ -250,7 +250,9 @@ to understand and they prevented evolution of the encoding and decoding code the shorthand form of property lists using atoms as properties so all properties must be tuples. all keys must be encoded as in `string` or as atoms or integers (which will be escaped and converted to binaries for - presentation to handlers). values should be valid json values + presentation to handlers). values should be valid json values. repeated + keys are tolerated in json text decoded to erlang terms but are not allowed + in erlang terms encoded to json ### incomplete input ### diff --git a/src/jsx_encoder.erl b/src/jsx_encoder.erl index 12e5866..c631efd 100644 --- a/src/jsx_encoder.erl +++ b/src/jsx_encoder.erl @@ -103,7 +103,7 @@ custom_error_handler_test_() -> parser(self(), [{error_handler, Error}]) )}, {"string error", ?_assertEqual( - {string, [{string, <<239, 191, 191>>}]}, + {value, [{string, <<239, 191, 191>>}]}, parser(<<239, 191, 191>>, [{error_handler, Error}, strict]) )} ]. diff --git a/src/jsx_parser.erl b/src/jsx_parser.erl index c5c23a2..1daa2ef 100644 --- a/src/jsx_parser.erl +++ b/src/jsx_parser.erl @@ -88,7 +88,7 @@ handle_event(Event, {Handler, State}, _Config) -> {Handler, Handler:handle_event value([start_object|Tokens], Handler, Stack, Config) -> - object(Tokens, handle_event(start_object, Handler, Config), [object|Stack], Config); + object(Tokens, handle_event(start_object, Handler, Config), [{object, sets:new()}|Stack], Config); value([start_array|Tokens], Handler, Stack, Config) -> array(Tokens, handle_event(start_array, Handler, Config), [array|Stack], Config); value([{literal, Literal}|Tokens], Handler, Stack, Config) when Literal == true; Literal == false; Literal == null -> @@ -108,10 +108,10 @@ value([Number|Tokens], Handler, Stack, Config) when is_integer(Number) -> value([Number|Tokens], Handler, Stack, Config) when is_float(Number) -> value([{float, Number}] ++ Tokens, Handler, Stack, Config); value([{string, String}|Tokens], Handler, Stack, Config) when is_binary(String) -> - case clean_string(String, Tokens, Handler, Stack, Config) of - Clean when is_binary(Clean) -> - maybe_done(Tokens, handle_event({string, Clean}, Handler, Config), Stack, Config); - Error -> Error + try clean_string(String, Config) of Clean -> + maybe_done(Tokens, handle_event({string, Clean}, Handler, Config), Stack, Config) + catch error:badarg -> + ?error(value, [{string, String}|Tokens], Handler, Stack, Config) end; value([String|Tokens], Handler, Stack, Config) when is_binary(String) -> value([{string, String}] ++ Tokens, Handler, Stack, Config); @@ -126,19 +126,26 @@ value(BadTokens, Handler, Stack, Config) when is_list(BadTokens) -> value(Token, Handler, Stack, Config) -> value([Token], Handler, Stack, Config). -object([end_object|Tokens], Handler, [object|Stack], Config) -> +object([end_object|Tokens], Handler, [{object, _}|Stack], Config) -> maybe_done(Tokens, handle_event(end_object, Handler, Config), Stack, Config); -object([{key, Key}|Tokens], Handler, Stack, Config) when is_atom(Key); is_binary(Key); is_integer(Key) -> - case clean_string(fix_key(Key), Tokens, Handler, Stack, Config) of - Clean when is_binary(Clean) -> - value(Tokens, handle_event({key, Clean}, Handler, Config), Stack, Config); - Error -> Error - end; -object([Key|Tokens], Handler, Stack, Config) when is_atom(Key); is_binary(Key); is_integer(Key) -> - case clean_string(fix_key(Key), Tokens, Handler, Stack, Config) of - Clean when is_binary(Clean) -> - value(Tokens, handle_event({key, Clean}, Handler, Config), Stack, Config); - Error -> Error +object([{key, Key}|Tokens], Handler, Stack, Config) +when is_atom(Key); is_binary(Key); is_integer(Key) -> + object([Key|Tokens], Handler, Stack, Config); +object([Key|Tokens], Handler, [{object, Keys}|Stack], Config) +when is_atom(Key); is_binary(Key); is_integer(Key) -> + try + CleanKey = clean_string(fix_key(Key), Config), + case sets:is_element(CleanKey, Keys) of true -> erlang:error(badarg); _ -> ok end, + CleanKey + of K -> + value( + Tokens, + handle_event({key, K}, Handler, Config), + [{object, sets:add_element(K, Keys)}|Stack], + Config + ) + catch error:badarg -> + ?error(object, [{string, Key}|Tokens], Handler, Stack, Config) end; object([], Handler, Stack, Config) -> incomplete(object, Handler, Stack, Config); @@ -156,7 +163,7 @@ array(Token, Handler, Stack, Config) -> maybe_done([end_json], Handler, [], Config) -> done([end_json], Handler, [], Config); -maybe_done(Tokens, Handler, [object|_] = Stack, Config) when is_list(Tokens) -> +maybe_done(Tokens, Handler, [{object, _}|_] = Stack, Config) when is_list(Tokens) -> object(Tokens, Handler, Stack, Config); maybe_done(Tokens, Handler, [array|_] = Stack, Config) when is_list(Tokens) -> array(Tokens, Handler, Stack, Config); @@ -183,15 +190,13 @@ fix_key(Key) when is_integer(Key) -> list_to_binary(integer_to_list(Key)); fix_key(Key) when is_binary(Key) -> Key. -clean_string(Bin, Tokens, Handler, Stack, Config) -> - case clean_string(Bin, Config) of - {error, badarg} -> ?error(string, [{string, Bin}|Tokens], Handler, Stack, Config); +clean_string(Bin, #config{dirty_strings=true}) -> Bin; +clean_string(Bin, Config) -> + case clean(Bin, [], Config) of + {error, badarg} -> erlang:error(badarg); String -> String end. -clean_string(Bin, #config{dirty_strings=true}) -> Bin; -clean_string(Bin, Config) -> clean(Bin, [], Config). - %% escape and/or replace bad codepoints if requested clean(<<>>, Acc, _Config) -> unicode:characters_to_binary(lists:reverse(Acc)); @@ -484,7 +489,7 @@ custom_error_handler_test_() -> parse([{string, <<"">>}, {literal, true}, end_json], [{error_handler, Error}]) )}, {"string error", ?_assertEqual( - {string, [{string, <<239, 191, 191>>}, end_json]}, + {value, [{string, <<239, 191, 191>>}, end_json]}, parse([{string, <<239, 191, 191>>}, end_json], [{error_handler, Error}, strict]) )} ]. @@ -579,6 +584,10 @@ extended_noncharacters() -> ++ [16#ffffe, 16#fffff, 16#10fffe, 16#10ffff] ]. +clean_string_helper(String) -> + try clean_string(String, #config{strict_utf8=true}) of Clean -> Clean + catch error:badarg -> {error, badarg} + end. clean_string_test_() -> [ @@ -600,19 +609,19 @@ clean_string_test_() -> )}, {"error reserved space", ?_assertEqual( lists:duplicate(length(reserved_space()), {error, badarg}), - lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{strict_utf8=true}) end, reserved_space()) + lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, reserved_space()) )}, {"error surrogates", ?_assertEqual( lists:duplicate(length(surrogates()), {error, badarg}), - lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{strict_utf8=true}) end, surrogates()) + lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, surrogates()) )}, {"error noncharacters", ?_assertEqual( lists:duplicate(length(noncharacters()), {error, badarg}), - lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{strict_utf8=true}) end, noncharacters()) + lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, noncharacters()) )}, {"error extended noncharacters", ?_assertEqual( lists:duplicate(length(extended_noncharacters()), {error, badarg}), - lists:map(fun(Codepoint) -> clean_string(Codepoint, #config{strict_utf8=true}) end, extended_noncharacters()) + lists:map(fun(Codepoint) -> clean_string_helper(Codepoint) end, extended_noncharacters()) )}, {"clean reserved space", ?_assertEqual( lists:duplicate(length(reserved_space()), <<16#fffd/utf8>>), @@ -804,80 +813,80 @@ escape_test_() -> bad_utf8_test_() -> [ - {"noncharacter u+fffe", ?_assertEqual( - {error, badarg}, + {"noncharacter u+fffe", ?_assertError( + badarg, clean_string(to_fake_utf8(16#fffe), #config{strict_utf8=true}) )}, {"noncharacter u+fffe replaced", ?_assertEqual( <<16#fffd/utf8>>, clean_string(to_fake_utf8(16#fffe), #config{}) )}, - {"noncharacter u+ffff", ?_assertEqual( - {error, badarg}, + {"noncharacter u+ffff", ?_assertError( + badarg, clean_string(to_fake_utf8(16#ffff), #config{strict_utf8=true}) )}, {"noncharacter u+ffff replaced", ?_assertEqual( <<16#fffd/utf8>>, clean_string(to_fake_utf8(16#ffff), #config{}) )}, - {"orphan continuation byte u+0080", ?_assertEqual( - {error, badarg}, + {"orphan continuation byte u+0080", ?_assertError( + badarg, clean_string(<<16#0080>>, #config{strict_utf8=true}) )}, {"orphan continuation byte u+0080 replaced", ?_assertEqual( <<16#fffd/utf8>>, clean_string(<<16#0080>>, #config{}) )}, - {"orphan continuation byte u+00bf", ?_assertEqual( - {error, badarg}, + {"orphan continuation byte u+00bf", ?_assertError( + badarg, clean_string(<<16#00bf>>, #config{strict_utf8=true}) )}, {"orphan continuation byte u+00bf replaced", ?_assertEqual( <<16#fffd/utf8>>, clean_string(<<16#00bf>>, #config{}) )}, - {"2 continuation bytes", ?_assertEqual( - {error, badarg}, + {"2 continuation bytes", ?_assertError( + badarg, clean_string(<<(binary:copy(<<16#0080>>, 2))/binary>>, #config{strict_utf8=true}) )}, {"2 continuation bytes replaced", ?_assertEqual( binary:copy(<<16#fffd/utf8>>, 2), clean_string(<<(binary:copy(<<16#0080>>, 2))/binary>>, #config{}) )}, - {"3 continuation bytes", ?_assertEqual( - {error, badarg}, + {"3 continuation bytes", ?_assertError( + badarg, clean_string(<<(binary:copy(<<16#0080>>, 3))/binary>>, #config{strict_utf8=true}) )}, {"3 continuation bytes replaced", ?_assertEqual( binary:copy(<<16#fffd/utf8>>, 3), clean_string(<<(binary:copy(<<16#0080>>, 3))/binary>>, #config{}) )}, - {"4 continuation bytes", ?_assertEqual( - {error, badarg}, + {"4 continuation bytes", ?_assertError( + badarg, clean_string(<<(binary:copy(<<16#0080>>, 4))/binary>>, #config{strict_utf8=true}) )}, {"4 continuation bytes replaced", ?_assertEqual( binary:copy(<<16#fffd/utf8>>, 4), clean_string(<<(binary:copy(<<16#0080>>, 4))/binary>>, #config{}) )}, - {"5 continuation bytes", ?_assertEqual( - {error, badarg}, + {"5 continuation bytes", ?_assertError( + badarg, clean_string(<<(binary:copy(<<16#0080>>, 5))/binary>>, #config{strict_utf8=true}) )}, {"5 continuation bytes replaced", ?_assertEqual( binary:copy(<<16#fffd/utf8>>, 5), clean_string(<<(binary:copy(<<16#0080>>, 5))/binary>>, #config{}) )}, - {"6 continuation bytes", ?_assertEqual( - {error, badarg}, + {"6 continuation bytes", ?_assertError( + badarg, clean_string(<<(binary:copy(<<16#0080>>, 6))/binary>>, #config{strict_utf8=true}) )}, {"6 continuation bytes replaced", ?_assertEqual( binary:copy(<<16#fffd/utf8>>, 6), clean_string(<<(binary:copy(<<16#0080>>, 6))/binary>>, #config{}) )}, - {"all continuation bytes", ?_assertEqual( - {error, badarg}, + {"all continuation bytes", ?_assertError( + badarg, clean_string(<<(list_to_binary(lists:seq(16#0080, 16#00bf)))/binary>>, #config{strict_utf8=true}) )}, {"all continuation bytes replaced", ?_assertEqual( @@ -887,104 +896,104 @@ bad_utf8_test_() -> #config{} ) )}, - {"lonely start byte", ?_assertEqual( - {error, badarg}, + {"lonely start byte", ?_assertError( + badarg, clean_string(<<16#00c0>>, #config{strict_utf8=true}) )}, {"lonely start byte replaced", ?_assertEqual( <<16#fffd/utf8>>, clean_string(<<16#00c0>>, #config{}) )}, - {"lonely start bytes (2 byte)", ?_assertEqual( - {error, badarg}, + {"lonely start bytes (2 byte)", ?_assertError( + badarg, clean_string(<<16#00c0, 32, 16#00df>>, #config{strict_utf8=true}) )}, {"lonely start bytes (2 byte) replaced", ?_assertEqual( <<16#fffd/utf8, 32, 16#fffd/utf8>>, clean_string(<<16#00c0, 32, 16#00df>>, #config{}) )}, - {"lonely start bytes (3 byte)", ?_assertEqual( - {error, badarg}, + {"lonely start bytes (3 byte)", ?_assertError( + badarg, clean_string(<<16#00e0, 32, 16#00ef>>, #config{strict_utf8=true}) )}, {"lonely start bytes (3 byte) replaced", ?_assertEqual( <<16#fffd/utf8, 32, 16#fffd/utf8>>, clean_string(<<16#00e0, 32, 16#00ef>>, #config{}) )}, - {"lonely start bytes (4 byte)", ?_assertEqual( - {error, badarg}, + {"lonely start bytes (4 byte)", ?_assertError( + badarg, clean_string(<<16#00f0, 32, 16#00f7>>, #config{strict_utf8=true}) )}, {"lonely start bytes (4 byte) replaced", ?_assertEqual( <<16#fffd/utf8, 32, 16#fffd/utf8>>, clean_string(<<16#00f0, 32, 16#00f7>>, #config{}) )}, - {"missing continuation byte (3 byte)", ?_assertEqual( - {error, badarg}, + {"missing continuation byte (3 byte)", ?_assertError( + badarg, clean_string(<<224, 160, 32>>, #config{strict_utf8=true}) )}, {"missing continuation byte (3 byte) replaced", ?_assertEqual( <<16#fffd/utf8, 32>>, clean_string(<<224, 160, 32>>, #config{}) )}, - {"missing continuation byte (4 byte missing one)", ?_assertEqual( - {error, badarg}, + {"missing continuation byte (4 byte missing one)", ?_assertError( + badarg, clean_string(<<240, 144, 128, 32>>, #config{strict_utf8=true}) )}, {"missing continuation byte (4 byte missing one) replaced", ?_assertEqual( <<16#fffd/utf8, 32>>, clean_string(<<240, 144, 128, 32>>, #config{}) )}, - {"missing continuation byte (4 byte missing two)", ?_assertEqual( - {error, badarg}, + {"missing continuation byte (4 byte missing two)", ?_assertError( + badarg, clean_string(<<240, 144, 32>>, #config{strict_utf8=true}) )}, {"missing continuation byte (4 byte missing two) replaced", ?_assertEqual( <<16#fffd/utf8, 32>>, clean_string(<<240, 144, 32>>, #config{}) )}, - {"overlong encoding of u+002f (2 byte)", ?_assertEqual( - {error, badarg}, + {"overlong encoding of u+002f (2 byte)", ?_assertError( + badarg, clean_string(<<16#c0, 16#af, 32>>, #config{strict_utf8=true}) )}, {"overlong encoding of u+002f (2 byte) replaced", ?_assertEqual( <<16#fffd/utf8, 32>>, clean_string(<<16#c0, 16#af, 32>>, #config{}) )}, - {"overlong encoding of u+002f (3 byte)", ?_assertEqual( - {error, badarg}, + {"overlong encoding of u+002f (3 byte)", ?_assertError( + badarg, clean_string(<<16#e0, 16#80, 16#af, 32>>, #config{strict_utf8=true}) )}, {"overlong encoding of u+002f (3 byte) replaced", ?_assertEqual( <<16#fffd/utf8, 32>>, clean_string(<<16#e0, 16#80, 16#af, 32>>, #config{}) )}, - {"overlong encoding of u+002f (4 byte)", ?_assertEqual( - {error, badarg}, + {"overlong encoding of u+002f (4 byte)", ?_assertError( + badarg, clean_string(<<16#f0, 16#80, 16#80, 16#af, 32>>, #config{strict_utf8=true}) )}, {"overlong encoding of u+002f (4 byte) replaced", ?_assertEqual( <<16#fffd/utf8, 32>>, clean_string(<<16#f0, 16#80, 16#80, 16#af, 32>>, #config{}) )}, - {"highest overlong 2 byte sequence", ?_assertEqual( - {error, badarg}, + {"highest overlong 2 byte sequence", ?_assertError( + badarg, clean_string(<<16#c1, 16#bf, 32>>, #config{strict_utf8=true}) )}, {"highest overlong 2 byte sequence replaced", ?_assertEqual( <<16#fffd/utf8, 32>>, clean_string(<<16#c1, 16#bf, 32>>, #config{}) )}, - {"highest overlong 3 byte sequence", ?_assertEqual( - {error, badarg}, + {"highest overlong 3 byte sequence", ?_assertError( + badarg, clean_string(<<16#e0, 16#9f, 16#bf, 32>>, #config{strict_utf8=true}) )}, {"highest overlong 3 byte sequence replaced", ?_assertEqual( <<16#fffd/utf8, 32>>, clean_string(<<16#e0, 16#9f, 16#bf, 32>>, #config{}) )}, - {"highest overlong 4 byte sequence", ?_assertEqual( - {error, badarg}, + {"highest overlong 4 byte sequence", ?_assertError( + badarg, clean_string(<<16#f0, 16#8f, 16#bf, 16#bf, 32>>, #config{strict_utf8=true}) )}, {"highest overlong 4 byte sequence replaced", ?_assertEqual( @@ -1009,4 +1018,14 @@ fix_key_test_() -> {"integer key", ?_assertEqual(fix_key(123), <<"123">>)} ]. + +repeated_key_test_() -> + Parse = fun(Events, Config) -> (parser(?MODULE, [], Config))(Events ++ [end_json]) end, + [ + {"repeated key", ?_assertError( + badarg, + Parse([start_object, <<"key">>, true, <<"key">>, true, end_object], []) + )} + ]. + -endif.