From 95a87fa0d7c4b75d0828ecdd55d27f6afd370f70 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Mon, 24 Nov 2014 17:59:41 -0800 Subject: [PATCH] add `uescape` option for 7 bit clean output of strings --- src/jsx_config.erl | 13 ++++++++++--- src/jsx_config.hrl | 1 + src/jsx_parser.erl | 38 ++++++++++++++++++++++++++++++++++---- 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/jsx_config.erl b/src/jsx_config.erl index 7aaa654..f7b8d7a 100644 --- a/src/jsx_config.erl +++ b/src/jsx_config.erl @@ -65,6 +65,8 @@ parse_config([dirty_strings|Rest], Config) -> parse_config(Rest, Config#config{dirty_strings=true}); parse_config([repeat_keys|Rest], Config) -> parse_config(Rest, Config#config{repeat_keys=true}); +parse_config([uescape|Rest], Config) -> + parse_config(Rest, Config#config{uescape=true}); parse_config([strict|Rest], Config) -> parse_config(Rest, Config#config{strict_comments=true, strict_commas=true, @@ -151,6 +153,7 @@ valid_flags() -> repeat_keys, strict, stream, + uescape, error_handler, incomplete_handler ]. @@ -193,7 +196,8 @@ config_test_() -> strict_utf8 = true, strict_single_quotes = true, strict_escapes = true, - stream = true + stream = true, + uescape = true }, parse_config([escaped_forward_slashes, escaped_strings, @@ -201,7 +205,8 @@ config_test_() -> dirty_strings, repeat_keys, strict, - stream + stream, + uescape ]) ) }, @@ -271,6 +276,7 @@ config_to_list_test_() -> dirty_strings, repeat_keys, stream, + uescape, strict ], config_to_list( @@ -283,7 +289,8 @@ config_to_list_test_() -> strict_utf8 = true, strict_single_quotes = true, strict_escapes = true, - stream = true + stream = true, + uescape = true } ) )}, diff --git a/src/jsx_config.hrl b/src/jsx_config.hrl index 3ffb084..e72247a 100644 --- a/src/jsx_config.hrl +++ b/src/jsx_config.hrl @@ -10,6 +10,7 @@ strict_single_quotes = false :: boolean(), strict_escapes = false :: boolean(), stream = false :: boolean(), + uescape = false :: boolean(), error_handler = false :: false | jsx_config:handler(), incomplete_handler = false :: false | jsx_config:handler() }). diff --git a/src/jsx_parser.erl b/src/jsx_parser.erl index 7b88195..c2969d3 100644 --- a/src/jsx_parser.erl +++ b/src/jsx_parser.erl @@ -351,6 +351,8 @@ clean(<<124, Rest/binary>>, Acc, Config) -> clean(Rest, [124] ++ Acc, Config); clean(<<125, Rest/binary>>, Acc, Config) -> clean(Rest, [125] ++ Acc, Config); clean(<<126, Rest/binary>>, Acc, Config) -> clean(Rest, [126] ++ Acc, Config); clean(<<127, Rest/binary>>, Acc, Config) -> clean(Rest, [127] ++ Acc, Config); +clean(<>, Acc, Config=#config{uescape=true}) -> + maybe_replace(X, Rest, Acc, Config); clean(<>, Acc, Config) when X == 16#2028; X == 16#2029 -> maybe_replace(X, Rest, Acc, Config); clean(<>, Acc, Config) when X < 16#d800 -> @@ -439,13 +441,15 @@ maybe_replace($/, Rest, Acc, Config=#config{escaped_strings=true}) -> end; maybe_replace($\\, Rest, Acc, Config=#config{escaped_strings=true}) -> clean(Rest, [$\\, $\\] ++ Acc, Config); +maybe_replace(X, Rest, Acc, Config=#config{escaped_strings=true}) when X < 32 -> + clean(Rest, lists:reverse(json_escape_sequence(X)) ++ Acc, Config); +maybe_replace(X, Rest, Acc, Config=#config{uescape=true}) when is_integer(X) -> + clean(Rest, lists:reverse(json_escape_sequence(X)) ++ Acc, Config); maybe_replace(X, Rest, Acc, Config=#config{escaped_strings=true}) when X == 16#2028; X == 16#2029 -> case Config#config.unescaped_jsonp of true -> clean(Rest, [X] ++ Acc, Config); false -> clean(Rest, lists:reverse(json_escape_sequence(X)) ++ Acc, Config) end; -maybe_replace(X, Rest, Acc, Config=#config{escaped_strings=true}) when X < 32 -> - clean(Rest, lists:reverse(json_escape_sequence(X)) ++ Acc, Config); maybe_replace(Atom, _, _, #config{strict_utf8=true}) when is_atom(Atom) -> {error, badarg}; maybe_replace(noncharacter, Rest, Acc, Config) -> clean(Rest, [16#fffd] ++ Acc, Config); maybe_replace(surrogate, Rest, Acc, Config) -> clean(Rest, [16#fffd] ++ Acc, Config); @@ -454,9 +458,13 @@ maybe_replace(X, Rest, Acc, Config) -> clean(Rest, [X] ++ Acc, Config). %% convert a codepoint to it's \uXXXX equiv. -json_escape_sequence(X) -> +json_escape_sequence(X) when X < 65536 -> <> = <>, - [$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))]. + [$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))]; +json_escape_sequence(X) -> + Adjusted = X - 16#10000, + <> = <>, + json_escape_sequence(A + 16#d800) ++ json_escape_sequence(B + 16#dc00). to_hex(10) -> $a; @@ -1033,6 +1041,28 @@ json_escape_sequence_test_() -> {"json escape sequence test - 16#def", ?_assertEqual(json_escape_sequence(16#def), "\\u0def")} ]. +uescaped_test_() -> + [ + {"\"\\u0080\"", ?_assertEqual( + <<"\\u0080">>, + clean_string(<<128/utf8>>, #config{uescape=true}) + )}, + {"\"\\u8ca8\\u5481\\u3002\\u0091\\u0091\"", ?_assertEqual( + <<"\\u8ca8\\u5481\\u3002\\u0091\\u0091">>, + clean_string( + <<232,178,168,229,146,129,227,128,130,194,145,194,145>>, + #config{uescape=true} + ) + )}, + {"\"\\ud834\\udd1e\"", ?_assertEqual( + <<"\\ud834\\udd1e">>, + clean_string(<<240, 157, 132, 158>>, #config{uescape=true}) + )}, + {"\"\\ud83d\\ude0a\"", ?_assertEqual( + <<"\\ud83d\\ude0a">>, + clean_string(<<240, 159, 152, 138>>, #config{uescape=true}) + )} + ]. fix_key_test_() -> [