diff --git a/README.md b/README.md index bc7baad..5a3026a 100644 --- a/README.md +++ b/README.md @@ -430,7 +430,9 @@ additional options beyond these. see - `uescape` - escape all codepoints outside the ascii range for 7 bit clean output + escape all codepoints outside the ascii range for 7 bit clean output. note + this escaping takes place even if no other string escaping is requested (via + `escaped_strings`) - `unescaped_jsonp` diff --git a/src/jsx_decoder.erl b/src/jsx_decoder.erl index 58c039f..80e8749 100644 --- a/src/jsx_decoder.erl +++ b/src/jsx_decoder.erl @@ -310,6 +310,8 @@ string(<>, Handler, Acc, Stack, Config) -> string(Rest, Handler, [Acc, maybe_replace(?solidus, Config)], Stack, Config); string(<>, Handler, Acc, Stack, Config) -> unescape(Rest, Handler, Acc, Stack, Config); +string(<>, Handler, Acc, Stack, Config=#config{uescape=true}) when X >= 16#80 -> + string(Rest, Handler, [Acc, maybe_replace(X, Config)], Stack, Config); string(<>, Handler, Acc, Stack, Config) when X == 16#2028; X == 16#2029 -> string(Rest, Handler, [Acc, maybe_replace(X, Config)], Stack, Config); string(<<_/utf8, _/binary>> = Bin, Handler, Acc, Stack, Config) -> @@ -547,6 +549,7 @@ count(<<127, Rest/binary>>, N, Config) -> count(Rest, N + 1, Config); count(<<_, Rest/binary>>, N, Config=#config{dirty_strings=true}) -> count(Rest, N + 1, Config); +count(<<_/utf8, _/binary>>, N, #config{uescape=true}) -> N; count(<>, N, Config) when X < 16#800 -> count(Rest, N + 2, Config); count(<>, N, _) when X == 16#2028; X == 16#2029 -> N; @@ -694,13 +697,20 @@ maybe_replace(X, Config=#config{escaped_strings=true}) when X == 16#2028; X == end; maybe_replace(X, #config{escaped_strings=true}) when X < 32 -> json_escape_sequence(X); +%% escaped even if no other escaping requested! +maybe_replace(X, #config{uescape=true}) when X >= 16#80 -> + json_escape_sequence(X); maybe_replace(X, _Config) -> <>. %% convert a codepoint to it's \uXXXX equiv. json_escape_sequence(X) when X < 65536 -> <> = <>, - [$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))]. + [$\\, $u, (to_hex(A)), (to_hex(B)), (to_hex(C)), (to_hex(D))]; +json_escape_sequence(X) -> + Adjusted = X - 16#10000, + <> = <>, + json_escape_sequence(A + 16#d800) ++ json_escape_sequence(B + 16#dc00). %% ascii "1" is [49], "2" is [50], etc... @@ -1524,6 +1534,30 @@ special_escape_test_() -> ]. +uescape_test_() -> + [ + {"\"\\u0080\"", ?_assertEqual( + [{string, <<"\\u0080">>}, end_json], + decode(<<34, 128/utf8, 34>>, [uescape]) + )}, + {"\"\\u8ca8\\u5481\\u3002\\u0091\\u0091\"", ?_assertEqual( + [{string, <<"\\u8ca8\\u5481\\u3002\\u0091\\u0091">>}, end_json], + decode( + <<34,232,178,168,229,146,129,227,128,130,194,145,194,145,34>>, + [uescape] + ) + )}, + {"\"\\ud834\\udd1e\"", ?_assertEqual( + [{string, <<"\\ud834\\udd1e">>}, end_json], + decode(<<34, 240, 157, 132, 158, 34>>, [uescape]) + )}, + {"\"\\ud83d\\ude0a\"", ?_assertEqual( + [{string, <<"\\ud83d\\ude0a">>}, end_json], + decode(<<34, 240, 159, 152, 138, 34>>, [uescape]) + )} + ]. + + single_quoted_string_test_() -> Cases = [ {"single quoted string", [{string, <<"hello world">>}, end_json], <<39, "hello world", 39>>},