diff --git a/src/jsx_utf16.erl b/src/jsx_utf16.erl index b9822e0..281d08b 100644 --- a/src/jsx_utf16.erl +++ b/src/jsx_utf16.erl @@ -262,8 +262,15 @@ escaped_unicode(<>, [C, B, A]) when ?is_hex(D) -> case erlang:list_to_integer([A, B, C, D], 16) of - X when X < 16#d800; X > 16#dfff, X < 16#fffe -> - string(Rest, Stack, Callbacks, Opts, [X] ++ String) + X when X >= 16#dc00, X =< 16#dfff -> + case check_acc_for_surrogate(String) of + false -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + ; {Y, NewString} -> + string(Rest, Stack, Callbacks, Opts, [surrogate_to_codepoint(X, Y)] ++ NewString) + end + ; X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) ; _ -> string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) end; @@ -273,6 +280,25 @@ escaped_unicode(<>, Stack, Callbacks, Opts, String, Ac escaped_unicode(Rest, Stack, Callbacks, Opts, String, [S] ++ Acc); escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) when byte_size(Bin) < 2 -> {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. + +%% upon encountering a low pair json/hex encoded value, check to see if there's a high +%% value already in the accumulator. + +check_acc_for_surrogate([D, C, B, A, $u, ?rsolidus|Rest]) + when ?is_hex(D), ?is_hex(C), ?is_hex(B), ?is_hex(A) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X >=16#d800, X =< 16#dbff -> + {X, Rest}; + _ -> + false + end; +check_acc_for_surrogate(_) -> + false. + +%% stole this from the unicode spec + +surrogate_to_codepoint(X, Y) -> + (X - 16#d800) * 16#400 + (Y - 16#dc00) + 16#10000. %% like strings, numbers are collected in an intermediate accumulator before diff --git a/src/jsx_utf16le.erl b/src/jsx_utf16le.erl index 7a78ff6..e2decb3 100644 --- a/src/jsx_utf16le.erl +++ b/src/jsx_utf16le.erl @@ -262,8 +262,15 @@ escaped_unicode(<>, [C, B, A]) when ?is_hex(D) -> case erlang:list_to_integer([A, B, C, D], 16) of - X when X < 16#d800; X > 16#dfff, X < 16#fffe -> - string(Rest, Stack, Callbacks, Opts, [X] ++ String) + X when X >= 16#dc00, X =< 16#dfff -> + case check_acc_for_surrogate(String) of + false -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + ; {Y, NewString} -> + string(Rest, Stack, Callbacks, Opts, [surrogate_to_codepoint(X, Y)] ++ NewString) + end + ; X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) ; _ -> string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) end; @@ -274,6 +281,25 @@ escaped_unicode(<>, Stack, Callbacks, Opts, String, Ac escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) when byte_size(Bin) < 2 -> {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. +%% upon encountering a low pair json/hex encoded value, check to see if there's a high +%% value already in the accumulator. + +check_acc_for_surrogate([D, C, B, A, $u, ?rsolidus|Rest]) + when ?is_hex(D), ?is_hex(C), ?is_hex(B), ?is_hex(A) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X >=16#d800, X =< 16#dbff -> + {X, Rest}; + _ -> + false + end; +check_acc_for_surrogate(_) -> + false. + +%% stole this from the unicode spec + +surrogate_to_codepoint(X, Y) -> + (X - 16#d800) * 16#400 + (Y - 16#dc00) + 16#10000. + %% like strings, numbers are collected in an intermediate accumulator before %% being emitted to the callback handler. no processing of numbers is done in diff --git a/src/jsx_utf32.erl b/src/jsx_utf32.erl index 0264d44..95a1a27 100644 --- a/src/jsx_utf32.erl +++ b/src/jsx_utf32.erl @@ -249,8 +249,15 @@ escaped_unicode(<>, [C, B, A]) when ?is_hex(D) -> case erlang:list_to_integer([A, B, C, D], 16) of - X when X < 16#d800; X > 16#dfff, X < 16#fffe -> - string(Rest, Stack, Callbacks, Opts, [X] ++ String) + X when X >= 16#dc00, X =< 16#dfff -> + case check_acc_for_surrogate(String) of + false -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + ; {Y, NewString} -> + string(Rest, Stack, Callbacks, Opts, [surrogate_to_codepoint(X, Y)] ++ NewString) + end + ; X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) ; _ -> string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) end; @@ -261,6 +268,25 @@ escaped_unicode(<>, Stack, Callbacks, Opts, String, Ac escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) when byte_size(Bin) < 4 -> {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. +%% upon encountering a low pair json/hex encoded value, check to see if there's a high +%% value already in the accumulator. + +check_acc_for_surrogate([D, C, B, A, $u, ?rsolidus|Rest]) + when ?is_hex(D), ?is_hex(C), ?is_hex(B), ?is_hex(A) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X >=16#d800, X =< 16#dbff -> + {X, Rest}; + _ -> + false + end; +check_acc_for_surrogate(_) -> + false. + +%% stole this from the unicode spec + +surrogate_to_codepoint(X, Y) -> + (X - 16#d800) * 16#400 + (Y - 16#dc00) + 16#10000. + %% like strings, numbers are collected in an intermediate accumulator before %% being emitted to the callback handler. no processing of numbers is done in @@ -482,4 +508,4 @@ comment(Bin, Resume) when byte_size(Bin) < 4 -> maybe_comment_done(<>, Resume) -> Resume(Rest); maybe_comment_done(Bin, Resume) when byte_size(Bin) < 4 -> - {incomplete, fun(Stream) -> maybe_comment_done(<>, Resume) end}.< 4 \ No newline at end of file + {incomplete, fun(Stream) -> maybe_comment_done(<>, Resume) end}. \ No newline at end of file diff --git a/src/jsx_utf32le.erl b/src/jsx_utf32le.erl index a366088..6ce96bf 100644 --- a/src/jsx_utf32le.erl +++ b/src/jsx_utf32le.erl @@ -249,8 +249,15 @@ escaped_unicode(<>, [C, B, A]) when ?is_hex(D) -> case erlang:list_to_integer([A, B, C, D], 16) of - X when X < 16#d800; X > 16#dfff, X < 16#fffe -> - string(Rest, Stack, Callbacks, Opts, [X] ++ String) + X when X >= 16#dc00, X =< 16#dfff -> + case check_acc_for_surrogate(String) of + false -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + ; {Y, NewString} -> + string(Rest, Stack, Callbacks, Opts, [surrogate_to_codepoint(X, Y)] ++ NewString) + end + ; X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) ; _ -> string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) end; @@ -261,6 +268,25 @@ escaped_unicode(<>, Stack, Callbacks, Opts, String, Ac escaped_unicode(Bin, Stack, Callbacks, Opts, String, Acc) when byte_size(Bin) < 4 -> {incomplete, fun(Stream) -> escaped_unicode(<>, Stack, Callbacks, Opts, String, Acc) end}. +%% upon encountering a low pair json/hex encoded value, check to see if there's a high +%% value already in the accumulator. + +check_acc_for_surrogate([D, C, B, A, $u, ?rsolidus|Rest]) + when ?is_hex(D), ?is_hex(C), ?is_hex(B), ?is_hex(A) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X >=16#d800, X =< 16#dbff -> + {X, Rest}; + _ -> + false + end; +check_acc_for_surrogate(_) -> + false. + +%% stole this from the unicode spec + +surrogate_to_codepoint(X, Y) -> + (X - 16#d800) * 16#400 + (Y - 16#dc00) + 16#10000. + %% like strings, numbers are collected in an intermediate accumulator before %% being emitted to the callback handler. no processing of numbers is done in diff --git a/src/jsx_utf8.erl b/src/jsx_utf8.erl index 140159b..d7bca84 100644 --- a/src/jsx_utf8.erl +++ b/src/jsx_utf8.erl @@ -273,8 +273,15 @@ escaped_unicode(<>, [C, B, A]) when ?is_hex(D) -> case erlang:list_to_integer([A, B, C, D], 16) of - X when X < 16#d800; X > 16#dfff, X < 16#fffe -> - string(Rest, Stack, Callbacks, Opts, [X] ++ String) + X when X >= 16#dc00, X =< 16#dfff -> + case check_acc_for_surrogate(String) of + false -> + string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) + ; {Y, NewString} -> + string(Rest, Stack, Callbacks, Opts, [surrogate_to_codepoint(X, Y)] ++ NewString) + end + ; X when X < 16#d800; X > 16#dfff, X < 16#fffe -> + string(Rest, Stack, Callbacks, Opts, [X] ++ String) ; _ -> string(Rest, Stack, Callbacks, Opts, [D, C, B, A, $u, ?rsolidus] ++ String) end; @@ -285,6 +292,25 @@ escaped_unicode(<>, Stack, Callbacks, Opts, String, Ac escaped_unicode(<<>>, Stack, Callbacks, Opts, String, Acc) -> {incomplete, fun(Stream) -> escaped_unicode(Stream, Stack, Callbacks, Opts, String, Acc) end}. +%% upon encountering a low pair json/hex encoded value, check to see if there's a high +%% value already in the accumulator. + +check_acc_for_surrogate([D, C, B, A, $u, ?rsolidus|Rest]) + when ?is_hex(D), ?is_hex(C), ?is_hex(B), ?is_hex(A) -> + case erlang:list_to_integer([A, B, C, D], 16) of + X when X >=16#d800, X =< 16#dbff -> + {X, Rest}; + _ -> + false + end; +check_acc_for_surrogate(_) -> + false. + +%% stole this from the unicode spec + +surrogate_to_codepoint(X, Y) -> + (X - 16#d800) * 16#400 + (Y - 16#dc00) + 16#10000. + %% like strings, numbers are collected in an intermediate accumulator before %% being emitted to the callback handler. no processing of numbers is done in