From e22ec340008052f155aebd87817c2a67d806cecd Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Sat, 16 Apr 2011 22:18:59 -0700 Subject: [PATCH] moves encoding detection to jsx_utils --- src/jsx.erl | 93 +---------------------------------------------- src/jsx_utils.erl | 89 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 93 deletions(-) diff --git a/src/jsx.erl b/src/jsx.erl index 76a6592..412deb5 100644 --- a/src/jsx.erl +++ b/src/jsx.erl @@ -56,7 +56,7 @@ parser(OptsList) -> ; utf32 -> jsx_utf32:parser(OptsList) ; {utf16, little} -> jsx_utf16le:parser(OptsList) ; {utf32, little} -> jsx_utf32le:parser(OptsList) - ; auto -> detect_encoding(OptsList) + ; auto -> jsx_utils:detect_encoding(OptsList) end. @@ -130,97 +130,6 @@ eventify([]) -> eventify([Next|Rest]) -> fun() -> {event, Next, eventify(Rest)} end. - - -%% internal functions - - -%% encoding detection -%% first check to see if there's a bom, if not, use the rfc4627 method for -%% determining encoding. this function makes some assumptions about the -%% validity of the stream which may delay failure later than if an encoding is -%% explicitly provided - -detect_encoding(OptsList) -> - fun(Stream) -> detect_encoding(Stream, OptsList) end. - -%% utf8 bom detection -detect_encoding(<<16#ef, 16#bb, 16#bf, Rest/binary>>, Opts) -> - (jsx_utf8:parser(Opts))(Rest); -%% utf32-little bom detection (this has to come before utf16-little or it'll -%% match that) -detect_encoding(<<16#ff, 16#fe, 0, 0, Rest/binary>>, Opts) -> - (jsx_utf32le:parser(Opts))(Rest); -%% utf16-big bom detection -detect_encoding(<<16#fe, 16#ff, Rest/binary>>, Opts) -> - (jsx_utf16:parser(Opts))(Rest); -%% utf16-little bom detection -detect_encoding(<<16#ff, 16#fe, Rest/binary>>, Opts) -> - (jsx_utf16le:parser(Opts))(Rest); -%% utf32-big bom detection -detect_encoding(<<0, 0, 16#fe, 16#ff, Rest/binary>>, Opts) -> - (jsx_utf32:parser(Opts))(Rest); - -%% utf32-little null order detection -detect_encoding(<> = JSON, Opts) when X =/= 0 -> - (jsx_utf32le:parser(Opts))(JSON); -%% utf32-big null order detection -detect_encoding(<<0, 0, 0, X, _Rest/binary>> = JSON, Opts) when X =/= 0 -> - (jsx_utf32:parser(Opts))(JSON); -%% utf16-little null order detection -detect_encoding(<> = JSON, Opts) when X =/= 0 -> - (jsx_utf16le:parser(Opts))(JSON); -%% utf16-big null order detection -detect_encoding(<<0, X, 0, _, _Rest/binary>> = JSON, Opts) when X =/= 0 -> - (jsx_utf16:parser(Opts))(JSON); -%% utf8 null order detection -detect_encoding(<> = JSON, Opts) when X =/= 0, Y =/= 0 -> - (jsx_utf8:parser(Opts))(JSON); - -%% a problem, to autodetect naked single digits' encoding, there is not enough -%% data to conclusively determine the encoding correctly. below is an attempt -%% to solve the problem -detect_encoding(<>, Opts) when X =/= 0 -> - {incomplete, - fun(end_stream) -> - try - {incomplete, Next} = (jsx_utf8:parser(Opts))(<>), - Next(end_stream) - catch error:function_clause -> {error, {badjson, <>}} - end - ; (Stream) -> detect_encoding(<>, Opts) - end - }; -detect_encoding(<<0, X>>, Opts) when X =/= 0 -> - {incomplete, - fun(end_stream) -> - try - {incomplete, Next} = (jsx_utf16:parser(Opts))(<<0, X>>), - Next(end_stream) - catch error:function_clause -> {error, {badjson, <<0, X>>}} - end - ; (Stream) -> detect_encoding(<<0, X, Stream/binary>>, Opts) - end - }; -detect_encoding(<>, Opts) when X =/= 0 -> - {incomplete, - fun(end_stream) -> - try - {incomplete, Next} = (jsx_utf16le:parser(Opts))(<>), - Next(end_stream) - catch error:function_clause -> {error, {badjson, <>}} - end - ; (Stream) -> detect_encoding(<>, Opts) - end - }; - -%% not enough input, request more -detect_encoding(Bin, Opts) -> - {incomplete, - fun(end_stream) -> {error, {badjson, Bin}} - ; (Stream) -> detect_encoding(<>, Opts) - end - }. -ifdef(TEST). diff --git a/src/jsx_utils.erl b/src/jsx_utils.erl index cf982de..406c6e4 100644 --- a/src/jsx_utils.erl +++ b/src/jsx_utils.erl @@ -23,7 +23,7 @@ -module(jsx_utils). --export([nice_decimal/1]). +-export([nice_decimal/1, detect_encoding/1, detect_encoding/2]). -ifdef(TEST). @@ -171,6 +171,93 @@ to_ascii(15) -> "f"; to_ascii(X) -> [X + 48]. %% ascii "1" is [49], "2" is [50], etc... +%% encoding detection +%% first check to see if there's a bom, if not, use the rfc4627 method for +%% determining encoding. this function makes some assumptions about the +%% validity of the stream which may delay failure later than if an encoding is +%% explicitly provided + +detect_encoding(OptsList) -> + fun(Stream) -> detect_encoding(Stream, OptsList) end. + +%% utf8 bom detection +detect_encoding(<<16#ef, 16#bb, 16#bf, Rest/binary>>, Opts) -> + (jsx_utf8:parser(Opts))(Rest); +%% utf32-little bom detection (this has to come before utf16-little or it'll +%% match that) +detect_encoding(<<16#ff, 16#fe, 0, 0, Rest/binary>>, Opts) -> + (jsx_utf32le:parser(Opts))(Rest); +%% utf16-big bom detection +detect_encoding(<<16#fe, 16#ff, Rest/binary>>, Opts) -> + (jsx_utf16:parser(Opts))(Rest); +%% utf16-little bom detection +detect_encoding(<<16#ff, 16#fe, Rest/binary>>, Opts) -> + (jsx_utf16le:parser(Opts))(Rest); +%% utf32-big bom detection +detect_encoding(<<0, 0, 16#fe, 16#ff, Rest/binary>>, Opts) -> + (jsx_utf32:parser(Opts))(Rest); + +%% utf32-little null order detection +detect_encoding(<> = JSON, Opts) when X =/= 0 -> + (jsx_utf32le:parser(Opts))(JSON); +%% utf32-big null order detection +detect_encoding(<<0, 0, 0, X, _Rest/binary>> = JSON, Opts) when X =/= 0 -> + (jsx_utf32:parser(Opts))(JSON); +%% utf16-little null order detection +detect_encoding(<> = JSON, Opts) when X =/= 0 -> + (jsx_utf16le:parser(Opts))(JSON); +%% utf16-big null order detection +detect_encoding(<<0, X, 0, _, _Rest/binary>> = JSON, Opts) when X =/= 0 -> + (jsx_utf16:parser(Opts))(JSON); +%% utf8 null order detection +detect_encoding(<> = JSON, Opts) when X =/= 0, Y =/= 0 -> + (jsx_utf8:parser(Opts))(JSON); + +%% a problem, to autodetect naked single digits' encoding, there is not enough +%% data to conclusively determine the encoding correctly. below is an attempt +%% to solve the problem +detect_encoding(<>, Opts) when X =/= 0 -> + {incomplete, + fun(end_stream) -> + try + {incomplete, Next} = (jsx_utf8:parser(Opts))(<>), + Next(end_stream) + catch error:function_clause -> {error, {badjson, <>}} + end + ; (Stream) -> detect_encoding(<>, Opts) + end + }; +detect_encoding(<<0, X>>, Opts) when X =/= 0 -> + {incomplete, + fun(end_stream) -> + try + {incomplete, Next} = (jsx_utf16:parser(Opts))(<<0, X>>), + Next(end_stream) + catch error:function_clause -> {error, {badjson, <<0, X>>}} + end + ; (Stream) -> detect_encoding(<<0, X, Stream/binary>>, Opts) + end + }; +detect_encoding(<>, Opts) when X =/= 0 -> + {incomplete, + fun(end_stream) -> + try + {incomplete, Next} = (jsx_utf16le:parser(Opts))(<>), + Next(end_stream) + catch error:function_clause -> {error, {badjson, <>}} + end + ; (Stream) -> detect_encoding(<>, Opts) + end + }; + +%% not enough input, request more +detect_encoding(Bin, Opts) -> + {incomplete, + fun(end_stream) -> {error, {badjson, Bin}} + ; (Stream) -> detect_encoding(<>, Opts) + end + }. + %% eunit tests -ifdef(TEST).