From fa81a0b3b4133e89d815ff3a0c1806866726938a Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Thu, 29 Jul 2010 22:59:23 -0700 Subject: [PATCH] updated README --- README | 0 README.markdown | 175 ++++++++++++++++++++++++++++++++++++++++++++++++ makefile | 9 ++- 3 files changed, 181 insertions(+), 3 deletions(-) delete mode 100644 README create mode 100644 README.markdown diff --git a/README b/README deleted file mode 100644 index e69de29..0000000 diff --git a/README.markdown b/README.markdown new file mode 100644 index 0000000..afc7057 --- /dev/null +++ b/README.markdown @@ -0,0 +1,175 @@ +jsx +=== + +basically yajl[1], but in erlang. born from a need for a stream based, incremental parser capable of outputting a number of representations + + + +overview +-------- + +jsx is a json parsing toolkit, to be used to convert json streams to arbitrary output representations. see: + + + %% an eep0018 (erlang extension proposal for json decoding) json decoder + + %% this is nearly complete, ignoring only the options json_to_term should accept + + %% [true,[{<<"key">>, <<"value">>}],1] = json_to_term(<<"[ true, { \"key\": \"value\" }, 1 ]">>). + + json_to_term(JSON) -> + P = jsx:parser(), + collect_start(P(JSON), [[]]). + + collect_start({event, Event, Next}, Acc) when Event =:= start_object; Event =:= start_array -> + collect(Next(), [[]|Acc]); + collect_start(_, _) -> + erlang:error(badarg). + + collect({event, Event, Next}, Acc) when Event =:= start_object; Event =:= start_array -> + collect(Next(), [[]|Acc]); + + %% special case for empty object + collect({event, end_object, Next}, [[], Parent|Rest]) -> + collect(Next(), [[[{}]] ++ Parent] ++ Rest); + %% reverse the array/object accumulator before prepending it to it's parent + collect({event, end_object, Next}, [Current, Parent|Rest]) when is_list(Parent) -> + collect(Next(), [[lists:reverse(Current)] ++ Parent] ++ Rest); + collect({event, end_array, Next}, [Current, Parent|Rest]) when is_list(Parent) -> + collect(Next(), [[lists:reverse(Current)] ++ Parent] ++ Rest); + collect({event, Start, Next}, [Current, Key, Parent|Rest]) + when Start =:= end_object; Start =:= end_array -> + collect(Next(), [[{Key, lists:reverse(Current)}] ++ Parent] ++ Rest); + + %% end of json is emitted asap (at close of array/object), Next() should return {incomplete, More} + %% and then More(end_stream) ensures the tail of the json binary is clean (whitespace only) + collect({event, end_json, Next}, [[Acc]]) -> + case Next() of + {incomplete, More} -> case More(end_stream) of + ok -> Acc + ; _ -> erlang:error(badarg) + end + ; _ -> erlang:error(badarg) + end; + + %% key can only be emitted inside of a json object, so just insert it directly into + %% the head of the accumulator and deal with it when we receive it's paired value + collect({event, {key, _} = PreKey, Next}, [Current|_] = Acc) -> + Key = event(PreKey), + case key_repeats(Key, Current) of + true -> erlang:error(badarg) + ; false -> collect(Next(), [Key] ++ Acc) + end; + + %% check acc to see if we're inside an object or an array. because inside an object + %% context the events that fall this far are always preceded by a key (which are + %% binaries or atoms), if Current is a list, we're inside an array, else, an + %% object + collect({event, Event, Next}, [Current|Rest]) when is_list(Current) -> + collect(Next(), [[event(Event)] ++ Current] ++ Rest); + collect({event, Event, Next}, [Key, Current|Rest]) -> + collect(Next(), [[{Key, event(Event)}] ++ Current] ++ Rest); + + %% any other event is an error + collect(_, _) -> erlang:error(badarg). + + + event({string, String}) -> + unicode:characters_to_binary(String); + event({key, Key}) -> + unicode:characters_to_binary(Key); + %% special case for negative zero + event({integer, "-0"}) -> + erlang:float(erlang:list_to_integer("-0")); + event({integer, Integer}) -> + erlang:list_to_integer(Integer); + event({float, Float}) -> + erlang:list_to_float(Float); + event({literal, Literal}) -> + Literal. + + + key_repeats(Key, [{Key, _Value}|_Rest]) -> true; + key_repeats(Key, [_|Rest]) -> key_repeats(Key, Rest); + key_repeats(_Key, []) -> false. + + + + %% the twitter json API has a ton of detail in it's stream, most of which + %% is ignorable. this parser extracts just the user screenname, the text + %% and the time and throws the rest away + + %% {"talentdeficit", "use jsx!", "Fri Jul 30 04:25:50 +0000 2010"} = tweet_to_tuple(...). + + tweet_to_tuple(Tweet) -> + P = jsx:parser(), + scan(P(Tweet), {name, text, time}). + + scan({event, {key, "screen_name"}, Next}, Acc) -> + collect_name(Next(), Acc); + scan({event, {key, "created_at"}, Next}, Acc) -> + collect_time(Next(), Acc); + scan({event, {key, "text"}, Next}, Acc) -> + collect_text(Next(), Acc); + scan({event, end_json, Next}, Acc) -> + Acc; + scan({_, _, Next}, Acc) -> + scan(Next(), Acc). + + collect_name({event, {string, Name}, Next}, {_, Text, Time}) -> + scan(Next(), {Name, Text, Time}). + collect_time({event, {string, Time}, Next}, {Name, Text, _}) -> + scan(Next(), {Name, Text, Time}). + collect_text({event, {string, Text}, Next}, {Name, _, Time}) -> + scan(Next(), {Name, Text, Time}). + + + +api +--- + +`jsx:parser/0` returns a function with arity 1. call it with a binary containing a properly encoded json stream and it returns one of the following values: + +* `{event, Event, Next}`: Event is described below and Next is a zero arity function that returns the next value +* `{incomplete, More}`: More is described below +* `{error, badjson}`: the parser has decided the json stream is not a valid json document + +`jsx:parser/1` has the same return signature as `jsx:parser/0` but accepts a proplist containing the following options: + +* `{comments, true | false}`: determines whether (/* c style */) comments are allowed. default is false +* `{escaped_unicode, ascii | codepoint | none}`: determines what escaped unicode sequences like `"\ua123"` are converted to in key/string events. ascii converts any sequences in the range 0-127 to their ascii value (`"\u0021"` would become 33), codepoint converts all valid sequences to their unicode codepoint value (`"\uabcd"` would become 43981), none does no conversion (`"\u0021"` would become the sequence 92, 117, 48, 48, 50, 49) +* `{encoding, utf8 | utf16 | {utf16, little} | utf32 | {utf32, little} | auto}`: determines which encoding to expect the binary to use. a contrary encoding will result in an error. auto will auto detect the encoding. auto is the default +* `{multi_term, true | false}`: normally, after the end of a json document only whitespace is allowed, passing this option with true alters parsing so after the end of a json document, another json document is permitted, with any amount of whitespace in between. default is false + +More is a new parser returned when parsing encounters the end of the stream supplied to the parser. calling it with a new stream resumes parsing as if the stream was not interrupted. because json documents may be followed by arbitrary whitespace, there is no unambiguous ending to a json stream. Next will always eventually return `{incomplete, More}`. to ensure the stream is clean and contains no garbage in the tail, call `More(end_stream)`. if `ok` is returned, parsing is complete. otherwise `{error, badjson}` will be returned + + + +events +------ + +a complete list of events. Next is described under *api* above: + +* `{event, start_object | end_object | start_array | end_array, Next}`: emitted when `{`, `}`, `[`, `]` are encountered by the parser in a legal position +* `{event, end_json, Next}`: emitted when the json document has been completed (the root object/array has been closed). note that this does NOT ensure the json document is valid, the tail must be checked to be free of invalid characters as described above under *api* +* `{event, {key, Key}, Next}`: an object key has been encountered. Key has the same format as String below +* `{event, {string, String}, Next}`: a string has been encountered. String is a list of unicode codepoints +* `{event, {integer, Integer}, Next}`: an integer had been encountered. Integer is a list of unicode codepoints that can be passed to `erlang:list_to_integer/1` to convert to an integer +* `{event, {float, Float}, Next}`: a float has been encountered. Float is a list of unicode codepoints that can be passed to `erlang:list_to_float/1` to convert to a float +* `{event, {literal, true | false | null}, Next}`: a json literal has been encountered. it will be the atom `true`, the atom `false` or the atom `null` + + +notes +----- + +to compile and install, run `make && make install` from the root of the project directory + +jsx supports utf8, utf16 (little and big endian) and utf32 (little and big endian). future support is planned for erlang iolists + + + + + + + +[1]: http://github.com/lloyd/yajl diff --git a/makefile b/makefile index d760d0b..016e34c 100644 --- a/makefile +++ b/makefile @@ -6,14 +6,17 @@ expand: ./priv/backends.escript create test: compile - ./priv/jsx_test.escript test/cases + ./test/jsx_test.escript test/cases prove: compile - prove ./priv/jsx_test.escript + prove ./test/jsx_test.escript clean: ./rebar clean ./priv/backends.escript clean package: compile - ./rebar install target=. \ No newline at end of file + ./rebar install target=. + +install: compile + ./rebar install \ No newline at end of file