From ad0b87aa3722415e42475ba4948aca0cb3621383 Mon Sep 17 00:00:00 2001 From: alisdair sullivan Date: Wed, 2 Jun 2010 23:02:45 -0700 Subject: [PATCH] readme updated --- README.markdown | 171 +++++++++++++++++++++++++++++++++++++- examples/jsx_prettify.erl | 4 + src/jsx.erl | 4 +- 3 files changed, 176 insertions(+), 3 deletions(-) diff --git a/README.markdown b/README.markdown index 7020479..8ac66e6 100644 --- a/README.markdown +++ b/README.markdown @@ -1,2 +1,171 @@ -%%% JSX: A Streaming, Evented JSON Parser Library %%% +# jsx: a streaming, event driven json parser library # + +## why another json parser written in erlang? ## + +none of the existing parsers support incremental parsing of json streams. none of the existing parsers are output representation neutral. none of the existing parsers are as fast as they could be. + + +## how do I use it? ## + +first, install it: + + chmod u+x rebar + ./rebar compile + ./rebar install + +next, start up an interactive session: + + 1> F = jsx:decoder(). + #Fun + 2> F(<<"[ \"some json\" ]">>). + {[start_array,{string,"some json"},end_array], + #Fun} + +that's it! + + +## that doesn't seem like a very friendly (or neutral...) output representation ## + +that's not meant for human consumption, or even use in your programs. it's a list of json events output by the parser that can be processed by a callback module to do whatever you want. there's a simple parser in examples you can check out to see what I mean: + + 1. jsx_parser:decode(<<"[ \"some json\" ]">>). + {ok,["some json"]} + +jsx_parser converts the events into a list containing a list of unicode codepoints (that conveniently is identical to what erlang considers a string). another example: + + 2. {ok, Dict} = jsx_parser:decode(<<"{ \"key\": 42 }">>). + {ok,{dict,1,16,16,8,80,48, + {[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}, + {{[],[],[],[],[],[],[],[],[],[],[],[], + [["key"|42]], + [],[],[]}}}} + 3. dict:fetch("key", Dict). + 42 + +the code that builds this representation from the list of events is less than 55 lines, not including comments. here's the fun that builds the default representation: + + fun(end_of_stream, State) -> lists:reverse(State) ;(Event, State) -> [Event] ++ State end + +that just collects the events into a list and returns them when finished. that's not all jsx can do: + + 4. Pretty = jsx_prettify:pretty(<<"[ \"life\", \"the universe\", \"everything\", { \"answer\": 42 } ]">>, []). + "[\n \"life\",\n \"the universe\",\n \"everything\",\n {\n \"answer\": 42\n }\n]" + 5. io:format("~s~n", [Pretty]). + [ + "life", + "the universe", + "everything", + { + "answer": 42 + } + ] + + +## the API ## + +there are three functions exported from jsx.erl (well, four, but don't use detect_encoding/4 directly): + +1. jsx:decoder/0 +2. jsx:decoder/1 +3. jsx:decoder/2 + +decoder/0 returns the default decoder (the one that just returns a list of events) while decoder/1 returns a decoder with the given options and the default callback handler. decoder/2 is the interesting one. it takes a callback handler and a proplist of options as specified below. + +return values are {Value, F}, {error, Error} or {incomplete, F}. see 'incremental parsing' below for an explanation of F. the only error currently returned is badjson, representing a failure to parse the json. other errors are unintentional, but may currently be encountered. + + +## callback handlers ## + +callback handlers are essentially a fold over the list of events. a function is called with the json event and an arbitrary term and a new term is returned to be passed to the function along with the next event. they're passed to decoder/2 either as {F, Term} where F is an anonymous function of arity 2 or {Module, Function, Term} where Module is a module that exports Function. Here's an example using the default callback function seen above: + + F = jsx:decoder({fun(end_of_stream, State) -> lists:reverse(State) ;(Event, State) -> [Event] ++ State end, []}, []) + +and an example from jsx_parser: + + F = jsx:decoder({jsx_parser, event, []}, []) + + +## options ## + +the second, mystery, argument seen above is a proplist of options used to configure the parser. currently there are only three possible options: + +#### comments #### + +possible values are true or false, the default is false. this allows c style comments in the json input stream in between any pair of events. + +#### escaped_unicode #### + +possible values are ascii, unicode or none. default is codepoint. this option handles how json unicode escapes are handled. you know, those \uXXXX things. none doesn't do anything with them, leaving them in the string intact. ascii will convert any that represent ascii codepoints (including control characters) into their corresponding codepoint and treats all other escapes as none. codepoint will convert any that represent any unicode codepoint into it's corresponding codepoint and leave unicode non-characters as strings. because json is silly, it will also attempt to convert utf16 surrogates encoded as json escapes into the correct codepoints. codepoint codepoint codepoint. + +#### encoding #### + +possible values are utf8, utf16, {utf16,little}, utf32, {utf32,little} and auto. default is auto. this just forces the parser to attempt to interpret the json as the chosen encoding. auto will autodetect the encoding. + + +## events ## + +the things the parser emits to your callback function. + + start_array, start_object, end_array, end_object + +these denote [, {, ] and } respectively. + + {key, [Codepoints]} + {string, [Codepoints]} + {integer, [Codepoints]} + {float, [Codepoints]} + +keys, strings, integers and floats are all returned as lists of unicode codepoints. if your json input is ascii, these are identical to erlang strings. otherwise, they can have non-printable characters in them and should be handled with the unicode module in the stdlib. if your input is utf8, occaisonally you can get lists which appear to be valid latin-1, this is just coincidence. + +floats and integers are always ascii printable, and should always be passable to erlang:list\_to\_float and erlang:list\_to\_integer, respectively. + + {literal, true} + {literal, false} + {literal, null} + +atoms representing the json boolean values. (and null, javascript is terrible). + + end_of_stream + +the json stream has been successfully parsed completely. + + +## incremental parsing ## + +jsx is a stream parser. it can parse partial json input: + + 6. F = jsx:decoder(). + #Fun + 7. {_, G} = F(<<"[ tr">>). + {incomplete,#Fun} + 8. {_, H} = G(<<"ue ]">>). + {[start_array,{literal,true},end_array], + #Fun} + +even upon returning a value, the parser still returns a new decoder that can be called on subsequent input in the stream: + + 9. H(<<" ">>). + {[start_array,{literal,true},end_array], + #Fun} + +this can be used to make sure the tails of json input are clean or when parsing naked json numbers: + + 10. {_, I} = F(<<"1">>). + {[{integer,"1"}],#Fun} + 11. {_, J} = I(<<"2">>). + {[{integer,"12"}],#Fun} + 12. J(<<"3">>). + {[{integer,"123"}],#Fun jsx_event(start_object, {Acc, Indent, Level, value}) -> {Acc ++ ",\n" ++ indent(Indent, Level) ++ "{", Indent, Level + 1, new}; +jsx_event(start_object, {Acc, Indent, Level, new}) -> + {Acc ++ ",\n" ++ indent(Indent, Level) ++ "{", Indent, Level + 1, new}; jsx_event(start_object, {Acc, Indent, Level, _}) -> {Acc ++ "{", Indent, Level + 1, new}; jsx_event(start_array, {Acc, Indent, Level, value}) -> {Acc ++ ",\n" ++ indent(Indent, Level) ++ "[", Indent, Level + 1, new}; +jsx_event(start_array, {Acc, Indent, Level, new}) -> + {Acc ++ ",\n" ++ indent(Indent, Level) ++ "[", Indent, Level + 1, new}; jsx_event(start_array, {Acc, Indent, Level, _}) -> {Acc ++ "[", Indent, Level + 1, new}; diff --git a/src/jsx.erl b/src/jsx.erl index 4c78fb6..6b3c2bd 100644 --- a/src/jsx.erl +++ b/src/jsx.erl @@ -45,8 +45,8 @@ start(Callbacks, OptsList) -> utf8 -> fun jsx_utf8:start/4 ; utf16 -> fun jsx_utf16:start/4 ; utf32 -> fun jsx_utf32:start/4 - ; utf16le -> fun jsx_utf16le:start/4 - ; utf32le -> fun jsx_utf32le:start/4 + ; {utf16, little} -> fun jsx_utf16le:start/4 + ; {utf32, little} -> fun jsx_utf32le:start/4 ; auto -> fun jsx:detect_encoding/4 end, start(Callbacks, Opts, F).