diff --git a/README.md b/README.md index 81f8dda..75e9960 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# jsx (v1.4.3) # +# jsx (v2.0) # an erlang application for consuming, producing and manipulating [json][json]. inspired by [yajl][yajl] @@ -116,24 +116,29 @@ false jsx is an erlang application for consuming, producing and manipulating [json][json] -json has a [spec][rfc4627] but common usage differs subtly. it's common -usage jsx attempts to address, with guidance from the spec +jsx follows the json [spec][rfc4627] as closely as possible with allowances for +real world usage -all json produced and consumed by jsx should be `utf8` encoded text or a -reasonable approximation thereof. ascii works too, but anything beyond that -i'm not going to make any promises. **especially** not latin1 +jsx is pragmatic. the json spec allows extensions so jsx extends the spec in a +number of ways. see the section on `strict` in [options](#option) below though -the [spec][rfc4627] thinks json values must be wrapped in a json array or -object but everyone else disagrees so jsx allows naked json values by default. -if you're a curmudgeon who's offended by this deviation here is a wrapper for -you: +there's not supposed to be any comments in json but when did comments ever hurt +anyone? json has no official comments but this parser allows c/c++ style comments. +anywhere whitespace is allowed you can insert comments (both `// ...` and `/* ... */`) + +all jsx decoder input should be `utf8` encoded binaries. sometimes you get binaries +that are almost but not quite valid utf8 whether due to improper escaping or poor +encoding. jsx replaces invalid codepoints and poorly formed sequences with the +unicode replacement character (`u+FFFD`) + +json only allows keys and strings to be delimited by double quotes (`u+0022`) but +javascript allows them to be delimited by single quotes (`u+0027`) as well. jsx +follows javascript in this. strings that start with single quotes can contain double +quotes but must end with single quotes and must escape any single quotes they contain + +json and jsx only recognize escape sequences as outlined in the json spec. it just +ignores bad escape sequences -```erlang -%% usage: `real_json(jsx:decode(JSON))` -real_json(Result) when is_list(Result) -> Result; -real_json(Result) when is_tuple(Result, 2) -> Result; -real_json(_) -> erlang:error(badarg). -``` ### json <-> erlang mapping ### @@ -218,22 +223,30 @@ real_json(_) -> erlang:error(badarg). ### incomplete input ### -jsx handles incomplete json texts. if a partial json text is parsed, rather than -returning a term from your callback handler, jsx returns `{incomplete, F}` where -`F` is a function with an identical API to the anonymous fun returned from -`decoder/3`, `encoder/3` or `parser/3`. it retains the internal state of the -parser at the point where input was exhausted. this allows you to parse as you -stream json over a socket or file descriptor, or to parse large json texts -without needing to keep them entirely in memory +jsx can handle incomplete json texts. if the option `stream` is passed to the decoder +or parser and if a partial json text is parsed, rather than returning a term from +your callback handler, jsx returns `{incomplete, F}` where `F` is a function with +an identical API to the anonymous fun returned from `decoder/3`, `encoder/3` or +`parser/3`. it retains the internal state of the parser at the point where input +was exhausted. this allows you to parse as you stream json over a socket or file +descriptor, or to parse large json texts without needing to keep them entirely in +memory -however, it is important to recognize that jsx is greedy by default. jsx will -consider the parsing complete if input is exhausted and the json text is not -unambiguously incomplete. this is mostly relevant when parsing bare numbers like -`<<"1234">>`. this could be a complete json integer or just the beginning of a -json integer that is being parsed incrementally. jsx will treat it as a whole -integer. calling jsx with the [option](#options) `stream` reverses this -behavior and never considers parsing complete until the `incomplete` function is -called with the argument `end_stream` +however, it is important to recognize that jsx is conservative by default. jsx will +not consider the parsing complete even when input is exhausted and the json text is +unambiguously incomplete. to end parsing call the `incomplete` function with the +argument `end_stream` like: + +```erlang +1> {incomplete, F} = jsx:decode(<<"[">>, [stream]). +{incomplete,#Fun} +2> F(end_stream). +** exception error: bad argument +3> {incomplete, G} = F(<<"]">>). +{incomplete,#Fun} +4> G(end_stream). +[] +``` ## data types ## @@ -262,6 +275,34 @@ json_text() = binary() a utf8 encoded binary containing a json string + +#### `token()` #### + +```erlang +event() = start_object + | end_object + | start_array + | end_array + | {key, binary()} + | {string, binary()} + | binary() + | {integer, integer()} + | integer() + | {float, float()} + | float() + | {literal, true} + | true + | {literal, false} + | false + | {literal, null} + | null + | end_json +``` + +the representation used during syntactic analysis. you can generate this +yourself and feed it to `jsx:parser/3` if you'd like to define your own +representations + #### `event()` #### ```erlang @@ -279,50 +320,32 @@ event() = start_object | end_json ``` -#### `token()` #### - -```erlang -token() = event() - | binary() - | {number, integer() | float()} - | integer() - | float() - | true - | false - | null -``` - -the representation used during syntactic analysis. you can generate this -yourself and feed it to `jsx:parser/3` if you'd like to define your own -representations +the subset of [`token()`](#token) emitted by the decoder and encoder to handlers #### `option()` #### ```erlang -option() = replaced_bad_utf8 - | escaped_forward_slashes - | single_quoted_strings - | unescaped_jsonp - | comments +option() = escaped_forward_slashes | escaped_strings + | unescaped_jsonp | dirty_strings - | ignored_bad_escapes - | relax + | strict + | {strict, [strict_option()]} | stream -``` + | {incomplete_handler, fun()} + | {error_handler, fun()} + +strict_option() = comments + | utf8 + | single_quotes + | escapes +``` jsx functions all take a common set of options. not all flags have meaning in all contexts, but they are always valid options. functions may have additional options beyond these. see [individual function documentation](#exports) for details -- `replaced_bad_utf8` - - json text input and json strings SHOULD be utf8 encoded binaries, - appropriately escaped as per the json spec. attempts are made to replace - invalid codepoints with `u+FFFD` as per the unicode spec when this option is - present. this applies both to malformed unicode and disallowed codepoints - - `escaped_forward_slashes` json strings are escaped according to the json spec. this means forward @@ -330,35 +353,6 @@ additional options beyond these. see are left unescaped. you may want to use this if you are embedding json directly into a html or xml document -- `single_quoted_strings` - - some parsers allow double quotes (`u+0022`) to be replaced by single quotes - (`u+0027`) to delimit keys and strings. this option allows json containing - single quotes as structural characters to be parsed without errors. note - that the parser expects strings to be terminated by the same quote type that - opened it and that single quotes must, obviously, be escaped within strings - delimited by single quotes - - double quotes must **always** be escaped, regardless of what kind of quotes - delimit the string they are found in - - the parser will never emit json with keys or strings delimited by single - quotes - -- `unescaped_jsonp` - - javascript interpreters treat the codepoints `u+2028` and `u+2029` as - significant whitespace. json strings that contain either of these codepoints - will be parsed incorrectly by some javascript interpreters. by default, - these codepoints are escaped (to `\u2028` and `\u2029`, respectively) to - retain compatibility. this option simply removes that escaping - -- `comments` - - json has no official comments but some parsers allow c/c++ style comments. - anywhere whitespace is allowed this flag allows comments (both `// ...` and - `/* ... */`) - - `escaped_strings` by default both the encoder and decoder return strings as utf8 binaries @@ -367,12 +361,14 @@ additional options beyond these. see unaltered. this flag escapes strings as if for output in json, removing control codes and problematic codepoints and replacing them with the appropriate escapes + +- `unescaped_jsonp` -- `ignored_bad_escapes` - - during decoding ignore unrecognized escape sequences and leave them as is in - the stream. note that combining this option with `escaped_strings` will - result in the escape character itself being escaped + javascript interpreters treat the codepoints `u+2028` and `u+2029` as + significant whitespace. json strings that contain either of these codepoints + will be parsed incorrectly by some javascript interpreters. by default, + these codepoints are escaped (to `\u2028` and `\u2029`, respectively) to + retain compatibility. this option simply removes that escaping - `dirty_strings` @@ -380,20 +376,39 @@ additional options beyond these. see can result in unwanted behaviour. if your strings are already escaped (or you'd like to force invalid strings into "json" you monster) use this flag to bypass escaping. this can also be used to read in **really** invalid json - strings. everything but escaped quotes are passed as is to the resulting - string term. note that this overrides `ignored_bad_escapes`, - `unescaped_jsonp` and `escaped_strings` + strings. everything between unescaped quotes are passed as is to the resulting + string term. note that this takes precedence over any other options + +- `strict` + + as mentioned [earlier](#description), jsx is pragmatic. if you're more of a + json purist or you're really into bdsm stricter adherence to the spec is + possible. the following restrictions are available + + * `comments` + + comments are disabled and result in a `badarg` error + + * `utf8` + + invalid codepoints and malformed unicode result in `badarg` errors + + * `single_quotes` + + only keys and strings delimited by double quotes (`u+0022`) are allowed. the + single quote (`u+0027`) results in a `badarg` error + + * `escapes` + + escape sequences not adhering to the json spec result in a `badarg` error + + any combination of these can be passed to jsx by using `{strict, [strict_option()]}`. + `strict` is equivalent to `{strict, [comments, bad_utf8, single_quotes, escapes]}` - `stream` see [incomplete input](#incomplete-input) -- `relax` - - relax is a synonym for `[replaced_bad_utf8, single_quoted_strings, comments, - ignored_bad_escapes]` for when you don't care how absolutely terrible your - json input is, you just want the parser to do the best it can - - `incomplete_handler` & `error_handler` the default incomplete and error handlers can be replaced with user defined @@ -432,7 +447,7 @@ parser(Module, Args, Opts) -> Fun((Tokens) -> any()) Opts = [option()] JSONText = json_text() JSONTerm = json_term() - Tokens = token() | [token()] + Tokens = event() | [event()] ``` jsx is a json compiler with interleaved tokenizing, syntactic analysis and @@ -465,7 +480,7 @@ decode(JSON, Opts) -> Term JSON = json_text() Term = json_term() - Opts = [option() | labels | {labels, Label} | {post_decode, F}] + Opts = [option() | labels | {labels, Label}] Label = binary | atom | existing_atom | attempt_atom F = fun((any()) -> any()) ``` @@ -482,18 +497,6 @@ new atoms to the atom table and will result in a `badarg` error if the atom does not exist. `attempt_atom` will convert keys to atoms when they exist, and leave them as binary otherwise -`{post_decode, F}` is a user defined function of arity 1 that is called on each -output value (objects, arrays, strings, numbers and literals). it may return any -value to be substituted in the returned term. for example: - -```erlang -1> F = fun(V) when is_list(V) -> V; (V) -> false end. -2> jsx:decode(<<"{\"a list\": [true, \"a string\", 1]}">>, [{post_decode, F}]). -[{<<"a list">>, [false, false, false]}] -``` - -declaring more than one post-decoder will result in a `badarg` error exception - raises a `badarg` error exception if input is not valid json @@ -505,7 +508,7 @@ encode(Term, Opts) -> JSON Term = json_term() JSON = json_text() - Opts = [option() | {pre_encode, F} | space | {space, N} | indent | {indent, N}] + Opts = [option() | space | {space, N} | indent | {indent, N}] F = fun((any()) -> any()) N = pos_integer() ``` @@ -519,18 +522,6 @@ the option `{indent, N}` inserts a newline and `N` spaces for each level of indentation in your json output. note that this overrides spaces inserted after a comma. `indent` is an alias for `{indent, 1}`. the default is `{indent, 0}` -`{pre_encode, F}` is a user defined function of arity 1 that is called on each -input value. it may return any valid json value to be substituted in the -returned json. for example: - -```erlang -1> F = fun(V) when is_list(V) -> V; (V) -> false end. -2> jsx:encode([{<<"a list">>, [true, <<"a string">>, 1]}], [{pre_encode, F}]). -<<"{\"a list\": [false, false, false]}">> -``` - -declaring more than one pre-encoder will result in a `badarg` error exception - raises a `badarg` error exception if input is not a valid [erlang representation of json](#json---erlang-mapping) @@ -664,16 +655,11 @@ following events must be handled: the end of a json array -- `{key, binary()}` - - a key in a json object. this is guaranteed to follow either `start_object` - or a json value. it will usually be a `utf8` encoded binary. see the - [options](#option) for possible exceptions - - `{string, binary()}` a json string. it will usually be a `utf8` encoded binary. see the - [options](#option) for possible exceptions + [options](#option) for possible exceptions. note that keys are also + json strings - `{integer, integer()}` @@ -711,4 +697,4 @@ jsx wouldn't be what it is without the contributions of [paul davis](https://git [rebar]: https://github.com/rebar/rebar [meck]: https://github.com/eproxus/meck [rfc4627]: http://tools.ietf.org/html/rfc4627 -[travis]: https://travis-ci.org/ +[travis]: https://travis-ci.org/ \ No newline at end of file