bipf - npm Package Compare versions

Comparing version 1.8.0 to 1.9.0

encode.js

		@@ -122,7 +122,16 @@ const varint = require('fast-varint')
		function encodeIdempotent(value, buffer, start) {
		encode(value, buffer, start)
		const len = encode(value, buffer, start)
		buffer._IS_BIPF_ENCODED = true
		return len
		}

		function markIdempotent(buffer) {
		buffer._IS_BIPF_ENCODED = true
		return buffer
		}

		function isIdempotent(buffer) {
		return !!buffer._IS_BIPF_ENCODED
		}

		function getEncodedLength(buffer, start) {
		@@ -153,2 +162,4 @@ return varint.decode(buffer, start) >> TAG_SIZE
		encodeIdempotent,
		markIdempotent,
		isIdempotent,
		getType,
		@@ -155,0 +166,0 @@ getEncodedLength,

index.js

		@@ -19,2 +19,4 @@ const varint = require('fast-varint')
		encodeIdempotent,
		markIdempotent,
		isIdempotent,
		encodingLength,
		@@ -82,2 +84,4 @@ allocAndEncode,
		encodeIdempotent,
		markIdempotent,
		isIdempotent,
		decode,
		@@ -84,0 +88,0 @@ allocAndEncode,

package.json

		{
		"name": "bipf",
		"description": "binary in-place format",
		"version": "1.8.0",
		"version": "1.9.0",
		"homepage": "https://github.com/ssbc/bipf",
		@@ -43,3 +43,3 @@ "repository": {
		},
		"readme": "# BIPF\n\nBinary In-Place Format. A binary format designed for in-place (without\nparsing) reads, with schemaless json-like semantics.\n\n## Motivation\n\n### In-place reads\n\nIn a database there are many cases where you need to read a bunch of\nrecords, filter out most of it (if one or two fields do not match) and\nthen immediately write whats left to a network socket. With json, this\nmeans parsing possibly hundreds of thousands of json objects (which is\nsuprisingly slow), and then reserializing whats left. An inplace\nformat doesn't actually require parsing as a whole at all. You only\nneed to parse the fields you actually read, and using length delimited\nfields instead of escapes, means you do not have to look at every byte\nto parse a field.\n\n### Length delimited collections\n\nUnfortunately, most binary json-like formats (such as msgpack and\ncbor) use element counts on collections (objects and arrays, in\njson-land) this means to find the end of a collection, you have to\nstep past each item in it (including the fields in any object\ncontained inside of it). However, if the collections are length\ndelimited, meaning marked by the encoded byte length of the object,\nnot the number of items inside it, then it's easy to jump right to the\nend of the object in one go. For this reason, databases (for example,\nmongodb, and couchdb) use length delimited collections.\n\n## Format\n\nThe format of BIPF is specificed in the\n[spec](https://github.com/ssbc/bipf-spec).\n\nAll values must have a correct length field. This makes it possible to\ntraverse all fields without looking at the values. Theirfor it is\npossible to quickly jump to any subvalue if you know it's path. If you\nare looking for a particular string, you can also skip any with the\nwrong length! Since object and array fields also begin with a length,\nyou can jump past them if you know they do not contain the value you\nare looking for. This means that seeking inside a more tree like\nobject is more efficient than seeking inside a more list like object!\n\n## Performance\n\nThis design is optimized for the performance of in-place\nreads. Encoding is expected to be slower because of the need to\ncalculate the length of collections before encoding them. If encoding\nis within half as fast as a format intended for encoding perf, that is\ngood. Of course, the intention with an in-place read system is that\nyou encode _once_ and then never decode. Just pass around the binary\nobject, reading fields out when necessary.\n\nBecause of the length encoding, the ability to update in-place is very\nlimited (not recommended actualy) but if you are building a system\naround immutable data, that is not much of a problem. Although, since\nsubobjects are fully valid as an encoded value, you can easily copy a\nsubobject into a new object, etc, without re-encoding.\n\n## Benchmark\n\nI did a simple benchmark, where I encoded and decoded this module's\npackage.json file in various ways. Please not that I am comparing the\nperformance of code written in C with code written in javascript. If\nthe javascript is within 10x the performance of the C then we are\ndoing well! (and a C implementation would likely close that gap)\n\nThe measurement is run 10k operations, then divide by number of ms\ntaken, higher number means more faster!\n\nBenchmark code is in `./test/perf.js`\n\n```\noperation, ops/ms\nbinary.encode 62.61740763932373\nJSON.stringify 325.7328990228013\nbinary.decode 83.40283569641367\nJSON.parse 242.13075060532688\nJSON.parse(buffer) 198.4126984126984\nJSON.stringify(JSON.parse()) 127.55102040816327\nbinary.seek(string) 500\nbinary.seek2(encoded) 1219.5121951219512\nbinary.seek(buffer) 1333.3333333333333\nbinary.seekPath(encoded) 558.659217877095\nbinary.seekPath(compiled) 1265.8227848101267\nbinary.compare() 1785.7142857142858\n```\n\nAs expected, `binary.encode` is much slower than `JSON.stringify`, but\nit's only 6 times worse. But the interesting comparison is\n`JSON.stringify(JSON.parse())` and `binary.seek(buffer)`. Often, in\nimplementing a database, you need to read something from disk, examine\none or two fields (to check if it matches a query) and then write it\nto network.\n\n(note: the `binary.seek` operation is fairly realistic, we seek to the\n\"dependencies\" object, then look up \"varint\" inside of that, then\ndecode the version range of \"varint\". So it's two comparisons and\ndecoding a string out)\n\nSo, in JSON land, that usually means reading it, parsing it, checking\nit, stringifying it again. This involves reading each byte in the\ninput and allocating memory for the parsed object. Then traversing\nthat object in memory and writing something to a string (more memory\nallocation, and all this memory allocation means the garbage collector\nneeds to handle it too)\n\nBut if we have in-place reads, we just read raw binary, seek into the\nappropiate places to check wether it's the objects we want, and then\nwrite it to the network directly. We don't allocate _any_ new memory\nafter reading it.\n\nFurther benchmarks and tests are necessary, but that it can be this\nfast using a _javascript implementation_ is impressive.\n\n## Cannonicisity\n\nFor a system with signatures, it's highly important that data is\n_cannonical_. There should be exactly one way to encode a given data\nstructure. There are a few edge cases here that need to be checked\nfor. (not implemented yet)\n\n* varints must not be zero padded\n* chrome and firefox preserve order of object keys, but any integer\n keys greater than zero come first, and are in increasing order.\n* the length of subfields must be checked to not excede their\n container's length. (This is a security issue)\n\nThese properties can all be checked by traversing the tags but without\nreading the keys or values. I will not consider this module _ready_\nuntil there are tests that cover these invalid cases, to ensure that\nimplementations throw an error.\n\n## API\n\n`encode, decode, encodingLength` follow the interface specified by\n[`abstract-encoding`](https://github.com/mafintosh/abstract-encoding)\n\n### encodingLength(value) => length\n\nreturns the length needed to encode `value`\n\n### encode(value, buffer, start) => length\n\nwrite `value` to `buffer` from start. returns the number of bytes\nused.\n\n### allocAndEncode(value) => buffer\n\nallocate a new buffer and write `value` into it. returns the newly\ncreated buffer.\n\n### encodeIdempotent(value, buffer, start) => length\n\nsame as `encode`, but tags the buffer as being a `bipf` buffer, such\nthat you can place this buffer in another encoded bipf, and it won't\nbe \"double encoded\", it will just be embedded inside the larger buffer.\n\n### allocAndEncodeIdempotent(value) => buffer\n\nsame as `allocAndEncode`, but tags the resulting buffer as being a\n`bipf` buffer.\n\nExample:\n\n```js\nvar obj = {address: {street: '123 Main St'}}\nvar buf1 = bipf.allocAndEncode(obj)\n\nvar innerObj = {street: '123 Main St'}\nvar innerBuf = bipf.allocAndEncodeIdempotent(innerObj)\nvar outerObj = {address: innerBuf}\nvar buf2 = bipf.allocAndEncode(outerObj)\n\ndeepEquals(buf1, buf2) // true\n```\n\nCounter-example:\n\n```js\nvar obj = {address: {street: '123 Main St'}}\nvar buf1 = bipf.allocAndEncode(obj)\n\nvar innerObj = {street: '123 Main St'}\nvar innerBuf = bipf.allocAndEncode(innerObj)\nvar outerObj = {address: innerBuf}\nvar buf2 = bipf.allocAndEncode(outerObj)\n\ndeepEquals(buf1, buf2) // false\n```\n\n### decode(buffer, start) => value\n\nread the next value from `buffer` at `start`. returns the value, and\nsets `decode.bytes` to number of bytes used.\n\n### pluck(buffer, start) => buffer\n\nreads the value from BIPF-encoded `buffer` at `start`, and returns the\nencoded value at that pointer, without decoding it.\n\n### getValueType(value) => type\n\nreturns the type tag that will be used to encode this type.\n\n### getEncodedType(buffer, start) => type\n\nget the `type` tag at `start`\n\n### types.{string,buffer,int,double,array,object,boolnull,reserved}\n\nan object containing the type tags.\n\n### iterate(buffer, start, fn) => void\n\nIf the field at `start` is an object or array, then `iterate` will\ncall the `fn` with arguments `fn(buffer, pointer, key)` for each\nsubfield. If the field at `start` is not an array or object, this\nreturns `-1`. You can stop/abort the iteration by making `fn` return\nany truthy value.\n\n### seekKey(buffer, start, target) => pointer\n\nSeek for a key `target` within an object. If `getEncodedType(buffer,\nstart) !== types.object` then will return `-1`. Otherwise, seekKey\nwill iterate over the encoding object and return a pointer to where it\nstarts.\n\nSince this defines a recursive encoding, a pointer to any valid\nsub-encoding is a valid start value.\n\n``` js\nvar obj = {\n foo: 1,\n bar: true,\n baz: 'hello'\n}\n//allocate a correctly sized buffer\nvar length = b.encodingLength(obj)\nvar buffer = Buffer.alloc(length)\n\n//encode object to buffer\nb.encode(obj, buffer, 0)\n\n//parse entire object and read a single value\nconsole.log(b.decode(buffer, 0).baz)\n\n//seek and decode a single value\nconsole.log(b.decode(buffer, b.seekKey(buffer, 0, 'baz')))\n```\n\nSee performance section for discussion on the performance of seek - if\nit's only needed to parse a couple of elements, it can be\nsignificantly faster than parsing.\n\n### seekKey2(buffer, start, target, target_start) => pointer\n\nSame as `seekKey`, except `target` must be an encoded value. This is\nusually done using `allocAndEncode`. This is a bit faster.\n\n### seekKeyCached(buffer, start, target) => pointer\n\nSame as `seekKey`, but uses a cache to avoid re-seeking the pointers\nif the same arguments have been provided in the past. However,\n`target` must be a string, not a buffer.\n\n### seekPath(buffer, start, array_of_buffers) => pointer\n\nThe same as `seekKey`, except for a recursive path. `path` should be\nan array of node buffers, just holding the key values, not encoded as\n`bipf`.\n\n### createSeekPath(path) => seekPath(buffer, start)\n\nCompiles a javascript function that does a seekPath. This is\nsignificantly faster than iterating over a javascript array and then\nlooking for each thing, because it will get optimized by the js\nengine's jit compiler.\n\n\n## License\n\nMIT\n\n\n"
		"readme": "# BIPF\n\nBinary In-Place Format. A binary format designed for in-place (without\nparsing) reads, with schemaless json-like semantics.\n\n## Motivation\n\n### In-place reads\n\nIn a database there are many cases where you need to read a bunch of\nrecords, filter out most of it (if one or two fields do not match) and\nthen immediately write whats left to a network socket. With json, this\nmeans parsing possibly hundreds of thousands of json objects (which is\nsuprisingly slow), and then reserializing whats left. An inplace\nformat doesn't actually require parsing as a whole at all. You only\nneed to parse the fields you actually read, and using length delimited\nfields instead of escapes, means you do not have to look at every byte\nto parse a field.\n\n### Length delimited collections\n\nUnfortunately, most binary json-like formats (such as msgpack and\ncbor) use element counts on collections (objects and arrays, in\njson-land) this means to find the end of a collection, you have to\nstep past each item in it (including the fields in any object\ncontained inside of it). However, if the collections are length\ndelimited, meaning marked by the encoded byte length of the object,\nnot the number of items inside it, then it's easy to jump right to the\nend of the object in one go. For this reason, databases (for example,\nmongodb, and couchdb) use length delimited collections.\n\n## Format\n\nThe format of BIPF is specificed in the\n[spec](https://github.com/ssbc/bipf-spec).\n\nAll values must have a correct length field. This makes it possible to\ntraverse all fields without looking at the values. Theirfor it is\npossible to quickly jump to any subvalue if you know it's path. If you\nare looking for a particular string, you can also skip any with the\nwrong length! Since object and array fields also begin with a length,\nyou can jump past them if you know they do not contain the value you\nare looking for. This means that seeking inside a more tree like\nobject is more efficient than seeking inside a more list like object!\n\n## Performance\n\nThis design is optimized for the performance of in-place\nreads. Encoding is expected to be slower because of the need to\ncalculate the length of collections before encoding them. If encoding\nis within half as fast as a format intended for encoding perf, that is\ngood. Of course, the intention with an in-place read system is that\nyou encode _once_ and then never decode. Just pass around the binary\nobject, reading fields out when necessary.\n\nBecause of the length encoding, the ability to update in-place is very\nlimited (not recommended actualy) but if you are building a system\naround immutable data, that is not much of a problem. Although, since\nsubobjects are fully valid as an encoded value, you can easily copy a\nsubobject into a new object, etc, without re-encoding.\n\n## Benchmark\n\nI did a simple benchmark, where I encoded and decoded this module's\npackage.json file in various ways. Please not that I am comparing the\nperformance of code written in C with code written in javascript. If\nthe javascript is within 10x the performance of the C then we are\ndoing well! (and a C implementation would likely close that gap)\n\nThe measurement is run 10k operations, then divide by number of ms\ntaken, higher number means more faster!\n\nBenchmark code is in `./test/perf.js`\n\n```\noperation, ops/ms\nbinary.encode 62.61740763932373\nJSON.stringify 325.7328990228013\nbinary.decode 83.40283569641367\nJSON.parse 242.13075060532688\nJSON.parse(buffer) 198.4126984126984\nJSON.stringify(JSON.parse()) 127.55102040816327\nbinary.seek(string) 500\nbinary.seek2(encoded) 1219.5121951219512\nbinary.seek(buffer) 1333.3333333333333\nbinary.seekPath(encoded) 558.659217877095\nbinary.seekPath(compiled) 1265.8227848101267\nbinary.compare() 1785.7142857142858\n```\n\nAs expected, `binary.encode` is much slower than `JSON.stringify`, but\nit's only 6 times worse. But the interesting comparison is\n`JSON.stringify(JSON.parse())` and `binary.seek(buffer)`. Often, in\nimplementing a database, you need to read something from disk, examine\none or two fields (to check if it matches a query) and then write it\nto network.\n\n(note: the `binary.seek` operation is fairly realistic, we seek to the\n\"dependencies\" object, then look up \"varint\" inside of that, then\ndecode the version range of \"varint\". So it's two comparisons and\ndecoding a string out)\n\nSo, in JSON land, that usually means reading it, parsing it, checking\nit, stringifying it again. This involves reading each byte in the\ninput and allocating memory for the parsed object. Then traversing\nthat object in memory and writing something to a string (more memory\nallocation, and all this memory allocation means the garbage collector\nneeds to handle it too)\n\nBut if we have in-place reads, we just read raw binary, seek into the\nappropiate places to check wether it's the objects we want, and then\nwrite it to the network directly. We don't allocate _any_ new memory\nafter reading it.\n\nFurther benchmarks and tests are necessary, but that it can be this\nfast using a _javascript implementation_ is impressive.\n\n## Cannonicisity\n\nFor a system with signatures, it's highly important that data is\n_cannonical_. There should be exactly one way to encode a given data\nstructure. There are a few edge cases here that need to be checked\nfor. (not implemented yet)\n\n* varints must not be zero padded\n* chrome and firefox preserve order of object keys, but any integer\n keys greater than zero come first, and are in increasing order.\n* the length of subfields must be checked to not excede their\n container's length. (This is a security issue)\n\nThese properties can all be checked by traversing the tags but without\nreading the keys or values. I will not consider this module _ready_\nuntil there are tests that cover these invalid cases, to ensure that\nimplementations throw an error.\n\n## API\n\n`encode, decode, encodingLength` follow the interface specified by\n[`abstract-encoding`](https://github.com/mafintosh/abstract-encoding)\n\n### encodingLength(value) => length\n\nreturns the length needed to encode `value`\n\n### encode(value, buffer, start) => length\n\nwrite `value` to `buffer` from start. returns the number of bytes\nused.\n\n### allocAndEncode(value) => buffer\n\nallocate a new buffer and write `value` into it. returns the newly\ncreated buffer.\n\n### encodeIdempotent(value, buffer, start) => length\n\nsame as `encode`, but tags the buffer as being a `bipf` buffer, such\nthat you can place this buffer in another encoded bipf, and it won't\nbe \"double encoded\", it will just be embedded inside the larger buffer.\n\n### allocAndEncodeIdempotent(value) => buffer\n\nsame as `allocAndEncode`, but tags the resulting buffer as being a\n`bipf` buffer.\n\nExample:\n\n```js\nvar obj = {address: {street: '123 Main St'}}\nvar buf1 = bipf.allocAndEncode(obj)\n\nvar innerObj = {street: '123 Main St'}\nvar innerBuf = bipf.allocAndEncodeIdempotent(innerObj)\nvar outerObj = {address: innerBuf}\nvar buf2 = bipf.allocAndEncode(outerObj)\n\ndeepEquals(buf1, buf2) // true\n```\n\nCounter-example:\n\n```js\nvar obj = {address: {street: '123 Main St'}}\nvar buf1 = bipf.allocAndEncode(obj)\n\nvar innerObj = {street: '123 Main St'}\nvar innerBuf = bipf.allocAndEncode(innerObj)\nvar outerObj = {address: innerBuf}\nvar buf2 = bipf.allocAndEncode(outerObj)\n\ndeepEquals(buf1, buf2) // false\n```\n\n### markIdempotent(buffer) => buffer\n\ndoes nothing else but tag the buffer as being a `bipf` buffer, such\nthat you can place it in another encoded bipf, and it won't be \"double\nencoded\", it will just be embedded inside the larger buffer.\n\nreturns the same buffer as the input.\n\n### isIdempotent(buffer) => boolean\n\nreturns true if `buffer` received an `encodeIdempotent()` call or a\n`markIdempotent()` call.\n\n### decode(buffer, start) => value\n\nread the next value from `buffer` at `start`. returns the value, and\nsets `decode.bytes` to number of bytes used.\n\n### pluck(buffer, start) => buffer\n\nreads the value from BIPF-encoded `buffer` at `start`, and returns the\nencoded value at that pointer, without decoding it.\n\n### getValueType(value) => type\n\nreturns the type tag that will be used to encode this type.\n\n### getEncodedType(buffer, start) => type\n\nget the `type` tag at `start`\n\n### types.{string,buffer,int,double,array,object,boolnull,reserved}\n\nan object containing the type tags.\n\n### iterate(buffer, start, fn) => void\n\nIf the field at `start` is an object or array, then `iterate` will\ncall the `fn` with arguments `fn(buffer, pointer, key)` for each\nsubfield. If the field at `start` is not an array or object, this\nreturns `-1`. You can stop/abort the iteration by making `fn` return\nany truthy value.\n\n### seekKey(buffer, start, target) => pointer\n\nSeek for a key `target` within an object. If `getEncodedType(buffer,\nstart) !== types.object` then will return `-1`. Otherwise, seekKey\nwill iterate over the encoding object and return a pointer to where it\nstarts.\n\nSince this defines a recursive encoding, a pointer to any valid\nsub-encoding is a valid start value.\n\n``` js\nvar obj = {\n foo: 1,\n bar: true,\n baz: 'hello'\n}\n//allocate a correctly sized buffer\nvar length = b.encodingLength(obj)\nvar buffer = Buffer.alloc(length)\n\n//encode object to buffer\nb.encode(obj, buffer, 0)\n\n//parse entire object and read a single value\nconsole.log(b.decode(buffer, 0).baz)\n\n//seek and decode a single value\nconsole.log(b.decode(buffer, b.seekKey(buffer, 0, 'baz')))\n```\n\nSee performance section for discussion on the performance of seek - if\nit's only needed to parse a couple of elements, it can be\nsignificantly faster than parsing.\n\n### seekKey2(buffer, start, target, target_start) => pointer\n\nSame as `seekKey`, except `target` must be an encoded value. This is\nusually done using `allocAndEncode`. This is a bit faster.\n\n### seekKeyCached(buffer, start, target) => pointer\n\nSame as `seekKey`, but uses a cache to avoid re-seeking the pointers\nif the same arguments have been provided in the past. However,\n`target` must be a string, not a buffer.\n\n### seekPath(buffer, start, array_of_buffers) => pointer\n\nThe same as `seekKey`, except for a recursive path. `path` should be\nan array of node buffers, just holding the key values, not encoded as\n`bipf`.\n\n### createSeekPath(path) => seekPath(buffer, start)\n\nCompiles a javascript function that does a seekPath. This is\nsignificantly faster than iterating over a javascript array and then\nlooking for each thing, because it will get optimized by the js\nengine's jit compiler.\n\n\n## License\n\nMIT\n\n\n"
		}

README.md

		@@ -194,2 +194,15 @@ # BIPF

		### markIdempotent(buffer) => buffer

		does nothing else but tag the buffer as being a `bipf` buffer, such
		that you can place it in another encoded bipf, and it won't be "double
		encoded", it will just be embedded inside the larger buffer.

		returns the same buffer as the input.

		### isIdempotent(buffer) => boolean

		returns true if `buffer` received an `encodeIdempotent()` call or a
		`markIdempotent()` call.

		### decode(buffer, start) => value
		@@ -196,0 +209,0 @@

bipf - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics